Add printing of table for failed article downloads
- Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table
This commit is contained in:
parent
60fb30e8a2
commit
ae1ddb9386
4 changed files with 211 additions and 107 deletions
182
src/epub.rs
182
src/epub.rs
|
@ -13,64 +13,100 @@ use crate::{
|
|||
pub fn generate_epubs(
|
||||
articles: Vec<Extractor>,
|
||||
merged: Option<&String>,
|
||||
) -> Result<(), PaperoniError> {
|
||||
) -> Result<(), Vec<PaperoniError>> {
|
||||
let bar = ProgressBar::new(articles.len() as u64);
|
||||
let style = ProgressStyle::default_bar().template(
|
||||
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}",
|
||||
);
|
||||
bar.set_style(style);
|
||||
bar.set_message("Generating epubs");
|
||||
|
||||
let mut base_table = Table::new();
|
||||
base_table
|
||||
.load_preset(UTF8_FULL)
|
||||
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
|
||||
.set_content_arrangement(ContentArrangement::Dynamic);
|
||||
|
||||
let mut errors: Vec<PaperoniError> = Vec::new();
|
||||
let mut can_print_table = false;
|
||||
|
||||
match merged {
|
||||
Some(name) => {
|
||||
base_table.set_header(vec![Cell::new("Table of Contents")
|
||||
.add_attribute(Attribute::Bold)
|
||||
.set_alignment(CellAlignment::Center)
|
||||
.fg(Color::Green)]);
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
|
||||
|
||||
let mut epub = match EpubBuilder::new(match ZipLibrary::new() {
|
||||
Ok(zip_library) => zip_library,
|
||||
Err(err) => {
|
||||
let mut paperoni_err: PaperoniError = err.into();
|
||||
paperoni_err.set_article_source(name);
|
||||
errors.push(paperoni_err);
|
||||
return Err(errors);
|
||||
}
|
||||
}) {
|
||||
Ok(epub) => epub,
|
||||
Err(err) => {
|
||||
let mut paperoni_err: PaperoniError = err.into();
|
||||
paperoni_err.set_article_source(name);
|
||||
errors.push(paperoni_err);
|
||||
return Err(errors);
|
||||
}
|
||||
};
|
||||
epub.inline_toc();
|
||||
epub = articles
|
||||
articles
|
||||
.iter()
|
||||
.enumerate()
|
||||
.fold(epub, |mut epub, (idx, article)| {
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||
epub.metadata("title", replace_metadata_value(name))
|
||||
.unwrap();
|
||||
let section_name = article.metadata().title();
|
||||
epub.add_content(
|
||||
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
|
||||
.title(replace_metadata_value(section_name)),
|
||||
)
|
||||
.unwrap();
|
||||
.fold(&mut epub, |epub, (idx, article)| {
|
||||
let mut article_result = || -> Result<(), PaperoniError> {
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?;
|
||||
let html_str = std::str::from_utf8(&html_buf)?;
|
||||
epub.metadata("title", replace_metadata_value(name))?;
|
||||
let section_name = article.metadata().title();
|
||||
epub.add_content(
|
||||
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
|
||||
.title(replace_metadata_value(section_name)),
|
||||
)?;
|
||||
|
||||
article.img_urls.iter().for_each(|img| {
|
||||
// TODO: Add error handling
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
article.img_urls.iter().for_each(|img| {
|
||||
// TODO: Add error handling and return errors as a vec
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(
|
||||
file_path.file_name().unwrap(),
|
||||
img_buf,
|
||||
img.1.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
});
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(
|
||||
file_path.file_name().unwrap(),
|
||||
img_buf,
|
||||
img.1.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
});
|
||||
Ok(())
|
||||
};
|
||||
if let Err(mut error) = article_result() {
|
||||
error.set_article_source(&article.url);
|
||||
errors.push(error);
|
||||
}
|
||||
bar.inc(1);
|
||||
base_table.add_row(vec![article.metadata().title()]);
|
||||
epub
|
||||
});
|
||||
let mut out_file = File::create(&name).unwrap();
|
||||
epub.generate(&mut out_file)?;
|
||||
match epub.generate(&mut out_file) {
|
||||
Ok(_) => (),
|
||||
Err(err) => {
|
||||
let mut paperoni_err: PaperoniError = err.into();
|
||||
paperoni_err.set_article_source(name);
|
||||
errors.push(paperoni_err);
|
||||
return Err(errors);
|
||||
}
|
||||
}
|
||||
|
||||
bar.finish_with_message("Generated epub\n");
|
||||
println!("Created {:?}", name);
|
||||
can_print_table = true;
|
||||
}
|
||||
None => {
|
||||
base_table
|
||||
|
@ -81,48 +117,62 @@ pub fn generate_epubs(
|
|||
.set_content_arrangement(ContentArrangement::Dynamic);
|
||||
|
||||
for article in articles {
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
|
||||
let file_name = format!(
|
||||
"{}.epub",
|
||||
article
|
||||
.metadata()
|
||||
.title()
|
||||
.replace("/", " ")
|
||||
.replace("\\", " ")
|
||||
);
|
||||
let mut out_file = File::create(&file_name).unwrap();
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||
if let Some(author) = article.metadata().byline() {
|
||||
epub.metadata("author", replace_metadata_value(author))?;
|
||||
let mut result = || -> Result<(), PaperoniError> {
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
|
||||
let file_name = format!(
|
||||
"{}.epub",
|
||||
article
|
||||
.metadata()
|
||||
.title()
|
||||
.replace("/", " ")
|
||||
.replace("\\", " ")
|
||||
);
|
||||
let mut out_file = File::create(&file_name).unwrap();
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||
if let Some(author) = article.metadata().byline() {
|
||||
epub.metadata("author", replace_metadata_value(author))?;
|
||||
}
|
||||
epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
|
||||
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
|
||||
for img in &article.img_urls {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(
|
||||
file_path.file_name().unwrap(),
|
||||
img_buf,
|
||||
img.1.as_ref().unwrap(),
|
||||
)?;
|
||||
}
|
||||
epub.generate(&mut out_file)?;
|
||||
bar.inc(1);
|
||||
|
||||
base_table.add_row(vec![article.metadata().title()]);
|
||||
|
||||
// println!("Created {:?}", file_name);
|
||||
can_print_table = true;
|
||||
Ok(())
|
||||
};
|
||||
if let Err(mut error) = result() {
|
||||
error.set_article_source(&article.url);
|
||||
errors.push(error);
|
||||
}
|
||||
epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
|
||||
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
|
||||
for img in &article.img_urls {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(
|
||||
file_path.file_name().unwrap(),
|
||||
img_buf,
|
||||
img.1.as_ref().unwrap(),
|
||||
)?;
|
||||
}
|
||||
epub.generate(&mut out_file)?;
|
||||
bar.inc(1);
|
||||
|
||||
base_table.add_row(vec![article.metadata().title()]);
|
||||
|
||||
// println!("Created {:?}", file_name);
|
||||
}
|
||||
bar.finish_with_message("Generated epubs\n");
|
||||
}
|
||||
}
|
||||
println!("{}", base_table);
|
||||
Ok(())
|
||||
if can_print_table {
|
||||
println!("{}", base_table);
|
||||
}
|
||||
if errors.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(errors)
|
||||
}
|
||||
}
|
||||
|
||||
/// Replaces characters that have to be escaped before adding to the epub's metadata
|
||||
|
|
|
@ -8,25 +8,35 @@ pub enum ErrorKind {
|
|||
HTTPError(String),
|
||||
#[error("[IOError]: {0}")]
|
||||
IOError(String),
|
||||
#[error("[UTF8Error]: {0}")]
|
||||
UTF8Error(String),
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("{kind}")]
|
||||
pub struct PaperoniError {
|
||||
article_link: Option<String>,
|
||||
article_source: Option<String>,
|
||||
kind: ErrorKind,
|
||||
}
|
||||
|
||||
impl PaperoniError {
|
||||
pub fn with_kind(kind: ErrorKind) -> Self {
|
||||
PaperoniError {
|
||||
article_link: None,
|
||||
article_source: None,
|
||||
kind,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_article_link(&mut self, article_link: String) {
|
||||
self.article_link = Some(article_link);
|
||||
pub fn kind(&self) -> &ErrorKind {
|
||||
&self.kind
|
||||
}
|
||||
|
||||
pub fn article_source(&self) -> &Option<String> {
|
||||
&self.article_source
|
||||
}
|
||||
|
||||
pub fn set_article_source(&mut self, article_source: &str) {
|
||||
self.article_source = Some(article_source.to_owned());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -59,3 +69,9 @@ impl From<std::io::Error> for PaperoniError {
|
|||
PaperoniError::with_kind(ErrorKind::IOError(err.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<std::str::Utf8Error> for PaperoniError {
|
||||
fn from(err: std::str::Utf8Error) -> Self {
|
||||
PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string()))
|
||||
}
|
||||
}
|
||||
|
|
77
src/http.rs
77
src/http.rs
|
@ -12,46 +12,53 @@ pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
|||
let client = surf::Client::new();
|
||||
// println!("Fetching...");
|
||||
|
||||
let mut redirect_count: u8 = 0;
|
||||
let base_url = Url::parse(&url)?;
|
||||
let mut url = base_url.clone();
|
||||
while redirect_count < 5 {
|
||||
redirect_count += 1;
|
||||
let req = surf::get(&url);
|
||||
let mut res = client.send(req).await?;
|
||||
if res.status().is_redirection() {
|
||||
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
||||
match Url::parse(location.last().as_str()) {
|
||||
Ok(valid_url) => url = valid_url,
|
||||
Err(e) => match e {
|
||||
url::ParseError::RelativeUrlWithoutBase => {
|
||||
url = base_url.join(location.last().as_str())?
|
||||
}
|
||||
e => return Err(e.into()),
|
||||
},
|
||||
};
|
||||
}
|
||||
} else if res.status().is_success() {
|
||||
if let Some(mime) = res.content_type() {
|
||||
if mime.essence() == "text/html" {
|
||||
return Ok((url.to_string(), res.body_string().await?));
|
||||
} else {
|
||||
let msg = format!(
|
||||
"Invalid HTTP response. Received {} instead of text/html",
|
||||
mime.essence()
|
||||
);
|
||||
let process_request = async {
|
||||
let mut redirect_count: u8 = 0;
|
||||
let base_url = Url::parse(&url)?;
|
||||
let mut url = base_url.clone();
|
||||
while redirect_count < 5 {
|
||||
redirect_count += 1;
|
||||
let req = surf::get(&url);
|
||||
let mut res = client.send(req).await?;
|
||||
if res.status().is_redirection() {
|
||||
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
||||
match Url::parse(location.last().as_str()) {
|
||||
Ok(valid_url) => url = valid_url,
|
||||
Err(e) => match e {
|
||||
url::ParseError::RelativeUrlWithoutBase => {
|
||||
url = base_url.join(location.last().as_str())?
|
||||
}
|
||||
e => return Err(e.into()),
|
||||
},
|
||||
};
|
||||
}
|
||||
} else if res.status().is_success() {
|
||||
if let Some(mime) = res.content_type() {
|
||||
if mime.essence() == "text/html" {
|
||||
return Ok((url.to_string(), res.body_string().await?));
|
||||
} else {
|
||||
let msg = format!(
|
||||
"Invalid HTTP response. Received {} instead of text/html",
|
||||
mime.essence()
|
||||
);
|
||||
|
||||
return Err(ErrorKind::HTTPError(msg).into());
|
||||
return Err(ErrorKind::HTTPError(msg).into());
|
||||
}
|
||||
} else {
|
||||
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
|
||||
}
|
||||
} else {
|
||||
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
|
||||
let msg = format!("Request failed: HTTP {}", res.status());
|
||||
return Err(ErrorKind::HTTPError(msg).into());
|
||||
}
|
||||
} else {
|
||||
let msg = format!("Request failed: HTTP {}", res.status());
|
||||
return Err(ErrorKind::HTTPError(msg).into());
|
||||
}
|
||||
}
|
||||
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
|
||||
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
|
||||
};
|
||||
|
||||
process_request.await.map_err(|mut error: PaperoniError| {
|
||||
error.set_article_source(url);
|
||||
error
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn download_images(
|
||||
|
|
35
src/main.rs
35
src/main.rs
|
@ -3,6 +3,8 @@ extern crate lazy_static;
|
|||
|
||||
use async_std::stream;
|
||||
use async_std::task;
|
||||
use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY;
|
||||
use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table};
|
||||
use futures::stream::StreamExt;
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use url::Url;
|
||||
|
@ -31,6 +33,7 @@ fn main() {
|
|||
|
||||
fn download(app_config: AppConfig) {
|
||||
let bar = ProgressBar::new(app_config.urls().len() as u64);
|
||||
let mut errors = Vec::new();
|
||||
let style = ProgressStyle::default_bar().template(
|
||||
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
|
||||
);
|
||||
|
@ -63,7 +66,7 @@ fn download(app_config: AppConfig) {
|
|||
articles.push(extractor);
|
||||
}
|
||||
}
|
||||
Err(e) => eprintln!("{}", e),
|
||||
Err(e) => errors.push(e),
|
||||
}
|
||||
bar.inc(1);
|
||||
}
|
||||
|
@ -72,6 +75,34 @@ fn download(app_config: AppConfig) {
|
|||
bar.finish_with_message("Downloaded articles");
|
||||
match generate_epubs(articles, app_config.merged()) {
|
||||
Ok(_) => (),
|
||||
Err(e) => eprintln!("{}", e),
|
||||
Err(gen_epub_errors) => {
|
||||
errors.extend(gen_epub_errors);
|
||||
}
|
||||
};
|
||||
if !errors.is_empty() {
|
||||
println!(
|
||||
"{}Failed article downloads{}",
|
||||
Attribute::Bold,
|
||||
Attribute::NormalIntensity
|
||||
);
|
||||
let mut table_failed = Table::new();
|
||||
table_failed
|
||||
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
|
||||
.set_header(vec![
|
||||
Cell::new("Link").set_alignment(CellAlignment::Center),
|
||||
Cell::new("Reason").set_alignment(CellAlignment::Center),
|
||||
])
|
||||
.set_content_arrangement(ContentArrangement::Dynamic);
|
||||
|
||||
for error in errors {
|
||||
table_failed.add_row(vec![
|
||||
error
|
||||
.article_source()
|
||||
.clone()
|
||||
.unwrap_or_else(|| "<unknown link>".to_string()),
|
||||
format!("{}", error.kind()),
|
||||
]);
|
||||
}
|
||||
println!("{}", table_failed);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue