Add printing of table for failed article downloads

- Map errors in `fetch_html` to include the source url
- Change `article_link` to `article_source`
- Add `Into` conversion for `UTF8Error`
- Collect errors in `generate_epubs` for displaying in a table
This commit is contained in:
Kenneth Gitere 2021-04-20 21:09:38 +03:00
parent 60fb30e8a2
commit ae1ddb9386
4 changed files with 211 additions and 107 deletions

View file

@ -13,64 +13,100 @@ use crate::{
pub fn generate_epubs( pub fn generate_epubs(
articles: Vec<Extractor>, articles: Vec<Extractor>,
merged: Option<&String>, merged: Option<&String>,
) -> Result<(), PaperoniError> { ) -> Result<(), Vec<PaperoniError>> {
let bar = ProgressBar::new(articles.len() as u64); let bar = ProgressBar::new(articles.len() as u64);
let style = ProgressStyle::default_bar().template( let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}",
); );
bar.set_style(style); bar.set_style(style);
bar.set_message("Generating epubs"); bar.set_message("Generating epubs");
let mut base_table = Table::new(); let mut base_table = Table::new();
base_table base_table
.load_preset(UTF8_FULL) .load_preset(UTF8_FULL)
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_content_arrangement(ContentArrangement::Dynamic); .set_content_arrangement(ContentArrangement::Dynamic);
let mut errors: Vec<PaperoniError> = Vec::new();
let mut can_print_table = false;
match merged { match merged {
Some(name) => { Some(name) => {
base_table.set_header(vec![Cell::new("Table of Contents") base_table.set_header(vec![Cell::new("Table of Contents")
.add_attribute(Attribute::Bold) .add_attribute(Attribute::Bold)
.set_alignment(CellAlignment::Center) .set_alignment(CellAlignment::Center)
.fg(Color::Green)]); .fg(Color::Green)]);
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
let mut epub = match EpubBuilder::new(match ZipLibrary::new() {
Ok(zip_library) => zip_library,
Err(err) => {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
}) {
Ok(epub) => epub,
Err(err) => {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
};
epub.inline_toc(); epub.inline_toc();
epub = articles articles
.iter() .iter()
.enumerate() .enumerate()
.fold(epub, |mut epub, (idx, article)| { .fold(&mut epub, |epub, (idx, article)| {
let mut html_buf = Vec::new(); let mut article_result = || -> Result<(), PaperoniError> {
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) let mut html_buf = Vec::new();
.expect("Unable to serialize to xhtml"); extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?;
let html_str = std::str::from_utf8(&html_buf).unwrap(); let html_str = std::str::from_utf8(&html_buf)?;
epub.metadata("title", replace_metadata_value(name)) epub.metadata("title", replace_metadata_value(name))?;
.unwrap(); let section_name = article.metadata().title();
let section_name = article.metadata().title(); epub.add_content(
epub.add_content( EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) .title(replace_metadata_value(section_name)),
.title(replace_metadata_value(section_name)), )?;
)
.unwrap();
article.img_urls.iter().for_each(|img| { article.img_urls.iter().for_each(|img| {
// TODO: Add error handling // TODO: Add error handling and return errors as a vec
let mut file_path = std::env::temp_dir(); let mut file_path = std::env::temp_dir();
file_path.push(&img.0); file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file"); let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource( epub.add_resource(
file_path.file_name().unwrap(), file_path.file_name().unwrap(),
img_buf, img_buf,
img.1.as_ref().unwrap(), img.1.as_ref().unwrap(),
) )
.unwrap(); .unwrap();
}); });
Ok(())
};
if let Err(mut error) = article_result() {
error.set_article_source(&article.url);
errors.push(error);
}
bar.inc(1); bar.inc(1);
base_table.add_row(vec![article.metadata().title()]); base_table.add_row(vec![article.metadata().title()]);
epub epub
}); });
let mut out_file = File::create(&name).unwrap(); let mut out_file = File::create(&name).unwrap();
epub.generate(&mut out_file)?; match epub.generate(&mut out_file) {
Ok(_) => (),
Err(err) => {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
}
bar.finish_with_message("Generated epub\n"); bar.finish_with_message("Generated epub\n");
println!("Created {:?}", name); println!("Created {:?}", name);
can_print_table = true;
} }
None => { None => {
base_table base_table
@ -81,48 +117,62 @@ pub fn generate_epubs(
.set_content_arrangement(ContentArrangement::Dynamic); .set_content_arrangement(ContentArrangement::Dynamic);
for article in articles { for article in articles {
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; let mut result = || -> Result<(), PaperoniError> {
let file_name = format!( let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
"{}.epub", let file_name = format!(
article "{}.epub",
.metadata() article
.title() .metadata()
.replace("/", " ") .title()
.replace("\\", " ") .replace("/", " ")
); .replace("\\", " ")
let mut out_file = File::create(&file_name).unwrap(); );
let mut html_buf = Vec::new(); let mut out_file = File::create(&file_name).unwrap();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) let mut html_buf = Vec::new();
.expect("Unable to serialize to xhtml"); extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
let html_str = std::str::from_utf8(&html_buf).unwrap(); .expect("Unable to serialize to xhtml");
if let Some(author) = article.metadata().byline() { let html_str = std::str::from_utf8(&html_buf).unwrap();
epub.metadata("author", replace_metadata_value(author))?; if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))?;
}
epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
for img in &article.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.as_ref().unwrap(),
)?;
}
epub.generate(&mut out_file)?;
bar.inc(1);
base_table.add_row(vec![article.metadata().title()]);
// println!("Created {:?}", file_name);
can_print_table = true;
Ok(())
};
if let Err(mut error) = result() {
error.set_article_source(&article.url);
errors.push(error);
} }
epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
for img in &article.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.as_ref().unwrap(),
)?;
}
epub.generate(&mut out_file)?;
bar.inc(1);
base_table.add_row(vec![article.metadata().title()]);
// println!("Created {:?}", file_name);
} }
bar.finish_with_message("Generated epubs\n"); bar.finish_with_message("Generated epubs\n");
} }
} }
println!("{}", base_table); if can_print_table {
Ok(()) println!("{}", base_table);
}
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
} }
/// Replaces characters that have to be escaped before adding to the epub's metadata /// Replaces characters that have to be escaped before adding to the epub's metadata

View file

@ -8,25 +8,35 @@ pub enum ErrorKind {
HTTPError(String), HTTPError(String),
#[error("[IOError]: {0}")] #[error("[IOError]: {0}")]
IOError(String), IOError(String),
#[error("[UTF8Error]: {0}")]
UTF8Error(String),
} }
#[derive(Error, Debug)] #[derive(Error, Debug)]
#[error("{kind}")] #[error("{kind}")]
pub struct PaperoniError { pub struct PaperoniError {
article_link: Option<String>, article_source: Option<String>,
kind: ErrorKind, kind: ErrorKind,
} }
impl PaperoniError { impl PaperoniError {
pub fn with_kind(kind: ErrorKind) -> Self { pub fn with_kind(kind: ErrorKind) -> Self {
PaperoniError { PaperoniError {
article_link: None, article_source: None,
kind, kind,
} }
} }
pub fn set_article_link(&mut self, article_link: String) { pub fn kind(&self) -> &ErrorKind {
self.article_link = Some(article_link); &self.kind
}
pub fn article_source(&self) -> &Option<String> {
&self.article_source
}
pub fn set_article_source(&mut self, article_source: &str) {
self.article_source = Some(article_source.to_owned());
} }
} }
@ -59,3 +69,9 @@ impl From<std::io::Error> for PaperoniError {
PaperoniError::with_kind(ErrorKind::IOError(err.to_string())) PaperoniError::with_kind(ErrorKind::IOError(err.to_string()))
} }
} }
impl From<std::str::Utf8Error> for PaperoniError {
fn from(err: std::str::Utf8Error) -> Self {
PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string()))
}
}

View file

@ -12,46 +12,53 @@ pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
let client = surf::Client::new(); let client = surf::Client::new();
// println!("Fetching..."); // println!("Fetching...");
let mut redirect_count: u8 = 0; let process_request = async {
let base_url = Url::parse(&url)?; let mut redirect_count: u8 = 0;
let mut url = base_url.clone(); let base_url = Url::parse(&url)?;
while redirect_count < 5 { let mut url = base_url.clone();
redirect_count += 1; while redirect_count < 5 {
let req = surf::get(&url); redirect_count += 1;
let mut res = client.send(req).await?; let req = surf::get(&url);
if res.status().is_redirection() { let mut res = client.send(req).await?;
if let Some(location) = res.header(surf::http::headers::LOCATION) { if res.status().is_redirection() {
match Url::parse(location.last().as_str()) { if let Some(location) = res.header(surf::http::headers::LOCATION) {
Ok(valid_url) => url = valid_url, match Url::parse(location.last().as_str()) {
Err(e) => match e { Ok(valid_url) => url = valid_url,
url::ParseError::RelativeUrlWithoutBase => { Err(e) => match e {
url = base_url.join(location.last().as_str())? url::ParseError::RelativeUrlWithoutBase => {
} url = base_url.join(location.last().as_str())?
e => return Err(e.into()), }
}, e => return Err(e.into()),
}; },
} };
} else if res.status().is_success() { }
if let Some(mime) = res.content_type() { } else if res.status().is_success() {
if mime.essence() == "text/html" { if let Some(mime) = res.content_type() {
return Ok((url.to_string(), res.body_string().await?)); if mime.essence() == "text/html" {
} else { return Ok((url.to_string(), res.body_string().await?));
let msg = format!( } else {
"Invalid HTTP response. Received {} instead of text/html", let msg = format!(
mime.essence() "Invalid HTTP response. Received {} instead of text/html",
); mime.essence()
);
return Err(ErrorKind::HTTPError(msg).into()); return Err(ErrorKind::HTTPError(msg).into());
}
} else {
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
} }
} else { } else {
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into()); let msg = format!("Request failed: HTTP {}", res.status());
return Err(ErrorKind::HTTPError(msg).into());
} }
} else {
let msg = format!("Request failed: HTTP {}", res.status());
return Err(ErrorKind::HTTPError(msg).into());
} }
} Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into()) };
process_request.await.map_err(|mut error: PaperoniError| {
error.set_article_source(url);
error
})
} }
pub async fn download_images( pub async fn download_images(

View file

@ -3,6 +3,8 @@ extern crate lazy_static;
use async_std::stream; use async_std::stream;
use async_std::task; use async_std::task;
use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY;
use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table};
use futures::stream::StreamExt; use futures::stream::StreamExt;
use indicatif::{ProgressBar, ProgressStyle}; use indicatif::{ProgressBar, ProgressStyle};
use url::Url; use url::Url;
@ -31,6 +33,7 @@ fn main() {
fn download(app_config: AppConfig) { fn download(app_config: AppConfig) {
let bar = ProgressBar::new(app_config.urls().len() as u64); let bar = ProgressBar::new(app_config.urls().len() as u64);
let mut errors = Vec::new();
let style = ProgressStyle::default_bar().template( let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
); );
@ -63,7 +66,7 @@ fn download(app_config: AppConfig) {
articles.push(extractor); articles.push(extractor);
} }
} }
Err(e) => eprintln!("{}", e), Err(e) => errors.push(e),
} }
bar.inc(1); bar.inc(1);
} }
@ -72,6 +75,34 @@ fn download(app_config: AppConfig) {
bar.finish_with_message("Downloaded articles"); bar.finish_with_message("Downloaded articles");
match generate_epubs(articles, app_config.merged()) { match generate_epubs(articles, app_config.merged()) {
Ok(_) => (), Ok(_) => (),
Err(e) => eprintln!("{}", e), Err(gen_epub_errors) => {
errors.extend(gen_epub_errors);
}
}; };
if !errors.is_empty() {
println!(
"{}Failed article downloads{}",
Attribute::Bold,
Attribute::NormalIntensity
);
let mut table_failed = Table::new();
table_failed
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_header(vec![
Cell::new("Link").set_alignment(CellAlignment::Center),
Cell::new("Reason").set_alignment(CellAlignment::Center),
])
.set_content_arrangement(ContentArrangement::Dynamic);
for error in errors {
table_failed.add_row(vec![
error
.article_source()
.clone()
.unwrap_or_else(|| "<unknown link>".to_string()),
format!("{}", error.kind()),
]);
}
println!("{}", table_failed);
}
} }