From ae1ddb9386b4e2106054b8c252248a4928c25282 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 20 Apr 2021 21:09:38 +0300 Subject: [PATCH] Add printing of table for failed article downloads - Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table --- src/epub.rs | 182 ++++++++++++++++++++++++++++++++------------------ src/errors.rs | 24 +++++-- src/http.rs | 77 +++++++++++---------- src/main.rs | 35 +++++++++- 4 files changed, 211 insertions(+), 107 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index ab6ee9c..83173ab 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -13,64 +13,100 @@ use crate::{ pub fn generate_epubs( articles: Vec, merged: Option<&String>, -) -> Result<(), PaperoniError> { +) -> Result<(), Vec> { let bar = ProgressBar::new(articles.len() as u64); let style = ProgressStyle::default_bar().template( "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", ); bar.set_style(style); bar.set_message("Generating epubs"); + let mut base_table = Table::new(); base_table .load_preset(UTF8_FULL) .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) .set_content_arrangement(ContentArrangement::Dynamic); + + let mut errors: Vec = Vec::new(); + let mut can_print_table = false; + match merged { Some(name) => { base_table.set_header(vec![Cell::new("Table of Contents") .add_attribute(Attribute::Bold) .set_alignment(CellAlignment::Center) .fg(Color::Green)]); - let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; + + let mut epub = match EpubBuilder::new(match ZipLibrary::new() { + Ok(zip_library) => zip_library, + Err(err) => { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + }) { + Ok(epub) => epub, + Err(err) => { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + }; epub.inline_toc(); - epub = articles + articles .iter() .enumerate() - .fold(epub, |mut epub, (idx, article)| { - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_str = std::str::from_utf8(&html_buf).unwrap(); - epub.metadata("title", replace_metadata_value(name)) - .unwrap(); - let section_name = article.metadata().title(); - epub.add_content( - EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) - .title(replace_metadata_value(section_name)), - ) - .unwrap(); + .fold(&mut epub, |epub, (idx, article)| { + let mut article_result = || -> Result<(), PaperoniError> { + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?; + let html_str = std::str::from_utf8(&html_buf)?; + epub.metadata("title", replace_metadata_value(name))?; + let section_name = article.metadata().title(); + epub.add_content( + EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) + .title(replace_metadata_value(section_name)), + )?; - article.img_urls.iter().for_each(|img| { - // TODO: Add error handling - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); + article.img_urls.iter().for_each(|img| { + // TODO: Add error handling and return errors as a vec + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource( - file_path.file_name().unwrap(), - img_buf, - img.1.as_ref().unwrap(), - ) - .unwrap(); - }); + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.as_ref().unwrap(), + ) + .unwrap(); + }); + Ok(()) + }; + if let Err(mut error) = article_result() { + error.set_article_source(&article.url); + errors.push(error); + } bar.inc(1); base_table.add_row(vec![article.metadata().title()]); epub }); let mut out_file = File::create(&name).unwrap(); - epub.generate(&mut out_file)?; + match epub.generate(&mut out_file) { + Ok(_) => (), + Err(err) => { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + } + bar.finish_with_message("Generated epub\n"); println!("Created {:?}", name); + can_print_table = true; } None => { base_table @@ -81,48 +117,62 @@ pub fn generate_epubs( .set_content_arrangement(ContentArrangement::Dynamic); for article in articles { - let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; - let file_name = format!( - "{}.epub", - article - .metadata() - .title() - .replace("/", " ") - .replace("\\", " ") - ); - let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_str = std::str::from_utf8(&html_buf).unwrap(); - if let Some(author) = article.metadata().byline() { - epub.metadata("author", replace_metadata_value(author))?; + let mut result = || -> Result<(), PaperoniError> { + let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; + let file_name = format!( + "{}.epub", + article + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + let mut out_file = File::create(&file_name).unwrap(); + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); + let html_str = std::str::from_utf8(&html_buf).unwrap(); + if let Some(author) = article.metadata().byline() { + epub.metadata("author", replace_metadata_value(author))?; + } + epub.metadata("title", replace_metadata_value(article.metadata().title()))?; + epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; + for img in &article.img_urls { + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); + + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.as_ref().unwrap(), + )?; + } + epub.generate(&mut out_file)?; + bar.inc(1); + + base_table.add_row(vec![article.metadata().title()]); + + // println!("Created {:?}", file_name); + can_print_table = true; + Ok(()) + }; + if let Err(mut error) = result() { + error.set_article_source(&article.url); + errors.push(error); } - epub.metadata("title", replace_metadata_value(article.metadata().title()))?; - epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; - for img in &article.img_urls { - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); - - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource( - file_path.file_name().unwrap(), - img_buf, - img.1.as_ref().unwrap(), - )?; - } - epub.generate(&mut out_file)?; - bar.inc(1); - - base_table.add_row(vec![article.metadata().title()]); - - // println!("Created {:?}", file_name); } bar.finish_with_message("Generated epubs\n"); } } - println!("{}", base_table); - Ok(()) + if can_print_table { + println!("{}", base_table); + } + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } } /// Replaces characters that have to be escaped before adding to the epub's metadata diff --git a/src/errors.rs b/src/errors.rs index f0b3d9c..70a522a 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -8,25 +8,35 @@ pub enum ErrorKind { HTTPError(String), #[error("[IOError]: {0}")] IOError(String), + #[error("[UTF8Error]: {0}")] + UTF8Error(String), } #[derive(Error, Debug)] #[error("{kind}")] pub struct PaperoniError { - article_link: Option, + article_source: Option, kind: ErrorKind, } impl PaperoniError { pub fn with_kind(kind: ErrorKind) -> Self { PaperoniError { - article_link: None, + article_source: None, kind, } } - pub fn set_article_link(&mut self, article_link: String) { - self.article_link = Some(article_link); + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub fn article_source(&self) -> &Option { + &self.article_source + } + + pub fn set_article_source(&mut self, article_source: &str) { + self.article_source = Some(article_source.to_owned()); } } @@ -59,3 +69,9 @@ impl From for PaperoniError { PaperoniError::with_kind(ErrorKind::IOError(err.to_string())) } } + +impl From for PaperoniError { + fn from(err: std::str::Utf8Error) -> Self { + PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string())) + } +} diff --git a/src/http.rs b/src/http.rs index c945765..6c7f801 100644 --- a/src/http.rs +++ b/src/http.rs @@ -12,46 +12,53 @@ pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); // println!("Fetching..."); - let mut redirect_count: u8 = 0; - let base_url = Url::parse(&url)?; - let mut url = base_url.clone(); - while redirect_count < 5 { - redirect_count += 1; - let req = surf::get(&url); - let mut res = client.send(req).await?; - if res.status().is_redirection() { - if let Some(location) = res.header(surf::http::headers::LOCATION) { - match Url::parse(location.last().as_str()) { - Ok(valid_url) => url = valid_url, - Err(e) => match e { - url::ParseError::RelativeUrlWithoutBase => { - url = base_url.join(location.last().as_str())? - } - e => return Err(e.into()), - }, - }; - } - } else if res.status().is_success() { - if let Some(mime) = res.content_type() { - if mime.essence() == "text/html" { - return Ok((url.to_string(), res.body_string().await?)); - } else { - let msg = format!( - "Invalid HTTP response. Received {} instead of text/html", - mime.essence() - ); + let process_request = async { + let mut redirect_count: u8 = 0; + let base_url = Url::parse(&url)?; + let mut url = base_url.clone(); + while redirect_count < 5 { + redirect_count += 1; + let req = surf::get(&url); + let mut res = client.send(req).await?; + if res.status().is_redirection() { + if let Some(location) = res.header(surf::http::headers::LOCATION) { + match Url::parse(location.last().as_str()) { + Ok(valid_url) => url = valid_url, + Err(e) => match e { + url::ParseError::RelativeUrlWithoutBase => { + url = base_url.join(location.last().as_str())? + } + e => return Err(e.into()), + }, + }; + } + } else if res.status().is_success() { + if let Some(mime) = res.content_type() { + if mime.essence() == "text/html" { + return Ok((url.to_string(), res.body_string().await?)); + } else { + let msg = format!( + "Invalid HTTP response. Received {} instead of text/html", + mime.essence() + ); - return Err(ErrorKind::HTTPError(msg).into()); + return Err(ErrorKind::HTTPError(msg).into()); + } + } else { + return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into()); } } else { - return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into()); + let msg = format!("Request failed: HTTP {}", res.status()); + return Err(ErrorKind::HTTPError(msg).into()); } - } else { - let msg = format!("Request failed: HTTP {}", res.status()); - return Err(ErrorKind::HTTPError(msg).into()); } - } - Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into()) + Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into()) + }; + + process_request.await.map_err(|mut error: PaperoniError| { + error.set_article_source(url); + error + }) } pub async fn download_images( diff --git a/src/main.rs b/src/main.rs index d3d9cc2..7ac578a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,6 +3,8 @@ extern crate lazy_static; use async_std::stream; use async_std::task; +use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; +use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table}; use futures::stream::StreamExt; use indicatif::{ProgressBar, ProgressStyle}; use url::Url; @@ -31,6 +33,7 @@ fn main() { fn download(app_config: AppConfig) { let bar = ProgressBar::new(app_config.urls().len() as u64); + let mut errors = Vec::new(); let style = ProgressStyle::default_bar().template( "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", ); @@ -63,7 +66,7 @@ fn download(app_config: AppConfig) { articles.push(extractor); } } - Err(e) => eprintln!("{}", e), + Err(e) => errors.push(e), } bar.inc(1); } @@ -72,6 +75,34 @@ fn download(app_config: AppConfig) { bar.finish_with_message("Downloaded articles"); match generate_epubs(articles, app_config.merged()) { Ok(_) => (), - Err(e) => eprintln!("{}", e), + Err(gen_epub_errors) => { + errors.extend(gen_epub_errors); + } }; + if !errors.is_empty() { + println!( + "{}Failed article downloads{}", + Attribute::Bold, + Attribute::NormalIntensity + ); + let mut table_failed = Table::new(); + table_failed + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_header(vec![ + Cell::new("Link").set_alignment(CellAlignment::Center), + Cell::new("Reason").set_alignment(CellAlignment::Center), + ]) + .set_content_arrangement(ContentArrangement::Dynamic); + + for error in errors { + table_failed.add_row(vec![ + error + .article_source() + .clone() + .unwrap_or_else(|| "".to_string()), + format!("{}", error.kind()), + ]); + } + println!("{}", table_failed); + } }