From d6cbbe405b9786d40216324d9d9bf2d5200be127 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Wed, 14 Apr 2021 18:07:39 +0300 Subject: [PATCH 01/24] Fix bug in `inline_css_str_to_map` --- src/moz_readability/mod.rs | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 9b25b79..4d28415 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -799,6 +799,7 @@ impl Readability { state = State::ReadProp; decl.1 = Some(token.trim().to_string()); tokens.push(decl.clone()); + decl = (None, None); token.clear(); } else { token.push(c); @@ -819,11 +820,18 @@ impl Readability { } } if !token.is_empty() { - decl.1 = Some(token.trim().to_string()); - tokens.push(decl); + match state { + State::ReadVal => { + decl.1 = Some(token.trim().to_string()); + tokens.push(decl); + } + _ => (), + } } + tokens .into_iter() + .filter(|tok_pair| tok_pair.0.is_some() && tok_pair.1.is_some()) .map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap())) .collect() } @@ -2460,12 +2468,24 @@ mod test { css_map.insert("align-items".to_string(), "center".to_string()); css_map.insert("border".to_string(), "2px solid black".to_string()); - let css_str_to_vec = Readability::inline_css_str_to_map(css_str); - assert_eq!(css_map, css_str_to_vec); + let css_str_to_map = Readability::inline_css_str_to_map(css_str); + assert_eq!(css_map, css_str_to_map); let mut css_map = HashMap::new(); css_map.insert("color".to_string(), "red".to_string()); css_map.insert("background-image".to_string(), "url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')".to_string()); assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')")); + + let empty_map = HashMap::new(); + assert_eq!(empty_map, Readability::inline_css_str_to_map(" \n \t \r")); + assert_eq!(empty_map, Readability::inline_css_str_to_map("color")); + + let mut css_map = HashMap::new(); + css_map.insert("color".to_string(), "red".to_string()); + css_map.insert("height".to_string(), "300px".to_string()); + assert_eq!( + css_map, + Readability::inline_css_str_to_map("color: red;height: 300px;width") + ); } #[test] From 7e9dcfc2b77315fe96c049f097ba2dd075c21c86 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 17 Apr 2021 12:04:06 +0300 Subject: [PATCH 02/24] Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. --- Cargo.lock | 9 ++-- Cargo.toml | 1 + src/epub.rs | 31 ++++++------ src/errors.rs | 61 +++++++++++++++++++++++ src/extractor.rs | 3 +- src/http.rs | 125 ++++++++++++++++++++++++++++------------------- src/main.rs | 20 ++++++-- 7 files changed, 178 insertions(+), 72 deletions(-) create mode 100644 src/errors.rs diff --git a/Cargo.lock b/Cargo.lock index 8466dbf..2c8d164 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1283,6 +1283,7 @@ dependencies = [ "md5", "regex", "surf", + "thiserror", "url", ] @@ -1960,18 +1961,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" [[package]] name = "thiserror" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e" +checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56" +checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 05660ed..451bf38 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,4 +22,5 @@ lazy_static = "1.4.0" md5 = "0.7.0" regex = "1.4.2" surf = "2.1.0" +thiserror = "1.0.24" url = "2.2.0" diff --git a/src/epub.rs b/src/epub.rs index e6e0376..a7a8cbc 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -2,12 +2,18 @@ use std::fs::File; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; -use crate::extractor::{self, Extractor}; +use crate::{ + errors::PaperoniError, + extractor::{self, Extractor}, +}; -pub fn generate_epubs(articles: Vec, merged: Option<&String>) { +pub fn generate_epubs( + articles: Vec, + merged: Option<&String>, +) -> Result<(), PaperoniError> { match merged { Some(name) => { - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; epub.inline_toc(); epub = articles .iter() @@ -41,12 +47,12 @@ pub fn generate_epubs(articles: Vec, merged: Option<&String>) { epub }); let mut out_file = File::create(&name).unwrap(); - epub.generate(&mut out_file).unwrap(); + epub.generate(&mut out_file)?; println!("Created {:?}", name); } None => { for article in articles { - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; let file_name = format!( "{}.epub", article @@ -61,26 +67,23 @@ pub fn generate_epubs(articles: Vec, merged: Option<&String>) { .expect("Unable to serialize to xhtml"); let html_str = std::str::from_utf8(&html_buf).unwrap(); if let Some(author) = article.metadata().byline() { - epub.metadata("author", replace_metadata_value(author)) - .unwrap(); + epub.metadata("author", replace_metadata_value(author))?; } - epub.metadata("title", replace_metadata_value(article.metadata().title())) - .unwrap(); - epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes())) - .unwrap(); + epub.metadata("title", replace_metadata_value(article.metadata().title()))?; + epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; for img in article.img_urls { let mut file_path = std::env::temp_dir(); file_path.push(&img.0); let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) - .unwrap(); + epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())?; } - epub.generate(&mut out_file).unwrap(); + epub.generate(&mut out_file)?; println!("Created {:?}", file_name); } } } + Ok(()) } /// Replaces characters that have to be escaped before adding to the epub's metadata diff --git a/src/errors.rs b/src/errors.rs new file mode 100644 index 0000000..f0b3d9c --- /dev/null +++ b/src/errors.rs @@ -0,0 +1,61 @@ +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum ErrorKind { + #[error("[EpubError]: {0}")] + EpubError(String), + #[error("[HTTPError]: {0}")] + HTTPError(String), + #[error("[IOError]: {0}")] + IOError(String), +} + +#[derive(Error, Debug)] +#[error("{kind}")] +pub struct PaperoniError { + article_link: Option, + kind: ErrorKind, +} + +impl PaperoniError { + pub fn with_kind(kind: ErrorKind) -> Self { + PaperoniError { + article_link: None, + kind, + } + } + + pub fn set_article_link(&mut self, article_link: String) { + self.article_link = Some(article_link); + } +} + +impl From for PaperoniError { + fn from(kind: ErrorKind) -> Self { + PaperoniError::with_kind(kind) + } +} + +impl From for PaperoniError { + fn from(err: epub_builder::Error) -> Self { + PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned())) + } +} + +impl From for PaperoniError { + fn from(err: surf::Error) -> Self { + PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From for PaperoniError { + fn from(err: url::ParseError) -> Self { + PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From for PaperoniError { + fn from(err: std::io::Error) -> Self { + PaperoniError::with_kind(ErrorKind::IOError(err.to_string())) + } +} diff --git a/src/extractor.rs b/src/extractor.rs index 0fcc5e8..64b9a2a 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use kuchiki::{traits::*, NodeRef}; +use crate::errors::PaperoniError; use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); @@ -75,7 +76,7 @@ impl Extractor { pub fn serialize_to_xhtml( node_ref: &NodeRef, mut w: &mut W, -) -> Result<(), Box> { +) -> Result<(), PaperoniError> { let mut escape_map = HashMap::new(); escape_map.insert("<", "<"); escape_map.insert(">", ">"); diff --git a/src/http.rs b/src/http.rs index faf9428..9ff7ef8 100644 --- a/src/http.rs +++ b/src/http.rs @@ -3,13 +3,11 @@ use async_std::{fs::File, stream}; use futures::StreamExt; use url::Url; -use crate::extractor::Extractor; +use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor}; type HTMLResource = (String, String); -pub async fn fetch_url( - url: &str, -) -> Result> { +pub async fn fetch_url(url: &str) -> Result { let client = surf::Client::new(); println!("Fetching..."); @@ -37,26 +35,28 @@ pub async fn fetch_url( if mime.essence() == "text/html" { return Ok((url.to_string(), res.body_string().await?)); } else { - return Err(format!( + let msg = format!( "Invalid HTTP response. Received {} instead of text/html", mime.essence() - ) - .into()); + ); + + return Err(ErrorKind::HTTPError(msg).into()); } } else { - return Err("Unknown HTTP response".into()); + return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into()); } } else { - return Err(format!("Request failed: HTTP {}", res.status()).into()); + let msg = format!("Request failed: HTTP {}", res.status()); + return Err(ErrorKind::HTTPError(msg).into()); } } - Err("Unable to fetch HTML".into()) + Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into()) } pub async fn download_images( extractor: &mut Extractor, article_origin: &Url, -) -> async_std::io::Result<()> { +) -> Result<(), Vec> { if extractor.img_urls.len() > 0 { println!("Downloading images..."); } @@ -71,39 +71,56 @@ pub async fn download_images( ) }) .map(|(url, req)| async move { - let mut img_response = req.await.expect("Unable to retrieve image"); - let img_content: Vec = img_response.body_bytes().await.unwrap(); - let img_mime = img_response - .content_type() - .map(|mime| mime.essence().to_string()); - let img_ext = img_response - .content_type() - .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) - .unwrap(); + match req.await { + Ok(mut img_response) => { + // let mut img_response = req.await.expect("Unable to retrieve image"); + let img_content: Vec = match img_response.body_bytes().await { + Ok(bytes) => bytes, + Err(e) => return Err(e.into()), + }; + let img_mime = img_response + .content_type() + .map(|mime| mime.essence().to_string()); + let img_ext = match img_response + .content_type() + .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) + { + Some(mime_str) => mime_str, + None => { + return Err(ErrorKind::HTTPError( + "Image has no Content-Type".to_owned(), + ) + .into()) + } + }; - let mut img_path = std::env::temp_dir(); - img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); - let mut img_file = File::create(&img_path) - .await - .expect("Unable to create file"); - img_file - .write_all(&img_content) - .await - .expect("Unable to save to file"); + let mut img_path = std::env::temp_dir(); + img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); + let mut img_file = match File::create(&img_path).await { + Ok(file) => file, + Err(e) => return Err(e.into()), + }; + match img_file.write_all(&img_content).await { + Ok(_) => (), + Err(e) => return Err(e.into()), + } - ( - url, - img_path - .file_name() - .map(|os_str_name| { - os_str_name - .to_str() - .expect("Unable to get image file name") - .to_string() - }) - .unwrap(), - img_mime, - ) + Ok(( + url, + img_path + .file_name() + .map(|os_str_name| { + os_str_name + .to_str() + .expect("Unable to get image file name") + .to_string() + }) + .unwrap(), + img_mime, + )) + } + Err(e) => Err(e.into()), + } }); // A utility closure used when update the value of an image source after downloading is successful @@ -124,14 +141,24 @@ pub async fn download_images( (img_path, img_mime) }; - extractor.img_urls = stream::from_iter(imgs_req_iter) + let imgs_req_iter = stream::from_iter(imgs_req_iter) .buffered(10) - .collect::>() - .await - .into_iter() - .map(replace_existing_img_src) - .collect(); - Ok(()) + .collect::>>() + .await; + let mut errors = Vec::new(); + let mut replaced_imgs = Vec::new(); + for img_req_result in imgs_req_iter { + match img_req_result { + Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)), + Err(e) => errors.push(e), + } + } + extractor.img_urls = replaced_imgs; + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } } /// Handles getting the extension from a given MIME subtype. diff --git a/src/main.rs b/src/main.rs index 0467712..7ad2560 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ use url::Url; mod cli; mod epub; +mod errors; mod extractor; /// This module is responsible for async HTTP calls for downloading /// the HTML content and images @@ -41,9 +42,17 @@ fn download(app_config: AppConfig) { if extractor.article().is_some() { extractor.extract_img_urls(); - download_images(&mut extractor, &Url::parse(&url).unwrap()) - .await - .expect("Unable to download images"); + + if let Err(img_errors) = + download_images(&mut extractor, &Url::parse(&url).unwrap()).await + { + eprintln!( + "{} image{} failed to download for {}", + img_errors.len(), + if img_errors.len() > 1 { "s" } else { "" }, + url + ); + } articles.push(extractor); } } @@ -52,5 +61,8 @@ fn download(app_config: AppConfig) { } articles }); - generate_epubs(articles, app_config.merged()); + match generate_epubs(articles, app_config.merged()) { + Ok(_) => (), + Err(e) => eprintln!("{}", e), + }; } From 217cd3e44299841afaf62228e5bdd1ca1aadc524 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 17 Apr 2021 12:08:24 +0300 Subject: [PATCH 03/24] Minor refactor Change cli to grab version from the Cargo manifest Rename fetch_url to fetch_html --- src/cli.rs | 2 +- src/http.rs | 2 +- src/main.rs | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 9815e08..a8701bc 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -8,7 +8,7 @@ pub fn cli_init() -> AppConfig { AppSettings::ArgRequiredElseHelp, AppSettings::UnifiedHelpMessage, ]) - .version("0.3.0-alpha1") + .version(clap::crate_version!()) .about( " Paperoni is an article downloader. diff --git a/src/http.rs b/src/http.rs index 9ff7ef8..9ddf192 100644 --- a/src/http.rs +++ b/src/http.rs @@ -7,7 +7,7 @@ use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor}; type HTMLResource = (String, String); -pub async fn fetch_url(url: &str) -> Result { +pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); println!("Fetching..."); diff --git a/src/main.rs b/src/main.rs index 7ad2560..8936713 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,7 +18,7 @@ mod moz_readability; use cli::AppConfig; use epub::generate_epubs; use extractor::Extractor; -use http::{download_images, fetch_url}; +use http::{download_images, fetch_html}; fn main() { let app_config = cli::cli_init(); @@ -30,7 +30,7 @@ fn main() { fn download(app_config: AppConfig) { let articles = task::block_on(async { - let urls_iter = app_config.urls().iter().map(|url| fetch_url(url)); + let urls_iter = app_config.urls().iter().map(|url| fetch_html(url)); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); let mut articles = Vec::new(); while let Some(fetch_result) = responses.next().await { From 04a1eed4e2de23ffb4285fe1a9e539d1f998c21e Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 17 Apr 2021 17:27:38 +0300 Subject: [PATCH 04/24] Add progress indicators for the cli --- Cargo.lock | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/epub.rs | 14 +++++++++++++- src/http.rs | 11 ++++++++--- src/main.rs | 15 ++++++++++++--- 5 files changed, 84 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2c8d164..b9edb91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -403,6 +403,21 @@ dependencies = [ "cache-padded", ] +[[package]] +name = "console" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3993e6445baa160675931ec041a5e03ca84b9c6e32a056150d3aa2bdda0a1f45" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "regex", + "terminal_size", + "unicode-width", + "winapi", +] + [[package]] name = "const_fn" version = "0.4.3" @@ -577,6 +592,12 @@ dependencies = [ "dtoa", ] +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "encoding_rs" version = "0.8.26" @@ -960,6 +981,18 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indicatif" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4" +dependencies = [ + "console", + "lazy_static", + "number_prefix", + "regex", +] + [[package]] name = "infer" version = "0.2.3" @@ -1232,6 +1265,12 @@ dependencies = [ "libc", ] +[[package]] +name = "number_prefix" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" + [[package]] name = "object" version = "0.22.0" @@ -1278,6 +1317,7 @@ dependencies = [ "epub-builder", "futures", "html5ever", + "indicatif", "kuchiki", "lazy_static", "md5", @@ -1944,6 +1984,16 @@ dependencies = [ "utf-8", ] +[[package]] +name = "terminal_size" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86ca8ced750734db02076f44132d802af0b33b09942331f4459dde8636fd2406" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "textwrap" version = "0.11.0" diff --git a/Cargo.toml b/Cargo.toml index 451bf38..d6d2499 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ clap = "2.33.3" epub-builder = "0.4.8" futures = "0.3.12" html5ever = "0.25.1" +indicatif = "0.15.0" kuchiki = "0.8.1" lazy_static = "1.4.0" md5 = "0.7.0" diff --git a/src/epub.rs b/src/epub.rs index a7a8cbc..73cd7cf 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -1,6 +1,7 @@ use std::fs::File; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; +use indicatif::{ProgressBar, ProgressStyle}; use crate::{ errors::PaperoniError, @@ -11,6 +12,12 @@ pub fn generate_epubs( articles: Vec, merged: Option<&String>, ) -> Result<(), PaperoniError> { + let bar = ProgressBar::new(articles.len() as u64); + let style = ProgressStyle::default_bar().template( + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", + ); + bar.set_style(style); + bar.set_message("Generating epubs"); match merged { Some(name) => { let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; @@ -33,6 +40,8 @@ pub fn generate_epubs( .unwrap(); article.img_urls.iter().for_each(|img| { + // TODO: Add error handling + bar.inc(1); let mut file_path = std::env::temp_dir(); file_path.push(&img.0); @@ -48,6 +57,7 @@ pub fn generate_epubs( }); let mut out_file = File::create(&name).unwrap(); epub.generate(&mut out_file)?; + bar.finish_with_message("Generated epub\n"); println!("Created {:?}", name); } None => { @@ -79,8 +89,10 @@ pub fn generate_epubs( epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())?; } epub.generate(&mut out_file)?; - println!("Created {:?}", file_name); + bar.inc(1); + // println!("Created {:?}", file_name); } + bar.finish_with_message("Generated epubs\n"); } } Ok(()) diff --git a/src/http.rs b/src/http.rs index 9ddf192..c945765 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,6 +1,7 @@ use async_std::io::prelude::*; use async_std::{fs::File, stream}; use futures::StreamExt; +use indicatif::ProgressBar; use url::Url; use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor}; @@ -9,7 +10,7 @@ type HTMLResource = (String, String); pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); - println!("Fetching..."); + // println!("Fetching..."); let mut redirect_count: u8 = 0; let base_url = Url::parse(&url)?; @@ -56,10 +57,12 @@ pub async fn fetch_html(url: &str) -> Result { pub async fn download_images( extractor: &mut Extractor, article_origin: &Url, + bar: &ProgressBar, ) -> Result<(), Vec> { if extractor.img_urls.len() > 0 { - println!("Downloading images..."); + // println!("Downloading images..."); } + let img_count = extractor.img_urls.len(); let imgs_req_iter = extractor .img_urls @@ -70,7 +73,9 @@ pub async fn download_images( surf::Client::new().get(get_absolute_url(&url, article_origin)), ) }) - .map(|(url, req)| async move { + .enumerate() + .map(|(img_idx, (url, req))| async move { + bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str()); match req.await { Ok(mut img_response) => { // let mut img_response = req.await.expect("Unable to retrieve image"); diff --git a/src/main.rs b/src/main.rs index 8936713..83884be 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,6 +4,7 @@ extern crate lazy_static; use async_std::stream; use async_std::task; use futures::stream::StreamExt; +use indicatif::{ProgressBar, ProgressStyle}; use url::Url; mod cli; @@ -29,6 +30,12 @@ fn main() { } fn download(app_config: AppConfig) { + let bar = ProgressBar::new(app_config.urls().len() as u64); + let style = ProgressStyle::default_bar().template( + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", + ); + bar.set_style(style); + bar.enable_steady_tick(500); let articles = task::block_on(async { let urls_iter = app_config.urls().iter().map(|url| fetch_html(url)); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); @@ -36,15 +43,15 @@ fn download(app_config: AppConfig) { while let Some(fetch_result) = responses.next().await { match fetch_result { Ok((url, html)) => { - println!("Extracting"); + // println!("Extracting"); let mut extractor = Extractor::from_html(&html); + bar.set_message("Extracting..."); extractor.extract_content(&url); if extractor.article().is_some() { extractor.extract_img_urls(); - if let Err(img_errors) = - download_images(&mut extractor, &Url::parse(&url).unwrap()).await + download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await { eprintln!( "{} image{} failed to download for {}", @@ -58,9 +65,11 @@ fn download(app_config: AppConfig) { } Err(e) => eprintln!("{}", e), } + bar.inc(1); } articles }); + bar.finish_with_message("Downloaded articles"); match generate_epubs(articles, app_config.merged()) { Ok(_) => (), Err(e) => eprintln!("{}", e), From b2174486010a5c82cb3f4c3bfbcb785d1d194301 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 20 Apr 2021 14:02:56 +0300 Subject: [PATCH 05/24] Add printing of tables upon successful extraction --- Cargo.lock | 165 +++++++++++++++++++++++++++++++++++-- Cargo.toml | 1 + src/epub.rs | 33 +++++++- src/moz_readability/mod.rs | 3 +- 4 files changed, 193 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b9edb91..8a37731 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -394,6 +394,17 @@ dependencies = [ "vec_map", ] +[[package]] +name = "comfy-table" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b99e9022e080d384b58d8eaf5976b42a311ff7a9669f8200eb2453c0b2b81a" +dependencies = [ + "crossterm", + "strum", + "strum_macros", +] + [[package]] name = "concurrent-queue" version = "1.2.2" @@ -468,6 +479,31 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "crossterm" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c" +dependencies = [ + "bitflags", + "crossterm_winapi", + "lazy_static", + "libc", + "mio", + "parking_lot", + "signal-hook", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9" +dependencies = [ + "winapi", +] + [[package]] name = "crypto-mac" version = "0.10.0" @@ -872,6 +908,15 @@ dependencies = [ "web-sys", ] +[[package]] +name = "heck" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "hermit-abi" version = "0.1.17" @@ -1075,9 +1120,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.80" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" +checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41" [[package]] name = "libnghttp2-sys" @@ -1204,6 +1249,28 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mio" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956" +dependencies = [ + "libc", + "log 0.4.11", + "miow", + "ntapi", + "winapi", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi", +] + [[package]] name = "mustache" version = "0.9.0" @@ -1236,6 +1303,15 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -1314,6 +1390,7 @@ version = "0.3.0-alpha1" dependencies = [ "async-std", "clap", + "comfy-table", "epub-builder", "futures", "html5ever", @@ -1333,6 +1410,31 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" +[[package]] +name = "parking_lot" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall 0.2.6", + "smallvec", + "winapi", +] + [[package]] name = "percent-encoding" version = "2.1.0" @@ -1596,6 +1698,15 @@ version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" +[[package]] +name = "redox_syscall" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" version = "1.4.2" @@ -1779,6 +1890,26 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "signal-hook" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729" +dependencies = [ + "libc", + "mio", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-registry" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6" +dependencies = [ + "libc", +] + [[package]] name = "siphasher" version = "0.3.3" @@ -1804,9 +1935,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.5.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acad6f34eb9e8a259d3283d1e8c1d34d7415943d4895f65cc73813c7396fc85" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" [[package]] name = "socket2" @@ -1816,7 +1947,7 @@ checksum = "2c29947abdee2a218277abeca306f25789c938e500ea5a9d4b12a5a504466902" dependencies = [ "cfg-if 1.0.0", "libc", - "redox_syscall", + "redox_syscall 0.1.57", "winapi", ] @@ -1924,6 +2055,24 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +[[package]] +name = "strum" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7318c509b5ba57f18533982607f24070a55d353e90d4cae30c467cdb2ad5ac5c" + +[[package]] +name = "strum_macros" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8bc6b87a5112aeeab1f4a9f7ab634fe6cbefc4850006df31267f4cfb9e3149" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "subtle" version = "2.3.0" @@ -2178,6 +2327,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0d2e7be6ae3a5fa87eed5fb451aff96f2573d2694942e40543ae0bbe19c796" + [[package]] name = "unicode-width" version = "0.1.8" diff --git a/Cargo.toml b/Cargo.toml index d6d2499..8487fc2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ readme = "README.md" [dependencies] async-std = "1.7.0" clap = "2.33.3" +comfy-table = "2.1.0" epub-builder = "0.4.8" futures = "0.3.12" html5ever = "0.25.1" diff --git a/src/epub.rs b/src/epub.rs index 73cd7cf..ab6ee9c 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -1,5 +1,7 @@ use std::fs::File; +use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; +use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use indicatif::{ProgressBar, ProgressStyle}; @@ -18,8 +20,17 @@ pub fn generate_epubs( ); bar.set_style(style); bar.set_message("Generating epubs"); + let mut base_table = Table::new(); + base_table + .load_preset(UTF8_FULL) + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_content_arrangement(ContentArrangement::Dynamic); match merged { Some(name) => { + base_table.set_header(vec![Cell::new("Table of Contents") + .add_attribute(Attribute::Bold) + .set_alignment(CellAlignment::Center) + .fg(Color::Green)]); let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; epub.inline_toc(); epub = articles @@ -41,7 +52,6 @@ pub fn generate_epubs( article.img_urls.iter().for_each(|img| { // TODO: Add error handling - bar.inc(1); let mut file_path = std::env::temp_dir(); file_path.push(&img.0); @@ -53,6 +63,8 @@ pub fn generate_epubs( ) .unwrap(); }); + bar.inc(1); + base_table.add_row(vec![article.metadata().title()]); epub }); let mut out_file = File::create(&name).unwrap(); @@ -61,6 +73,13 @@ pub fn generate_epubs( println!("Created {:?}", name); } None => { + base_table + .set_header(vec![Cell::new("Downloaded articles") + .add_attribute(Attribute::Bold) + .set_alignment(CellAlignment::Center) + .fg(Color::Green)]) + .set_content_arrangement(ContentArrangement::Dynamic); + for article in articles { let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; let file_name = format!( @@ -81,20 +100,28 @@ pub fn generate_epubs( } epub.metadata("title", replace_metadata_value(article.metadata().title()))?; epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; - for img in article.img_urls { + for img in &article.img_urls { let mut file_path = std::env::temp_dir(); file_path.push(&img.0); let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())?; + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.as_ref().unwrap(), + )?; } epub.generate(&mut out_file)?; bar.inc(1); + + base_table.add_row(vec![article.metadata().title()]); + // println!("Created {:?}", file_name); } bar.finish_with_message("Generated epubs\n"); } } + println!("{}", base_table); Ok(()) } diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 4d28415..fd65620 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -1585,7 +1585,8 @@ impl Readability { /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// a user wants to read. Then return it wrapped up in a div. fn grab_article(&mut self) { - println!("Grabbing article"); + // TODO: Add logging for this + // println!("Grabbing article"); // var doc = this._doc; // var isPaging = (page !== null ? true: false); // page = page ? page : this._doc.body; From 60fb30e8a2f4511cc3860cf9b0e208ce6687e2ab Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 20 Apr 2021 21:06:54 +0300 Subject: [PATCH 06/24] Add url field in Extractor struct --- src/extractor.rs | 12 +++++++----- src/main.rs | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/extractor.rs b/src/extractor.rs index 64b9a2a..507ff6a 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -15,22 +15,24 @@ pub struct Extractor { article: Option, pub img_urls: Vec, readability: Readability, + pub url: String, } impl Extractor { /// Create a new instance of an HTML extractor given an HTML string - pub fn from_html(html_str: &str) -> Self { + pub fn from_html(html_str: &str, url: &str) -> Self { Extractor { article: None, img_urls: Vec::new(), readability: Readability::new(html_str), + url: url.to_string(), } } /// Locates and extracts the HTML in a document which is determined to be /// the source of the content - pub fn extract_content(&mut self, url: &str) { - self.readability.parse(url); + pub fn extract_content(&mut self) { + self.readability.parse(&self.url); if let Some(article_node_ref) = &self.readability.article_node { let template = r#" @@ -157,8 +159,8 @@ mod test { #[test] fn test_extract_img_urls() { - let mut extractor = Extractor::from_html(TEST_HTML); - extractor.extract_content("http://example.com/"); + let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/"); + extractor.extract_content(); extractor.extract_img_urls(); assert!(extractor.img_urls.len() > 0); diff --git a/src/main.rs b/src/main.rs index 83884be..d3d9cc2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -44,9 +44,9 @@ fn download(app_config: AppConfig) { match fetch_result { Ok((url, html)) => { // println!("Extracting"); - let mut extractor = Extractor::from_html(&html); + let mut extractor = Extractor::from_html(&html, &url); bar.set_message("Extracting..."); - extractor.extract_content(&url); + extractor.extract_content(); if extractor.article().is_some() { extractor.extract_img_urls(); From ae1ddb9386b4e2106054b8c252248a4928c25282 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 20 Apr 2021 21:09:38 +0300 Subject: [PATCH 07/24] Add printing of table for failed article downloads - Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table --- src/epub.rs | 182 ++++++++++++++++++++++++++++++++------------------ src/errors.rs | 24 +++++-- src/http.rs | 77 +++++++++++---------- src/main.rs | 35 +++++++++- 4 files changed, 211 insertions(+), 107 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index ab6ee9c..83173ab 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -13,64 +13,100 @@ use crate::{ pub fn generate_epubs( articles: Vec, merged: Option<&String>, -) -> Result<(), PaperoniError> { +) -> Result<(), Vec> { let bar = ProgressBar::new(articles.len() as u64); let style = ProgressStyle::default_bar().template( "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", ); bar.set_style(style); bar.set_message("Generating epubs"); + let mut base_table = Table::new(); base_table .load_preset(UTF8_FULL) .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) .set_content_arrangement(ContentArrangement::Dynamic); + + let mut errors: Vec = Vec::new(); + let mut can_print_table = false; + match merged { Some(name) => { base_table.set_header(vec![Cell::new("Table of Contents") .add_attribute(Attribute::Bold) .set_alignment(CellAlignment::Center) .fg(Color::Green)]); - let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; + + let mut epub = match EpubBuilder::new(match ZipLibrary::new() { + Ok(zip_library) => zip_library, + Err(err) => { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + }) { + Ok(epub) => epub, + Err(err) => { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + }; epub.inline_toc(); - epub = articles + articles .iter() .enumerate() - .fold(epub, |mut epub, (idx, article)| { - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_str = std::str::from_utf8(&html_buf).unwrap(); - epub.metadata("title", replace_metadata_value(name)) - .unwrap(); - let section_name = article.metadata().title(); - epub.add_content( - EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) - .title(replace_metadata_value(section_name)), - ) - .unwrap(); + .fold(&mut epub, |epub, (idx, article)| { + let mut article_result = || -> Result<(), PaperoniError> { + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?; + let html_str = std::str::from_utf8(&html_buf)?; + epub.metadata("title", replace_metadata_value(name))?; + let section_name = article.metadata().title(); + epub.add_content( + EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) + .title(replace_metadata_value(section_name)), + )?; - article.img_urls.iter().for_each(|img| { - // TODO: Add error handling - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); + article.img_urls.iter().for_each(|img| { + // TODO: Add error handling and return errors as a vec + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource( - file_path.file_name().unwrap(), - img_buf, - img.1.as_ref().unwrap(), - ) - .unwrap(); - }); + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.as_ref().unwrap(), + ) + .unwrap(); + }); + Ok(()) + }; + if let Err(mut error) = article_result() { + error.set_article_source(&article.url); + errors.push(error); + } bar.inc(1); base_table.add_row(vec![article.metadata().title()]); epub }); let mut out_file = File::create(&name).unwrap(); - epub.generate(&mut out_file)?; + match epub.generate(&mut out_file) { + Ok(_) => (), + Err(err) => { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + } + bar.finish_with_message("Generated epub\n"); println!("Created {:?}", name); + can_print_table = true; } None => { base_table @@ -81,48 +117,62 @@ pub fn generate_epubs( .set_content_arrangement(ContentArrangement::Dynamic); for article in articles { - let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; - let file_name = format!( - "{}.epub", - article - .metadata() - .title() - .replace("/", " ") - .replace("\\", " ") - ); - let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_str = std::str::from_utf8(&html_buf).unwrap(); - if let Some(author) = article.metadata().byline() { - epub.metadata("author", replace_metadata_value(author))?; + let mut result = || -> Result<(), PaperoniError> { + let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; + let file_name = format!( + "{}.epub", + article + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + let mut out_file = File::create(&file_name).unwrap(); + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); + let html_str = std::str::from_utf8(&html_buf).unwrap(); + if let Some(author) = article.metadata().byline() { + epub.metadata("author", replace_metadata_value(author))?; + } + epub.metadata("title", replace_metadata_value(article.metadata().title()))?; + epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; + for img in &article.img_urls { + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); + + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.as_ref().unwrap(), + )?; + } + epub.generate(&mut out_file)?; + bar.inc(1); + + base_table.add_row(vec![article.metadata().title()]); + + // println!("Created {:?}", file_name); + can_print_table = true; + Ok(()) + }; + if let Err(mut error) = result() { + error.set_article_source(&article.url); + errors.push(error); } - epub.metadata("title", replace_metadata_value(article.metadata().title()))?; - epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; - for img in &article.img_urls { - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); - - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource( - file_path.file_name().unwrap(), - img_buf, - img.1.as_ref().unwrap(), - )?; - } - epub.generate(&mut out_file)?; - bar.inc(1); - - base_table.add_row(vec![article.metadata().title()]); - - // println!("Created {:?}", file_name); } bar.finish_with_message("Generated epubs\n"); } } - println!("{}", base_table); - Ok(()) + if can_print_table { + println!("{}", base_table); + } + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } } /// Replaces characters that have to be escaped before adding to the epub's metadata diff --git a/src/errors.rs b/src/errors.rs index f0b3d9c..70a522a 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -8,25 +8,35 @@ pub enum ErrorKind { HTTPError(String), #[error("[IOError]: {0}")] IOError(String), + #[error("[UTF8Error]: {0}")] + UTF8Error(String), } #[derive(Error, Debug)] #[error("{kind}")] pub struct PaperoniError { - article_link: Option, + article_source: Option, kind: ErrorKind, } impl PaperoniError { pub fn with_kind(kind: ErrorKind) -> Self { PaperoniError { - article_link: None, + article_source: None, kind, } } - pub fn set_article_link(&mut self, article_link: String) { - self.article_link = Some(article_link); + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub fn article_source(&self) -> &Option { + &self.article_source + } + + pub fn set_article_source(&mut self, article_source: &str) { + self.article_source = Some(article_source.to_owned()); } } @@ -59,3 +69,9 @@ impl From for PaperoniError { PaperoniError::with_kind(ErrorKind::IOError(err.to_string())) } } + +impl From for PaperoniError { + fn from(err: std::str::Utf8Error) -> Self { + PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string())) + } +} diff --git a/src/http.rs b/src/http.rs index c945765..6c7f801 100644 --- a/src/http.rs +++ b/src/http.rs @@ -12,46 +12,53 @@ pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); // println!("Fetching..."); - let mut redirect_count: u8 = 0; - let base_url = Url::parse(&url)?; - let mut url = base_url.clone(); - while redirect_count < 5 { - redirect_count += 1; - let req = surf::get(&url); - let mut res = client.send(req).await?; - if res.status().is_redirection() { - if let Some(location) = res.header(surf::http::headers::LOCATION) { - match Url::parse(location.last().as_str()) { - Ok(valid_url) => url = valid_url, - Err(e) => match e { - url::ParseError::RelativeUrlWithoutBase => { - url = base_url.join(location.last().as_str())? - } - e => return Err(e.into()), - }, - }; - } - } else if res.status().is_success() { - if let Some(mime) = res.content_type() { - if mime.essence() == "text/html" { - return Ok((url.to_string(), res.body_string().await?)); - } else { - let msg = format!( - "Invalid HTTP response. Received {} instead of text/html", - mime.essence() - ); + let process_request = async { + let mut redirect_count: u8 = 0; + let base_url = Url::parse(&url)?; + let mut url = base_url.clone(); + while redirect_count < 5 { + redirect_count += 1; + let req = surf::get(&url); + let mut res = client.send(req).await?; + if res.status().is_redirection() { + if let Some(location) = res.header(surf::http::headers::LOCATION) { + match Url::parse(location.last().as_str()) { + Ok(valid_url) => url = valid_url, + Err(e) => match e { + url::ParseError::RelativeUrlWithoutBase => { + url = base_url.join(location.last().as_str())? + } + e => return Err(e.into()), + }, + }; + } + } else if res.status().is_success() { + if let Some(mime) = res.content_type() { + if mime.essence() == "text/html" { + return Ok((url.to_string(), res.body_string().await?)); + } else { + let msg = format!( + "Invalid HTTP response. Received {} instead of text/html", + mime.essence() + ); - return Err(ErrorKind::HTTPError(msg).into()); + return Err(ErrorKind::HTTPError(msg).into()); + } + } else { + return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into()); } } else { - return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into()); + let msg = format!("Request failed: HTTP {}", res.status()); + return Err(ErrorKind::HTTPError(msg).into()); } - } else { - let msg = format!("Request failed: HTTP {}", res.status()); - return Err(ErrorKind::HTTPError(msg).into()); } - } - Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into()) + Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into()) + }; + + process_request.await.map_err(|mut error: PaperoniError| { + error.set_article_source(url); + error + }) } pub async fn download_images( diff --git a/src/main.rs b/src/main.rs index d3d9cc2..7ac578a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,6 +3,8 @@ extern crate lazy_static; use async_std::stream; use async_std::task; +use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; +use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table}; use futures::stream::StreamExt; use indicatif::{ProgressBar, ProgressStyle}; use url::Url; @@ -31,6 +33,7 @@ fn main() { fn download(app_config: AppConfig) { let bar = ProgressBar::new(app_config.urls().len() as u64); + let mut errors = Vec::new(); let style = ProgressStyle::default_bar().template( "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", ); @@ -63,7 +66,7 @@ fn download(app_config: AppConfig) { articles.push(extractor); } } - Err(e) => eprintln!("{}", e), + Err(e) => errors.push(e), } bar.inc(1); } @@ -72,6 +75,34 @@ fn download(app_config: AppConfig) { bar.finish_with_message("Downloaded articles"); match generate_epubs(articles, app_config.merged()) { Ok(_) => (), - Err(e) => eprintln!("{}", e), + Err(gen_epub_errors) => { + errors.extend(gen_epub_errors); + } }; + if !errors.is_empty() { + println!( + "{}Failed article downloads{}", + Attribute::Bold, + Attribute::NormalIntensity + ); + let mut table_failed = Table::new(); + table_failed + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_header(vec![ + Cell::new("Link").set_alignment(CellAlignment::Center), + Cell::new("Reason").set_alignment(CellAlignment::Center), + ]) + .set_content_arrangement(ContentArrangement::Dynamic); + + for error in errors { + table_failed.add_row(vec![ + error + .article_source() + .clone() + .unwrap_or_else(|| "".to_string()), + format!("{}", error.kind()), + ]); + } + println!("{}", table_failed); + } } From dbac7c3b69379a31e321b8d622aab3f32366c979 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Wed, 21 Apr 2021 19:07:08 +0300 Subject: [PATCH 08/24] Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. --- src/epub.rs | 4 ++-- src/errors.rs | 2 ++ src/extractor.rs | 16 +++++++++++----- src/http.rs | 2 -- src/main.rs | 33 +++++++++++++++++++-------------- src/moz_readability/mod.rs | 20 ++++++++++++-------- 6 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index 83173ab..ac1a934 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -61,7 +61,7 @@ pub fn generate_epubs( .fold(&mut epub, |epub, (idx, article)| { let mut article_result = || -> Result<(), PaperoniError> { let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?; + extractor::serialize_to_xhtml(article.article(), &mut html_buf)?; let html_str = std::str::from_utf8(&html_buf)?; epub.metadata("title", replace_metadata_value(name))?; let section_name = article.metadata().title(); @@ -129,7 +129,7 @@ pub fn generate_epubs( ); let mut out_file = File::create(&file_name).unwrap(); let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) + extractor::serialize_to_xhtml(article.article(), &mut html_buf) .expect("Unable to serialize to xhtml"); let html_str = std::str::from_utf8(&html_buf).unwrap(); if let Some(author) = article.metadata().byline() { diff --git a/src/errors.rs b/src/errors.rs index 70a522a..c37ff8e 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -10,6 +10,8 @@ pub enum ErrorKind { IOError(String), #[error("[UTF8Error]: {0}")] UTF8Error(String), + #[error("[ReadabilityError]: {0}")] + ReadabilityError(String), } #[derive(Error, Debug)] diff --git a/src/extractor.rs b/src/extractor.rs index 507ff6a..1f4140f 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -31,8 +31,8 @@ impl Extractor { /// Locates and extracts the HTML in a document which is determined to be /// the source of the content - pub fn extract_content(&mut self) { - self.readability.parse(&self.url); + pub fn extract_content(&mut self) -> Result<(), PaperoniError> { + self.readability.parse(&self.url)?; if let Some(article_node_ref) = &self.readability.article_node { let template = r#" @@ -47,6 +47,7 @@ impl Extractor { body.as_node().append(article_node_ref.clone()); self.article = Some(doc); } + Ok(()) } /// Traverses the DOM tree of the content and retrieves the IMG URLs @@ -64,8 +65,11 @@ impl Extractor { } } - pub fn article(&self) -> Option<&NodeRef> { - self.article.as_ref() + /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse + pub fn article(&self) -> &NodeRef { + self.article.as_ref().expect( + "Article node doesn't exist. This may be because the document has not been parsed", + ) } pub fn metadata(&self) -> &MetaData { @@ -160,7 +164,9 @@ mod test { #[test] fn test_extract_img_urls() { let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/"); - extractor.extract_content(); + extractor + .extract_content() + .expect("Article extraction failed unexpectedly"); extractor.extract_img_urls(); assert!(extractor.img_urls.len() > 0); diff --git a/src/http.rs b/src/http.rs index 6c7f801..bd87213 100644 --- a/src/http.rs +++ b/src/http.rs @@ -141,8 +141,6 @@ pub async fn download_images( let (img_url, img_path, img_mime) = img_item; let img_ref = extractor .article() - .as_mut() - .expect("Unable to get mutable ref") .select_first(&format!("img[src='{}']", img_url)) .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut(); diff --git a/src/main.rs b/src/main.rs index 7ac578a..3b1ad47 100644 --- a/src/main.rs +++ b/src/main.rs @@ -49,21 +49,26 @@ fn download(app_config: AppConfig) { // println!("Extracting"); let mut extractor = Extractor::from_html(&html, &url); bar.set_message("Extracting..."); - extractor.extract_content(); - - if extractor.article().is_some() { - extractor.extract_img_urls(); - if let Err(img_errors) = - download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await - { - eprintln!( - "{} image{} failed to download for {}", - img_errors.len(), - if img_errors.len() > 1 { "s" } else { "" }, - url - ); + match extractor.extract_content() { + Ok(_) => { + extractor.extract_img_urls(); + if let Err(img_errors) = + download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) + .await + { + eprintln!( + "{} image{} failed to download for {}", + img_errors.len(), + if img_errors.len() > 1 { "s" } else { "" }, + url + ); + } + articles.push(extractor); + } + Err(mut e) => { + e.set_article_source(&url); + errors.push(e); } - articles.push(extractor); } } Err(e) => errors.push(e), diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index fd65620..dc8df9f 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -9,6 +9,8 @@ use kuchiki::{ }; use url::Url; +use crate::errors::{ErrorKind, PaperoniError}; + const DEFAULT_CHAR_THRESHOLD: usize = 500; const FLAG_STRIP_UNLIKELYS: u32 = 0x1; const FLAG_WEIGHT_CLASSES: u32 = 0x2; @@ -76,14 +78,15 @@ impl Readability { metadata: MetaData::new(), } } - pub fn parse(&mut self, url: &str) { + pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> { self.unwrap_no_script_tags(); self.remove_scripts(); self.prep_document(); self.metadata = self.get_article_metadata(); self.article_title = self.metadata.title.clone(); - self.grab_article(); + self.grab_article()?; self.post_process_content(url); + Ok(()) } /// Recursively check if node is image, or if node contains exactly only one image @@ -1584,7 +1587,7 @@ impl Readability { /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// a user wants to read. Then return it wrapped up in a div. - fn grab_article(&mut self) { + fn grab_article(&mut self) -> Result<(), PaperoniError> { // TODO: Add logging for this // println!("Grabbing article"); // var doc = this._doc; @@ -1593,8 +1596,7 @@ impl Readability { let page = self.root_node.select_first("body"); if page.is_err() { // TODO:Have error logging for this - println!("Document has no "); - return; + return Err(ErrorKind::ReadabilityError("Document has no ".into()).into()); } let page = page.unwrap(); let mut attempts: Vec = Vec::new(); @@ -2084,8 +2086,10 @@ impl Readability { attempts.push(ExtractAttempt::new(article_content.clone(), text_length)); attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap()); if attempts.first().as_ref().unwrap().length == 0 { - println!("Unable to extract content"); - break; + return Err(ErrorKind::ReadabilityError( + "Unable to extract content".into(), + ) + .into()); } article_content = attempts[0].article.clone(); parse_successful = true; @@ -2111,7 +2115,7 @@ impl Readability { false }); self.article_node = Some(article_content); - return; + return Ok(()); } } } From 960f114dc6412ee2006fe194a24f61cd1d873905 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Wed, 21 Apr 2021 19:14:25 +0300 Subject: [PATCH 09/24] Minor fixes in moz_readability - swap unwrap for if let statement in `get_article_metadata` - add default when extracting the title from a possible `` element - fix extracting alternative titles from h1 tags --- src/moz_readability/mod.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index dc8df9f..c3ab1d2 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -429,8 +429,7 @@ impl Readability { let mut matches = None; if let Some(property) = node_attr.get("property") { matches = regexes::PROPERTY_REGEX.captures(property); - if matches.is_some() { - let captures = matches.as_ref().unwrap(); + if let Some(captures) = &matches { for capture in captures.iter() { let mut name = capture.unwrap().as_str().to_lowercase(); name = regexes::REPLACE_WHITESPACE_REGEX @@ -564,7 +563,7 @@ impl Readability { .root_node .select_first("title") .map(|title| title.text_contents().trim().to_string()) - .expect("This file has no <title> tag to extract a title from"); + .unwrap_or("".to_string()); let orig_title = cur_title.clone(); let mut title_had_hierarchical_separators = false; let word_count = |s: &str| -> usize { s.split_whitespace().count() }; @@ -598,8 +597,8 @@ impl Readability { } } else if cur_title.len() > 150 || cur_title.len() < 15 { let mut h1_nodes = self.root_node.select("h1").unwrap(); - let (_, h1_count) = h1_nodes.size_hint(); - if Some(1) == h1_count { + let h1_count = self.root_node.select("h1").unwrap().count(); + if h1_count == 1 { cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None); } } From 313041a1090c9496f31a997eed04fb381ab5f8de Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Thu, 22 Apr 2021 18:01:23 +0300 Subject: [PATCH 10/24] Update dependencies and restore redirect middleware in `download_images` --- Cargo.lock | 111 +++++++++++++++++++++++++++++----------------------- Cargo.toml | 10 ++--- src/http.rs | 4 +- 3 files changed, 69 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8a37731..be621b1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -126,12 +126,15 @@ dependencies = [ [[package]] name = "async-global-executor" -version = "1.4.3" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73079b49cd26b8fd5a15f68fc7707fc78698dc2a3d61430f2a7a9430230dfa04" +checksum = "9586ec52317f36de58453159d48351bc244bc24ced3effc1fce22f3d48664af6" dependencies = [ + "async-channel", "async-executor", "async-io", + "async-mutex", + "blocking", "futures-lite", "num_cpus", "once_cell", @@ -157,6 +160,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "async-lock" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6a8ea61bf9947a1007c5cada31e647dbc77b103c679858150003ba697ea798b" +dependencies = [ + "event-listener", +] + [[package]] name = "async-mutex" version = "1.4.0" @@ -168,14 +180,14 @@ dependencies = [ [[package]] name = "async-std" -version = "1.7.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7e82538bc65a25dbdff70e4c5439d52f068048ab97cdea0acd73f131594caa1" +checksum = "d9f06685bad74e0570f5213741bea82158279a4103d988e57bfada11ad230341" dependencies = [ + "async-channel", "async-global-executor", "async-io", - "async-mutex", - "blocking", + "async-lock", "crossbeam-utils", "futures-channel", "futures-core", @@ -187,7 +199,7 @@ dependencies = [ "memchr", "num_cpus", "once_cell", - "pin-project-lite 0.1.11", + "pin-project-lite 0.2.4", "pin-utils", "slab", "wasm-bindgen-futures", @@ -581,6 +593,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "dashmap" +version = "4.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e77a43b28d0668df09411cb0bc9a8c2adc40f9a048afe863e05fd43251e8e39c" +dependencies = [ + "cfg-if 1.0.0", + "num_cpus", +] + [[package]] name = "data-encoding" version = "2.3.1" @@ -742,9 +764,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150" +checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" dependencies = [ "futures-channel", "futures-core", @@ -757,9 +779,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" +checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" dependencies = [ "futures-core", "futures-sink", @@ -767,15 +789,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" +checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" [[package]] name = "futures-executor" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9" +checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" dependencies = [ "futures-core", "futures-task", @@ -784,9 +806,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" +checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" [[package]] name = "futures-lite" @@ -805,9 +827,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" +checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -817,24 +839,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" +checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" [[package]] name = "futures-task" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" -dependencies = [ - "once_cell", -] +checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" [[package]] name = "futures-util" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" +checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" dependencies = [ "futures-channel", "futures-core", @@ -982,12 +1001,14 @@ dependencies = [ [[package]] name = "http-client" -version = "6.2.0" +version = "6.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "010092b71b94ee49293995625ce7a607778b8b4099c8088fa84fd66bd3e0f21c" +checksum = "5566ecc26bc6b04e773e680d66141fced78e091ad818e420d726c152b05a64ff" dependencies = [ "async-std", "async-trait", + "cfg-if 1.0.0", + "dashmap", "http-types", "isahc", "log 0.4.11", @@ -1709,21 +1730,20 @@ dependencies = [ [[package]] name = "regex" -version = "1.4.2" +version = "1.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c" +checksum = "957056ecddbeba1b26965114e191d2e8589ce74db242b6ea25fc4062427a5c19" dependencies = [ "aho-corasick", "memchr", "regex-syntax", - "thread_local", ] [[package]] name = "regex-syntax" -version = "0.6.21" +version = "0.6.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189" +checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" [[package]] name = "remove_dir_all" @@ -2081,13 +2101,13 @@ checksum = "343f3f510c2915908f155e94f17220b19ccfacf2a64a2a5d8004f2c3e311e7fd" [[package]] name = "surf" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7189c787d96fe18fef704950de76d590022d9d70858a4a201e1f07a0666882ea" +checksum = "2a154d33ca6b5e1fe6fd1c760e5a5cc1202425f6cca2e13229f16a69009f6328" dependencies = [ "async-std", "async-trait", - "cfg-if 0.1.10", + "cfg-if 1.0.0", "encoding_rs", "futures-util", "http-client", @@ -2095,7 +2115,7 @@ dependencies = [ "log 0.4.11", "mime_guess", "once_cell", - "pin-project-lite 0.1.11", + "pin-project-lite 0.2.4", "serde", "serde_json", "web-sys", @@ -2178,15 +2198,6 @@ dependencies = [ "syn", ] -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static", -] - [[package]] name = "time" version = "0.1.44" @@ -2357,9 +2368,9 @@ dependencies = [ [[package]] name = "url" -version = "2.2.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" +checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" dependencies = [ "form_urlencoded", "idna", diff --git a/Cargo.toml b/Cargo.toml index 8487fc2..b8fce77 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,17 +12,17 @@ readme = "README.md" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -async-std = "1.7.0" +async-std = "1.9.0" clap = "2.33.3" comfy-table = "2.1.0" epub-builder = "0.4.8" -futures = "0.3.12" +futures = "0.3.14" html5ever = "0.25.1" indicatif = "0.15.0" kuchiki = "0.8.1" lazy_static = "1.4.0" md5 = "0.7.0" -regex = "1.4.2" -surf = "2.1.0" +regex = "1.4.5" +surf = "2.2.0" thiserror = "1.0.24" -url = "2.2.0" +url = "2.2.1" diff --git a/src/http.rs b/src/http.rs index bd87213..3dc1e42 100644 --- a/src/http.rs +++ b/src/http.rs @@ -77,7 +77,9 @@ pub async fn download_images( .map(|(url, _)| { ( url, - surf::Client::new().get(get_absolute_url(&url, article_origin)), + surf::Client::new() + .with(surf::middleware::Redirect::default()) + .get(get_absolute_url(&url, article_origin)), ) }) .enumerate() From b496abb576ab7dda0c559335c2a0344e24fcc085 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Thu, 22 Apr 2021 19:00:32 +0300 Subject: [PATCH 11/24] Fix serialization issue with poorly defined attribute names --- src/extractor.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/extractor.rs b/src/extractor.rs index 1f4140f..2cf3f25 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -103,6 +103,7 @@ pub fn serialize_to_xhtml<W: std::io::Write>( let attrs_str = attrs .map .iter() + .filter(|(k, _)| &k.local != "\"") .map(|(k, v)| { format!( "{}=\"{}\"", From c0323a6ae42c1567b36dc8dd0250f8ea23f9b7f4 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Sat, 24 Apr 2021 09:00:18 +0300 Subject: [PATCH 12/24] Minor refactor and add non zero exit upon failure to download any article - Move printing of the successfully downloaded articles into main.rs - Add summary text --- src/epub.rs | 25 +++++++------------------ src/main.rs | 29 +++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index ac1a934..36d766f 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -1,6 +1,5 @@ use std::fs::File; -use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use indicatif::{ProgressBar, ProgressStyle}; @@ -13,6 +12,7 @@ use crate::{ pub fn generate_epubs( articles: Vec<Extractor>, merged: Option<&String>, + successful_articles_table: &mut Table, ) -> Result<(), Vec<PaperoniError>> { let bar = ProgressBar::new(articles.len() as u64); let style = ProgressStyle::default_bar().template( @@ -21,18 +21,11 @@ pub fn generate_epubs( bar.set_style(style); bar.set_message("Generating epubs"); - let mut base_table = Table::new(); - base_table - .load_preset(UTF8_FULL) - .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) - .set_content_arrangement(ContentArrangement::Dynamic); - let mut errors: Vec<PaperoniError> = Vec::new(); - let mut can_print_table = false; match merged { Some(name) => { - base_table.set_header(vec![Cell::new("Table of Contents") + successful_articles_table.set_header(vec![Cell::new("Table of Contents") .add_attribute(Attribute::Bold) .set_alignment(CellAlignment::Center) .fg(Color::Green)]); @@ -90,7 +83,7 @@ pub fn generate_epubs( errors.push(error); } bar.inc(1); - base_table.add_row(vec![article.metadata().title()]); + successful_articles_table.add_row(vec![article.metadata().title()]); epub }); let mut out_file = File::create(&name).unwrap(); @@ -106,17 +99,16 @@ pub fn generate_epubs( bar.finish_with_message("Generated epub\n"); println!("Created {:?}", name); - can_print_table = true; } None => { - base_table + successful_articles_table .set_header(vec![Cell::new("Downloaded articles") .add_attribute(Attribute::Bold) .set_alignment(CellAlignment::Center) .fg(Color::Green)]) .set_content_arrangement(ContentArrangement::Dynamic); - for article in articles { + for article in &articles { let mut result = || -> Result<(), PaperoniError> { let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; let file_name = format!( @@ -151,10 +143,9 @@ pub fn generate_epubs( epub.generate(&mut out_file)?; bar.inc(1); - base_table.add_row(vec![article.metadata().title()]); + successful_articles_table.add_row(vec![article.metadata().title()]); // println!("Created {:?}", file_name); - can_print_table = true; Ok(()) }; if let Err(mut error) = result() { @@ -165,9 +156,7 @@ pub fn generate_epubs( bar.finish_with_message("Generated epubs\n"); } } - if can_print_table { - println!("{}", base_table); - } + if errors.is_empty() { Ok(()) } else { diff --git a/src/main.rs b/src/main.rs index 3b1ad47..98fa3a6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,7 @@ extern crate lazy_static; use async_std::stream; use async_std::task; -use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; +use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table}; use futures::stream::StreamExt; use indicatif::{ProgressBar, ProgressStyle}; @@ -78,12 +78,36 @@ fn download(app_config: AppConfig) { articles }); bar.finish_with_message("Downloaded articles"); - match generate_epubs(articles, app_config.merged()) { + let mut succesful_articles_table = Table::new(); + succesful_articles_table + .load_preset(UTF8_FULL) + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_content_arrangement(ContentArrangement::Dynamic); + match generate_epubs(articles, app_config.merged(), &mut succesful_articles_table) { Ok(_) => (), Err(gen_epub_errors) => { errors.extend(gen_epub_errors); } }; + let successfully_downloaded_count = app_config.urls().len() - errors.len(); + + println!( + "{} articles downloaded successfully. {}", + if successfully_downloaded_count == app_config.urls().len() { + "All".to_string() + } else { + successfully_downloaded_count.to_string() + }, + if errors.len() > 0 { + errors.len().to_string() + " failed" + } else { + "".to_string() + } + ); + + if successfully_downloaded_count > 0 { + println!("{}", succesful_articles_table); + } if !errors.is_empty() { println!( "{}Failed article downloads{}", @@ -109,5 +133,6 @@ fn download(app_config: AppConfig) { ]); } println!("{}", table_failed); + std::process::exit(1); } } From 910c45abf7364ae9130411596e64182bab36139d Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Sat, 24 Apr 2021 13:54:47 +0300 Subject: [PATCH 13/24] Add logging configured to send to a file by default --- .gitignore | 3 +- Cargo.lock | 85 ++++++++++++++++++++++++++++++-------- Cargo.toml | 3 ++ src/epub.rs | 13 ++++-- src/http.rs | 23 +++++++++-- src/main.rs | 13 +++++- src/moz_readability/mod.rs | 6 +-- 7 files changed, 116 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 3ae8faf..8e42494 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /target -*.epub \ No newline at end of file +*.epub +*.log \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index be621b1..c047a45 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -150,7 +150,7 @@ dependencies = [ "fastrand", "futures-lite", "libc", - "log 0.4.11", + "log 0.4.14", "nb-connect", "once_cell", "parking", @@ -195,7 +195,7 @@ dependencies = [ "futures-lite", "gloo-timers", "kv-log-macro", - "log 0.4.11", + "log 0.4.14", "memchr", "num_cpus", "once_cell", @@ -553,6 +553,16 @@ dependencies = [ "syn", ] +[[package]] +name = "ctor" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fbaabec2c953050352311293be5c6aba8e141ba19d6811862b232d6fd020484" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "ctr" version = "0.6.0" @@ -719,6 +729,22 @@ dependencies = [ "miniz_oxide 0.3.7", ] +[[package]] +name = "flexi_logger" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab94b6ac8eb69f1496a6993f26f785b5fd6d99b7416023eb2a6175c0b242b1" +dependencies = [ + "atty", + "chrono", + "glob", + "lazy_static", + "log 0.4.14", + "regex", + "thiserror", + "yansi", +] + [[package]] name = "flume" version = "0.9.2" @@ -914,6 +940,12 @@ version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6503fe142514ca4799d4c26297c4248239fe8838d827db6bd6065c6ed29a6ce" +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "gloo-timers" version = "0.2.1" @@ -980,7 +1012,7 @@ version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" dependencies = [ - "log 0.4.11", + "log 0.4.14", "mac", "markup5ever", "proc-macro2", @@ -1011,7 +1043,7 @@ dependencies = [ "dashmap", "http-types", "isahc", - "log 0.4.11", + "log 0.4.14", ] [[package]] @@ -1087,7 +1119,7 @@ dependencies = [ "flume", "futures-lite", "http", - "log 0.4.11", + "log 0.4.14", "once_cell", "slab", "sluice", @@ -1130,7 +1162,7 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" dependencies = [ - "log 0.4.11", + "log 0.4.14", ] [[package]] @@ -1182,16 +1214,17 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" dependencies = [ - "log 0.4.11", + "log 0.4.14", ] [[package]] name = "log" -version = "0.4.11" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", + "value-bag", ] [[package]] @@ -1206,7 +1239,7 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" dependencies = [ - "log 0.4.11", + "log 0.4.14", "phf", "phf_codegen", "serde", @@ -1277,7 +1310,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956" dependencies = [ "libc", - "log 0.4.11", + "log 0.4.14", "miow", "ntapi", "winapi", @@ -1410,14 +1443,17 @@ name = "paperoni" version = "0.3.0-alpha1" dependencies = [ "async-std", + "atty", "clap", "comfy-table", "epub-builder", + "flexi_logger", "futures", "html5ever", "indicatif", "kuchiki", "lazy_static", + "log 0.4.14", "md5", "regex", "surf", @@ -1568,7 +1604,7 @@ checksum = "a2a7bc6b2a29e632e45451c941832803a18cce6781db04de8a04696cdca8bde4" dependencies = [ "cfg-if 0.1.10", "libc", - "log 0.4.11", + "log 0.4.14", "wepoll-sys", "winapi", ] @@ -1801,7 +1837,7 @@ dependencies = [ "cssparser", "derive_more", "fxhash", - "log 0.4.11", + "log 0.4.14", "matches", "phf", "phf_codegen", @@ -2112,7 +2148,7 @@ dependencies = [ "futures-util", "http-client", "http-types", - "log 0.4.11", + "log 0.4.14", "mime_guess", "once_cell", "pin-project-lite 0.2.4", @@ -2269,7 +2305,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27" dependencies = [ "cfg-if 0.1.10", - "log 0.4.11", + "log 0.4.14", "pin-project-lite 0.1.11", "tracing-attributes", "tracing-core", @@ -2400,6 +2436,15 @@ dependencies = [ "rand 0.7.3", ] +[[package]] +name = "value-bag" +version = "1.0.0-alpha.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b676010e055c99033117c2343b33a40a30b91fecd6c49055ac9cd2d6c305ab1" +dependencies = [ + "ctor", +] + [[package]] name = "vcpkg" version = "0.2.10" @@ -2460,7 +2505,7 @@ checksum = "f22b422e2a757c35a73774860af8e112bff612ce6cb604224e8e47641a9e4f68" dependencies = [ "bumpalo", "lazy_static", - "log 0.4.11", + "log 0.4.14", "proc-macro2", "quote", "syn", @@ -2549,6 +2594,12 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "yansi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc79f4a1e39857fc00c3f662cbf2651c771f00e9c15fe2abc341806bd46bd71" + [[package]] name = "zip" version = "0.5.8" diff --git a/Cargo.toml b/Cargo.toml index b8fce77..7cdfcd4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,14 +13,17 @@ readme = "README.md" [dependencies] async-std = "1.9.0" +atty = "0.2.14" clap = "2.33.3" comfy-table = "2.1.0" epub-builder = "0.4.8" +flexi_logger = "0.17.1" futures = "0.3.14" html5ever = "0.25.1" indicatif = "0.15.0" kuchiki = "0.8.1" lazy_static = "1.4.0" +log = "0.4.14" md5 = "0.7.0" regex = "1.4.5" surf = "2.2.0" diff --git a/src/epub.rs b/src/epub.rs index 36d766f..0b46277 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -3,6 +3,7 @@ use std::fs::File; use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use indicatif::{ProgressBar, ProgressStyle}; +use log::{debug, info}; use crate::{ errors::PaperoniError, @@ -19,7 +20,9 @@ pub fn generate_epubs( "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", ); bar.set_style(style); - bar.set_message("Generating epubs"); + if !articles.is_empty() { + bar.set_message("Generating epubs"); + } let mut errors: Vec<PaperoniError> = Vec::new(); @@ -47,6 +50,7 @@ pub fn generate_epubs( return Err(errors); } }; + debug!("Creating {:?}", name); epub.inline_toc(); articles .iter() @@ -62,7 +66,7 @@ pub fn generate_epubs( EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) .title(replace_metadata_value(section_name)), )?; - + info!("Adding images for {:?}", name); article.img_urls.iter().for_each(|img| { // TODO: Add error handling and return errors as a vec let mut file_path = std::env::temp_dir(); @@ -76,6 +80,7 @@ pub fn generate_epubs( ) .unwrap(); }); + info!("Added images for {:?}", name); Ok(()) }; if let Err(mut error) = article_result() { @@ -98,6 +103,7 @@ pub fn generate_epubs( } bar.finish_with_message("Generated epub\n"); + debug!("Created {:?}", name); println!("Created {:?}", name); } None => { @@ -119,6 +125,7 @@ pub fn generate_epubs( .replace("/", " ") .replace("\\", " ") ); + debug!("Creating {:?}", file_name); let mut out_file = File::create(&file_name).unwrap(); let mut html_buf = Vec::new(); extractor::serialize_to_xhtml(article.article(), &mut html_buf) @@ -145,7 +152,7 @@ pub fn generate_epubs( successful_articles_table.add_row(vec![article.metadata().title()]); - // println!("Created {:?}", file_name); + debug!("Created {:?}", file_name); Ok(()) }; if let Err(mut error) = result() { diff --git a/src/http.rs b/src/http.rs index 3dc1e42..9bdaa42 100644 --- a/src/http.rs +++ b/src/http.rs @@ -2,6 +2,7 @@ use async_std::io::prelude::*; use async_std::{fs::File, stream}; use futures::StreamExt; use indicatif::ProgressBar; +use log::{debug, info}; use url::Url; use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor}; @@ -10,7 +11,7 @@ type HTMLResource = (String, String); pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> { let client = surf::Client::new(); - // println!("Fetching..."); + debug!("Fetching {}", url); let process_request = async { let mut redirect_count: u8 = 0; @@ -23,10 +24,19 @@ pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> { if res.status().is_redirection() { if let Some(location) = res.header(surf::http::headers::LOCATION) { match Url::parse(location.last().as_str()) { - Ok(valid_url) => url = valid_url, + Ok(valid_url) => { + info!("Redirecting {} to {}", url, valid_url); + url = valid_url + } Err(e) => match e { url::ParseError::RelativeUrlWithoutBase => { - url = base_url.join(location.last().as_str())? + match base_url.join(location.last().as_str()) { + Ok(joined_url) => { + info!("Redirecting {} to {}", url, joined_url); + url = joined_url; + } + Err(e) => return Err(e.into()), + } } e => return Err(e.into()), }, @@ -35,6 +45,7 @@ pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> { } else if res.status().is_success() { if let Some(mime) = res.content_type() { if mime.essence() == "text/html" { + debug!("Successfully fetched {}", url); return Ok((url.to_string(), res.body_string().await?)); } else { let msg = format!( @@ -67,7 +78,11 @@ pub async fn download_images( bar: &ProgressBar, ) -> Result<(), Vec<PaperoniError>> { if extractor.img_urls.len() > 0 { - // println!("Downloading images..."); + debug!( + "Downloading {} images for {}", + extractor.img_urls.len(), + article_origin + ); } let img_count = extractor.img_urls.len(); diff --git a/src/main.rs b/src/main.rs index 98fa3a6..3eee69e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table}; use futures::stream::StreamExt; use indicatif::{ProgressBar, ProgressStyle}; +use log::{debug, warn}; use url::Url; mod cli; @@ -27,6 +28,13 @@ fn main() { let app_config = cli::cli_init(); if !app_config.urls().is_empty() { + match flexi_logger::Logger::with_str("paperoni=debug") + .log_to_file() + .start() + { + Ok(_) => (), + Err(e) => eprintln!("Unable to start logger!\n{}", e), + } download(app_config); } } @@ -46,7 +54,7 @@ fn download(app_config: AppConfig) { while let Some(fetch_result) = responses.next().await { match fetch_result { Ok((url, html)) => { - // println!("Extracting"); + debug!("Extracting {}", &url); let mut extractor = Extractor::from_html(&html, &url); bar.set_message("Extracting..."); match extractor.extract_content() { @@ -56,7 +64,7 @@ fn download(app_config: AppConfig) { download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) .await { - eprintln!( + warn!( "{} image{} failed to download for {}", img_errors.len(), if img_errors.len() > 1 { "s" } else { "" }, @@ -78,6 +86,7 @@ fn download(app_config: AppConfig) { articles }); bar.finish_with_message("Downloaded articles"); + let mut succesful_articles_table = Table::new(); succesful_articles_table .load_preset(UTF8_FULL) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index c3ab1d2..38236d3 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -7,6 +7,7 @@ use kuchiki::{ traits::*, NodeData, NodeRef, }; +use log::info; use url::Url; use crate::errors::{ErrorKind, PaperoniError}; @@ -1587,14 +1588,12 @@ impl Readability { /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// a user wants to read. Then return it wrapped up in a div. fn grab_article(&mut self) -> Result<(), PaperoniError> { - // TODO: Add logging for this - // println!("Grabbing article"); + info!("Grabbing article {:?}", self.metadata.title); // var doc = this._doc; // var isPaging = (page !== null ? true: false); // page = page ? page : this._doc.body; let page = self.root_node.select_first("body"); if page.is_err() { - // TODO:Have error logging for this return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into()); } let page = page.unwrap(); @@ -2114,6 +2113,7 @@ impl Readability { false }); self.article_node = Some(article_content); + info!("Successfully grabbed article {:?}", self.metadata.title); return Ok(()); } } From a3de3fb6ff607120c9287ec0533ba53b5898a8ba Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Sat, 24 Apr 2021 13:57:06 +0300 Subject: [PATCH 14/24] Add ImgError struct for representing errors in downloading article images --- src/errors.rs | 47 +++++++++++++++++++++++ src/http.rs | 103 +++++++++++++++++++++++++++----------------------- src/main.rs | 7 ++++ 3 files changed, 110 insertions(+), 47 deletions(-) diff --git a/src/errors.rs b/src/errors.rs index c37ff8e..84d1535 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -14,6 +14,53 @@ pub enum ErrorKind { ReadabilityError(String), } +#[derive(Error, Debug)] +#[error("{kind}")] +/// Used to represent errors from downloading images. Errors from here are used solely for debugging +/// as they are considered recoverable. +pub struct ImgError { + kind: ErrorKind, + url: Option<String>, +} + +impl ImgError { + pub fn with_kind(kind: ErrorKind) -> Self { + ImgError { url: None, kind } + } + + pub fn set_url(&mut self, url: &str) { + self.url = Some(url.to_string()); + } + + pub fn url(&self) -> &Option<String> { + &self.url + } +} + +impl From<ErrorKind> for ImgError { + fn from(kind: ErrorKind) -> Self { + ImgError::with_kind(kind) + } +} + +impl From<surf::Error> for ImgError { + fn from(err: surf::Error) -> Self { + ImgError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From<url::ParseError> for ImgError { + fn from(err: url::ParseError) -> Self { + ImgError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From<std::io::Error> for ImgError { + fn from(err: std::io::Error) -> Self { + ImgError::with_kind(ErrorKind::IOError(err.to_string())) + } +} + #[derive(Error, Debug)] #[error("{kind}")] pub struct PaperoniError { diff --git a/src/http.rs b/src/http.rs index 9bdaa42..bb457b1 100644 --- a/src/http.rs +++ b/src/http.rs @@ -5,8 +5,8 @@ use indicatif::ProgressBar; use log::{debug, info}; use url::Url; -use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor}; - +use crate::errors::{ErrorKind, ImgError, PaperoniError}; +use crate::extractor::Extractor; type HTMLResource = (String, String); pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> { @@ -76,7 +76,7 @@ pub async fn download_images( extractor: &mut Extractor, article_origin: &Url, bar: &ProgressBar, -) -> Result<(), Vec<PaperoniError>> { +) -> Result<(), Vec<ImgError>> { if extractor.img_urls.len() > 0 { debug!( "Downloading {} images for {}", @@ -102,53 +102,62 @@ pub async fn download_images( bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str()); match req.await { Ok(mut img_response) => { - // let mut img_response = req.await.expect("Unable to retrieve image"); - let img_content: Vec<u8> = match img_response.body_bytes().await { - Ok(bytes) => bytes, - Err(e) => return Err(e.into()), - }; - let img_mime = img_response - .content_type() - .map(|mime| mime.essence().to_string()); - let img_ext = match img_response - .content_type() - .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) - { - Some(mime_str) => mime_str, - None => { - return Err(ErrorKind::HTTPError( - "Image has no Content-Type".to_owned(), - ) - .into()) + let process_response = async { + let img_content: Vec<u8> = match img_response.body_bytes().await { + Ok(bytes) => bytes, + Err(e) => return Err(e.into()), + }; + let img_mime = img_response + .content_type() + .map(|mime| mime.essence().to_string()); + let img_ext = match img_response + .content_type() + .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) + { + Some(mime_str) => mime_str, + None => { + return Err(ErrorKind::HTTPError( + "Image has no Content-Type".to_owned(), + ) + .into()) + } + }; + + let mut img_path = std::env::temp_dir(); + img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); + let mut img_file = match File::create(&img_path).await { + Ok(file) => file, + Err(e) => return Err(e.into()), + }; + match img_file.write_all(&img_content).await { + Ok(_) => (), + Err(e) => return Err(e.into()), } - }; - let mut img_path = std::env::temp_dir(); - img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); - let mut img_file = match File::create(&img_path).await { - Ok(file) => file, - Err(e) => return Err(e.into()), + Ok(( + url, + img_path + .file_name() + .map(|os_str_name| { + os_str_name + .to_str() + .expect("Unable to get image file name") + .to_string() + }) + .unwrap(), + img_mime, + )) }; - match img_file.write_all(&img_content).await { - Ok(_) => (), - Err(e) => return Err(e.into()), - } - - Ok(( - url, - img_path - .file_name() - .map(|os_str_name| { - os_str_name - .to_str() - .expect("Unable to get image file name") - .to_string() - }) - .unwrap(), - img_mime, - )) + process_response.await.map_err(|mut e: ImgError| { + e.set_url(url); + e + }) + } + Err(e) => { + let mut img_err: ImgError = e.into(); + img_err.set_url(url); + Err(img_err) } - Err(e) => Err(e.into()), } }); @@ -170,7 +179,7 @@ pub async fn download_images( let imgs_req_iter = stream::from_iter(imgs_req_iter) .buffered(10) - .collect::<Vec<Result<_, PaperoniError>>>() + .collect::<Vec<Result<_, ImgError>>>() .await; let mut errors = Vec::new(); let mut replaced_imgs = Vec::new(); diff --git a/src/main.rs b/src/main.rs index 3eee69e..697b5f3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -70,6 +70,13 @@ fn download(app_config: AppConfig) { if img_errors.len() > 1 { "s" } else { "" }, url ); + for img_error in img_errors { + warn!( + "{}\n\t\tReason {}", + img_error.url().as_ref().unwrap(), + img_error + ); + } } articles.push(extractor); } From 65f8ebda56041b76613cfcc612e47fdef68803b9 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Sat, 24 Apr 2021 13:58:03 +0300 Subject: [PATCH 15/24] Add logs crate for dealing with printing out the final download summary --- src/logs.rs | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 51 +++---------------------- 2 files changed, 114 insertions(+), 45 deletions(-) create mode 100644 src/logs.rs diff --git a/src/logs.rs b/src/logs.rs new file mode 100644 index 0000000..c2275c6 --- /dev/null +++ b/src/logs.rs @@ -0,0 +1,108 @@ +use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; +use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table}; +use log::error; + +use crate::errors::PaperoniError; + +pub fn display_summary( + initial_article_count: usize, + succesful_articles_table: Table, + errors: Vec<PaperoniError>, +) { + let successfully_downloaded_count = initial_article_count - errors.len(); + + println!( + "{}", + short_summary( + initial_article_count, + successfully_downloaded_count, + errors.len() + ) + ); + + if successfully_downloaded_count > 0 { + println!("{}", succesful_articles_table); + } + if !errors.is_empty() { + println!( + "{}Failed article downloads{}", + Attribute::Bold, + Attribute::NormalIntensity + ); + let mut table_failed = Table::new(); + table_failed + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_header(vec![ + Cell::new("Link").set_alignment(CellAlignment::Center), + Cell::new("Reason").set_alignment(CellAlignment::Center), + ]) + .set_content_arrangement(ContentArrangement::Dynamic); + + for error in errors { + let error_source = error + .article_source() + .clone() + .unwrap_or_else(|| "<unknown link>".to_string()); + table_failed.add_row(vec![&error_source, &format!("{}", error.kind())]); + error!("{}\n - {}", error, error_source); + } + println!("{}", table_failed); + } +} + +/// Returns a string summary of the total number of failed and successful article downloads +fn short_summary(initial_count: usize, successful_count: usize, failed_count: usize) -> String { + if initial_count != successful_count + failed_count { + panic!("initial_count must be equal to the sum of failed and successful count") + } + let get_noun = |count: usize| if count == 1 { "article" } else { "articles" }; + if successful_count == initial_count { + "All articles downloaded successfully".into() + } else if successful_count == 0 { + "All articles failed to download".into() + } else { + format!( + "{} {} downloaded successfully, {} {} failed", + successful_count, + get_noun(successful_count), + failed_count, + get_noun(failed_count) + ) + } +} + +#[cfg(test)] +mod tests { + use super::short_summary; + #[test] + fn test_short_summary() { + assert_eq!( + short_summary(10, 10, 0), + "All articles downloaded successfully".to_string() + ); + assert_eq!( + short_summary(10, 0, 10), + "All articles failed to download".to_string() + ); + assert_eq!( + short_summary(10, 8, 2), + "8 articles downloaded successfully, 2 articles failed".to_string() + ); + assert_eq!( + short_summary(10, 1, 9), + "1 article downloaded successfully, 9 articles failed".to_string() + ); + assert_eq!( + short_summary(7, 6, 1), + "6 articles downloaded successfully, 1 article failed".to_string() + ); + } + + #[test] + #[should_panic( + expected = "initial_count must be equal to the sum of failed and successful count" + )] + fn test_short_summary_panics_on_invalid_input() { + short_summary(0, 12, 43); + } +} diff --git a/src/main.rs b/src/main.rs index 697b5f3..d23311d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,7 +4,7 @@ extern crate lazy_static; use async_std::stream; use async_std::task; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; -use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table}; +use comfy_table::{ContentArrangement, Table}; use futures::stream::StreamExt; use indicatif::{ProgressBar, ProgressStyle}; use log::{debug, warn}; @@ -17,12 +17,14 @@ mod extractor; /// This module is responsible for async HTTP calls for downloading /// the HTML content and images mod http; +mod logs; mod moz_readability; use cli::AppConfig; use epub::generate_epubs; use extractor::Extractor; use http::{download_images, fetch_html}; +use logs::display_summary; fn main() { let app_config = cli::cli_init(); @@ -105,50 +107,9 @@ fn download(app_config: AppConfig) { errors.extend(gen_epub_errors); } }; - let successfully_downloaded_count = app_config.urls().len() - errors.len(); - - println!( - "{} articles downloaded successfully. {}", - if successfully_downloaded_count == app_config.urls().len() { - "All".to_string() - } else { - successfully_downloaded_count.to_string() - }, - if errors.len() > 0 { - errors.len().to_string() + " failed" - } else { - "".to_string() - } - ); - - if successfully_downloaded_count > 0 { - println!("{}", succesful_articles_table); - } - if !errors.is_empty() { - println!( - "{}Failed article downloads{}", - Attribute::Bold, - Attribute::NormalIntensity - ); - let mut table_failed = Table::new(); - table_failed - .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) - .set_header(vec![ - Cell::new("Link").set_alignment(CellAlignment::Center), - Cell::new("Reason").set_alignment(CellAlignment::Center), - ]) - .set_content_arrangement(ContentArrangement::Dynamic); - - for error in errors { - table_failed.add_row(vec![ - error - .article_source() - .clone() - .unwrap_or_else(|| "<unknown link>".to_string()), - format!("{}", error.kind()), - ]); - } - println!("{}", table_failed); + let has_errors = !errors.is_empty(); + display_summary(app_config.urls().len(), succesful_articles_table, errors); + if has_errors { std::process::exit(1); } } From a9787d7b5ab4025c1306cb7c68aadfe41814558c Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Sat, 24 Apr 2021 15:13:44 +0300 Subject: [PATCH 16/24] Add colored output and configuring of a paperoni root directory for logs --- Cargo.lock | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++-- Cargo.toml | 2 ++ src/logs.rs | 47 ++++++++++++++++++++++++++++++------------- src/main.rs | 28 +++++++++++++++++++------- 4 files changed, 112 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c047a45..7808f4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -406,6 +406,17 @@ dependencies = [ "vec_map", ] +[[package]] +name = "colored" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd" +dependencies = [ + "atty", + "lazy_static", + "winapi", +] + [[package]] name = "comfy-table" version = "2.1.0" @@ -639,6 +650,26 @@ dependencies = [ "generic-array", ] +[[package]] +name = "directories" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e69600ff1703123957937708eb27f7a564e48885c537782722ed0ba3189ce1d7" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03d86534ed367a67548dc68113a0f5db55432fdfbb6e6f9d77704397d95d5780" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + [[package]] name = "discard" version = "1.0.4" @@ -925,6 +956,17 @@ dependencies = [ "wasi 0.9.0+wasi-snapshot-preview1", ] +[[package]] +name = "getrandom" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", +] + [[package]] name = "ghash" version = "0.3.0" @@ -1445,7 +1487,9 @@ dependencies = [ "async-std", "atty", "clap", + "colored", "comfy-table", + "directories", "epub-builder", "flexi_logger", "futures", @@ -1680,7 +1724,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ - "getrandom", + "getrandom 0.1.15", "libc", "rand_chacha", "rand_core 0.5.1", @@ -1719,7 +1763,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" dependencies = [ - "getrandom", + "getrandom 0.1.15", ] [[package]] @@ -1764,6 +1808,16 @@ dependencies = [ "bitflags", ] +[[package]] +name = "redox_users" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" +dependencies = [ + "getrandom 0.2.2", + "redox_syscall 0.2.6", +] + [[package]] name = "regex" version = "1.4.5" diff --git a/Cargo.toml b/Cargo.toml index 7cdfcd4..b37aea1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,9 @@ readme = "README.md" async-std = "1.9.0" atty = "0.2.14" clap = "2.33.3" +colored = "2.0.0" comfy-table = "2.1.0" +directories = "3.0.2" epub-builder = "0.4.8" flexi_logger = "0.17.1" futures = "0.3.14" diff --git a/src/logs.rs b/src/logs.rs index c2275c6..6efec8a 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -1,5 +1,6 @@ +use colored::*; use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; -use comfy_table::{Attribute, Cell, CellAlignment, ContentArrangement, Table}; +use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; use log::error; use crate::errors::PaperoniError; @@ -18,17 +19,14 @@ pub fn display_summary( successfully_downloaded_count, errors.len() ) + .bold() ); if successfully_downloaded_count > 0 { println!("{}", succesful_articles_table); } if !errors.is_empty() { - println!( - "{}Failed article downloads{}", - Attribute::Bold, - Attribute::NormalIntensity - ); + println!("\n{}", "Failed article downloads".bright_red().bold()); let mut table_failed = Table::new(); table_failed .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) @@ -56,10 +54,14 @@ fn short_summary(initial_count: usize, successful_count: usize, failed_count: us panic!("initial_count must be equal to the sum of failed and successful count") } let get_noun = |count: usize| if count == 1 { "article" } else { "articles" }; - if successful_count == initial_count { - "All articles downloaded successfully".into() + if successful_count == initial_count && successful_count == 1 { + "Article downloaded successfully".green().to_string() + } else if initial_count == failed_count && failed_count == 1 { + "Article failed to download".red().to_string() + } else if successful_count == initial_count { + "All articles downloaded successfully".green().to_string() } else if successful_count == 0 { - "All articles failed to download".into() + "All articles failed to download".red().to_string() } else { format!( "{} {} downloaded successfully, {} {} failed", @@ -68,33 +70,50 @@ fn short_summary(initial_count: usize, successful_count: usize, failed_count: us failed_count, get_noun(failed_count) ) + .yellow() + .to_string() } } #[cfg(test)] mod tests { use super::short_summary; + use colored::*; #[test] fn test_short_summary() { + assert_eq!( + short_summary(1, 1, 0), + "Article downloaded successfully".green().to_string() + ); + assert_eq!( + short_summary(1, 0, 1), + "Article failed to download".red().to_string() + ); assert_eq!( short_summary(10, 10, 0), - "All articles downloaded successfully".to_string() + "All articles downloaded successfully".green().to_string() ); assert_eq!( short_summary(10, 0, 10), - "All articles failed to download".to_string() + "All articles failed to download".red().to_string() ); assert_eq!( short_summary(10, 8, 2), - "8 articles downloaded successfully, 2 articles failed".to_string() + "8 articles downloaded successfully, 2 articles failed" + .yellow() + .to_string() ); assert_eq!( short_summary(10, 1, 9), - "1 article downloaded successfully, 9 articles failed".to_string() + "1 article downloaded successfully, 9 articles failed" + .yellow() + .to_string() ); assert_eq!( short_summary(7, 6, 1), - "6 articles downloaded successfully, 1 article failed".to_string() + "6 articles downloaded successfully, 1 article failed" + .yellow() + .to_string() ); } diff --git a/src/main.rs b/src/main.rs index d23311d..7fa1ddb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,6 +5,7 @@ use async_std::stream; use async_std::task; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::{ContentArrangement, Table}; +use directories::UserDirs; use futures::stream::StreamExt; use indicatif::{ProgressBar, ProgressStyle}; use log::{debug, warn}; @@ -30,13 +31,26 @@ fn main() { let app_config = cli::cli_init(); if !app_config.urls().is_empty() { - match flexi_logger::Logger::with_str("paperoni=debug") - .log_to_file() - .start() - { - Ok(_) => (), - Err(e) => eprintln!("Unable to start logger!\n{}", e), - } + match UserDirs::new() { + Some(user_dirs) => { + let home_dir = user_dirs.home_dir(); + let paperoni_dir = home_dir.join(".paperoni"); + let log_dir = paperoni_dir.join("logs"); + if !paperoni_dir.is_dir() || !log_dir.is_dir() { + std::fs::create_dir_all(&log_dir) + .expect("Unable to create paperoni directories on home directory for logging purposes"); + } + match flexi_logger::Logger::with_str("paperoni=debug") + .directory(log_dir) + .log_to_file() + .start() + { + Ok(_) => (), + Err(e) => eprintln!("Unable to start logger!\n{}", e), + } + } + None => eprintln!("Unable to get user directories for logging purposes"), + }; download(app_config); } } From 088699b2c362bfad1d4b86188204f7186e9a3f40 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Sat, 24 Apr 2021 15:50:43 +0300 Subject: [PATCH 17/24] Add debug flag --- src/cli.rs | 20 +++++++++++++++++++- src/main.rs | 41 ++++++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index a8701bc..9e4b62e 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -38,7 +38,12 @@ It takes a url and downloads the article content from it and saves it to an epub .long("max_conn") .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") - .takes_value(true)); + .takes_value(true)) + .arg( + Arg::with_name("debug") + .long("debug") + .help("Enable logging of events for debugging") + .takes_value(false)); let arg_matches = app.get_matches(); let mut urls: Vec<String> = match arg_matches.value_of("file") { Some(file_name) => { @@ -84,6 +89,9 @@ It takes a url and downloads the article content from it and saves it to an epub }; app_config.set_merged(file_name); } + if arg_matches.is_present("debug") { + app_config.toggle_debug(true); + } app_config } @@ -91,6 +99,7 @@ pub struct AppConfig { urls: Vec<String>, max_conn: usize, merged: Option<String>, + is_debug: bool, } impl AppConfig { @@ -99,9 +108,14 @@ impl AppConfig { urls: vec![], max_conn, merged: None, + is_debug: false, } } + fn toggle_debug(&mut self, is_debug: bool) { + self.is_debug = is_debug; + } + fn set_urls(&mut self, urls: Vec<String>) { self.urls.extend(urls); } @@ -120,4 +134,8 @@ impl AppConfig { pub fn merged(&self) -> Option<&String> { self.merged.as_ref() } + + pub fn is_debug(&self) -> bool { + self.is_debug + } } diff --git a/src/main.rs b/src/main.rs index 7fa1ddb..ebf2dca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -31,26 +31,29 @@ fn main() { let app_config = cli::cli_init(); if !app_config.urls().is_empty() { - match UserDirs::new() { - Some(user_dirs) => { - let home_dir = user_dirs.home_dir(); - let paperoni_dir = home_dir.join(".paperoni"); - let log_dir = paperoni_dir.join("logs"); - if !paperoni_dir.is_dir() || !log_dir.is_dir() { - std::fs::create_dir_all(&log_dir) - .expect("Unable to create paperoni directories on home directory for logging purposes"); + if app_config.is_debug() { + match UserDirs::new() { + Some(user_dirs) => { + let home_dir = user_dirs.home_dir(); + let paperoni_dir = home_dir.join(".paperoni"); + let log_dir = paperoni_dir.join("logs"); + if !paperoni_dir.is_dir() || !log_dir.is_dir() { + std::fs::create_dir_all(&log_dir) + .expect("Unable to create paperoni directories on home directory for logging purposes"); + } + match flexi_logger::Logger::with_str("paperoni=debug") + .directory(log_dir) + .log_to_file() + .print_message() + .start() + { + Ok(_) => (), + Err(e) => eprintln!("Unable to start logger!\n{}", e), + } } - match flexi_logger::Logger::with_str("paperoni=debug") - .directory(log_dir) - .log_to_file() - .start() - { - Ok(_) => (), - Err(e) => eprintln!("Unable to start logger!\n{}", e), - } - } - None => eprintln!("Unable to get user directories for logging purposes"), - }; + None => eprintln!("Unable to get user directories for logging purposes"), + }; + } download(app_config); } } From 36c3eb65c646efa068cff78e6bde275d3c90970e Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Tue, 27 Apr 2021 20:34:26 +0300 Subject: [PATCH 18/24] Add appendix page for listing the source of the article --- src/epub.rs | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/epub.rs b/src/epub.rs index 0b46277..87d6106 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -91,6 +91,17 @@ pub fn generate_epubs( successful_articles_table.add_row(vec![article.metadata().title()]); epub }); + let appendix = generate_appendix(articles.iter().collect()); + if let Err(err) = epub.add_content( + EpubContent::new("appendix.xhtml", appendix.as_bytes()) + .title(replace_metadata_value("Article Sources")), + ) { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + let mut out_file = File::create(&name).unwrap(); match epub.generate(&mut out_file) { Ok(_) => (), @@ -147,6 +158,11 @@ pub fn generate_epubs( img.1.as_ref().unwrap(), )?; } + let appendix = generate_appendix(vec![&article]); + epub.add_content( + EpubContent::new("appendix.xhtml", appendix.as_bytes()) + .title(replace_metadata_value("Article Source")), + )?; epub.generate(&mut out_file)?; bar.inc(1); @@ -179,6 +195,37 @@ fn replace_metadata_value(value: &str) -> String { .replace(">", ">") } +//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references +fn generate_appendix(articles: Vec<&Extractor>) -> String { + let link_tags: String = articles + .iter() + .map(|article| { + let article_name = if !article.metadata().title().is_empty() { + article.metadata().title() + } else { + &article.url + }; + format!( + "<a href=\"{}\">{}</a><br></br>", + replace_metadata_value(&article.url), + replace_metadata_value(article_name) + ) + }) + .collect(); + let template = format!( + r#"<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> + <head> + </head> + <body> + <h2>Appendix</h2><h3>Article sources</h3> + {} + </body> +</html>"#, + link_tags + ); + template +} + #[cfg(test)] mod test { use super::replace_metadata_value; From 00d704fdd68bba0776a4ae4552324811f9391bc7 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Wed, 28 Apr 2021 07:47:45 +0300 Subject: [PATCH 19/24] Move initializing logger to logs module --- Cargo.lock | 1 - Cargo.toml | 2 +- src/logs.rs | 26 ++++++++++++++++++++++++++ src/main.rs | 25 ++----------------------- 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7808f4f..831201d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1485,7 +1485,6 @@ name = "paperoni" version = "0.3.0-alpha1" dependencies = [ "async-std", - "atty", "clap", "colored", "comfy-table", diff --git a/Cargo.toml b/Cargo.toml index b37aea1..af15d18 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ readme = "README.md" [dependencies] async-std = "1.9.0" -atty = "0.2.14" +# atty = "0.2.14" clap = "2.33.3" colored = "2.0.0" comfy-table = "2.1.0" diff --git a/src/logs.rs b/src/logs.rs index 6efec8a..84e97e0 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -1,6 +1,7 @@ use colored::*; use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; +use directories::UserDirs; use log::error; use crate::errors::PaperoniError; @@ -75,6 +76,31 @@ fn short_summary(initial_count: usize, successful_count: usize, failed_count: us } } +pub fn init_logger() { + match UserDirs::new() { + Some(user_dirs) => { + let home_dir = user_dirs.home_dir(); + let paperoni_dir = home_dir.join(".paperoni"); + let log_dir = paperoni_dir.join("logs"); + if !paperoni_dir.is_dir() || !log_dir.is_dir() { + std::fs::create_dir_all(&log_dir).expect( + "Unable to create paperoni directories on home directory for logging purposes", + ); + } + match flexi_logger::Logger::with_str("paperoni=debug") + .directory(log_dir) + .log_to_file() + .print_message() + .start() + { + Ok(_) => (), + Err(e) => eprintln!("Unable to start logger!\n{}", e), + } + } + None => eprintln!("Unable to get user directories for logging purposes"), + }; +} + #[cfg(test)] mod tests { use super::short_summary; diff --git a/src/main.rs b/src/main.rs index ebf2dca..b5cd770 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,7 +5,6 @@ use async_std::stream; use async_std::task; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::{ContentArrangement, Table}; -use directories::UserDirs; use futures::stream::StreamExt; use indicatif::{ProgressBar, ProgressStyle}; use log::{debug, warn}; @@ -25,34 +24,14 @@ use cli::AppConfig; use epub::generate_epubs; use extractor::Extractor; use http::{download_images, fetch_html}; -use logs::display_summary; +use logs::{display_summary, init_logger}; fn main() { let app_config = cli::cli_init(); if !app_config.urls().is_empty() { if app_config.is_debug() { - match UserDirs::new() { - Some(user_dirs) => { - let home_dir = user_dirs.home_dir(); - let paperoni_dir = home_dir.join(".paperoni"); - let log_dir = paperoni_dir.join("logs"); - if !paperoni_dir.is_dir() || !log_dir.is_dir() { - std::fs::create_dir_all(&log_dir) - .expect("Unable to create paperoni directories on home directory for logging purposes"); - } - match flexi_logger::Logger::with_str("paperoni=debug") - .directory(log_dir) - .log_to_file() - .print_message() - .start() - { - Ok(_) => (), - Err(e) => eprintln!("Unable to start logger!\n{}", e), - } - } - None => eprintln!("Unable to get user directories for logging purposes"), - }; + init_logger(); } download(app_config); } From ae52cc4e138b817c38c10b8e0795711619bb356e Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Thu, 29 Apr 2021 19:58:37 +0300 Subject: [PATCH 20/24] Add features for logging and cli - display of partial downloads in the summary - custom file name that is displayed after the summary ensuring it is visible - log-to-file flag which specifies that logs will be sent to the default directory - verbose flag (v) used to configure the log levels - disabling the progress bars when logging to the terminal is active --- Cargo.lock | 1 + Cargo.toml | 1 + src/cli.rs | 96 ++++++++++++++++++++++------ src/epub.rs | 22 ++++--- src/logs.rs | 177 +++++++++++++++++++++++++++++++++++++++++----------- src/main.rs | 35 ++++++++--- 6 files changed, 259 insertions(+), 73 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 831201d..a82ccaa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1485,6 +1485,7 @@ name = "paperoni" version = "0.3.0-alpha1" dependencies = [ "async-std", + "chrono", "clap", "colored", "comfy-table", diff --git a/Cargo.toml b/Cargo.toml index af15d18..655fbb7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ readme = "README.md" [dependencies] async-std = "1.9.0" # atty = "0.2.14" +chrono = "0.4.19" clap = "2.33.3" colored = "2.0.0" comfy-table = "2.1.0" diff --git a/src/cli.rs b/src/cli.rs index 9e4b62e..abf8226 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,6 +1,10 @@ -use std::{fs::File, io::Read}; +use std::{fs::File, io::Read, path::Path}; +use chrono::{DateTime, Local}; use clap::{App, AppSettings, Arg}; +use flexi_logger::LevelFilter as LogLevel; + +use crate::logs::init_logger; pub fn cli_init() -> AppConfig { let app = App::new("paperoni") @@ -12,7 +16,7 @@ pub fn cli_init() -> AppConfig { .about( " Paperoni is an article downloader. -It takes a url and downloads the article content from it and saves it to an epub. +It takes a url, downloads the article content from it and saves it to an epub. ", ) .arg( @@ -40,11 +44,27 @@ It takes a url and downloads the article content from it and saves it to an epub .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") .takes_value(true)) .arg( - Arg::with_name("debug") - .long("debug") - .help("Enable logging of events for debugging") + Arg::with_name("verbosity") + .short("v") + .multiple(true) + .help("Enables logging of events and set the verbosity level. Use -h to read on its usage") + .long_help( +"This takes upto 4 levels of verbosity in the following order. + - Error (-v) + - Warn (-vv) + - Info (-vvv) + - Debug (-vvvv) + When this flag is passed, it disables the progress bars and logs to stderr. + If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag." + ) + .takes_value(false)) + .arg( + Arg::with_name("log-to-file") + .long("log-to-file") + .help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level") .takes_value(false)); let arg_matches = app.get_matches(); + let mut urls: Vec<String> = match arg_matches.value_of("file") { Some(file_name) => { if let Ok(mut file) = File::open(file_name) { @@ -81,17 +101,43 @@ It takes a url and downloads the article content from it and saves it to an epub let mut app_config = AppConfig::new(max_conn); app_config.set_urls(urls); + if let Some(name) = arg_matches.value_of("output_name") { + let file_path = Path::new(name); + if !file_path.is_file() { + eprintln!("{:?} is not a vaild file", name); + std::process::exit(1); + } + let file_name = if name.ends_with(".epub") && name.len() > 5 { name.to_owned() } else { name.to_owned() + ".epub" }; - app_config.set_merged(file_name); + app_config.merged = Some(file_name); } - if arg_matches.is_present("debug") { - app_config.toggle_debug(true); + + if arg_matches.is_present("verbosity") { + if !arg_matches.is_present("log-to-file") { + app_config.can_disable_progress_bar = true; + } + let log_levels: [LogLevel; 5] = [ + LogLevel::Off, + LogLevel::Debug, + LogLevel::Info, + LogLevel::Warn, + LogLevel::Error, + ]; + let level = arg_matches.occurrences_of("verbosity").clamp(0, 4) as usize; + app_config.log_level = log_levels[level]; } + if arg_matches.is_present("log-to-file") { + app_config.log_level = LogLevel::Debug; + app_config.is_logging_to_file = true; + } + + init_logger(&app_config); + app_config } @@ -99,7 +145,10 @@ pub struct AppConfig { urls: Vec<String>, max_conn: usize, merged: Option<String>, - is_debug: bool, + log_level: LogLevel, + can_disable_progress_bar: bool, + start_time: DateTime<Local>, + is_logging_to_file: bool, } impl AppConfig { @@ -108,22 +157,17 @@ impl AppConfig { urls: vec![], max_conn, merged: None, - is_debug: false, + log_level: LogLevel::Off, + can_disable_progress_bar: false, + start_time: Local::now(), + is_logging_to_file: false, } } - fn toggle_debug(&mut self, is_debug: bool) { - self.is_debug = is_debug; - } - fn set_urls(&mut self, urls: Vec<String>) { self.urls.extend(urls); } - fn set_merged(&mut self, name: String) { - self.merged = Some(name); - } - pub fn urls(&self) -> &Vec<String> { &self.urls } @@ -135,7 +179,19 @@ impl AppConfig { self.merged.as_ref() } - pub fn is_debug(&self) -> bool { - self.is_debug + pub fn log_level(&self) -> LogLevel { + self.log_level + } + + pub fn can_disable_progress_bar(&self) -> bool { + self.can_disable_progress_bar + } + + pub fn start_time(&self) -> &DateTime<Local> { + &self.start_time + } + + pub fn is_logging_to_file(&self) -> bool { + self.is_logging_to_file } } diff --git a/src/epub.rs b/src/epub.rs index 87d6106..75f2b9e 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -6,27 +6,33 @@ use indicatif::{ProgressBar, ProgressStyle}; use log::{debug, info}; use crate::{ + cli::AppConfig, errors::PaperoniError, extractor::{self, Extractor}, }; pub fn generate_epubs( articles: Vec<Extractor>, - merged: Option<&String>, + app_config: &AppConfig, successful_articles_table: &mut Table, ) -> Result<(), Vec<PaperoniError>> { - let bar = ProgressBar::new(articles.len() as u64); - let style = ProgressStyle::default_bar().template( + let bar = if app_config.can_disable_progress_bar() { + ProgressBar::hidden() + } else { + let enabled_bar = ProgressBar::new(articles.len() as u64); + let style = ProgressStyle::default_bar().template( "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", ); - bar.set_style(style); - if !articles.is_empty() { - bar.set_message("Generating epubs"); - } + enabled_bar.set_style(style); + if !articles.is_empty() { + enabled_bar.set_message("Generating epubs"); + } + enabled_bar + }; let mut errors: Vec<PaperoniError> = Vec::new(); - match merged { + match app_config.merged() { Some(name) => { successful_articles_table.set_header(vec![Cell::new("Table of Contents") .add_attribute(Attribute::Bold) diff --git a/src/logs.rs b/src/logs.rs index 84e97e0..87b5d1b 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -2,24 +2,28 @@ use colored::*; use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; use directories::UserDirs; +use flexi_logger::LogSpecBuilder; use log::error; -use crate::errors::PaperoniError; +use crate::{cli::AppConfig, errors::PaperoniError}; pub fn display_summary( initial_article_count: usize, succesful_articles_table: Table, + partial_downloads_count: usize, errors: Vec<PaperoniError>, ) { - let successfully_downloaded_count = initial_article_count - errors.len(); + let successfully_downloaded_count = + initial_article_count - partial_downloads_count - errors.len(); println!( "{}", - short_summary( + short_summary(DownloadCount::new( initial_article_count, successfully_downloaded_count, + partial_downloads_count, errors.len() - ) + )) .bold() ); @@ -50,49 +54,128 @@ pub fn display_summary( } /// Returns a string summary of the total number of failed and successful article downloads -fn short_summary(initial_count: usize, successful_count: usize, failed_count: usize) -> String { - if initial_count != successful_count + failed_count { +fn short_summary(download_count: DownloadCount) -> String { + // TODO: Refactor this + if download_count.total + != download_count.successful + download_count.failed + download_count.partial + { panic!("initial_count must be equal to the sum of failed and successful count") } let get_noun = |count: usize| if count == 1 { "article" } else { "articles" }; - if successful_count == initial_count && successful_count == 1 { + if download_count.successful == download_count.total && download_count.successful == 1 { "Article downloaded successfully".green().to_string() - } else if initial_count == failed_count && failed_count == 1 { + } else if download_count.total == download_count.failed && download_count.failed == 1 { "Article failed to download".red().to_string() - } else if successful_count == initial_count { + } else if download_count.total == download_count.partial && download_count.partial == 1 { + "Article partially failed to download".yellow().to_string() + } else if download_count.successful == download_count.total { "All articles downloaded successfully".green().to_string() - } else if successful_count == 0 { + } else if download_count.failed == download_count.total { "All articles failed to download".red().to_string() - } else { + } else if download_count.partial == download_count.total { + "All articles partially failed to download" + .yellow() + .to_string() + } else if download_count.partial == 0 { format!( "{} {} downloaded successfully, {} {} failed", - successful_count, - get_noun(successful_count), - failed_count, - get_noun(failed_count) + download_count.successful, + get_noun(download_count.successful), + download_count.failed, + get_noun(download_count.failed) + ) + .yellow() + .to_string() + } else if download_count.successful == 0 + && download_count.partial > 0 + && download_count.failed > 0 + { + format!( + "{} {} partially failed to download, {} {} failed", + download_count.partial, + get_noun(download_count.partial), + download_count.failed, + get_noun(download_count.failed) + ) + .yellow() + .to_string() + } else if download_count.failed == 0 + && download_count.successful > 0 + && download_count.partial > 0 + { + format!( + "{} {} downloaded successfully, {} {} partially failed to download", + download_count.successful, + get_noun(download_count.successful), + download_count.partial, + get_noun(download_count.partial) + ) + .yellow() + .to_string() + } else { + format!( + "{} {} downloaded successfully, {} {} partially failed to download, {} {} failed", + download_count.successful, + get_noun(download_count.successful), + download_count.partial, + get_noun(download_count.partial), + download_count.failed, + get_noun(download_count.failed) ) .yellow() .to_string() } } -pub fn init_logger() { +struct DownloadCount { + total: usize, + successful: usize, + partial: usize, + failed: usize, +} +impl DownloadCount { + fn new(total: usize, successful: usize, partial: usize, failed: usize) -> Self { + Self { + total, + successful, + partial, + failed, + } + } +} + +pub fn init_logger(app_config: &AppConfig) { match UserDirs::new() { Some(user_dirs) => { let home_dir = user_dirs.home_dir(); let paperoni_dir = home_dir.join(".paperoni"); let log_dir = paperoni_dir.join("logs"); - if !paperoni_dir.is_dir() || !log_dir.is_dir() { - std::fs::create_dir_all(&log_dir).expect( - "Unable to create paperoni directories on home directory for logging purposes", - ); + + let log_spec = LogSpecBuilder::new() + .module("paperoni", app_config.log_level()) + .build(); + let formatted_timestamp = app_config.start_time().format("%Y-%m-%d_%H-%M-%S"); + let mut logger = flexi_logger::Logger::with(log_spec); + + if app_config.is_logging_to_file() && (!paperoni_dir.is_dir() || !log_dir.is_dir()) { + match std::fs::create_dir_all(&log_dir) { + Ok(_) => (), + Err(e) => { + eprintln!("Unable to create paperoni directories on home directory for logging purposes\n{}",e); + std::process::exit(1); + } + }; } - match flexi_logger::Logger::with_str("paperoni=debug") - .directory(log_dir) - .log_to_file() - .print_message() - .start() - { + + if app_config.is_logging_to_file() { + logger = logger + .directory(log_dir) + .discriminant(formatted_timestamp.to_string()) + .suppress_timestamp() + .log_to_file(); + } + + match logger.start() { Ok(_) => (), Err(e) => eprintln!("Unable to start logger!\n{}", e), } @@ -103,44 +186,68 @@ pub fn init_logger() { #[cfg(test)] mod tests { - use super::short_summary; + use super::{short_summary, DownloadCount}; use colored::*; #[test] fn test_short_summary() { assert_eq!( - short_summary(1, 1, 0), + short_summary(DownloadCount::new(1, 1, 0, 0)), "Article downloaded successfully".green().to_string() ); assert_eq!( - short_summary(1, 0, 1), + short_summary(DownloadCount::new(1, 0, 0, 1)), "Article failed to download".red().to_string() ); assert_eq!( - short_summary(10, 10, 0), + short_summary(DownloadCount::new(10, 10, 0, 0)), "All articles downloaded successfully".green().to_string() ); assert_eq!( - short_summary(10, 0, 10), + short_summary(DownloadCount::new(10, 0, 0, 10)), "All articles failed to download".red().to_string() ); assert_eq!( - short_summary(10, 8, 2), + short_summary(DownloadCount::new(10, 8, 0, 2)), "8 articles downloaded successfully, 2 articles failed" .yellow() .to_string() ); assert_eq!( - short_summary(10, 1, 9), + short_summary(DownloadCount::new(10, 1, 0, 9)), "1 article downloaded successfully, 9 articles failed" .yellow() .to_string() ); assert_eq!( - short_summary(7, 6, 1), + short_summary(DownloadCount::new(7, 6, 0, 1)), "6 articles downloaded successfully, 1 article failed" .yellow() .to_string() ); + assert_eq!( + short_summary(DownloadCount::new(7, 4, 2, 1)), + "4 articles downloaded successfully, 2 articles partially failed to download, 1 article failed" + .yellow() + .to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(12, 6, 6, 0)), + "6 articles downloaded successfully, 6 articles partially failed to download" + .yellow() + .to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(5, 0, 4, 1)), + "4 articles partially failed to download, 1 article failed" + .yellow() + .to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(4, 0, 4, 0)), + "All articles partially failed to download" + .yellow() + .to_string() + ); } #[test] @@ -148,6 +255,6 @@ mod tests { expected = "initial_count must be equal to the sum of failed and successful count" )] fn test_short_summary_panics_on_invalid_input() { - short_summary(0, 12, 43); + short_summary(DownloadCount::new(0, 12, 0, 43)); } } diff --git a/src/main.rs b/src/main.rs index b5cd770..0f8b34a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,27 +24,30 @@ use cli::AppConfig; use epub::generate_epubs; use extractor::Extractor; use http::{download_images, fetch_html}; -use logs::{display_summary, init_logger}; +use logs::display_summary; fn main() { let app_config = cli::cli_init(); if !app_config.urls().is_empty() { - if app_config.is_debug() { - init_logger(); - } download(app_config); } } fn download(app_config: AppConfig) { - let bar = ProgressBar::new(app_config.urls().len() as u64); let mut errors = Vec::new(); - let style = ProgressStyle::default_bar().template( + let mut partial_download_count: usize = 0; + let bar = if app_config.can_disable_progress_bar() { + ProgressBar::hidden() + } else { + let enabled_bar = ProgressBar::new(app_config.urls().len() as u64); + let style = ProgressStyle::default_bar().template( "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", ); - bar.set_style(style); - bar.enable_steady_tick(500); + enabled_bar.set_style(style); + enabled_bar.enable_steady_tick(500); + enabled_bar + }; let articles = task::block_on(async { let urls_iter = app_config.urls().iter().map(|url| fetch_html(url)); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); @@ -62,6 +65,7 @@ fn download(app_config: AppConfig) { download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) .await { + partial_download_count += 1; warn!( "{} image{} failed to download for {}", img_errors.len(), @@ -97,14 +101,25 @@ fn download(app_config: AppConfig) { .load_preset(UTF8_FULL) .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) .set_content_arrangement(ContentArrangement::Dynamic); - match generate_epubs(articles, app_config.merged(), &mut succesful_articles_table) { + match generate_epubs(articles, &app_config, &mut succesful_articles_table) { Ok(_) => (), Err(gen_epub_errors) => { errors.extend(gen_epub_errors); } }; let has_errors = !errors.is_empty(); - display_summary(app_config.urls().len(), succesful_articles_table, errors); + display_summary( + app_config.urls().len(), + succesful_articles_table, + partial_download_count, + errors, + ); + if app_config.is_logging_to_file() { + println!( + "Log written to paperoni_{}.log\n", + app_config.start_time().format("%Y-%m-%d_%H-%M-%S") + ); + } if has_errors { std::process::exit(1); } From c00582ac29f8d6a38cfc7fffb67aebed5c0f3166 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Fri, 30 Apr 2021 06:42:08 +0300 Subject: [PATCH 21/24] Fix verbosity levels ordering --- src/cli.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index abf8226..30c5367 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -123,10 +123,10 @@ It takes a url, downloads the article content from it and saves it to an epub. } let log_levels: [LogLevel; 5] = [ LogLevel::Off, - LogLevel::Debug, - LogLevel::Info, - LogLevel::Warn, LogLevel::Error, + LogLevel::Warn, + LogLevel::Info, + LogLevel::Debug, ]; let level = arg_matches.occurrences_of("verbosity").clamp(0, 4) as usize; app_config.log_level = log_levels[level]; From cae9227ab048603402c99707ce98b6beee5ee2e8 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Fri, 30 Apr 2021 06:55:02 +0300 Subject: [PATCH 22/24] Update documentation --- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 43 ++++++++++++++++++++++++++++++++++++++++--- src/cli.rs | 7 ++----- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a82ccaa..ca5456c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1482,7 +1482,7 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.3.0-alpha1" +version = "0.4.0-alpha1" dependencies = [ "async-std", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 655fbb7..3fbd83c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ description = "A web article downloader" homepage = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.3.0-alpha1" +version = "0.4.0-alpha1" authors = ["Kenneth Gitere <gitere81@gmail.com>"] edition = "2018" license = "MIT" diff --git a/README.md b/README.md index 96e15c5..0e626e0 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ <p align="center"><i>Salami not included</i></p> -Paperoni is a web article downloader written in Rust. The downloaded articles are then exported as EPUB files. +![crates.io](https://img.shields.io/crates/v/paperoni.svg) +Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. > This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash. @@ -17,7 +18,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run: ```sh -cargo install paperoni --version 0.3.0-alpha1 +cargo install paperoni --version 0.4.0-alpha1 ``` _Paperoni is still in alpha so the `version` flag has to be passed._ @@ -37,6 +38,27 @@ cargo run -- # pass your url here ## Usage +``` +USAGE: + paperoni [OPTIONS] [urls]... + +OPTIONS: + -f, --file <file> Input file containing links + -h, --help Prints help information + --log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level + of debug. Use -v to specify the logging level + --max_conn <max_conn> The maximum number of concurrent HTTP connections when downloading articles. Default is + 8 + --merge <output_name> Merge multiple articles into a single epub + -V, --version Prints version information + -v Enables logging of events and set the verbosity level. Use -h to read on its usage + +ARGS: + <urls>... Urls of web articles +``` + +To download a single article pass in its URL + ```sh paperoni https://en.wikipedia.org/wiki/Pepperoni ``` @@ -68,10 +90,23 @@ into a single epub using the `merge` flag and specifying the output file. paperoni -f links.txt --merge out.epub ``` +### Logging events + +Logging is disabled by default. This can be activated by either using the `-v` flag or `--log-to-file` flag. If the `--log-to-file` flag is passed the logs are sent to a file in the default Paperoni directory `.paperoni/logs` which is on your home directory. The `-v` flag configures the verbosity levels such that: + +``` +-v Logs only the error level +-vv Logs only the warn level +-vvv Logs only the info level +-vvvv Logs only the debug level +``` + +If only the `-v` flag is passed, the progress bars are disabled. If both `-v` and `--log-to-file` are passed then the progress bars will still be shown. + ## How it works The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor. -This extractor retrieves a possible article using a port of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB. +This extractor retrieves a possible article using a [custom port](https://github.com/hipstermojo/paperoni/blob/master/src/moz_readability/mod.rs) of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB. > The port of the algorithm is still unstable as well so it is not fully compatible with all the websites that can be extracted using Readability. @@ -82,3 +117,5 @@ This program is still in alpha so a number of things won't work: - Websites that only run with JavaScript cannot be extracted. - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either. - Code snippets on Medium articles that are lazy loaded will not appear in the EPUB. + +There are also web pages it won't work on in general such as Twitter and Reddit threads. diff --git a/src/cli.rs b/src/cli.rs index 30c5367..f1f38bc 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -14,10 +14,7 @@ pub fn cli_init() -> AppConfig { ]) .version(clap::crate_version!()) .about( - " -Paperoni is an article downloader. -It takes a url, downloads the article content from it and saves it to an epub. - ", + "Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs", ) .arg( Arg::with_name("urls") @@ -47,7 +44,7 @@ It takes a url, downloads the article content from it and saves it to an epub. Arg::with_name("verbosity") .short("v") .multiple(true) - .help("Enables logging of events and set the verbosity level. Use -h to read on its usage") + .help("Enables logging of events and set the verbosity level. Use --help to read on its usage") .long_help( "This takes upto 4 levels of verbosity in the following order. - Error (-v) From 4fd71311a191fd283c5bcc8a4344f9e62510075b Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Fri, 30 Apr 2021 07:47:25 +0300 Subject: [PATCH 23/24] Fix bug when validating the download file name in merged mode --- src/cli.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index f1f38bc..19ce379 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -101,16 +101,24 @@ pub fn cli_init() -> AppConfig { if let Some(name) = arg_matches.value_of("output_name") { let file_path = Path::new(name); - if !file_path.is_file() { - eprintln!("{:?} is not a vaild file", name); + if file_path.is_dir() { + eprintln!("{:?} is a directory", name); std::process::exit(1); } - let file_name = if name.ends_with(".epub") && name.len() > 5 { + let file_name = if file_path.extension().is_some() { name.to_owned() } else { name.to_owned() + ".epub" }; + + match std::fs::File::create(&file_name) { + Ok(_) => (), + Err(e) => { + eprintln!("Unable to create file {:?}\n{}", file_path, e); + std::process::exit(1) + } + } app_config.merged = Some(file_name); } From f93017ab7346d8cec32acaa59aea8f5912236226 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Fri, 30 Apr 2021 08:21:46 +0300 Subject: [PATCH 24/24] Fix README formatting --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0e626e0..99c9771 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ +![crates.io](https://img.shields.io/crates/v/paperoni.svg) + <p align="center"><img src="./paperoni-dark.png"></p> <p align="center"><i>Salami not included</i></p> -![crates.io](https://img.shields.io/crates/v/paperoni.svg) Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. > This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash.