From 5fbfb9c8062123a551f037f5069e5f9ee63fb9c2 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 8 Jun 2021 07:42:30 +0300 Subject: [PATCH] refactor: move download function to http module feat: add rendering of table for partial downloads feat: add help message for enabling --log-to-file chore: format flags to kebab-case and shorten --output-directory flag --- README.md | 14 +++++----- src/cli.rs | 6 ++--- src/http.rs | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/logs.rs | 22 +++++++++++++++- src/main.rs | 74 ++++++++++------------------------------------------- 5 files changed, 117 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 873f95a..f38c741 100644 --- a/README.md +++ b/README.md @@ -48,26 +48,26 @@ USAGE: paperoni [OPTIONS] [urls]... OPTIONS: - -f, --file + -f, --file Input file containing links - -h, --help + -h, --help Prints help information --log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level - --max_conn + --max-conn The maximum number of concurrent HTTP connections when downloading articles. Default is 8. NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests. - -o, --output_directory + -o, --output-dir Directory for saving epub documents - --merge + --merge Merge multiple articles into a single epub that will be given the name provided - -V, --version + -V, --version Prints version information -v @@ -80,7 +80,7 @@ OPTIONS: If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag. ARGS: - ... + ... Urls of web articles ``` diff --git a/src/cli.rs b/src/cli.rs index 5827f56..eb6c610 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -47,7 +47,7 @@ impl AppConfig { ) .arg( Arg::with_name("output_directory") - .long("output-directory") + .long("output-dir") .short("o") .help("Directory to store output epub documents") .conflicts_with("output_name") @@ -61,7 +61,7 @@ impl AppConfig { .conflicts_with("output_directory") .takes_value(true), ).arg( - Arg::with_name("max_conn") + Arg::with_name("max-conn") .long("max_conn") .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") @@ -146,7 +146,7 @@ impl<'a> TryFrom> for AppConfig { (None, None) => Err(Error::NoUrls), } }?) - .max_conn(match arg_matches.value_of("max_conn") { + .max_conn(match arg_matches.value_of("max-conn") { Some(max_conn) => max_conn.parse::()?.get(), None => DEFAULT_MAX_CONN, }) diff --git a/src/http.rs b/src/http.rs index 148fab0..8707977 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,14 +1,72 @@ use async_std::io::prelude::*; +use async_std::task; use async_std::{fs::File, stream}; use futures::StreamExt; use indicatif::ProgressBar; +use log::warn; use log::{debug, info}; use url::Url; +use crate::cli::AppConfig; use crate::errors::{ErrorKind, ImgError, PaperoniError}; use crate::extractor::Extractor; type HTMLResource = (String, String); +pub fn download( + app_config: &AppConfig, + bar: &ProgressBar, + partial_downloads: &mut Vec, + errors: &mut Vec, +) -> Vec { + task::block_on(async { + let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); + let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); + let mut articles = Vec::new(); + while let Some(fetch_result) = responses.next().await { + match fetch_result { + Ok((url, html)) => { + debug!("Extracting {}", &url); + let mut extractor = Extractor::from_html(&html, &url); + bar.set_message("Extracting..."); + match extractor.extract_content() { + Ok(_) => { + extractor.extract_img_urls(); + if let Err(img_errors) = + download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) + .await + { + partial_downloads + .push(PartialDownload::new(&url, extractor.metadata().title())); + warn!( + "{} image{} failed to download for {}", + img_errors.len(), + if img_errors.len() > 1 { "s" } else { "" }, + url + ); + for img_error in img_errors { + warn!( + "{}\n\t\tReason {}", + img_error.url().as_ref().unwrap(), + img_error + ); + } + } + articles.push(extractor); + } + Err(mut e) => { + e.set_article_source(&url); + errors.push(e); + } + } + } + Err(e) => errors.push(e), + } + bar.inc(1); + } + articles + }) +} + pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); debug!("Fetching {}", url); @@ -210,6 +268,20 @@ pub async fn download_images( } } +pub struct PartialDownload { + pub link: String, + pub title: String, +} + +impl PartialDownload { + pub fn new(link: &str, title: &str) -> Self { + Self { + link: link.into(), + title: title.into(), + } + } +} + /// Handles getting the extension from a given MIME subtype. fn map_mime_subtype_to_ext(subtype: &str) -> &str { if subtype == ("svg+xml") { diff --git a/src/logs.rs b/src/logs.rs index 61e7bc2..722c131 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -12,9 +12,10 @@ use crate::errors::PaperoniError; pub fn display_summary( initial_article_count: usize, succesful_articles_table: Table, - partial_downloads_count: usize, + partial_downloads: Vec, errors: Vec, ) { + let partial_downloads_count = partial_downloads.len(); let successfully_downloaded_count = initial_article_count - partial_downloads_count - errors.len(); @@ -32,6 +33,24 @@ pub fn display_summary( if successfully_downloaded_count > 0 { println!("{}", succesful_articles_table); } + + if partial_downloads_count > 0 { + println!("\n{}", "Partially failed downloads".yellow().bold()); + let mut table_partial = Table::new(); + table_partial + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_header(vec![ + Cell::new("Link").set_alignment(CellAlignment::Center), + Cell::new("Title").set_alignment(CellAlignment::Center), + ]) + .set_content_arrangement(ContentArrangement::Dynamic); + + for partial in partial_downloads { + table_partial.add_row(vec![&partial.link, &partial.title]); + } + println!("{}", table_partial); + } + if !errors.is_empty() { println!("\n{}", "Failed article downloads".bright_red().bold()); let mut table_failed = Table::new(); @@ -126,6 +145,7 @@ impl DownloadCount { } use crate::errors::LogError as Error; +use crate::http::PartialDownload; pub fn init_logger( log_level: LevelFilter, diff --git a/src/main.rs b/src/main.rs index fcb0cd4..dc4787d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,14 +3,10 @@ extern crate lazy_static; use std::process::exit; -use async_std::stream; -use async_std::task; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::{ContentArrangement, Table}; -use futures::stream::StreamExt; +use http::download; use indicatif::{ProgressBar, ProgressStyle}; -use log::{debug, warn}; -use url::Url; mod cli; mod epub; @@ -24,8 +20,6 @@ mod moz_readability; use cli::AppConfig; use epub::generate_epubs; -use extractor::Extractor; -use http::{download_images, fetch_html}; use logs::display_summary; fn main() { @@ -38,70 +32,25 @@ fn main() { }; if !app_config.urls.is_empty() { - download(app_config); + run(app_config); } } -fn download(app_config: AppConfig) { +fn run(app_config: AppConfig) { let mut errors = Vec::new(); - let mut partial_download_count: usize = 0; + let mut partial_downloads = Vec::new(); let bar = if app_config.can_disable_progress_bar { ProgressBar::hidden() } else { let enabled_bar = ProgressBar::new(app_config.urls.len() as u64); let style = ProgressStyle::default_bar().template( - "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", - ); + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", + ); enabled_bar.set_style(style); enabled_bar.enable_steady_tick(500); enabled_bar }; - let articles = task::block_on(async { - let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); - let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); - let mut articles = Vec::new(); - while let Some(fetch_result) = responses.next().await { - match fetch_result { - Ok((url, html)) => { - debug!("Extracting {}", &url); - let mut extractor = Extractor::from_html(&html, &url); - bar.set_message("Extracting..."); - match extractor.extract_content() { - Ok(_) => { - extractor.extract_img_urls(); - if let Err(img_errors) = - download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) - .await - { - partial_download_count += 1; - warn!( - "{} image{} failed to download for {}", - img_errors.len(), - if img_errors.len() > 1 { "s" } else { "" }, - url - ); - for img_error in img_errors { - warn!( - "{}\n\t\tReason {}", - img_error.url().as_ref().unwrap(), - img_error - ); - } - } - articles.push(extractor); - } - Err(mut e) => { - e.set_article_source(&url); - errors.push(e); - } - } - } - Err(e) => errors.push(e), - } - bar.inc(1); - } - articles - }); + let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors); bar.finish_with_message("Downloaded articles"); let mut succesful_articles_table = Table::new(); @@ -115,19 +64,24 @@ fn download(app_config: AppConfig) { errors.extend(gen_epub_errors); } }; - let has_errors = !errors.is_empty(); + + let has_errors = !errors.is_empty() || !partial_downloads.is_empty(); display_summary( app_config.urls.len(), succesful_articles_table, - partial_download_count, + partial_downloads, errors, ); + if app_config.is_logging_to_file { println!( "Log written to paperoni_{}.log\n", app_config.start_time.format("%Y-%m-%d_%H-%M-%S") ); + } else if has_errors && !app_config.is_logging_to_file { + println!("\nRun paperoni with the --log-to-file flag to create a log file"); } + if has_errors { std::process::exit(1); }