refactor: move download function to http module

feat:  add rendering of table for partial downloads
feat:  add help message for enabling --log-to-file
chore: format flags to kebab-case and shorten --output-directory flag
This commit is contained in:
Kenneth Gitere 2021-06-08 07:42:30 +03:00
parent 95bd22f339
commit 5fbfb9c806
5 changed files with 117 additions and 71 deletions

View file

@ -48,26 +48,26 @@ USAGE:
paperoni [OPTIONS] [urls]... paperoni [OPTIONS] [urls]...
OPTIONS: OPTIONS:
-f, --file <file> -f, --file <file>
Input file containing links Input file containing links
-h, --help -h, --help
Prints help information Prints help information
--log-to-file --log-to-file
Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to
specify the logging level specify the logging level
--max_conn <max_conn> --max-conn <max_conn>
The maximum number of concurrent HTTP connections when downloading articles. Default is 8. The maximum number of concurrent HTTP connections when downloading articles. Default is 8.
NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end
up overloading your network card with too many concurrent requests. up overloading your network card with too many concurrent requests.
-o, --output_directory <output_directory> -o, --output-dir <output_directory>
Directory for saving epub documents Directory for saving epub documents
--merge <output_name> --merge <output_name>
Merge multiple articles into a single epub that will be given the name provided Merge multiple articles into a single epub that will be given the name provided
-V, --version -V, --version
Prints version information Prints version information
-v -v
@ -80,7 +80,7 @@ OPTIONS:
If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag. If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag.
ARGS: ARGS:
<urls>... <urls>...
Urls of web articles Urls of web articles
``` ```

View file

@ -47,7 +47,7 @@ impl AppConfig {
) )
.arg( .arg(
Arg::with_name("output_directory") Arg::with_name("output_directory")
.long("output-directory") .long("output-dir")
.short("o") .short("o")
.help("Directory to store output epub documents") .help("Directory to store output epub documents")
.conflicts_with("output_name") .conflicts_with("output_name")
@ -61,7 +61,7 @@ impl AppConfig {
.conflicts_with("output_directory") .conflicts_with("output_directory")
.takes_value(true), .takes_value(true),
).arg( ).arg(
Arg::with_name("max_conn") Arg::with_name("max-conn")
.long("max_conn") .long("max_conn")
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
@ -146,7 +146,7 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
(None, None) => Err(Error::NoUrls), (None, None) => Err(Error::NoUrls),
} }
}?) }?)
.max_conn(match arg_matches.value_of("max_conn") { .max_conn(match arg_matches.value_of("max-conn") {
Some(max_conn) => max_conn.parse::<NonZeroUsize>()?.get(), Some(max_conn) => max_conn.parse::<NonZeroUsize>()?.get(),
None => DEFAULT_MAX_CONN, None => DEFAULT_MAX_CONN,
}) })

View file

@ -1,14 +1,72 @@
use async_std::io::prelude::*; use async_std::io::prelude::*;
use async_std::task;
use async_std::{fs::File, stream}; use async_std::{fs::File, stream};
use futures::StreamExt; use futures::StreamExt;
use indicatif::ProgressBar; use indicatif::ProgressBar;
use log::warn;
use log::{debug, info}; use log::{debug, info};
use url::Url; use url::Url;
use crate::cli::AppConfig;
use crate::errors::{ErrorKind, ImgError, PaperoniError}; use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Extractor; use crate::extractor::Extractor;
type HTMLResource = (String, String); type HTMLResource = (String, String);
pub fn download(
app_config: &AppConfig,
bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>,
errors: &mut Vec<PaperoniError>,
) -> Vec<Extractor> {
task::block_on(async {
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
match fetch_result {
Ok((url, html)) => {
debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.await
{
partial_downloads
.push(PartialDownload::new(&url, extractor.metadata().title()));
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
}
}
Err(e) => errors.push(e),
}
bar.inc(1);
}
articles
})
}
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> { pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
let client = surf::Client::new(); let client = surf::Client::new();
debug!("Fetching {}", url); debug!("Fetching {}", url);
@ -210,6 +268,20 @@ pub async fn download_images(
} }
} }
pub struct PartialDownload {
pub link: String,
pub title: String,
}
impl PartialDownload {
pub fn new(link: &str, title: &str) -> Self {
Self {
link: link.into(),
title: title.into(),
}
}
}
/// Handles getting the extension from a given MIME subtype. /// Handles getting the extension from a given MIME subtype.
fn map_mime_subtype_to_ext(subtype: &str) -> &str { fn map_mime_subtype_to_ext(subtype: &str) -> &str {
if subtype == ("svg+xml") { if subtype == ("svg+xml") {

View file

@ -12,9 +12,10 @@ use crate::errors::PaperoniError;
pub fn display_summary( pub fn display_summary(
initial_article_count: usize, initial_article_count: usize,
succesful_articles_table: Table, succesful_articles_table: Table,
partial_downloads_count: usize, partial_downloads: Vec<PartialDownload>,
errors: Vec<PaperoniError>, errors: Vec<PaperoniError>,
) { ) {
let partial_downloads_count = partial_downloads.len();
let successfully_downloaded_count = let successfully_downloaded_count =
initial_article_count - partial_downloads_count - errors.len(); initial_article_count - partial_downloads_count - errors.len();
@ -32,6 +33,24 @@ pub fn display_summary(
if successfully_downloaded_count > 0 { if successfully_downloaded_count > 0 {
println!("{}", succesful_articles_table); println!("{}", succesful_articles_table);
} }
if partial_downloads_count > 0 {
println!("\n{}", "Partially failed downloads".yellow().bold());
let mut table_partial = Table::new();
table_partial
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_header(vec![
Cell::new("Link").set_alignment(CellAlignment::Center),
Cell::new("Title").set_alignment(CellAlignment::Center),
])
.set_content_arrangement(ContentArrangement::Dynamic);
for partial in partial_downloads {
table_partial.add_row(vec![&partial.link, &partial.title]);
}
println!("{}", table_partial);
}
if !errors.is_empty() { if !errors.is_empty() {
println!("\n{}", "Failed article downloads".bright_red().bold()); println!("\n{}", "Failed article downloads".bright_red().bold());
let mut table_failed = Table::new(); let mut table_failed = Table::new();
@ -126,6 +145,7 @@ impl DownloadCount {
} }
use crate::errors::LogError as Error; use crate::errors::LogError as Error;
use crate::http::PartialDownload;
pub fn init_logger( pub fn init_logger(
log_level: LevelFilter, log_level: LevelFilter,

View file

@ -3,14 +3,10 @@ extern crate lazy_static;
use std::process::exit; use std::process::exit;
use async_std::stream;
use async_std::task;
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
use comfy_table::{ContentArrangement, Table}; use comfy_table::{ContentArrangement, Table};
use futures::stream::StreamExt; use http::download;
use indicatif::{ProgressBar, ProgressStyle}; use indicatif::{ProgressBar, ProgressStyle};
use log::{debug, warn};
use url::Url;
mod cli; mod cli;
mod epub; mod epub;
@ -24,8 +20,6 @@ mod moz_readability;
use cli::AppConfig; use cli::AppConfig;
use epub::generate_epubs; use epub::generate_epubs;
use extractor::Extractor;
use http::{download_images, fetch_html};
use logs::display_summary; use logs::display_summary;
fn main() { fn main() {
@ -38,70 +32,25 @@ fn main() {
}; };
if !app_config.urls.is_empty() { if !app_config.urls.is_empty() {
download(app_config); run(app_config);
} }
} }
fn download(app_config: AppConfig) { fn run(app_config: AppConfig) {
let mut errors = Vec::new(); let mut errors = Vec::new();
let mut partial_download_count: usize = 0; let mut partial_downloads = Vec::new();
let bar = if app_config.can_disable_progress_bar { let bar = if app_config.can_disable_progress_bar {
ProgressBar::hidden() ProgressBar::hidden()
} else { } else {
let enabled_bar = ProgressBar::new(app_config.urls.len() as u64); let enabled_bar = ProgressBar::new(app_config.urls.len() as u64);
let style = ProgressStyle::default_bar().template( let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
); );
enabled_bar.set_style(style); enabled_bar.set_style(style);
enabled_bar.enable_steady_tick(500); enabled_bar.enable_steady_tick(500);
enabled_bar enabled_bar
}; };
let articles = task::block_on(async { let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors);
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
match fetch_result {
Ok((url, html)) => {
debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.await
{
partial_download_count += 1;
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
}
}
Err(e) => errors.push(e),
}
bar.inc(1);
}
articles
});
bar.finish_with_message("Downloaded articles"); bar.finish_with_message("Downloaded articles");
let mut succesful_articles_table = Table::new(); let mut succesful_articles_table = Table::new();
@ -115,19 +64,24 @@ fn download(app_config: AppConfig) {
errors.extend(gen_epub_errors); errors.extend(gen_epub_errors);
} }
}; };
let has_errors = !errors.is_empty();
let has_errors = !errors.is_empty() || !partial_downloads.is_empty();
display_summary( display_summary(
app_config.urls.len(), app_config.urls.len(),
succesful_articles_table, succesful_articles_table,
partial_download_count, partial_downloads,
errors, errors,
); );
if app_config.is_logging_to_file { if app_config.is_logging_to_file {
println!( println!(
"Log written to paperoni_{}.log\n", "Log written to paperoni_{}.log\n",
app_config.start_time.format("%Y-%m-%d_%H-%M-%S") app_config.start_time.format("%Y-%m-%d_%H-%M-%S")
); );
} else if has_errors && !app_config.is_logging_to_file {
println!("\nRun paperoni with the --log-to-file flag to create a log file");
} }
if has_errors { if has_errors {
std::process::exit(1); std::process::exit(1);
} }