refactor: move download function to http module

feat:  add rendering of table for partial downloads
feat:  add help message for enabling --log-to-file
chore: format flags to kebab-case and shorten --output-directory flag
This commit is contained in:
Kenneth Gitere 2021-06-08 07:42:30 +03:00
parent 95bd22f339
commit 5fbfb9c806
5 changed files with 117 additions and 71 deletions

View file

@ -48,26 +48,26 @@ USAGE:
paperoni [OPTIONS] [urls]...
OPTIONS:
-f, --file <file>
-f, --file <file>
Input file containing links
-h, --help
-h, --help
Prints help information
--log-to-file
Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to
specify the logging level
--max_conn <max_conn>
--max-conn <max_conn>
The maximum number of concurrent HTTP connections when downloading articles. Default is 8.
NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end
up overloading your network card with too many concurrent requests.
-o, --output_directory <output_directory>
-o, --output-dir <output_directory>
Directory for saving epub documents
--merge <output_name>
--merge <output_name>
Merge multiple articles into a single epub that will be given the name provided
-V, --version
-V, --version
Prints version information
-v
@ -80,7 +80,7 @@ OPTIONS:
If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag.
ARGS:
<urls>...
<urls>...
Urls of web articles
```

View file

@ -47,7 +47,7 @@ impl AppConfig {
)
.arg(
Arg::with_name("output_directory")
.long("output-directory")
.long("output-dir")
.short("o")
.help("Directory to store output epub documents")
.conflicts_with("output_name")
@ -61,7 +61,7 @@ impl AppConfig {
.conflicts_with("output_directory")
.takes_value(true),
).arg(
Arg::with_name("max_conn")
Arg::with_name("max-conn")
.long("max_conn")
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
@ -146,7 +146,7 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
(None, None) => Err(Error::NoUrls),
}
}?)
.max_conn(match arg_matches.value_of("max_conn") {
.max_conn(match arg_matches.value_of("max-conn") {
Some(max_conn) => max_conn.parse::<NonZeroUsize>()?.get(),
None => DEFAULT_MAX_CONN,
})

View file

@ -1,14 +1,72 @@
use async_std::io::prelude::*;
use async_std::task;
use async_std::{fs::File, stream};
use futures::StreamExt;
use indicatif::ProgressBar;
use log::warn;
use log::{debug, info};
use url::Url;
use crate::cli::AppConfig;
use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Extractor;
type HTMLResource = (String, String);
pub fn download(
app_config: &AppConfig,
bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>,
errors: &mut Vec<PaperoniError>,
) -> Vec<Extractor> {
task::block_on(async {
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
match fetch_result {
Ok((url, html)) => {
debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.await
{
partial_downloads
.push(PartialDownload::new(&url, extractor.metadata().title()));
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
}
}
Err(e) => errors.push(e),
}
bar.inc(1);
}
articles
})
}
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
let client = surf::Client::new();
debug!("Fetching {}", url);
@ -210,6 +268,20 @@ pub async fn download_images(
}
}
pub struct PartialDownload {
pub link: String,
pub title: String,
}
impl PartialDownload {
pub fn new(link: &str, title: &str) -> Self {
Self {
link: link.into(),
title: title.into(),
}
}
}
/// Handles getting the extension from a given MIME subtype.
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
if subtype == ("svg+xml") {

View file

@ -12,9 +12,10 @@ use crate::errors::PaperoniError;
pub fn display_summary(
initial_article_count: usize,
succesful_articles_table: Table,
partial_downloads_count: usize,
partial_downloads: Vec<PartialDownload>,
errors: Vec<PaperoniError>,
) {
let partial_downloads_count = partial_downloads.len();
let successfully_downloaded_count =
initial_article_count - partial_downloads_count - errors.len();
@ -32,6 +33,24 @@ pub fn display_summary(
if successfully_downloaded_count > 0 {
println!("{}", succesful_articles_table);
}
if partial_downloads_count > 0 {
println!("\n{}", "Partially failed downloads".yellow().bold());
let mut table_partial = Table::new();
table_partial
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_header(vec![
Cell::new("Link").set_alignment(CellAlignment::Center),
Cell::new("Title").set_alignment(CellAlignment::Center),
])
.set_content_arrangement(ContentArrangement::Dynamic);
for partial in partial_downloads {
table_partial.add_row(vec![&partial.link, &partial.title]);
}
println!("{}", table_partial);
}
if !errors.is_empty() {
println!("\n{}", "Failed article downloads".bright_red().bold());
let mut table_failed = Table::new();
@ -126,6 +145,7 @@ impl DownloadCount {
}
use crate::errors::LogError as Error;
use crate::http::PartialDownload;
pub fn init_logger(
log_level: LevelFilter,

View file

@ -3,14 +3,10 @@ extern crate lazy_static;
use std::process::exit;
use async_std::stream;
use async_std::task;
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
use comfy_table::{ContentArrangement, Table};
use futures::stream::StreamExt;
use http::download;
use indicatif::{ProgressBar, ProgressStyle};
use log::{debug, warn};
use url::Url;
mod cli;
mod epub;
@ -24,8 +20,6 @@ mod moz_readability;
use cli::AppConfig;
use epub::generate_epubs;
use extractor::Extractor;
use http::{download_images, fetch_html};
use logs::display_summary;
fn main() {
@ -38,70 +32,25 @@ fn main() {
};
if !app_config.urls.is_empty() {
download(app_config);
run(app_config);
}
}
fn download(app_config: AppConfig) {
fn run(app_config: AppConfig) {
let mut errors = Vec::new();
let mut partial_download_count: usize = 0;
let mut partial_downloads = Vec::new();
let bar = if app_config.can_disable_progress_bar {
ProgressBar::hidden()
} else {
let enabled_bar = ProgressBar::new(app_config.urls.len() as u64);
let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
);
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
);
enabled_bar.set_style(style);
enabled_bar.enable_steady_tick(500);
enabled_bar
};
let articles = task::block_on(async {
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
match fetch_result {
Ok((url, html)) => {
debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.await
{
partial_download_count += 1;
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
}
}
Err(e) => errors.push(e),
}
bar.inc(1);
}
articles
});
let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors);
bar.finish_with_message("Downloaded articles");
let mut succesful_articles_table = Table::new();
@ -115,19 +64,24 @@ fn download(app_config: AppConfig) {
errors.extend(gen_epub_errors);
}
};
let has_errors = !errors.is_empty();
let has_errors = !errors.is_empty() || !partial_downloads.is_empty();
display_summary(
app_config.urls.len(),
succesful_articles_table,
partial_download_count,
partial_downloads,
errors,
);
if app_config.is_logging_to_file {
println!(
"Log written to paperoni_{}.log\n",
app_config.start_time.format("%Y-%m-%d_%H-%M-%S")
);
} else if has_errors && !app_config.is_logging_to_file {
println!("\nRun paperoni with the --log-to-file flag to create a log file");
}
if has_errors {
std::process::exit(1);
}