refactor: move download function to http module
feat: add rendering of table for partial downloads feat: add help message for enabling --log-to-file chore: format flags to kebab-case and shorten --output-directory flag
This commit is contained in:
parent
95bd22f339
commit
5fbfb9c806
5 changed files with 117 additions and 71 deletions
|
@ -57,11 +57,11 @@ OPTIONS:
|
||||||
--log-to-file
|
--log-to-file
|
||||||
Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to
|
Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to
|
||||||
specify the logging level
|
specify the logging level
|
||||||
--max_conn <max_conn>
|
--max-conn <max_conn>
|
||||||
The maximum number of concurrent HTTP connections when downloading articles. Default is 8.
|
The maximum number of concurrent HTTP connections when downloading articles. Default is 8.
|
||||||
NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end
|
NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end
|
||||||
up overloading your network card with too many concurrent requests.
|
up overloading your network card with too many concurrent requests.
|
||||||
-o, --output_directory <output_directory>
|
-o, --output-dir <output_directory>
|
||||||
Directory for saving epub documents
|
Directory for saving epub documents
|
||||||
|
|
||||||
--merge <output_name>
|
--merge <output_name>
|
||||||
|
|
|
@ -47,7 +47,7 @@ impl AppConfig {
|
||||||
)
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::with_name("output_directory")
|
Arg::with_name("output_directory")
|
||||||
.long("output-directory")
|
.long("output-dir")
|
||||||
.short("o")
|
.short("o")
|
||||||
.help("Directory to store output epub documents")
|
.help("Directory to store output epub documents")
|
||||||
.conflicts_with("output_name")
|
.conflicts_with("output_name")
|
||||||
|
@ -61,7 +61,7 @@ impl AppConfig {
|
||||||
.conflicts_with("output_directory")
|
.conflicts_with("output_directory")
|
||||||
.takes_value(true),
|
.takes_value(true),
|
||||||
).arg(
|
).arg(
|
||||||
Arg::with_name("max_conn")
|
Arg::with_name("max-conn")
|
||||||
.long("max_conn")
|
.long("max_conn")
|
||||||
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
|
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
|
||||||
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
|
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
|
||||||
|
@ -146,7 +146,7 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
|
||||||
(None, None) => Err(Error::NoUrls),
|
(None, None) => Err(Error::NoUrls),
|
||||||
}
|
}
|
||||||
}?)
|
}?)
|
||||||
.max_conn(match arg_matches.value_of("max_conn") {
|
.max_conn(match arg_matches.value_of("max-conn") {
|
||||||
Some(max_conn) => max_conn.parse::<NonZeroUsize>()?.get(),
|
Some(max_conn) => max_conn.parse::<NonZeroUsize>()?.get(),
|
||||||
None => DEFAULT_MAX_CONN,
|
None => DEFAULT_MAX_CONN,
|
||||||
})
|
})
|
||||||
|
|
72
src/http.rs
72
src/http.rs
|
@ -1,14 +1,72 @@
|
||||||
use async_std::io::prelude::*;
|
use async_std::io::prelude::*;
|
||||||
|
use async_std::task;
|
||||||
use async_std::{fs::File, stream};
|
use async_std::{fs::File, stream};
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use indicatif::ProgressBar;
|
use indicatif::ProgressBar;
|
||||||
|
use log::warn;
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::cli::AppConfig;
|
||||||
use crate::errors::{ErrorKind, ImgError, PaperoniError};
|
use crate::errors::{ErrorKind, ImgError, PaperoniError};
|
||||||
use crate::extractor::Extractor;
|
use crate::extractor::Extractor;
|
||||||
type HTMLResource = (String, String);
|
type HTMLResource = (String, String);
|
||||||
|
|
||||||
|
pub fn download(
|
||||||
|
app_config: &AppConfig,
|
||||||
|
bar: &ProgressBar,
|
||||||
|
partial_downloads: &mut Vec<PartialDownload>,
|
||||||
|
errors: &mut Vec<PaperoniError>,
|
||||||
|
) -> Vec<Extractor> {
|
||||||
|
task::block_on(async {
|
||||||
|
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
||||||
|
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
||||||
|
let mut articles = Vec::new();
|
||||||
|
while let Some(fetch_result) = responses.next().await {
|
||||||
|
match fetch_result {
|
||||||
|
Ok((url, html)) => {
|
||||||
|
debug!("Extracting {}", &url);
|
||||||
|
let mut extractor = Extractor::from_html(&html, &url);
|
||||||
|
bar.set_message("Extracting...");
|
||||||
|
match extractor.extract_content() {
|
||||||
|
Ok(_) => {
|
||||||
|
extractor.extract_img_urls();
|
||||||
|
if let Err(img_errors) =
|
||||||
|
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
partial_downloads
|
||||||
|
.push(PartialDownload::new(&url, extractor.metadata().title()));
|
||||||
|
warn!(
|
||||||
|
"{} image{} failed to download for {}",
|
||||||
|
img_errors.len(),
|
||||||
|
if img_errors.len() > 1 { "s" } else { "" },
|
||||||
|
url
|
||||||
|
);
|
||||||
|
for img_error in img_errors {
|
||||||
|
warn!(
|
||||||
|
"{}\n\t\tReason {}",
|
||||||
|
img_error.url().as_ref().unwrap(),
|
||||||
|
img_error
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
articles.push(extractor);
|
||||||
|
}
|
||||||
|
Err(mut e) => {
|
||||||
|
e.set_article_source(&url);
|
||||||
|
errors.push(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => errors.push(e),
|
||||||
|
}
|
||||||
|
bar.inc(1);
|
||||||
|
}
|
||||||
|
articles
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
||||||
let client = surf::Client::new();
|
let client = surf::Client::new();
|
||||||
debug!("Fetching {}", url);
|
debug!("Fetching {}", url);
|
||||||
|
@ -210,6 +268,20 @@ pub async fn download_images(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct PartialDownload {
|
||||||
|
pub link: String,
|
||||||
|
pub title: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialDownload {
|
||||||
|
pub fn new(link: &str, title: &str) -> Self {
|
||||||
|
Self {
|
||||||
|
link: link.into(),
|
||||||
|
title: title.into(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Handles getting the extension from a given MIME subtype.
|
/// Handles getting the extension from a given MIME subtype.
|
||||||
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
||||||
if subtype == ("svg+xml") {
|
if subtype == ("svg+xml") {
|
||||||
|
|
22
src/logs.rs
22
src/logs.rs
|
@ -12,9 +12,10 @@ use crate::errors::PaperoniError;
|
||||||
pub fn display_summary(
|
pub fn display_summary(
|
||||||
initial_article_count: usize,
|
initial_article_count: usize,
|
||||||
succesful_articles_table: Table,
|
succesful_articles_table: Table,
|
||||||
partial_downloads_count: usize,
|
partial_downloads: Vec<PartialDownload>,
|
||||||
errors: Vec<PaperoniError>,
|
errors: Vec<PaperoniError>,
|
||||||
) {
|
) {
|
||||||
|
let partial_downloads_count = partial_downloads.len();
|
||||||
let successfully_downloaded_count =
|
let successfully_downloaded_count =
|
||||||
initial_article_count - partial_downloads_count - errors.len();
|
initial_article_count - partial_downloads_count - errors.len();
|
||||||
|
|
||||||
|
@ -32,6 +33,24 @@ pub fn display_summary(
|
||||||
if successfully_downloaded_count > 0 {
|
if successfully_downloaded_count > 0 {
|
||||||
println!("{}", succesful_articles_table);
|
println!("{}", succesful_articles_table);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if partial_downloads_count > 0 {
|
||||||
|
println!("\n{}", "Partially failed downloads".yellow().bold());
|
||||||
|
let mut table_partial = Table::new();
|
||||||
|
table_partial
|
||||||
|
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
|
||||||
|
.set_header(vec![
|
||||||
|
Cell::new("Link").set_alignment(CellAlignment::Center),
|
||||||
|
Cell::new("Title").set_alignment(CellAlignment::Center),
|
||||||
|
])
|
||||||
|
.set_content_arrangement(ContentArrangement::Dynamic);
|
||||||
|
|
||||||
|
for partial in partial_downloads {
|
||||||
|
table_partial.add_row(vec![&partial.link, &partial.title]);
|
||||||
|
}
|
||||||
|
println!("{}", table_partial);
|
||||||
|
}
|
||||||
|
|
||||||
if !errors.is_empty() {
|
if !errors.is_empty() {
|
||||||
println!("\n{}", "Failed article downloads".bright_red().bold());
|
println!("\n{}", "Failed article downloads".bright_red().bold());
|
||||||
let mut table_failed = Table::new();
|
let mut table_failed = Table::new();
|
||||||
|
@ -126,6 +145,7 @@ impl DownloadCount {
|
||||||
}
|
}
|
||||||
|
|
||||||
use crate::errors::LogError as Error;
|
use crate::errors::LogError as Error;
|
||||||
|
use crate::http::PartialDownload;
|
||||||
|
|
||||||
pub fn init_logger(
|
pub fn init_logger(
|
||||||
log_level: LevelFilter,
|
log_level: LevelFilter,
|
||||||
|
|
70
src/main.rs
70
src/main.rs
|
@ -3,14 +3,10 @@ extern crate lazy_static;
|
||||||
|
|
||||||
use std::process::exit;
|
use std::process::exit;
|
||||||
|
|
||||||
use async_std::stream;
|
|
||||||
use async_std::task;
|
|
||||||
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
|
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
|
||||||
use comfy_table::{ContentArrangement, Table};
|
use comfy_table::{ContentArrangement, Table};
|
||||||
use futures::stream::StreamExt;
|
use http::download;
|
||||||
use indicatif::{ProgressBar, ProgressStyle};
|
use indicatif::{ProgressBar, ProgressStyle};
|
||||||
use log::{debug, warn};
|
|
||||||
use url::Url;
|
|
||||||
|
|
||||||
mod cli;
|
mod cli;
|
||||||
mod epub;
|
mod epub;
|
||||||
|
@ -24,8 +20,6 @@ mod moz_readability;
|
||||||
|
|
||||||
use cli::AppConfig;
|
use cli::AppConfig;
|
||||||
use epub::generate_epubs;
|
use epub::generate_epubs;
|
||||||
use extractor::Extractor;
|
|
||||||
use http::{download_images, fetch_html};
|
|
||||||
use logs::display_summary;
|
use logs::display_summary;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
|
@ -38,13 +32,13 @@ fn main() {
|
||||||
};
|
};
|
||||||
|
|
||||||
if !app_config.urls.is_empty() {
|
if !app_config.urls.is_empty() {
|
||||||
download(app_config);
|
run(app_config);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn download(app_config: AppConfig) {
|
fn run(app_config: AppConfig) {
|
||||||
let mut errors = Vec::new();
|
let mut errors = Vec::new();
|
||||||
let mut partial_download_count: usize = 0;
|
let mut partial_downloads = Vec::new();
|
||||||
let bar = if app_config.can_disable_progress_bar {
|
let bar = if app_config.can_disable_progress_bar {
|
||||||
ProgressBar::hidden()
|
ProgressBar::hidden()
|
||||||
} else {
|
} else {
|
||||||
|
@ -56,52 +50,7 @@ fn download(app_config: AppConfig) {
|
||||||
enabled_bar.enable_steady_tick(500);
|
enabled_bar.enable_steady_tick(500);
|
||||||
enabled_bar
|
enabled_bar
|
||||||
};
|
};
|
||||||
let articles = task::block_on(async {
|
let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors);
|
||||||
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
|
||||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
|
||||||
let mut articles = Vec::new();
|
|
||||||
while let Some(fetch_result) = responses.next().await {
|
|
||||||
match fetch_result {
|
|
||||||
Ok((url, html)) => {
|
|
||||||
debug!("Extracting {}", &url);
|
|
||||||
let mut extractor = Extractor::from_html(&html, &url);
|
|
||||||
bar.set_message("Extracting...");
|
|
||||||
match extractor.extract_content() {
|
|
||||||
Ok(_) => {
|
|
||||||
extractor.extract_img_urls();
|
|
||||||
if let Err(img_errors) =
|
|
||||||
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
partial_download_count += 1;
|
|
||||||
warn!(
|
|
||||||
"{} image{} failed to download for {}",
|
|
||||||
img_errors.len(),
|
|
||||||
if img_errors.len() > 1 { "s" } else { "" },
|
|
||||||
url
|
|
||||||
);
|
|
||||||
for img_error in img_errors {
|
|
||||||
warn!(
|
|
||||||
"{}\n\t\tReason {}",
|
|
||||||
img_error.url().as_ref().unwrap(),
|
|
||||||
img_error
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
articles.push(extractor);
|
|
||||||
}
|
|
||||||
Err(mut e) => {
|
|
||||||
e.set_article_source(&url);
|
|
||||||
errors.push(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => errors.push(e),
|
|
||||||
}
|
|
||||||
bar.inc(1);
|
|
||||||
}
|
|
||||||
articles
|
|
||||||
});
|
|
||||||
bar.finish_with_message("Downloaded articles");
|
bar.finish_with_message("Downloaded articles");
|
||||||
|
|
||||||
let mut succesful_articles_table = Table::new();
|
let mut succesful_articles_table = Table::new();
|
||||||
|
@ -115,19 +64,24 @@ fn download(app_config: AppConfig) {
|
||||||
errors.extend(gen_epub_errors);
|
errors.extend(gen_epub_errors);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let has_errors = !errors.is_empty();
|
|
||||||
|
let has_errors = !errors.is_empty() || !partial_downloads.is_empty();
|
||||||
display_summary(
|
display_summary(
|
||||||
app_config.urls.len(),
|
app_config.urls.len(),
|
||||||
succesful_articles_table,
|
succesful_articles_table,
|
||||||
partial_download_count,
|
partial_downloads,
|
||||||
errors,
|
errors,
|
||||||
);
|
);
|
||||||
|
|
||||||
if app_config.is_logging_to_file {
|
if app_config.is_logging_to_file {
|
||||||
println!(
|
println!(
|
||||||
"Log written to paperoni_{}.log\n",
|
"Log written to paperoni_{}.log\n",
|
||||||
app_config.start_time.format("%Y-%m-%d_%H-%M-%S")
|
app_config.start_time.format("%Y-%m-%d_%H-%M-%S")
|
||||||
);
|
);
|
||||||
|
} else if has_errors && !app_config.is_logging_to_file {
|
||||||
|
println!("\nRun paperoni with the --log-to-file flag to create a log file");
|
||||||
}
|
}
|
||||||
|
|
||||||
if has_errors {
|
if has_errors {
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue