2020-10-12 19:33:01 +01:00
|
|
|
#[macro_use]
|
|
|
|
extern crate lazy_static;
|
|
|
|
|
2021-06-01 10:23:22 +01:00
|
|
|
use std::process::exit;
|
|
|
|
|
2021-02-06 14:03:02 +00:00
|
|
|
use async_std::stream;
|
2020-11-23 06:39:56 +00:00
|
|
|
use async_std::task;
|
2021-04-24 07:00:18 +01:00
|
|
|
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
|
2021-04-24 11:58:03 +01:00
|
|
|
use comfy_table::{ContentArrangement, Table};
|
2021-02-06 14:03:02 +00:00
|
|
|
use futures::stream::StreamExt;
|
2021-04-17 15:27:38 +01:00
|
|
|
use indicatif::{ProgressBar, ProgressStyle};
|
2021-04-24 11:54:47 +01:00
|
|
|
use log::{debug, warn};
|
2020-05-02 16:33:45 +01:00
|
|
|
use url::Url;
|
2020-04-30 09:05:53 +01:00
|
|
|
|
2020-05-16 08:09:44 +01:00
|
|
|
mod cli;
|
2021-02-06 09:59:03 +00:00
|
|
|
mod epub;
|
2021-04-17 10:04:06 +01:00
|
|
|
mod errors;
|
2020-05-01 14:17:59 +01:00
|
|
|
mod extractor;
|
2021-02-06 09:59:03 +00:00
|
|
|
/// This module is responsible for async HTTP calls for downloading
|
|
|
|
/// the HTML content and images
|
|
|
|
mod http;
|
2021-04-24 11:58:03 +01:00
|
|
|
mod logs;
|
2020-08-31 17:30:09 +01:00
|
|
|
mod moz_readability;
|
2020-05-01 14:17:59 +01:00
|
|
|
|
2021-02-06 14:03:02 +00:00
|
|
|
use cli::AppConfig;
|
2021-02-11 10:51:21 +00:00
|
|
|
use epub::generate_epubs;
|
2021-02-06 14:03:02 +00:00
|
|
|
use extractor::Extractor;
|
2021-04-17 10:08:24 +01:00
|
|
|
use http::{download_images, fetch_html};
|
2021-04-29 17:58:37 +01:00
|
|
|
use logs::display_summary;
|
2021-02-06 09:59:03 +00:00
|
|
|
|
2020-04-30 09:05:53 +01:00
|
|
|
fn main() {
|
2021-06-01 10:23:22 +01:00
|
|
|
let app_config = match cli::AppConfig::init_with_cli() {
|
|
|
|
Ok(app_config) => app_config,
|
|
|
|
Err(err) => {
|
|
|
|
eprintln!("{}", err);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
};
|
2021-02-01 08:28:07 +00:00
|
|
|
|
2021-06-01 10:23:22 +01:00
|
|
|
if !app_config.urls.is_empty() {
|
2021-02-06 14:03:02 +00:00
|
|
|
download(app_config);
|
2020-05-16 08:09:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-06 14:03:02 +00:00
|
|
|
fn download(app_config: AppConfig) {
|
2021-04-20 19:09:38 +01:00
|
|
|
let mut errors = Vec::new();
|
2021-04-29 17:58:37 +01:00
|
|
|
let mut partial_download_count: usize = 0;
|
2021-06-01 10:23:22 +01:00
|
|
|
let bar = if app_config.can_disable_progress_bar {
|
2021-04-29 17:58:37 +01:00
|
|
|
ProgressBar::hidden()
|
|
|
|
} else {
|
2021-06-01 10:23:22 +01:00
|
|
|
let enabled_bar = ProgressBar::new(app_config.urls.len() as u64);
|
2021-04-29 17:58:37 +01:00
|
|
|
let style = ProgressStyle::default_bar().template(
|
2021-04-17 15:27:38 +01:00
|
|
|
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
|
|
|
|
);
|
2021-04-29 17:58:37 +01:00
|
|
|
enabled_bar.set_style(style);
|
|
|
|
enabled_bar.enable_steady_tick(500);
|
|
|
|
enabled_bar
|
|
|
|
};
|
2021-02-11 10:51:21 +00:00
|
|
|
let articles = task::block_on(async {
|
2021-06-01 10:23:22 +01:00
|
|
|
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
|
|
|
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
2021-02-11 10:51:21 +00:00
|
|
|
let mut articles = Vec::new();
|
2021-02-06 14:03:02 +00:00
|
|
|
while let Some(fetch_result) = responses.next().await {
|
|
|
|
match fetch_result {
|
2021-01-24 14:49:42 +00:00
|
|
|
Ok((url, html)) => {
|
2021-04-24 11:54:47 +01:00
|
|
|
debug!("Extracting {}", &url);
|
2021-04-20 19:06:54 +01:00
|
|
|
let mut extractor = Extractor::from_html(&html, &url);
|
2021-04-17 15:27:38 +01:00
|
|
|
bar.set_message("Extracting...");
|
2021-04-21 17:07:08 +01:00
|
|
|
match extractor.extract_content() {
|
|
|
|
Ok(_) => {
|
|
|
|
extractor.extract_img_urls();
|
|
|
|
if let Err(img_errors) =
|
|
|
|
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
|
|
|
|
.await
|
|
|
|
{
|
2021-04-29 17:58:37 +01:00
|
|
|
partial_download_count += 1;
|
2021-04-24 11:54:47 +01:00
|
|
|
warn!(
|
2021-04-21 17:07:08 +01:00
|
|
|
"{} image{} failed to download for {}",
|
|
|
|
img_errors.len(),
|
|
|
|
if img_errors.len() > 1 { "s" } else { "" },
|
|
|
|
url
|
|
|
|
);
|
2021-04-24 11:57:06 +01:00
|
|
|
for img_error in img_errors {
|
|
|
|
warn!(
|
|
|
|
"{}\n\t\tReason {}",
|
|
|
|
img_error.url().as_ref().unwrap(),
|
|
|
|
img_error
|
|
|
|
);
|
|
|
|
}
|
2021-04-21 17:07:08 +01:00
|
|
|
}
|
|
|
|
articles.push(extractor);
|
|
|
|
}
|
|
|
|
Err(mut e) => {
|
|
|
|
e.set_article_source(&url);
|
|
|
|
errors.push(e);
|
2021-04-17 10:04:06 +01:00
|
|
|
}
|
2021-01-24 14:49:42 +00:00
|
|
|
}
|
2020-10-22 13:22:56 +01:00
|
|
|
}
|
2021-04-20 19:09:38 +01:00
|
|
|
Err(e) => errors.push(e),
|
2020-10-22 10:12:30 +01:00
|
|
|
}
|
2021-04-17 15:27:38 +01:00
|
|
|
bar.inc(1);
|
2020-05-05 10:24:11 +01:00
|
|
|
}
|
2021-02-11 10:51:21 +00:00
|
|
|
articles
|
|
|
|
});
|
2021-04-17 15:27:38 +01:00
|
|
|
bar.finish_with_message("Downloaded articles");
|
2021-04-24 11:54:47 +01:00
|
|
|
|
2021-04-24 07:00:18 +01:00
|
|
|
let mut succesful_articles_table = Table::new();
|
|
|
|
succesful_articles_table
|
|
|
|
.load_preset(UTF8_FULL)
|
|
|
|
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
|
|
|
|
.set_content_arrangement(ContentArrangement::Dynamic);
|
2021-04-29 17:58:37 +01:00
|
|
|
match generate_epubs(articles, &app_config, &mut succesful_articles_table) {
|
2021-04-17 10:04:06 +01:00
|
|
|
Ok(_) => (),
|
2021-04-20 19:09:38 +01:00
|
|
|
Err(gen_epub_errors) => {
|
|
|
|
errors.extend(gen_epub_errors);
|
|
|
|
}
|
2021-04-17 10:04:06 +01:00
|
|
|
};
|
2021-04-24 11:58:03 +01:00
|
|
|
let has_errors = !errors.is_empty();
|
2021-04-29 17:58:37 +01:00
|
|
|
display_summary(
|
2021-06-01 10:23:22 +01:00
|
|
|
app_config.urls.len(),
|
2021-04-29 17:58:37 +01:00
|
|
|
succesful_articles_table,
|
|
|
|
partial_download_count,
|
|
|
|
errors,
|
|
|
|
);
|
2021-06-01 10:23:22 +01:00
|
|
|
if app_config.is_logging_to_file {
|
2021-04-29 17:58:37 +01:00
|
|
|
println!(
|
|
|
|
"Log written to paperoni_{}.log\n",
|
2021-06-01 10:23:22 +01:00
|
|
|
app_config.start_time.format("%Y-%m-%d_%H-%M-%S")
|
2021-04-29 17:58:37 +01:00
|
|
|
);
|
|
|
|
}
|
2021-04-24 11:58:03 +01:00
|
|
|
if has_errors {
|
2021-04-24 07:00:18 +01:00
|
|
|
std::process::exit(1);
|
2021-04-20 19:09:38 +01:00
|
|
|
}
|
2020-04-30 09:05:53 +01:00
|
|
|
}
|