From 3a8160412c0640a87b1d079499ff5dce8ea463a8 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Mon, 17 May 2021 22:12:10 +0300 Subject: [PATCH 01/17] refactor `short_summary` function in logs.rs to be less redundant --- src/logs.rs | 156 +++++++++++++++++++++++++--------------------------- 1 file changed, 75 insertions(+), 81 deletions(-) diff --git a/src/logs.rs b/src/logs.rs index 87b5d1b..4d735b1 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -55,76 +55,55 @@ pub fn display_summary( /// Returns a string summary of the total number of failed and successful article downloads fn short_summary(download_count: DownloadCount) -> String { - // TODO: Refactor this if download_count.total != download_count.successful + download_count.failed + download_count.partial { panic!("initial_count must be equal to the sum of failed and successful count") } let get_noun = |count: usize| if count == 1 { "article" } else { "articles" }; - if download_count.successful == download_count.total && download_count.successful == 1 { - "Article downloaded successfully".green().to_string() - } else if download_count.total == download_count.failed && download_count.failed == 1 { - "Article failed to download".red().to_string() - } else if download_count.total == download_count.partial && download_count.partial == 1 { - "Article partially failed to download".yellow().to_string() - } else if download_count.successful == download_count.total { - "All articles downloaded successfully".green().to_string() - } else if download_count.failed == download_count.total { - "All articles failed to download".red().to_string() - } else if download_count.partial == download_count.total { - "All articles partially failed to download" - .yellow() - .to_string() - } else if download_count.partial == 0 { - format!( - "{} {} downloaded successfully, {} {} failed", - download_count.successful, - get_noun(download_count.successful), - download_count.failed, - get_noun(download_count.failed) - ) - .yellow() - .to_string() - } else if download_count.successful == 0 - && download_count.partial > 0 - && download_count.failed > 0 - { - format!( - "{} {} partially failed to download, {} {} failed", - download_count.partial, - get_noun(download_count.partial), - download_count.failed, - get_noun(download_count.failed) - ) - .yellow() - .to_string() - } else if download_count.failed == 0 - && download_count.successful > 0 - && download_count.partial > 0 - { - format!( - "{} {} downloaded successfully, {} {} partially failed to download", - download_count.successful, - get_noun(download_count.successful), - download_count.partial, - get_noun(download_count.partial) - ) - .yellow() + let get_summary = |count, label, color: Color| { + if count == 0 { + return "".to_string(); + }; + + { + if count == 1 && count == download_count.total { + "Article".to_string() + label + } else if count == download_count.total { + "All ".to_string() + get_noun(count) + label + } else { + count.to_string() + " " + get_noun(count) + label + } + } + .color(color) .to_string() + }; + + let mut summary = get_summary( + download_count.successful, + " downloaded successfully", + Color::BrightGreen, + ); + + let partial_summary = get_summary( + download_count.partial, + " partially failed to download", + Color::Yellow, + ); + + if !summary.is_empty() && !partial_summary.is_empty() { + summary = summary + ", " + &partial_summary; } else { - format!( - "{} {} downloaded successfully, {} {} partially failed to download, {} {} failed", - download_count.successful, - get_noun(download_count.successful), - download_count.partial, - get_noun(download_count.partial), - download_count.failed, - get_noun(download_count.failed) - ) - .yellow() - .to_string() + summary = summary + &partial_summary; } + + let failed_summary = get_summary(download_count.failed, " failed to download", Color::Red); + if !summary.is_empty() && !failed_summary.is_empty() { + summary = summary + ", " + &failed_summary; + } else { + summary = summary + &failed_summary; + } + summary } struct DownloadCount { @@ -192,7 +171,7 @@ mod tests { fn test_short_summary() { assert_eq!( short_summary(DownloadCount::new(1, 1, 0, 0)), - "Article downloaded successfully".green().to_string() + "Article downloaded successfully".bright_green().to_string() ); assert_eq!( short_summary(DownloadCount::new(1, 0, 0, 1)), @@ -200,7 +179,9 @@ mod tests { ); assert_eq!( short_summary(DownloadCount::new(10, 10, 0, 0)), - "All articles downloaded successfully".green().to_string() + "All articles downloaded successfully" + .bright_green() + .to_string() ); assert_eq!( short_summary(DownloadCount::new(10, 0, 0, 10)), @@ -208,39 +189,52 @@ mod tests { ); assert_eq!( short_summary(DownloadCount::new(10, 8, 0, 2)), - "8 articles downloaded successfully, 2 articles failed" - .yellow() - .to_string() + format!( + "{}, {}", + "8 articles downloaded successfully".bright_green(), + "2 articles failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(10, 1, 0, 9)), - "1 article downloaded successfully, 9 articles failed" - .yellow() - .to_string() + format!( + "{}, {}", + "1 article downloaded successfully".bright_green(), + "9 articles failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(7, 6, 0, 1)), - "6 articles downloaded successfully, 1 article failed" - .yellow() - .to_string() + format!( + "{}, {}", + "6 articles downloaded successfully".bright_green(), + "1 article failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(7, 4, 2, 1)), - "4 articles downloaded successfully, 2 articles partially failed to download, 1 article failed" - .yellow() - .to_string() + format!( + "{}, {}, {}", + "4 articles downloaded successfully".bright_green(), + "2 articles partially failed to download".yellow(), + "1 article failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(12, 6, 6, 0)), - "6 articles downloaded successfully, 6 articles partially failed to download" - .yellow() - .to_string() + format!( + "{}, {}", + "6 articles downloaded successfully".bright_green(), + "6 articles partially failed to download".yellow() + ) ); assert_eq!( short_summary(DownloadCount::new(5, 0, 4, 1)), - "4 articles partially failed to download, 1 article failed" - .yellow() - .to_string() + format!( + "{}, {}", + "4 articles partially failed to download".yellow(), + "1 article failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(4, 0, 4, 0)), From 8c9783b596d6f8c132295b0939efd04874178f17 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 18 May 2021 18:08:31 +0300 Subject: [PATCH 02/17] feat: add header level table of contents for articles --- src/epub.rs | 94 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 14 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index 75f2b9e..5e5d377 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -1,8 +1,10 @@ +use std::collections::HashMap; use std::fs::File; use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; -use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; +use epub_builder::{EpubBuilder, EpubContent, TocElement, ZipLibrary}; use indicatif::{ProgressBar, ProgressStyle}; +use kuchiki::NodeRef; use log::{debug, info}; use crate::{ @@ -63,15 +65,22 @@ pub fn generate_epubs( .enumerate() .fold(&mut epub, |epub, (idx, article)| { let mut article_result = || -> Result<(), PaperoniError> { - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article(), &mut html_buf)?; - let html_str = std::str::from_utf8(&html_buf)?; - epub.metadata("title", replace_metadata_value(name))?; + let mut xhtml_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf)?; + let xhtml_str = std::str::from_utf8(&xhtml_buf)?; let section_name = article.metadata().title(); - epub.add_content( - EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) - .title(replace_metadata_value(section_name)), - )?; + let content_url = format!("article_{}.xhtml", idx); + let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes()) + .title(replace_metadata_value(section_name)); + let header_level_tocs = + get_header_level_toc_vec(&content_url, article.article()); + + for toc_element in header_level_tocs { + content = content.child(toc_element); + } + + epub.metadata("title", replace_metadata_value(name))?; + epub.add_content(content)?; info!("Adding images for {:?}", name); article.img_urls.iter().for_each(|img| { // TODO: Add error handling and return errors as a vec @@ -144,15 +153,28 @@ pub fn generate_epubs( ); debug!("Creating {:?}", file_name); let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article(), &mut html_buf) + let mut xhtml_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf) .expect("Unable to serialize to xhtml"); - let html_str = std::str::from_utf8(&html_buf).unwrap(); + let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap(); + let header_level_tocs = + get_header_level_toc_vec("index.xhtml", article.article()); + if let Some(author) = article.metadata().byline() { epub.metadata("author", replace_metadata_value(author))?; } - epub.metadata("title", replace_metadata_value(article.metadata().title()))?; - epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; + let title = replace_metadata_value(article.metadata().title()); + epub.metadata("title", &title)?; + + let mut content = + EpubContent::new("index.xhtml", xhtml_str.as_bytes()).title(title); + + for toc_element in header_level_tocs { + content = content.child(toc_element); + } + + epub.add_content(content)?; + for img in &article.img_urls { let mut file_path = std::env::temp_dir(); file_path.push(&img.0); @@ -232,6 +254,50 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String { template } +/// Returns a vector of `TocElement` from a NodeRef used for adding to the Table of Contents for navigation +fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec { + // TODO: Test this + let mut headers_vec = Vec::new(); + + let mut header_levels = HashMap::new(); + header_levels.insert("h1", 1); + header_levels.insert("h2", 2); + header_levels.insert("h3", 3); + + let headings = article + .select("h1, h2, h3") + .expect("Unable to create selector for headings"); + + let mut prev_toc: Option = None; + + for heading in headings { + // TODO: Create a new function that adds an id attribute to heading tags before this function is called + let elem_attrs = heading.attributes.borrow(); + let elem_name: &str = &heading.name.local; + let id = elem_attrs + .get("id") + .map(|val| val.to_string()) + .unwrap_or(heading.text_contents().replace(" ", "-")); + let toc = TocElement::new(format!("{}#{}", content_url, id), heading.text_contents()) + .level(header_levels[elem_name]); + if let Some(prev_toc_element) = prev_toc { + if prev_toc_element.level <= toc.level { + headers_vec.push(prev_toc_element); + prev_toc = Some(toc); + } else { + prev_toc = Some(prev_toc_element.child(toc)) + } + } else { + prev_toc = Some(toc); + } + } + + if let Some(toc_element) = prev_toc { + headers_vec.push(toc_element); + } + + headers_vec +} #[cfg(test)] mod test { use super::replace_metadata_value; From 13ad14e73d54f21d0b4ae114f9dead66b1edb163 Mon Sep 17 00:00:00 2001 From: Mikhail Gorbachev Date: Tue, 1 Jun 2021 12:23:22 +0300 Subject: [PATCH 03/17] Add `output_dir` to cli argument - Add `output_dir` to cli argument - This argument allows you to save output files in a special folder, not just current dir - Refactor 'cli.rs' - Add `Builder` for `AppConfig` - Add `Error` instead separated panics - Upgrade dependencies --- Cargo.lock | 159 +++++++++++++++++++------ Cargo.toml | 13 +- README.md | 23 ++-- rust-toolchain | 1 + src/cli.rs | 315 ++++++++++++++++++++++++++++--------------------- src/epub.rs | 13 +- src/http.rs | 10 +- src/logs.rs | 45 +------ src/main.rs | 26 ++-- 9 files changed, 354 insertions(+), 251 deletions(-) create mode 100644 rust-toolchain diff --git a/Cargo.lock b/Cargo.lock index d07b91c..c2bcea0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,7 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +version = 3 + [[package]] name = "addr2line" version = "0.14.1" @@ -71,9 +73,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "0.7.15" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] @@ -389,7 +391,7 @@ dependencies = [ "ansi_term", "atty", "bitflags", - "strsim", + "strsim 0.8.0", "textwrap", "unicode-width", "vec_map", @@ -435,9 +437,7 @@ dependencies = [ "encode_unicode", "lazy_static", "libc", - "regex", "terminal_size", - "unicode-width", "winapi", ] @@ -614,6 +614,41 @@ dependencies = [ "winapi", ] +[[package]] +name = "darling" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f2c43f534ea4b0b049015d00269734195e6d3f0f6635cb692251aca6f9f8b3c" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e91455b86830a1c21799d94524df0845183fa55bafd9aa137b01c7d1065fa36" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29b5acf0dea37a7f66f7b25d2c5e93fd46f8f6968b1a5d7a3e02e97768afc95a" +dependencies = [ + "darling_core", + "quote", + "syn", +] + [[package]] name = "dashmap" version = "4.0.2" @@ -630,6 +665,37 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57" +[[package]] +name = "derive_builder" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d13202debe11181040ae9063d739fa32cfcaaebe2275fe387703460ae2365b30" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66e616858f6187ed828df7c64a6d71720d83767a7f19740b2d1b6fe6327b36e5" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58a94ace95092c5acb1e97a7e846b310cfbd499652f72297da7493f618a98d73" +dependencies = [ + "derive_builder_core", + "syn", +] + [[package]] name = "derive_more" version = "0.99.13" @@ -822,9 +888,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" +checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27" dependencies = [ "futures-channel", "futures-core", @@ -837,9 +903,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" +checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" dependencies = [ "futures-core", "futures-sink", @@ -847,15 +913,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" +checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" [[package]] name = "futures-executor" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" +checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79" dependencies = [ "futures-core", "futures-task", @@ -864,9 +930,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" +checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" [[package]] name = "futures-lite" @@ -885,10 +951,11 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" +checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" dependencies = [ + "autocfg", "proc-macro-hack", "proc-macro2", "quote", @@ -897,22 +964,23 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" +checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" [[package]] name = "futures-task" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" +checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" [[package]] name = "futures-util" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" +checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" dependencies = [ + "autocfg", "futures-channel", "futures-core", "futures-io", @@ -1112,6 +1180,12 @@ dependencies = [ "url", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.2.3" @@ -1125,9 +1199,9 @@ dependencies = [ [[package]] name = "indicatif" -version = "0.15.0" +version = "0.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4" +checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b" dependencies = [ "console", "lazy_static", @@ -1305,9 +1379,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.3.4" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "mime" @@ -1419,9 +1493,9 @@ dependencies = [ [[package]] name = "number_prefix" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" [[package]] name = "object" @@ -1469,6 +1543,7 @@ dependencies = [ "clap", "colored", "comfy-table", + "derive_builder", "directories", "epub-builder", "flexi_logger", @@ -1829,9 +1904,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.4.6" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", @@ -1840,9 +1915,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.23" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "remove_dir_all" @@ -2172,6 +2247,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + [[package]] name = "strum" version = "0.20.0" @@ -2277,18 +2358,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" [[package]] name = "thiserror" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e" +checksum = "fa6f76457f59514c7eeb4e59d891395fab0b2fd1d40723ae737d64153392e9c6" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" +checksum = "8a36768c0fbf1bb15eca10defa29526bda730a2376c2ab4393ccfa16fb1a318d" dependencies = [ "proc-macro2", "quote", @@ -2465,9 +2546,9 @@ dependencies = [ [[package]] name = "url" -version = "2.2.1" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" dependencies = [ "form_urlencoded", "idna", diff --git a/Cargo.toml b/Cargo.toml index 8b8b6e6..e3f6055 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,23 +12,24 @@ readme = "README.md" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -async-std = "1.9.0" # atty = "0.2.14" +async-std = "1.9.0" chrono = "0.4.19" clap = "2.33.3" colored = "2.0.0" comfy-table = "2.1.0" +derive_builder = "0.10.2" directories = "3.0.2" epub-builder = "0.4.8" flexi_logger = "0.17.1" -futures = "0.3.14" +futures = "0.3.15" html5ever = "0.25.1" -indicatif = "0.15.0" +indicatif = "0.16.2" kuchiki = "0.8.1" lazy_static = "1.4.0" log = "0.4.14" md5 = "0.7.0" -regex = "1.4.5" +regex = "1.5.4" surf = "2.2.0" -thiserror = "1.0.24" -url = "2.2.1" +thiserror = "1.0.25" +url = "2.2.2" diff --git a/README.md b/README.md index 2cfba38..aa94d06 100644 --- a/README.md +++ b/README.md @@ -48,15 +48,20 @@ USAGE: paperoni [OPTIONS] [urls]... OPTIONS: - -f, --file Input file containing links - -h, --help Prints help information - --log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level - of debug. Use -v to specify the logging level - --max_conn The maximum number of concurrent HTTP connections when downloading articles. Default is - 8 - --merge Merge multiple articles into a single epub - -V, --version Prints version information - -v Enables logging of events and set the verbosity level. Use -h to read on its usage + -f, --file Input file containing links + -h, --help Prints help information + --log-to-file + Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to + specify the logging level + --max_conn + The maximum number of concurrent HTTP connections when downloading articles. Default is 8 + + -o, --output_directory Directory for store output epub documents + --merge Merge multiple articles into a single epub + -V, --version Prints version information + -v + Enables logging of events and set the verbosity level. Use --help to read on its usage + ARGS: ... Urls of web articles diff --git a/rust-toolchain b/rust-toolchain new file mode 100644 index 0000000..154cb93 --- /dev/null +++ b/rust-toolchain @@ -0,0 +1 @@ +1.52.1 diff --git a/src/cli.rs b/src/cli.rs index 19ce379..63c4d89 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,13 +1,56 @@ -use std::{fs::File, io::Read, path::Path}; +use std::{ + collections::HashSet, + num::{NonZeroUsize, ParseIntError}, + path::Path, +}; use chrono::{DateTime, Local}; -use clap::{App, AppSettings, Arg}; -use flexi_logger::LevelFilter as LogLevel; +use clap::{App, AppSettings, Arg, ArgMatches}; +use flexi_logger::{FlexiLoggerError, LevelFilter as LogLevel}; +use std::fs; +use thiserror::Error; -use crate::logs::init_logger; +const DEFAULT_MAX_CONN: usize = 8; -pub fn cli_init() -> AppConfig { - let app = App::new("paperoni") +#[derive(derive_builder::Builder)] +pub struct AppConfig { + /// Urls for store in epub + pub urls: Vec, + pub max_conn: usize, + /// Path to file of multiple articles into a single epub + pub merged: Option, + pub output_directory: Option, + pub log_level: LogLevel, + pub can_disable_progress_bar: bool, + pub start_time: DateTime, + pub is_logging_to_file: bool, +} + +#[derive(Debug, Error)] +pub enum Error { + #[error("Failed to open file with urls: {0}")] + UrlFileError(#[from] std::io::Error), + #[error("Failed to parse max connection value: {0}")] + InvalidMaxConnectionCount(#[from] ParseIntError), + #[error("No urls for parse")] + NoUrls, + #[error("No urls for parse")] + AppBuildError(#[from] AppConfigBuilderError), + #[error("Invalid output path name for merged epubs: {0}")] + InvalidOutputPath(String), + #[error("Log error: {0}")] + LogDirectoryError(String), + #[error(transparent)] + LogError(#[from] FlexiLoggerError), + #[error("Wrong output directory")] + WrongOutputDirectory, + #[error("Output directory not exists")] + OutputDirectoryNotExists, +} + +impl AppConfig { + pub fn init_with_cli() -> Result { + let app = App::new("paperoni") .settings(&[ AppSettings::ArgRequiredElseHelp, AppSettings::UnifiedHelpMessage, @@ -28,11 +71,21 @@ pub fn cli_init() -> AppConfig { .help("Input file containing links") .takes_value(true), ) + .arg( + Arg::with_name("output_directory") + .long("output_directory") + .short("o") + .help("Directory for store output epub documents") + .conflicts_with("output_name") + .long_help("Directory for saving epub documents") + .takes_value(true), + ) .arg( Arg::with_name("output_name") .long("merge") .help("Merge multiple articles into a single epub") .long_help("Merge multiple articles into a single epub that will be given the name provided") + .conflicts_with("output_directory") .takes_value(true), ).arg( Arg::with_name("max_conn") @@ -60,143 +113,135 @@ pub fn cli_init() -> AppConfig { .long("log-to-file") .help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level") .takes_value(false)); - let arg_matches = app.get_matches(); - let mut urls: Vec = match arg_matches.value_of("file") { - Some(file_name) => { - if let Ok(mut file) = File::open(file_name) { - let mut content = String::new(); - match file.read_to_string(&mut content) { - Ok(_) => content - .lines() - .filter(|line| !line.is_empty()) - .map(|line| line.to_owned()) - .collect(), - Err(_) => vec![], + Self::try_from(app.get_matches()) + } + + fn init_merge_file(self) -> Result { + self.merged + .as_deref() + .map(fs::File::create) + .transpose() + .err() + .map(|err| Err(Error::InvalidOutputPath(err.to_string()))) + .unwrap_or(Ok(self)) + } + + fn init_logger(self) -> Result { + use directories::UserDirs; + use flexi_logger::LogSpecBuilder; + + match UserDirs::new() { + Some(user_dirs) => { + let home_dir = user_dirs.home_dir(); + let paperoni_dir = home_dir.join(".paperoni"); + let log_dir = paperoni_dir.join("logs"); + + let log_spec = LogSpecBuilder::new() + .module("paperoni", self.log_level) + .build(); + let formatted_timestamp = self.start_time.format("%Y-%m-%d_%H-%M-%S"); + let mut logger = flexi_logger::Logger::with(log_spec); + + if self.is_logging_to_file && (!paperoni_dir.is_dir() || !log_dir.is_dir()) { + if let Err(e) = fs::create_dir_all(&log_dir) { + return Err(Error::LogDirectoryError(format!("Unable to create paperoni directories on home directory for logging purposes\n{}",e))); + } } - } else { - println!("Unable to open file: {}", file_name); - vec![] + if self.is_logging_to_file { + logger = logger + .directory(log_dir) + .discriminant(formatted_timestamp.to_string()) + .suppress_timestamp() + .log_to_file(); + } + logger.start()?; + Ok(self) } + None => Err(Error::LogDirectoryError( + "Unable to get user directories for logging purposes".to_string(), + )), } - None => vec![], - }; - - if let Some(vals) = arg_matches.values_of("urls") { - urls.extend( - vals.filter(|val| !val.is_empty()) - .map(|val| val.to_string()), - ); } - - let max_conn = arg_matches - .value_of("max_conn") - .map(|conn_str| conn_str.parse::().ok()) - .flatten() - .map(|max| if max > 0 { max } else { 1 }) - .unwrap_or(8); - - let mut app_config = AppConfig::new(max_conn); - app_config.set_urls(urls); - - if let Some(name) = arg_matches.value_of("output_name") { - let file_path = Path::new(name); - if file_path.is_dir() { - eprintln!("{:?} is a directory", name); - std::process::exit(1); - } - - let file_name = if file_path.extension().is_some() { - name.to_owned() - } else { - name.to_owned() + ".epub" - }; - - match std::fs::File::create(&file_name) { - Ok(_) => (), - Err(e) => { - eprintln!("Unable to create file {:?}\n{}", file_path, e); - std::process::exit(1) - } - } - app_config.merged = Some(file_name); - } - - if arg_matches.is_present("verbosity") { - if !arg_matches.is_present("log-to-file") { - app_config.can_disable_progress_bar = true; - } - let log_levels: [LogLevel; 5] = [ - LogLevel::Off, - LogLevel::Error, - LogLevel::Warn, - LogLevel::Info, - LogLevel::Debug, - ]; - let level = arg_matches.occurrences_of("verbosity").clamp(0, 4) as usize; - app_config.log_level = log_levels[level]; - } - if arg_matches.is_present("log-to-file") { - app_config.log_level = LogLevel::Debug; - app_config.is_logging_to_file = true; - } - - init_logger(&app_config); - - app_config } -pub struct AppConfig { - urls: Vec, - max_conn: usize, - merged: Option, - log_level: LogLevel, - can_disable_progress_bar: bool, - start_time: DateTime, - is_logging_to_file: bool, +use std::convert::TryFrom; + +impl<'a> TryFrom> for AppConfig { + type Error = Error; + + fn try_from(arg_matches: ArgMatches<'a>) -> Result { + AppConfigBuilder::default() + .urls({ + let url_filter = |url: &str| { + let url = url.trim(); + if !url.is_empty() { + Some(url.to_owned()) + } else { + None + } + }; + match ( + arg_matches + .values_of("urls") + .and_then(|urls| urls.map(url_filter).collect::>>()), + arg_matches + .value_of("file") + .map(fs::read_to_string) + .transpose()? + .and_then(|content| { + content + .lines() + .map(url_filter) + .collect::>>() + }), + ) { + (Some(direct_urls), Some(file_urls)) => Ok(direct_urls + .union(&file_urls) + .map(ToOwned::to_owned) + .collect::>()), + (Some(urls), None) | (None, Some(urls)) => Ok(urls.into_iter().collect()), + (None, None) => Err(Error::NoUrls), + } + }?) + .max_conn(match arg_matches.value_of("max_conn") { + Some(max_conn) => max_conn.parse::()?.get(), + None => DEFAULT_MAX_CONN, + }) + .merged(arg_matches.value_of("output_name").map(ToOwned::to_owned)) + .can_disable_progress_bar( + arg_matches.is_present("verbosity") && !arg_matches.is_present("log-to-file"), + ) + .log_level(match arg_matches.occurrences_of("verbosity") { + 0 => LogLevel::Off, + 1 => LogLevel::Error, + 2 => LogLevel::Warn, + 3 => LogLevel::Info, + 4..=u64::MAX => LogLevel::Debug, + }) + .is_logging_to_file(arg_matches.is_present("log-to_file")) + .output_directory( + arg_matches + .value_of("output_directory") + .map(|output_directory| { + let path = Path::new(output_directory); + if !path.exists() { + Err(Error::OutputDirectoryNotExists) + } else if !path.is_dir() { + Err(Error::WrongOutputDirectory) + } else { + Ok(output_directory.to_owned()) + } + }) + .transpose()?, + ) + .start_time(Local::now()) + .try_init() + } } -impl AppConfig { - fn new(max_conn: usize) -> Self { - Self { - urls: vec![], - max_conn, - merged: None, - log_level: LogLevel::Off, - can_disable_progress_bar: false, - start_time: Local::now(), - is_logging_to_file: false, - } - } - - fn set_urls(&mut self, urls: Vec) { - self.urls.extend(urls); - } - - pub fn urls(&self) -> &Vec { - &self.urls - } - pub fn max_conn(&self) -> usize { - self.max_conn - } - - pub fn merged(&self) -> Option<&String> { - self.merged.as_ref() - } - - pub fn log_level(&self) -> LogLevel { - self.log_level - } - - pub fn can_disable_progress_bar(&self) -> bool { - self.can_disable_progress_bar - } - - pub fn start_time(&self) -> &DateTime { - &self.start_time - } - - pub fn is_logging_to_file(&self) -> bool { - self.is_logging_to_file +impl AppConfigBuilder { + pub fn try_init(&self) -> Result { + self.build()?.init_logger()?.init_merge_file() } } diff --git a/src/epub.rs b/src/epub.rs index 75f2b9e..06dfeff 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -16,7 +16,7 @@ pub fn generate_epubs( app_config: &AppConfig, successful_articles_table: &mut Table, ) -> Result<(), Vec> { - let bar = if app_config.can_disable_progress_bar() { + let bar = if app_config.can_disable_progress_bar { ProgressBar::hidden() } else { let enabled_bar = ProgressBar::new(articles.len() as u64); @@ -32,8 +32,8 @@ pub fn generate_epubs( let mut errors: Vec = Vec::new(); - match app_config.merged() { - Some(name) => { + match app_config.merged { + Some(ref name) => { successful_articles_table.set_header(vec![Cell::new("Table of Contents") .add_attribute(Attribute::Bold) .set_alignment(CellAlignment::Center) @@ -103,7 +103,7 @@ pub fn generate_epubs( .title(replace_metadata_value("Article Sources")), ) { let mut paperoni_err: PaperoniError = err.into(); - paperoni_err.set_article_source(name); + paperoni_err.set_article_source(&name); errors.push(paperoni_err); return Err(errors); } @@ -113,7 +113,7 @@ pub fn generate_epubs( Ok(_) => (), Err(err) => { let mut paperoni_err: PaperoniError = err.into(); - paperoni_err.set_article_source(name); + paperoni_err.set_article_source(&name); errors.push(paperoni_err); return Err(errors); } @@ -135,7 +135,8 @@ pub fn generate_epubs( let mut result = || -> Result<(), PaperoniError> { let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; let file_name = format!( - "{}.epub", + "{}/{}.epub", + app_config.output_directory.as_deref().unwrap_or("."), article .metadata() .title() diff --git a/src/http.rs b/src/http.rs index efd64b8..148fab0 100644 --- a/src/http.rs +++ b/src/http.rs @@ -153,7 +153,11 @@ pub async fn download_images( }) .enumerate() .map(|(img_idx, (url, req))| async move { - bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str()); + bar.set_message(format!( + "Downloading images [{}/{}]", + img_idx + 1, + img_count + )); match req.await { Ok(mut img_response) => { let process_response = @@ -234,9 +238,9 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String { .unwrap() .join(url) .unwrap() - .into_string() + .into() } else { - request_url.join(url).unwrap().into_string() + request_url.join(url).unwrap().into() } } diff --git a/src/logs.rs b/src/logs.rs index 87b5d1b..0feb27f 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -1,11 +1,9 @@ use colored::*; use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; -use directories::UserDirs; -use flexi_logger::LogSpecBuilder; use log::error; -use crate::{cli::AppConfig, errors::PaperoniError}; +use crate::errors::PaperoniError; pub fn display_summary( initial_article_count: usize, @@ -143,47 +141,6 @@ impl DownloadCount { } } } - -pub fn init_logger(app_config: &AppConfig) { - match UserDirs::new() { - Some(user_dirs) => { - let home_dir = user_dirs.home_dir(); - let paperoni_dir = home_dir.join(".paperoni"); - let log_dir = paperoni_dir.join("logs"); - - let log_spec = LogSpecBuilder::new() - .module("paperoni", app_config.log_level()) - .build(); - let formatted_timestamp = app_config.start_time().format("%Y-%m-%d_%H-%M-%S"); - let mut logger = flexi_logger::Logger::with(log_spec); - - if app_config.is_logging_to_file() && (!paperoni_dir.is_dir() || !log_dir.is_dir()) { - match std::fs::create_dir_all(&log_dir) { - Ok(_) => (), - Err(e) => { - eprintln!("Unable to create paperoni directories on home directory for logging purposes\n{}",e); - std::process::exit(1); - } - }; - } - - if app_config.is_logging_to_file() { - logger = logger - .directory(log_dir) - .discriminant(formatted_timestamp.to_string()) - .suppress_timestamp() - .log_to_file(); - } - - match logger.start() { - Ok(_) => (), - Err(e) => eprintln!("Unable to start logger!\n{}", e), - } - } - None => eprintln!("Unable to get user directories for logging purposes"), - }; -} - #[cfg(test)] mod tests { use super::{short_summary, DownloadCount}; diff --git a/src/main.rs b/src/main.rs index 0f8b34a..fcb0cd4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,8 @@ #[macro_use] extern crate lazy_static; +use std::process::exit; + use async_std::stream; use async_std::task; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; @@ -27,9 +29,15 @@ use http::{download_images, fetch_html}; use logs::display_summary; fn main() { - let app_config = cli::cli_init(); + let app_config = match cli::AppConfig::init_with_cli() { + Ok(app_config) => app_config, + Err(err) => { + eprintln!("{}", err); + exit(1); + } + }; - if !app_config.urls().is_empty() { + if !app_config.urls.is_empty() { download(app_config); } } @@ -37,10 +45,10 @@ fn main() { fn download(app_config: AppConfig) { let mut errors = Vec::new(); let mut partial_download_count: usize = 0; - let bar = if app_config.can_disable_progress_bar() { + let bar = if app_config.can_disable_progress_bar { ProgressBar::hidden() } else { - let enabled_bar = ProgressBar::new(app_config.urls().len() as u64); + let enabled_bar = ProgressBar::new(app_config.urls.len() as u64); let style = ProgressStyle::default_bar().template( "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", ); @@ -49,8 +57,8 @@ fn download(app_config: AppConfig) { enabled_bar }; let articles = task::block_on(async { - let urls_iter = app_config.urls().iter().map(|url| fetch_html(url)); - let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); + let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); + let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); let mut articles = Vec::new(); while let Some(fetch_result) = responses.next().await { match fetch_result { @@ -109,15 +117,15 @@ fn download(app_config: AppConfig) { }; let has_errors = !errors.is_empty(); display_summary( - app_config.urls().len(), + app_config.urls.len(), succesful_articles_table, partial_download_count, errors, ); - if app_config.is_logging_to_file() { + if app_config.is_logging_to_file { println!( "Log written to paperoni_{}.log\n", - app_config.start_time().format("%Y-%m-%d_%H-%M-%S") + app_config.start_time.format("%Y-%m-%d_%H-%M-%S") ); } if has_errors { From fd161455b4cb03e03c52f25d4bbf736723179a3c Mon Sep 17 00:00:00 2001 From: Philip Wrenn Date: Sat, 5 Jun 2021 23:17:55 -0400 Subject: [PATCH 04/17] Removed unwrap to prevent unexpected panic. --- src/moz_readability/mod.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 705fa55..7549f24 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -1609,13 +1609,8 @@ impl Readability { // // class name "comment", etc), and turn divs into P tags where they have been // // used inappropriately (as in, where they contain no other block level elements.) let mut elements_to_score: Vec = Vec::new(); - let mut node = Some( - self.root_node - .select_first("html") - .unwrap() - .as_node() - .clone(), - ); + let mut node = self.root_node.select_first("html") + .ok().map(|n| n.as_node().clone()); while let Some(node_ref) = node { let node_elem = node_ref.as_element().unwrap(); From 8220cf29f718e103b9a7d8a47967dd862dbda4d3 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sun, 6 Jun 2021 10:10:00 +0300 Subject: [PATCH 05/17] Change function `replace_metadata_value` to `replace_escaped_characters` --- src/epub.rs | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index 5e5d377..6c2d5f3 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -71,7 +71,7 @@ pub fn generate_epubs( let section_name = article.metadata().title(); let content_url = format!("article_{}.xhtml", idx); let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes()) - .title(replace_metadata_value(section_name)); + .title(replace_escaped_characters(section_name)); let header_level_tocs = get_header_level_toc_vec(&content_url, article.article()); @@ -79,7 +79,7 @@ pub fn generate_epubs( content = content.child(toc_element); } - epub.metadata("title", replace_metadata_value(name))?; + epub.metadata("title", replace_escaped_characters(name))?; epub.add_content(content)?; info!("Adding images for {:?}", name); article.img_urls.iter().for_each(|img| { @@ -109,7 +109,7 @@ pub fn generate_epubs( let appendix = generate_appendix(articles.iter().collect()); if let Err(err) = epub.add_content( EpubContent::new("appendix.xhtml", appendix.as_bytes()) - .title(replace_metadata_value("Article Sources")), + .title(replace_escaped_characters("Article Sources")), ) { let mut paperoni_err: PaperoniError = err.into(); paperoni_err.set_article_source(name); @@ -161,9 +161,9 @@ pub fn generate_epubs( get_header_level_toc_vec("index.xhtml", article.article()); if let Some(author) = article.metadata().byline() { - epub.metadata("author", replace_metadata_value(author))?; + epub.metadata("author", replace_escaped_characters(author))?; } - let title = replace_metadata_value(article.metadata().title()); + let title = replace_escaped_characters(article.metadata().title()); epub.metadata("title", &title)?; let mut content = @@ -189,7 +189,7 @@ pub fn generate_epubs( let appendix = generate_appendix(vec![&article]); epub.add_content( EpubContent::new("appendix.xhtml", appendix.as_bytes()) - .title(replace_metadata_value("Article Source")), + .title(replace_escaped_characters("Article Source")), )?; epub.generate(&mut out_file)?; bar.inc(1); @@ -216,7 +216,7 @@ pub fn generate_epubs( } /// Replaces characters that have to be escaped before adding to the epub's metadata -fn replace_metadata_value(value: &str) -> String { +fn replace_escaped_characters(value: &str) -> String { value .replace("&", "&") .replace("<", "<") @@ -235,8 +235,8 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String { }; format!( "{}

", - replace_metadata_value(&article.url), - replace_metadata_value(article_name) + replace_escaped_characters(&article.url), + replace_escaped_characters(article_name) ) }) .collect(); @@ -300,20 +300,22 @@ fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec Date: Sun, 6 Jun 2021 13:01:57 +0300 Subject: [PATCH 06/17] Add `generate_header_ids` function Add h4 to header level ToC and update implementation Add tests --- src/epub.rs | 200 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 179 insertions(+), 21 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index 6c2d5f3..9797342 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -254,55 +254,80 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String { template } +/// Adds an id attribute to header elements and assigns a value based on +/// the hash of the text content. Headers with id attributes are not modified. +/// The headers here are known to have text because the grabbed article from +/// readability removes headers with no text. +fn generate_header_ids(root_node: &NodeRef) { + let headers = root_node + .select("h1, h2, h3, h4") + .expect("Unable to create selector for headings"); + let headers_no_id = headers.filter(|node_data_ref| { + let attrs = node_data_ref.attributes.borrow(); + !attrs.contains("id") + }); + for header in headers_no_id { + let mut attrs = header.attributes.borrow_mut(); + let text = header.text_contents(); + // The value of the id begins with an underscore because the hexadecimal + // digest might start with a number which would make it an invalid id + // when querying with selectors + let value = format!("_{:x}", md5::compute(text)); + attrs.insert("id", value); + } +} + /// Returns a vector of `TocElement` from a NodeRef used for adding to the Table of Contents for navigation fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec { - // TODO: Test this + generate_header_ids(article); + let mut headers_vec = Vec::new(); let mut header_levels = HashMap::new(); header_levels.insert("h1", 1); header_levels.insert("h2", 2); header_levels.insert("h3", 3); + header_levels.insert("h4", 4); let headings = article - .select("h1, h2, h3") + .select("h1, h2, h3, h4") .expect("Unable to create selector for headings"); - let mut prev_toc: Option = None; + let mut last_toc_elem_level: Option = None; for heading in headings { // TODO: Create a new function that adds an id attribute to heading tags before this function is called let elem_attrs = heading.attributes.borrow(); let elem_name: &str = &heading.name.local; - let id = elem_attrs - .get("id") - .map(|val| val.to_string()) - .unwrap_or(heading.text_contents().replace(" ", "-")); - let toc = TocElement::new(format!("{}#{}", content_url, id), heading.text_contents()) - .level(header_levels[elem_name]); - if let Some(prev_toc_element) = prev_toc { - if prev_toc_element.level <= toc.level { - headers_vec.push(prev_toc_element); - prev_toc = Some(toc); + let elem_level = header_levels[elem_name]; + let id = elem_attrs.get("id").map(|val| val.to_string()).unwrap(); + let toc = TocElement::new( + format!("{}#{}", content_url, id), + replace_escaped_characters(&heading.text_contents()), + ); + + if let Some(last_elem_level) = last_toc_elem_level { + if elem_level <= last_elem_level { + last_toc_elem_level = Some(elem_level); + headers_vec.push(toc); } else { - prev_toc = Some(prev_toc_element.child(toc)) + match headers_vec.last_mut() { + Some(toc_elem) => *toc_elem = toc_elem.clone().child(toc), + _ => unreachable!(), + } } } else { - prev_toc = Some(toc); + last_toc_elem_level = Some(elem_level); + headers_vec.push(toc); } } - - if let Some(toc_element) = prev_toc { - headers_vec.push(toc_element); - } - headers_vec } #[cfg(test)] mod test { use kuchiki::traits::*; - use super::{get_header_level_toc_vec, replace_escaped_characters}; + use super::{generate_header_ids, get_header_level_toc_vec, replace_escaped_characters}; #[test] fn test_replace_escaped_characters() { @@ -319,4 +344,137 @@ mod test { "Author Name <author@mail.example>" ); } + + #[test] + fn test_generate_header_ids() { + let html_str = r#" + + + +

Heading 1

+

Heading 2

+

Heading 2 again

+

Heading 4

+

Heading 1 again

+

Heading 3

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + generate_header_ids(&doc); + + let mut headers = doc.select("h1, h2, h3, h4").unwrap(); + let all_headers_have_ids = headers.all(|node_data_ref| { + let attrs = node_data_ref.attributes.borrow(); + if let Some(id) = attrs.get("id") { + !id.trim().is_empty() + } else { + false + } + }); + assert_eq!(true, all_headers_have_ids); + + let selector = format!("h1#_{:x}", md5::compute("Heading 1")); + assert_eq!(true, doc.select_first(&selector).is_ok()); + + let selector = format!("h1#_{:x}", md5::compute("Heading 1 again")); + assert_eq!(true, doc.select_first(&selector).is_ok()); + + let selector = "h2#heading-2-again"; + assert_eq!(true, doc.select_first(selector).is_ok()); + } + + #[test] + fn test_get_header_level_toc_vec() { + // NOTE: Due to `TocElement` not implementing PartialEq, the tests here + // will need to be manually written to cover for this + let html_str = r#" + + + +

Lorem ipsum

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + + let toc_vec = get_header_level_toc_vec("index.xhtml", &doc); + assert_eq!(0, toc_vec.len()); + + let html_str = r#" + + + +

Heading 1

+

Lorem ipsum

+
+

Heading 2

+

Lorem ipsum

+

Lorem ipsum

+
+

Subheading 3

+

Lorem ipsum

+

Second Heading 1

+

Lorem ipsum

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + + let toc_vec = get_header_level_toc_vec("index.xhtml", &doc); + assert_eq!(2, toc_vec.len()); + + let first_h1_toc = toc_vec.first().unwrap(); + assert_eq!("Heading 1", first_h1_toc.title); + assert_eq!(1, first_h1_toc.children.len()); + + let h2_toc = first_h1_toc.children.first().unwrap(); + assert_eq!("Heading 2", h2_toc.title); + assert_eq!(1, h2_toc.children.len()); + + let h3_toc = h2_toc.children.first().unwrap(); + assert_eq!("Subheading 3", h3_toc.title); + assert_eq!(0, h3_toc.children.len()); + + let last_h1_toc = toc_vec.last().unwrap(); + assert_eq!("Second Heading 1", last_h1_toc.title); + assert_eq!(0, last_h1_toc.children.len()); + + let html_str = r#" + + + +

Heading 1

+

Lorem ipsum

+
+

Heading 2

+

Lorem ipsum

+

Lorem ipsum

+

Subheading 3

+

Lorem ipsum

+
+

Heading 2

+

Lorem ipsum

+

Subheading 4

+

Conclusion

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + + let toc_vec = get_header_level_toc_vec("index.xhtml", &doc); + assert_eq!(1, toc_vec.len()); + + let h1_toc = toc_vec.first().unwrap(); + assert_eq!("Heading 1", h1_toc.title); + assert_eq!(3, h1_toc.children.len()); + + let first_h2_toc = h1_toc.children.first().unwrap(); + assert_eq!("Heading 2", first_h2_toc.title); + assert_eq!(1, first_h2_toc.children.len()); + + let h3_toc = first_h2_toc.children.first().unwrap(); + assert_eq!("Subheading 3", h3_toc.title); + assert_eq!(0, h3_toc.children.len()); + } } From aa9258e12205f5d8a99a83a6d7b9d5b7eafbcf18 Mon Sep 17 00:00:00 2001 From: Mikhail Gorbachev Date: Sun, 6 Jun 2021 13:20:08 +0300 Subject: [PATCH 07/17] Fix from PR#15 - refactor comments - move `cli::Error` to `errors::ErrorCli` - removed mixing of order of input urls - move pure functionality if `init_logger` to clear function --- README.md | 36 ++++++++++++---- src/cli.rs | 113 +++++++++++++------------------------------------- src/errors.rs | 31 ++++++++++++++ src/logs.rs | 46 ++++++++++++++++++++ 4 files changed, 133 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index aa94d06..873f95a 100644 --- a/README.md +++ b/README.md @@ -48,23 +48,41 @@ USAGE: paperoni [OPTIONS] [urls]... OPTIONS: - -f, --file Input file containing links - -h, --help Prints help information + -f, --file + Input file containing links + + -h, --help + Prints help information + --log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level --max_conn - The maximum number of concurrent HTTP connections when downloading articles. Default is 8 + The maximum number of concurrent HTTP connections when downloading articles. Default is 8. + NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end + up overloading your network card with too many concurrent requests. + -o, --output_directory + Directory for saving epub documents + + --merge + Merge multiple articles into a single epub that will be given the name provided + + -V, --version + Prints version information - -o, --output_directory Directory for store output epub documents - --merge Merge multiple articles into a single epub - -V, --version Prints version information -v - Enables logging of events and set the verbosity level. Use --help to read on its usage - + This takes upto 4 levels of verbosity in the following order. + - Error (-v) + - Warn (-vv) + - Info (-vvv) + - Debug (-vvvv) + When this flag is passed, it disables the progress bars and logs to stderr. + If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag. ARGS: - ... Urls of web articles + ... + Urls of web articles + ``` To download a single article pass in its URL diff --git a/src/cli.rs b/src/cli.rs index 63c4d89..5827f56 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,14 +1,10 @@ -use std::{ - collections::HashSet, - num::{NonZeroUsize, ParseIntError}, - path::Path, -}; +use std::{collections::BTreeSet, fs, num::NonZeroUsize, path::Path}; use chrono::{DateTime, Local}; use clap::{App, AppSettings, Arg, ArgMatches}; -use flexi_logger::{FlexiLoggerError, LevelFilter as LogLevel}; -use std::fs; -use thiserror::Error; +use flexi_logger::LevelFilter as LogLevel; + +type Error = crate::errors::CliError; const DEFAULT_MAX_CONN: usize = 8; @@ -26,28 +22,6 @@ pub struct AppConfig { pub is_logging_to_file: bool, } -#[derive(Debug, Error)] -pub enum Error { - #[error("Failed to open file with urls: {0}")] - UrlFileError(#[from] std::io::Error), - #[error("Failed to parse max connection value: {0}")] - InvalidMaxConnectionCount(#[from] ParseIntError), - #[error("No urls for parse")] - NoUrls, - #[error("No urls for parse")] - AppBuildError(#[from] AppConfigBuilderError), - #[error("Invalid output path name for merged epubs: {0}")] - InvalidOutputPath(String), - #[error("Log error: {0}")] - LogDirectoryError(String), - #[error(transparent)] - LogError(#[from] FlexiLoggerError), - #[error("Wrong output directory")] - WrongOutputDirectory, - #[error("Output directory not exists")] - OutputDirectoryNotExists, -} - impl AppConfig { pub fn init_with_cli() -> Result { let app = App::new("paperoni") @@ -73,11 +47,10 @@ impl AppConfig { ) .arg( Arg::with_name("output_directory") - .long("output_directory") + .long("output-directory") .short("o") - .help("Directory for store output epub documents") + .help("Directory to store output epub documents") .conflicts_with("output_name") - .long_help("Directory for saving epub documents") .takes_value(true), ) .arg( @@ -128,40 +101,10 @@ impl AppConfig { } fn init_logger(self) -> Result { - use directories::UserDirs; - use flexi_logger::LogSpecBuilder; - - match UserDirs::new() { - Some(user_dirs) => { - let home_dir = user_dirs.home_dir(); - let paperoni_dir = home_dir.join(".paperoni"); - let log_dir = paperoni_dir.join("logs"); - - let log_spec = LogSpecBuilder::new() - .module("paperoni", self.log_level) - .build(); - let formatted_timestamp = self.start_time.format("%Y-%m-%d_%H-%M-%S"); - let mut logger = flexi_logger::Logger::with(log_spec); - - if self.is_logging_to_file && (!paperoni_dir.is_dir() || !log_dir.is_dir()) { - if let Err(e) = fs::create_dir_all(&log_dir) { - return Err(Error::LogDirectoryError(format!("Unable to create paperoni directories on home directory for logging purposes\n{}",e))); - } - } - if self.is_logging_to_file { - logger = logger - .directory(log_dir) - .discriminant(formatted_timestamp.to_string()) - .suppress_timestamp() - .log_to_file(); - } - logger.start()?; - Ok(self) - } - None => Err(Error::LogDirectoryError( - "Unable to get user directories for logging purposes".to_string(), - )), - } + use crate::logs; + logs::init_logger(self.log_level, &self.start_time, self.is_logging_to_file) + .map(|_| self) + .map_err(Error::LogError) } } @@ -181,21 +124,20 @@ impl<'a> TryFrom> for AppConfig { None } }; - match ( - arg_matches - .values_of("urls") - .and_then(|urls| urls.map(url_filter).collect::>>()), - arg_matches - .value_of("file") - .map(fs::read_to_string) - .transpose()? - .and_then(|content| { - content - .lines() - .map(url_filter) - .collect::>>() - }), - ) { + let direct_urls = arg_matches + .values_of("urls") + .and_then(|urls| urls.map(url_filter).collect::>>()); + let file_urls = arg_matches + .value_of("file") + .map(fs::read_to_string) + .transpose()? + .and_then(|content| { + content + .lines() + .map(url_filter) + .collect::>>() + }); + match (direct_urls, file_urls) { (Some(direct_urls), Some(file_urls)) => Ok(direct_urls .union(&file_urls) .map(ToOwned::to_owned) @@ -219,7 +161,7 @@ impl<'a> TryFrom> for AppConfig { 3 => LogLevel::Info, 4..=u64::MAX => LogLevel::Debug, }) - .is_logging_to_file(arg_matches.is_present("log-to_file")) + .is_logging_to_file(arg_matches.is_present("log-to-file")) .output_directory( arg_matches .value_of("output_directory") @@ -242,6 +184,9 @@ impl<'a> TryFrom> for AppConfig { impl AppConfigBuilder { pub fn try_init(&self) -> Result { - self.build()?.init_logger()?.init_merge_file() + self.build() + .map_err(Error::AppBuildError)? + .init_logger()? + .init_merge_file() } } diff --git a/src/errors.rs b/src/errors.rs index 84d1535..eb8cbe1 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,3 +1,6 @@ +use std::fmt::{Debug, Display}; + +use flexi_logger::FlexiLoggerError; use thiserror::Error; #[derive(Error, Debug)] @@ -124,3 +127,31 @@ impl From for PaperoniError { PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string())) } } + +#[derive(Debug, Error)] +pub enum LogError { + #[error(transparent)] + FlexiError(#[from] FlexiLoggerError), + #[error("Wrong log directory: {0}")] + LogDirectoryError(String), +} + +#[derive(Debug, Error)] +pub enum CliError { + #[error("Failed to open file with urls: {0}")] + UrlFileError(#[from] std::io::Error), + #[error("Failed to parse max connection value: {0}")] + InvalidMaxConnectionCount(#[from] std::num::ParseIntError), + #[error("No urls were provided")] + NoUrls, + #[error("Failed to build cli application: {0}")] + AppBuildError(BuilderError), + #[error("Invalid output path name for merged epubs: {0}")] + InvalidOutputPath(String), + #[error("Wrong output directory")] + WrongOutputDirectory, + #[error("Output directory not exists")] + OutputDirectoryNotExists, + #[error("Unable to start logger!\n{0}")] + LogError(#[from] LogError), +} diff --git a/src/logs.rs b/src/logs.rs index 0feb27f..e8f89de 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -1,6 +1,10 @@ +use std::fs; + +use chrono::{DateTime, Local}; use colored::*; use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; +use flexi_logger::LevelFilter; use log::error; use crate::errors::PaperoniError; @@ -141,6 +145,48 @@ impl DownloadCount { } } } + +use crate::errors::LogError as Error; + +pub fn init_logger( + log_level: LevelFilter, + start_time: &DateTime, + is_logging_to_file: bool, +) -> Result<(), Error> { + use directories::UserDirs; + use flexi_logger::LogSpecBuilder; + + match UserDirs::new() { + Some(user_dirs) => { + let home_dir = user_dirs.home_dir(); + let paperoni_dir = home_dir.join(".paperoni"); + let log_dir = paperoni_dir.join("logs"); + + let log_spec = LogSpecBuilder::new().module("paperoni", log_level).build(); + let formatted_timestamp = start_time.format("%Y-%m-%d_%H-%M-%S"); + let mut logger = flexi_logger::Logger::with(log_spec); + + if is_logging_to_file && (!paperoni_dir.is_dir() || !log_dir.is_dir()) { + if let Err(e) = fs::create_dir_all(&log_dir) { + return Err(Error::LogDirectoryError(format!("Unable to create paperoni directories on home directory for logging purposes\n{}",e))); + } + } + if is_logging_to_file { + logger = logger + .directory(log_dir) + .discriminant(formatted_timestamp.to_string()) + .suppress_timestamp() + .log_to_file(); + } + logger.start()?; + Ok(()) + } + None => Err(Error::LogDirectoryError( + "Unable to get user directories for logging purposes".to_string(), + )), + } +} + #[cfg(test)] mod tests { use super::{short_summary, DownloadCount}; From 67e86e4d74a745904a422f20a52c53312086f5a1 Mon Sep 17 00:00:00 2001 From: Mikhail Gorbachev Date: Sun, 6 Jun 2021 15:52:30 +0300 Subject: [PATCH 08/17] Refactor `LogError` --- src/errors.rs | 6 ++++-- src/logs.rs | 12 ++++-------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/errors.rs b/src/errors.rs index eb8cbe1..a479268 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -132,8 +132,10 @@ impl From for PaperoniError { pub enum LogError { #[error(transparent)] FlexiError(#[from] FlexiLoggerError), - #[error("Wrong log directory: {0}")] - LogDirectoryError(String), + #[error("Unable to get user directories for logging purposes")] + UserDirectoriesError, + #[error("Can't create log directory: {0}")] + CreateLogDirectoryError(#[from] std::io::Error), } #[derive(Debug, Error)] diff --git a/src/logs.rs b/src/logs.rs index e8f89de..526921d 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -166,12 +166,10 @@ pub fn init_logger( let formatted_timestamp = start_time.format("%Y-%m-%d_%H-%M-%S"); let mut logger = flexi_logger::Logger::with(log_spec); - if is_logging_to_file && (!paperoni_dir.is_dir() || !log_dir.is_dir()) { - if let Err(e) = fs::create_dir_all(&log_dir) { - return Err(Error::LogDirectoryError(format!("Unable to create paperoni directories on home directory for logging purposes\n{}",e))); - } - } if is_logging_to_file { + if !paperoni_dir.is_dir() || !log_dir.is_dir() { + fs::create_dir_all(&log_dir)?; + } logger = logger .directory(log_dir) .discriminant(formatted_timestamp.to_string()) @@ -181,9 +179,7 @@ pub fn init_logger( logger.start()?; Ok(()) } - None => Err(Error::LogDirectoryError( - "Unable to get user directories for logging purposes".to_string(), - )), + None => Err(Error::UserDirectoriesError), } } From 5b41e785b8812889d36a14a7219063656b47fdd7 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Mon, 7 Jun 2021 22:42:14 +0300 Subject: [PATCH 09/17] Fix `get_header_level_toc_vec` --- src/epub.rs | 99 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 23 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index 9797342..d84a403 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -279,48 +279,101 @@ fn generate_header_ids(root_node: &NodeRef) { /// Returns a vector of `TocElement` from a NodeRef used for adding to the Table of Contents for navigation fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec { - generate_header_ids(article); + // Depth starts from 1 + const HEADER_LEVEL_MAX_DEPTH: usize = 4; + let mut headers_vec: Vec = Vec::new(); - let mut headers_vec = Vec::new(); - - let mut header_levels = HashMap::new(); + let mut header_levels = HashMap::with_capacity(HEADER_LEVEL_MAX_DEPTH); header_levels.insert("h1", 1); header_levels.insert("h2", 2); header_levels.insert("h3", 3); header_levels.insert("h4", 4); + generate_header_ids(article); + let headings = article .select("h1, h2, h3, h4") .expect("Unable to create selector for headings"); - let mut last_toc_elem_level: Option = None; + // The header list will be generated using some sort of backtracking algorithm + // There will be a stack of maximum size 4 (since it only goes to h4 now) + let mut stack: Vec> = std::iter::repeat(None) + .take(HEADER_LEVEL_MAX_DEPTH) + .collect::<_>(); for heading in headings { - // TODO: Create a new function that adds an id attribute to heading tags before this function is called - let elem_attrs = heading.attributes.borrow(); let elem_name: &str = &heading.name.local; - let elem_level = header_levels[elem_name]; - let id = elem_attrs.get("id").map(|val| val.to_string()).unwrap(); - let toc = TocElement::new( - format!("{}#{}", content_url, id), - replace_escaped_characters(&heading.text_contents()), - ); + let attrs = heading.attributes.borrow(); + let id = attrs + .get("id") + .map(ToOwned::to_owned) + .expect("Unable to get id value in get_header_level_toc_vec"); + let url = format!("{}#{}", content_url, id); - if let Some(last_elem_level) = last_toc_elem_level { - if elem_level <= last_elem_level { - last_toc_elem_level = Some(elem_level); - headers_vec.push(toc); + let level = header_levels[elem_name]; + let index = level - 1; + + if let Some(mut existing_toc) = stack.get_mut(index).take().cloned().flatten() { + // If a toc element already exists at that header level, consume all the toc elements + // of a lower hierarchy e.g if the existing toc is a h2, then the h3 and h4 in the stack + // will be consumed. + // We collapse the children by folding from the right to the left of the stack. + let descendants_levels = HEADER_LEVEL_MAX_DEPTH - level; + let folded_descendants = stack + .iter_mut() + .rev() + .take(descendants_levels) + .map(|toc_elem| toc_elem.take()) + .filter(|toc_elem| toc_elem.is_some()) + .map(|toc_elem| toc_elem.unwrap()) + .reduce(|child, parent| parent.child(child)); + + if let Some(child) = folded_descendants { + existing_toc = existing_toc.child(child); + }; + + // Find the nearest ancestor to embed into. + // If this toc_elem was a h1, then just add it to the headers_vec + if index == 0 { + headers_vec.push(existing_toc); } else { - match headers_vec.last_mut() { - Some(toc_elem) => *toc_elem = toc_elem.clone().child(toc), - _ => unreachable!(), + // Otherwise, find the nearest ancestor to add it to. If none exists, add it to the headers_vec + let first_ancestor = stack + .iter_mut() + .take(level - 1) + .map(|toc_elem| toc_elem.as_mut()) + .rfind(|toc_elem| toc_elem.is_some()) + .flatten(); + + match first_ancestor { + Some(ancestor_toc_elem) => { + *ancestor_toc_elem = ancestor_toc_elem.clone().child(existing_toc); + } + None => { + headers_vec.push(existing_toc); + } } } - } else { - last_toc_elem_level = Some(elem_level); - headers_vec.push(toc); + } + + if let Some(toc_elem) = stack.get_mut(index) { + *toc_elem = Some(TocElement::new( + url, + replace_escaped_characters(&heading.text_contents()), + )); } } + + let folded_stack = stack + .into_iter() + .rev() + .filter(|toc_elem| toc_elem.is_some()) + .map(|opt_toc_elem| opt_toc_elem.unwrap()) + .reduce(|child, parent| parent.child(child)); + if let Some(toc_elem) = folded_stack { + headers_vec.push(toc_elem) + } + headers_vec } #[cfg(test)] From 5fbfb9c8062123a551f037f5069e5f9ee63fb9c2 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 8 Jun 2021 07:42:30 +0300 Subject: [PATCH 10/17] refactor: move download function to http module feat: add rendering of table for partial downloads feat: add help message for enabling --log-to-file chore: format flags to kebab-case and shorten --output-directory flag --- README.md | 14 +++++----- src/cli.rs | 6 ++--- src/http.rs | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/logs.rs | 22 +++++++++++++++- src/main.rs | 74 ++++++++++------------------------------------------- 5 files changed, 117 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 873f95a..f38c741 100644 --- a/README.md +++ b/README.md @@ -48,26 +48,26 @@ USAGE: paperoni [OPTIONS] [urls]... OPTIONS: - -f, --file + -f, --file Input file containing links - -h, --help + -h, --help Prints help information --log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level - --max_conn + --max-conn The maximum number of concurrent HTTP connections when downloading articles. Default is 8. NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests. - -o, --output_directory + -o, --output-dir Directory for saving epub documents - --merge + --merge Merge multiple articles into a single epub that will be given the name provided - -V, --version + -V, --version Prints version information -v @@ -80,7 +80,7 @@ OPTIONS: If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag. ARGS: - ... + ... Urls of web articles ``` diff --git a/src/cli.rs b/src/cli.rs index 5827f56..eb6c610 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -47,7 +47,7 @@ impl AppConfig { ) .arg( Arg::with_name("output_directory") - .long("output-directory") + .long("output-dir") .short("o") .help("Directory to store output epub documents") .conflicts_with("output_name") @@ -61,7 +61,7 @@ impl AppConfig { .conflicts_with("output_directory") .takes_value(true), ).arg( - Arg::with_name("max_conn") + Arg::with_name("max-conn") .long("max_conn") .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") @@ -146,7 +146,7 @@ impl<'a> TryFrom> for AppConfig { (None, None) => Err(Error::NoUrls), } }?) - .max_conn(match arg_matches.value_of("max_conn") { + .max_conn(match arg_matches.value_of("max-conn") { Some(max_conn) => max_conn.parse::()?.get(), None => DEFAULT_MAX_CONN, }) diff --git a/src/http.rs b/src/http.rs index 148fab0..8707977 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,14 +1,72 @@ use async_std::io::prelude::*; +use async_std::task; use async_std::{fs::File, stream}; use futures::StreamExt; use indicatif::ProgressBar; +use log::warn; use log::{debug, info}; use url::Url; +use crate::cli::AppConfig; use crate::errors::{ErrorKind, ImgError, PaperoniError}; use crate::extractor::Extractor; type HTMLResource = (String, String); +pub fn download( + app_config: &AppConfig, + bar: &ProgressBar, + partial_downloads: &mut Vec, + errors: &mut Vec, +) -> Vec { + task::block_on(async { + let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); + let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); + let mut articles = Vec::new(); + while let Some(fetch_result) = responses.next().await { + match fetch_result { + Ok((url, html)) => { + debug!("Extracting {}", &url); + let mut extractor = Extractor::from_html(&html, &url); + bar.set_message("Extracting..."); + match extractor.extract_content() { + Ok(_) => { + extractor.extract_img_urls(); + if let Err(img_errors) = + download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) + .await + { + partial_downloads + .push(PartialDownload::new(&url, extractor.metadata().title())); + warn!( + "{} image{} failed to download for {}", + img_errors.len(), + if img_errors.len() > 1 { "s" } else { "" }, + url + ); + for img_error in img_errors { + warn!( + "{}\n\t\tReason {}", + img_error.url().as_ref().unwrap(), + img_error + ); + } + } + articles.push(extractor); + } + Err(mut e) => { + e.set_article_source(&url); + errors.push(e); + } + } + } + Err(e) => errors.push(e), + } + bar.inc(1); + } + articles + }) +} + pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); debug!("Fetching {}", url); @@ -210,6 +268,20 @@ pub async fn download_images( } } +pub struct PartialDownload { + pub link: String, + pub title: String, +} + +impl PartialDownload { + pub fn new(link: &str, title: &str) -> Self { + Self { + link: link.into(), + title: title.into(), + } + } +} + /// Handles getting the extension from a given MIME subtype. fn map_mime_subtype_to_ext(subtype: &str) -> &str { if subtype == ("svg+xml") { diff --git a/src/logs.rs b/src/logs.rs index 61e7bc2..722c131 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -12,9 +12,10 @@ use crate::errors::PaperoniError; pub fn display_summary( initial_article_count: usize, succesful_articles_table: Table, - partial_downloads_count: usize, + partial_downloads: Vec, errors: Vec, ) { + let partial_downloads_count = partial_downloads.len(); let successfully_downloaded_count = initial_article_count - partial_downloads_count - errors.len(); @@ -32,6 +33,24 @@ pub fn display_summary( if successfully_downloaded_count > 0 { println!("{}", succesful_articles_table); } + + if partial_downloads_count > 0 { + println!("\n{}", "Partially failed downloads".yellow().bold()); + let mut table_partial = Table::new(); + table_partial + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_header(vec![ + Cell::new("Link").set_alignment(CellAlignment::Center), + Cell::new("Title").set_alignment(CellAlignment::Center), + ]) + .set_content_arrangement(ContentArrangement::Dynamic); + + for partial in partial_downloads { + table_partial.add_row(vec![&partial.link, &partial.title]); + } + println!("{}", table_partial); + } + if !errors.is_empty() { println!("\n{}", "Failed article downloads".bright_red().bold()); let mut table_failed = Table::new(); @@ -126,6 +145,7 @@ impl DownloadCount { } use crate::errors::LogError as Error; +use crate::http::PartialDownload; pub fn init_logger( log_level: LevelFilter, diff --git a/src/main.rs b/src/main.rs index fcb0cd4..dc4787d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,14 +3,10 @@ extern crate lazy_static; use std::process::exit; -use async_std::stream; -use async_std::task; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::{ContentArrangement, Table}; -use futures::stream::StreamExt; +use http::download; use indicatif::{ProgressBar, ProgressStyle}; -use log::{debug, warn}; -use url::Url; mod cli; mod epub; @@ -24,8 +20,6 @@ mod moz_readability; use cli::AppConfig; use epub::generate_epubs; -use extractor::Extractor; -use http::{download_images, fetch_html}; use logs::display_summary; fn main() { @@ -38,70 +32,25 @@ fn main() { }; if !app_config.urls.is_empty() { - download(app_config); + run(app_config); } } -fn download(app_config: AppConfig) { +fn run(app_config: AppConfig) { let mut errors = Vec::new(); - let mut partial_download_count: usize = 0; + let mut partial_downloads = Vec::new(); let bar = if app_config.can_disable_progress_bar { ProgressBar::hidden() } else { let enabled_bar = ProgressBar::new(app_config.urls.len() as u64); let style = ProgressStyle::default_bar().template( - "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", - ); + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", + ); enabled_bar.set_style(style); enabled_bar.enable_steady_tick(500); enabled_bar }; - let articles = task::block_on(async { - let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); - let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); - let mut articles = Vec::new(); - while let Some(fetch_result) = responses.next().await { - match fetch_result { - Ok((url, html)) => { - debug!("Extracting {}", &url); - let mut extractor = Extractor::from_html(&html, &url); - bar.set_message("Extracting..."); - match extractor.extract_content() { - Ok(_) => { - extractor.extract_img_urls(); - if let Err(img_errors) = - download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) - .await - { - partial_download_count += 1; - warn!( - "{} image{} failed to download for {}", - img_errors.len(), - if img_errors.len() > 1 { "s" } else { "" }, - url - ); - for img_error in img_errors { - warn!( - "{}\n\t\tReason {}", - img_error.url().as_ref().unwrap(), - img_error - ); - } - } - articles.push(extractor); - } - Err(mut e) => { - e.set_article_source(&url); - errors.push(e); - } - } - } - Err(e) => errors.push(e), - } - bar.inc(1); - } - articles - }); + let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors); bar.finish_with_message("Downloaded articles"); let mut succesful_articles_table = Table::new(); @@ -115,19 +64,24 @@ fn download(app_config: AppConfig) { errors.extend(gen_epub_errors); } }; - let has_errors = !errors.is_empty(); + + let has_errors = !errors.is_empty() || !partial_downloads.is_empty(); display_summary( app_config.urls.len(), succesful_articles_table, - partial_download_count, + partial_downloads, errors, ); + if app_config.is_logging_to_file { println!( "Log written to paperoni_{}.log\n", app_config.start_time.format("%Y-%m-%d_%H-%M-%S") ); + } else if has_errors && !app_config.is_logging_to_file { + println!("\nRun paperoni with the --log-to-file flag to create a log file"); } + if has_errors { std::process::exit(1); } From 8691b0166f9d046cb2490b6754834fdf87059732 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 8 Jun 2021 20:35:52 +0300 Subject: [PATCH 11/17] fix: fix panic when unwrapping a base URI chore: add message when downloading articles to a specified output-dir --- src/main.rs | 11 +++++++++++ src/moz_readability/mod.rs | 29 +++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/main.rs b/src/main.rs index dc4787d..e378115 100644 --- a/src/main.rs +++ b/src/main.rs @@ -39,6 +39,16 @@ fn main() { fn run(app_config: AppConfig) { let mut errors = Vec::new(); let mut partial_downloads = Vec::new(); + + if let Some(dir_name) = &app_config.output_directory { + let noun = if app_config.urls.len() > 1 { + "articles" + } else { + "article" + }; + println!("Downloading {} to {}", noun, dir_name); + } + let bar = if app_config.can_disable_progress_bar { ProgressBar::hidden() } else { @@ -50,6 +60,7 @@ fn run(app_config: AppConfig) { enabled_bar.enable_steady_tick(500); enabled_bar }; + let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors); bar.finish_with_message("Downloaded articles"); diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 7549f24..0f4fc66 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -659,10 +659,24 @@ impl Readability { .map(|node_ref| { let node_attrs = node_ref.attributes.borrow(); let href = node_attrs.get("href").unwrap(); - if href.trim() == "/" { - document_uri.join("/").unwrap() - } else { - Url::parse(href).unwrap() + + match Url::parse(href) { + Ok(url) => url, + Err(e) => match e { + url::ParseError::RelativeUrlWithoutBase => { + match document_uri.join(href) { + Ok(joined_url) => joined_url, + Err(e) => panic!( + "{:} unable to parse url {:?} on element {}", + e, href, &node_ref.name.local + ), + } + } + e => panic!( + "{:} unable to parse url {:?} on element {}", + e, href, &node_ref.name.local + ), + }, } }) .next() @@ -1609,8 +1623,11 @@ impl Readability { // // class name "comment", etc), and turn divs into P tags where they have been // // used inappropriately (as in, where they contain no other block level elements.) let mut elements_to_score: Vec = Vec::new(); - let mut node = self.root_node.select_first("html") - .ok().map(|n| n.as_node().clone()); + let mut node = self + .root_node + .select_first("html") + .ok() + .map(|n| n.as_node().clone()); while let Some(node_ref) = node { let node_elem = node_ref.as_element().unwrap(); From d50bbdfb580ff97487d2e0fbd3d5f2e8b29a0b71 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Wed, 9 Jun 2021 07:26:52 +0300 Subject: [PATCH 12/17] fix: minor fixes - restore default debug level when logging to file - return early from generating epubs if there are no articles - fix serialization bug in creating attributes --- src/cli.rs | 8 +++++++- src/epub.rs | 4 ++++ src/extractor.rs | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index eb6c610..22cc156 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -155,7 +155,13 @@ impl<'a> TryFrom> for AppConfig { arg_matches.is_present("verbosity") && !arg_matches.is_present("log-to-file"), ) .log_level(match arg_matches.occurrences_of("verbosity") { - 0 => LogLevel::Off, + 0 => { + if !arg_matches.is_present("log-to-file") { + LogLevel::Off + } else { + LogLevel::Debug + } + } 1 => LogLevel::Error, 2 => LogLevel::Warn, 3 => LogLevel::Info, diff --git a/src/epub.rs b/src/epub.rs index 667e2f7..e573260 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -18,6 +18,10 @@ pub fn generate_epubs( app_config: &AppConfig, successful_articles_table: &mut Table, ) -> Result<(), Vec> { + if articles.is_empty() { + return Ok(()); + } + let bar = if app_config.can_disable_progress_bar { ProgressBar::hidden() } else { diff --git a/src/extractor.rs b/src/extractor.rs index 2cf3f25..110357b 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -103,7 +103,7 @@ pub fn serialize_to_xhtml( let attrs_str = attrs .map .iter() - .filter(|(k, _)| &k.local != "\"") + .filter(|(k, _)| !k.local.contains("\"")) .map(|(k, v)| { format!( "{}=\"{}\"", From 4247fab1eacf91c686d0e278154a7affe7470786 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Wed, 9 Jun 2021 08:04:50 +0300 Subject: [PATCH 13/17] feat: add css library for EPUB exports --- README.md | 4 ++++ src/assets/writ.min.css | 7 +++++++ src/epub.rs | 23 ++++++++++++++++++++--- src/extractor.rs | 1 + 4 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 src/assets/writ.min.css diff --git a/README.md b/README.md index f38c741..d80bbe7 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,10 @@ into a single epub using the `merge` flag and specifying the output file. paperoni -f links.txt --merge out.epub ``` +### Recommended fonts + +The styling on the EPUB files comes from the [writ.css](https://github.com/causal-agent/writ) library. This uses Palatino as the serif font which you can get online for free. However, you can use whichever serif fonts you have installed. + ### Logging events Logging is disabled by default. This can be activated by either using the `-v` flag or `--log-to-file` flag. If the `--log-to-file` flag is passed the logs are sent to a file in the default Paperoni directory `.paperoni/logs` which is on your home directory. The `-v` flag configures the verbosity levels such that: diff --git a/src/assets/writ.min.css b/src/assets/writ.min.css new file mode 100644 index 0000000..afef597 --- /dev/null +++ b/src/assets/writ.min.css @@ -0,0 +1,7 @@ +/*! + * Writ v1.0.4 + * + * Copyright © 2015, Curtis McEnroe + * + * https://cmcenroe.me/writ/LICENSE (ISC) + */dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Palatino,Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Consolas,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap} diff --git a/src/epub.rs b/src/epub.rs index e573260..6dc28b6 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -3,9 +3,10 @@ use std::fs::File; use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; use epub_builder::{EpubBuilder, EpubContent, TocElement, ZipLibrary}; +use html5ever::tendril::fmt::Slice; use indicatif::{ProgressBar, ProgressStyle}; use kuchiki::NodeRef; -use log::{debug, info}; +use log::{debug, error, info}; use crate::{ cli::AppConfig, @@ -27,8 +28,8 @@ pub fn generate_epubs( } else { let enabled_bar = ProgressBar::new(articles.len() as u64); let style = ProgressStyle::default_bar().template( - "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", - ); + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", + ); enabled_bar.set_style(style); if !articles.is_empty() { enabled_bar.set_message("Generating epubs"); @@ -36,6 +37,8 @@ pub fn generate_epubs( enabled_bar }; + let stylesheet = include_bytes!("./assets/writ.min.css"); + let mut errors: Vec = Vec::new(); match app_config.merged { @@ -64,6 +67,16 @@ pub fn generate_epubs( }; debug!("Creating {:?}", name); epub.inline_toc(); + match epub.stylesheet(stylesheet.as_bytes()) { + Ok(_) => (), + Err(e) => { + error!("Unable to add stylesheets to epub file"); + let mut paperoni_err: PaperoniError = e.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + } articles .iter() .enumerate() @@ -168,6 +181,9 @@ pub fn generate_epubs( if let Some(author) = article.metadata().byline() { epub.metadata("author", replace_escaped_characters(author))?; } + + epub.stylesheet(stylesheet.as_bytes())?; + let title = replace_escaped_characters(article.metadata().title()); epub.metadata("title", &title)?; @@ -248,6 +264,7 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String { let template = format!( r#" +

Appendix

Article sources

diff --git a/src/extractor.rs b/src/extractor.rs index 110357b..ef470d9 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -37,6 +37,7 @@ impl Extractor { let template = r#" + From 282d2297541752e36a16fdca457cd1d559ea02dd Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 10 Jun 2021 20:16:31 +0300 Subject: [PATCH 14/17] fix: fix ordering issue with merged articles This commit adds the itertools crate which is used to dedup the Vec when downloading urls fix: fix error message feat: change the serif and mono fonts declarations --- Cargo.lock | 16 ++++++++++++++++ Cargo.toml | 1 + README.md | 4 ---- src/assets/writ.min.css | 2 +- src/cli.rs | 31 ++++++++++++++++--------------- src/errors.rs | 2 +- src/extractor.rs | 23 ++++++++++++++--------- 7 files changed, 49 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2bcea0..e1c29ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -758,6 +758,12 @@ dependencies = [ "dtoa", ] +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + [[package]] name = "encode_unicode" version = "0.3.6" @@ -1247,6 +1253,15 @@ dependencies = [ "waker-fn", ] +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.7" @@ -1550,6 +1565,7 @@ dependencies = [ "futures", "html5ever", "indicatif", + "itertools", "kuchiki", "lazy_static", "log 0.4.14", diff --git a/Cargo.toml b/Cargo.toml index e3f6055..d8305fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ flexi_logger = "0.17.1" futures = "0.3.15" html5ever = "0.25.1" indicatif = "0.16.2" +itertools = "0.10.1" kuchiki = "0.8.1" lazy_static = "1.4.0" log = "0.4.14" diff --git a/README.md b/README.md index d80bbe7..f38c741 100644 --- a/README.md +++ b/README.md @@ -118,10 +118,6 @@ into a single epub using the `merge` flag and specifying the output file. paperoni -f links.txt --merge out.epub ``` -### Recommended fonts - -The styling on the EPUB files comes from the [writ.css](https://github.com/causal-agent/writ) library. This uses Palatino as the serif font which you can get online for free. However, you can use whichever serif fonts you have installed. - ### Logging events Logging is disabled by default. This can be activated by either using the `-v` flag or `--log-to-file` flag. If the `--log-to-file` flag is passed the logs are sent to a file in the default Paperoni directory `.paperoni/logs` which is on your home directory. The `-v` flag configures the verbosity levels such that: diff --git a/src/assets/writ.min.css b/src/assets/writ.min.css index afef597..1c9c0b4 100644 --- a/src/assets/writ.min.css +++ b/src/assets/writ.min.css @@ -4,4 +4,4 @@ * Copyright © 2015, Curtis McEnroe * * https://cmcenroe.me/writ/LICENSE (ISC) - */dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Palatino,Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Consolas,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap} + */dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap} diff --git a/src/cli.rs b/src/cli.rs index 22cc156..763898f 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,8 +1,9 @@ -use std::{collections::BTreeSet, fs, num::NonZeroUsize, path::Path}; +use std::{fs, num::NonZeroUsize, path::Path}; use chrono::{DateTime, Local}; use clap::{App, AppSettings, Arg, ArgMatches}; use flexi_logger::LevelFilter as LogLevel; +use itertools::Itertools; type Error = crate::errors::CliError; @@ -126,24 +127,24 @@ impl<'a> TryFrom> for AppConfig { }; let direct_urls = arg_matches .values_of("urls") - .and_then(|urls| urls.map(url_filter).collect::>>()); + .and_then(|urls| urls.map(url_filter).collect::>>()) + .unwrap_or(Vec::new()); let file_urls = arg_matches .value_of("file") .map(fs::read_to_string) .transpose()? - .and_then(|content| { - content - .lines() - .map(url_filter) - .collect::>>() - }); - match (direct_urls, file_urls) { - (Some(direct_urls), Some(file_urls)) => Ok(direct_urls - .union(&file_urls) - .map(ToOwned::to_owned) - .collect::>()), - (Some(urls), None) | (None, Some(urls)) => Ok(urls.into_iter().collect()), - (None, None) => Err(Error::NoUrls), + .and_then(|content| content.lines().map(url_filter).collect::>>()) + .unwrap_or(Vec::new()); + + let urls = [direct_urls, file_urls] + .concat() + .into_iter() + .unique() + .collect_vec(); + if !urls.is_empty() { + Ok(urls) + } else { + Err(Error::NoUrls) } }?) .max_conn(match arg_matches.value_of("max-conn") { diff --git a/src/errors.rs b/src/errors.rs index a479268..17ae34a 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -152,7 +152,7 @@ pub enum CliError { InvalidOutputPath(String), #[error("Wrong output directory")] WrongOutputDirectory, - #[error("Output directory not exists")] + #[error("Output directory does not exist")] OutputDirectoryNotExists, #[error("Unable to start logger!\n{0}")] LogError(#[from] LogError), diff --git a/src/extractor.rs b/src/extractor.rs index ef470d9..632f567 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use itertools::Itertools; use kuchiki::{traits::*, NodeRef}; use crate::errors::PaperoniError; @@ -54,15 +55,19 @@ impl Extractor { /// Traverses the DOM tree of the content and retrieves the IMG URLs pub fn extract_img_urls(&mut self) { if let Some(content_ref) = &self.article { - for img_ref in content_ref.select("img").unwrap() { - img_ref.as_node().as_element().map(|img_elem| { - img_elem.attributes.borrow().get("src").map(|img_url| { - if !(img_url.is_empty() || img_url.starts_with("data:image")) { - self.img_urls.push((img_url.to_string(), None)) - } - }) - }); - } + self.img_urls = content_ref + .select("img") + .unwrap() + .filter_map(|img_ref| { + let attrs = img_ref.attributes.borrow(); + attrs + .get("src") + .filter(|val| !(val.is_empty() || val.starts_with("data:image"))) + .map(ToString::to_string) + }) + .unique() + .map(|val| (val, None)) + .collect(); } } From c6c10689eb916ab92d6590eae950b075f2023893 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Wed, 16 Jun 2021 18:09:05 +0300 Subject: [PATCH 15/17] fix: fix broken links in toc generation the fix involves ensuring the ToC is generated prior to serialization because it mutates the document and will not work otherwise. chore: add .vscode config to .gitignore --- .gitignore | 3 +- src/epub.rs | 83 ++++++++++++++++++++++++++++++++++++++++-------- src/extractor.rs | 59 ---------------------------------- 3 files changed, 71 insertions(+), 74 deletions(-) diff --git a/.gitignore b/.gitignore index 8e42494..2b8060a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target *.epub -*.log \ No newline at end of file +*.log +.vscode/ \ No newline at end of file diff --git a/src/epub.rs b/src/epub.rs index 6dc28b6..e54801e 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -8,11 +8,11 @@ use indicatif::{ProgressBar, ProgressStyle}; use kuchiki::NodeRef; use log::{debug, error, info}; -use crate::{ - cli::AppConfig, - errors::PaperoniError, - extractor::{self, Extractor}, -}; +use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor}; + +lazy_static! { + static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); +} pub fn generate_epubs( articles: Vec, @@ -82,16 +82,17 @@ pub fn generate_epubs( .enumerate() .fold(&mut epub, |epub, (idx, article)| { let mut article_result = || -> Result<(), PaperoniError> { - let mut xhtml_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf)?; - let xhtml_str = std::str::from_utf8(&xhtml_buf)?; - let section_name = article.metadata().title(); let content_url = format!("article_{}.xhtml", idx); - let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes()) - .title(replace_escaped_characters(section_name)); + let mut xhtml_buf = Vec::new(); let header_level_tocs = get_header_level_toc_vec(&content_url, article.article()); + serialize_to_xhtml(article.article(), &mut xhtml_buf)?; + let xhtml_str = std::str::from_utf8(&xhtml_buf)?; + let section_name = article.metadata().title(); + let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes()) + .title(replace_escaped_characters(section_name)); + for toc_element in header_level_tocs { content = content.child(toc_element); } @@ -172,11 +173,11 @@ pub fn generate_epubs( debug!("Creating {:?}", file_name); let mut out_file = File::create(&file_name).unwrap(); let mut xhtml_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf) - .expect("Unable to serialize to xhtml"); - let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap(); let header_level_tocs = get_header_level_toc_vec("index.xhtml", article.article()); + serialize_to_xhtml(article.article(), &mut xhtml_buf) + .expect("Unable to serialize to xhtml"); + let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap(); if let Some(author) = article.metadata().byline() { epub.metadata("author", replace_escaped_characters(author))?; @@ -398,6 +399,60 @@ fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec( + node_ref: &NodeRef, + mut w: &mut W, +) -> Result<(), PaperoniError> { + let mut escape_map = HashMap::new(); + escape_map.insert("<", "<"); + escape_map.insert(">", ">"); + escape_map.insert("&", "&"); + escape_map.insert("\"", """); + escape_map.insert("'", "'"); + for edge in node_ref.traverse_inclusive() { + match edge { + kuchiki::iter::NodeEdge::Start(n) => match n.data() { + kuchiki::NodeData::Text(rc_text) => { + let text = rc_text.borrow(); + let esc_text = ESC_SEQ_REGEX + .replace_all(&text, |captures: ®ex::Captures| escape_map[&captures[1]]); + write!(&mut w, "{}", esc_text)?; + } + kuchiki::NodeData::Element(elem_data) => { + let attrs = elem_data.attributes.borrow(); + let attrs_str = attrs + .map + .iter() + .filter(|(k, _)| !k.local.contains("\"")) + .map(|(k, v)| { + format!( + "{}=\"{}\"", + k.local, + ESC_SEQ_REGEX + .replace_all(&v.value, |captures: ®ex::Captures| { + escape_map[&captures[1]] + }) + ) + }) + .fold("".to_string(), |acc, val| acc + " " + &val); + write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?; + } + _ => (), + }, + kuchiki::iter::NodeEdge::End(n) => match n.data() { + kuchiki::NodeData::Element(elem_data) => { + write!(&mut w, "", &elem_data.name.local)?; + } + _ => (), + }, + } + } + Ok(()) +} + #[cfg(test)] mod test { use kuchiki::traits::*; diff --git a/src/extractor.rs b/src/extractor.rs index 632f567..f427e0f 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,5 +1,3 @@ -use std::collections::HashMap; - use itertools::Itertools; use kuchiki::{traits::*, NodeRef}; @@ -8,10 +6,6 @@ use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); -lazy_static! { - static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); -} - pub struct Extractor { article: Option, pub img_urls: Vec, @@ -83,59 +77,6 @@ impl Extractor { } } -/// Serializes a NodeRef to a string that is XHTML compatible -/// The only DOM nodes serialized are Text and Element nodes -pub fn serialize_to_xhtml( - node_ref: &NodeRef, - mut w: &mut W, -) -> Result<(), PaperoniError> { - let mut escape_map = HashMap::new(); - escape_map.insert("<", "<"); - escape_map.insert(">", ">"); - escape_map.insert("&", "&"); - escape_map.insert("\"", """); - escape_map.insert("'", "'"); - for edge in node_ref.traverse_inclusive() { - match edge { - kuchiki::iter::NodeEdge::Start(n) => match n.data() { - kuchiki::NodeData::Text(rc_text) => { - let text = rc_text.borrow(); - let esc_text = ESC_SEQ_REGEX - .replace_all(&text, |captures: ®ex::Captures| escape_map[&captures[1]]); - write!(&mut w, "{}", esc_text)?; - } - kuchiki::NodeData::Element(elem_data) => { - let attrs = elem_data.attributes.borrow(); - let attrs_str = attrs - .map - .iter() - .filter(|(k, _)| !k.local.contains("\"")) - .map(|(k, v)| { - format!( - "{}=\"{}\"", - k.local, - ESC_SEQ_REGEX - .replace_all(&v.value, |captures: ®ex::Captures| { - escape_map[&captures[1]] - }) - ) - }) - .fold("".to_string(), |acc, val| acc + " " + &val); - write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?; - } - _ => (), - }, - kuchiki::iter::NodeEdge::End(n) => match n.data() { - kuchiki::NodeData::Element(elem_data) => { - write!(&mut w, "", &elem_data.name.local)?; - } - _ => (), - }, - } - } - Ok(()) -} - #[cfg(test)] mod test { use super::*; From 754365a42aa56673cda9d1ba7abe9be2c4151b0b Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 17 Jun 2021 17:32:53 +0300 Subject: [PATCH 16/17] feat: add `inline-toc` flag --- src/cli.rs | 11 ++++++++++- src/epub.rs | 6 +++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 763898f..3ffd4f7 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -21,6 +21,7 @@ pub struct AppConfig { pub can_disable_progress_bar: bool, pub start_time: DateTime, pub is_logging_to_file: bool, + pub inline_toc: bool, } impl AppConfig { @@ -86,7 +87,14 @@ impl AppConfig { Arg::with_name("log-to-file") .long("log-to-file") .help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level") - .takes_value(false)); + .takes_value(false)) + .arg( + Arg::with_name("inline-toc") + .long("inline-toc") + .requires("output_name") + .help("Add an inlined Table of Contents page at the start of the merged article.") + .long_help("Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation") + ); Self::try_from(app.get_matches()) } @@ -169,6 +177,7 @@ impl<'a> TryFrom> for AppConfig { 4..=u64::MAX => LogLevel::Debug, }) .is_logging_to_file(arg_matches.is_present("log-to-file")) + .inline_toc(arg_matches.is_present("inline-toc")) .output_directory( arg_matches .value_of("output_directory") diff --git a/src/epub.rs b/src/epub.rs index e54801e..f3e37f4 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -66,7 +66,11 @@ pub fn generate_epubs( } }; debug!("Creating {:?}", name); - epub.inline_toc(); + + if app_config.inline_toc { + epub.inline_toc(); + } + match epub.stylesheet(stylesheet.as_bytes()) { Ok(_) => (), Err(e) => { From 92c97ca2cfe5a0454a92d1a86b77f36f92426893 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 24 Jun 2021 08:26:40 +0300 Subject: [PATCH 17/17] fix: add .epub extension as fallback chore: update dependencies and update README chore: bump version --- Cargo.lock | 50 ++++++++++++++++++++++++++++++-------------------- Cargo.toml | 6 +++--- README.md | 18 ++++++++++++++++-- src/cli.rs | 8 +++++++- src/logs.rs | 13 +++++++------ 5 files changed, 63 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e1c29ad..8489e38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -410,9 +410,9 @@ dependencies = [ [[package]] name = "comfy-table" -version = "2.1.0" +version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b99e9022e080d384b58d8eaf5976b42a311ff7a9669f8200eb2453c0b2b81a" +checksum = "c93d79ba722818d1a6aedfbe2cf4889330c856d0c6772951efbbf3dd283c070a" dependencies = [ "crossterm", "strum", @@ -504,25 +504,25 @@ dependencies = [ [[package]] name = "crossterm" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c" +checksum = "c0ebde6a9dd5e331cd6c6f48253254d117642c31653baa475e394657c59c1f7d" dependencies = [ "bitflags", "crossterm_winapi", - "lazy_static", "libc", "mio", "parking_lot", "signal-hook", + "signal-hook-mio", "winapi", ] [[package]] name = "crossterm_winapi" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9" +checksum = "3a6966607622438301997d3dac0d2f6e9a90c68bb6bc1785ea98456ab93c0507" dependencies = [ "winapi", ] @@ -835,9 +835,9 @@ dependencies = [ [[package]] name = "flexi_logger" -version = "0.17.1" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ab94b6ac8eb69f1496a6993f26f785b5fd6d99b7416023eb2a6175c0b242b1" +checksum = "8ba2265890613939b533fa11c3728651531419ac549ccf527896201581f23991" dependencies = [ "atty", "chrono", @@ -1551,7 +1551,7 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.4.1-alpha1" +version = "0.5.0-alpha1" dependencies = [ "async-std", "chrono", @@ -2102,20 +2102,30 @@ dependencies = [ [[package]] name = "signal-hook" -version = "0.1.17" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729" +checksum = "470c5a6397076fae0094aaf06a08e6ba6f37acb77d3b1b91ea92b4d6c8650c39" dependencies = [ "libc", - "mio", "signal-hook-registry", ] [[package]] -name = "signal-hook-registry" -version = "1.3.0" +name = "signal-hook-mio" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6" +checksum = "29fd5867f1c4f2c5be079aee7a2adf1152ebb04a4bc4d341f504b7dece607ed4" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ "libc", ] @@ -2271,15 +2281,15 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "strum" -version = "0.20.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7318c509b5ba57f18533982607f24070a55d353e90d4cae30c467cdb2ad5ac5c" +checksum = "aaf86bbcfd1fa9670b7a129f64fc0c9fcbbfe4f1bc4210e9e98fe71ffc12cde2" [[package]] name = "strum_macros" -version = "0.20.1" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee8bc6b87a5112aeeab1f4a9f7ab634fe6cbefc4850006df31267f4cfb9e3149" +checksum = "d06aaeeee809dbc59eb4556183dd927df67db1540de5be8d3ec0b6636358a5ec" dependencies = [ "heck", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index d8305fa..3594149 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ description = "A web article downloader" homepage = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.4.1-alpha1" +version = "0.5.0-alpha1" authors = ["Kenneth Gitere "] edition = "2018" license = "MIT" @@ -17,11 +17,11 @@ async-std = "1.9.0" chrono = "0.4.19" clap = "2.33.3" colored = "2.0.0" -comfy-table = "2.1.0" +comfy-table = "3.0.0" derive_builder = "0.10.2" directories = "3.0.2" epub-builder = "0.4.8" -flexi_logger = "0.17.1" +flexi_logger = "0.18.0" futures = "0.3.15" html5ever = "0.25.1" indicatif = "0.16.2" diff --git a/README.md b/README.md index f38c741..5c547b0 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ -Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. +Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. There is provisional\* support for exporting to PDF as well. > This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash. @@ -23,7 +23,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run: ```sh -cargo install paperoni --version 0.4.1-alpha1 +cargo install paperoni --version 0.5.0-alpha1 ``` _Paperoni is still in alpha so the `version` flag has to be passed._ @@ -54,6 +54,9 @@ OPTIONS: -h, --help Prints help information + --inline-toc + Add an inlined Table of Contents page at the start of the merged article. + --log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level @@ -147,3 +150,14 @@ This program is still in alpha so a number of things won't work: - Code snippets on Medium articles that are lazy loaded will not appear in the EPUB. There are also web pages it won't work on in general such as Twitter and Reddit threads. + +## PDF exports + +As of version 0.5-alpha1, you can now export to PDF using a third party tool. This requires that you install [Calibre](https://calibre-ebook.com/) which comes with a ebook conversion. You can convert the epub to a pdf through the terminal with `ebook-convert`: + +```sh +# Assuming the downloaded epub was called foo.epub +ebook-convert foo.epub foo.pdf +``` + +Alternatively, you can use the Calibre GUI to do the file conversion. diff --git a/src/cli.rs b/src/cli.rs index 3ffd4f7..62937e7 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -159,7 +159,13 @@ impl<'a> TryFrom> for AppConfig { Some(max_conn) => max_conn.parse::()?.get(), None => DEFAULT_MAX_CONN, }) - .merged(arg_matches.value_of("output_name").map(ToOwned::to_owned)) + .merged(arg_matches.value_of("output_name").map(|name| { + if name.ends_with(".epub") { + name.to_owned() + } else { + name.to_string() + ".epub" + } + })) .can_disable_progress_bar( arg_matches.is_present("verbosity") && !arg_matches.is_present("log-to-file"), ) diff --git a/src/logs.rs b/src/logs.rs index 722c131..a0f51d8 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -4,7 +4,7 @@ use chrono::{DateTime, Local}; use colored::*; use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; -use flexi_logger::LevelFilter; +use flexi_logger::{FileSpec, LevelFilter}; use log::error; use crate::errors::PaperoniError; @@ -169,11 +169,12 @@ pub fn init_logger( if !paperoni_dir.is_dir() || !log_dir.is_dir() { fs::create_dir_all(&log_dir)?; } - logger = logger - .directory(log_dir) - .discriminant(formatted_timestamp.to_string()) - .suppress_timestamp() - .log_to_file(); + logger = logger.log_to_file( + FileSpec::default() + .directory(log_dir) + .discriminant(formatted_timestamp.to_string()) + .suppress_timestamp(), + ); } logger.start()?; Ok(())