From d67169425d0c0d82d4570a4d921f72fec8597d1c Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Fri, 16 Jul 2021 07:45:20 +0300 Subject: [PATCH 1/6] fix: fix serialization of element attributes --- src/epub.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/epub.rs b/src/epub.rs index f3e37f4..79f8689 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -12,6 +12,7 @@ use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor}; lazy_static! { static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); + static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_]"#).unwrap(); } pub fn generate_epubs( @@ -292,6 +293,10 @@ fn generate_header_ids(root_node: &NodeRef) { let headers_no_id = headers.filter(|node_data_ref| { let attrs = node_data_ref.attributes.borrow(); !attrs.contains("id") + || attrs + .get("id") + .map(|val| !VALID_ATTR_CHARS_REGEX.is_match(&val)) + .unwrap() }); for header in headers_no_id { let mut attrs = header.attributes.borrow_mut(); @@ -430,7 +435,10 @@ fn serialize_to_xhtml( let attrs_str = attrs .map .iter() - .filter(|(k, _)| !k.local.contains("\"")) + .filter(|(k, _)| { + let attr_key: &str = &k.local; + attr_key.is_ascii() && VALID_ATTR_CHARS_REGEX.is_match(attr_key) + }) .map(|(k, v)| { format!( "{}=\"{}\"", From d1d1a0f3f4040434345f59ab5293f5ee53035d1e Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 22 Jul 2021 08:47:03 +0300 Subject: [PATCH 2/6] feat: add no-css and no-header-css flags for #19 refactor: change to yaml configuration for the CLI refactor: change all flags to kebab case --- Cargo.lock | 7 +++ Cargo.toml | 2 +- src/assets/body.min.css | 7 +++ src/assets/headers.min.css | 7 +++ src/assets/writ.min.css | 7 --- src/cli.rs | 94 +++++++++----------------------------- src/cli_config.yml | 66 ++++++++++++++++++++++++++ src/epub.rs | 27 +++++++++-- 8 files changed, 133 insertions(+), 84 deletions(-) create mode 100644 src/assets/body.min.css create mode 100644 src/assets/headers.min.css delete mode 100644 src/assets/writ.min.css create mode 100644 src/cli_config.yml diff --git a/Cargo.lock b/Cargo.lock index 8489e38..8a52a90 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -395,6 +395,7 @@ dependencies = [ "textwrap", "unicode-width", "vec_map", + "yaml-rust", ] [[package]] @@ -2756,6 +2757,12 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "yaml-rust" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e66366e18dc58b46801afbf2ca7661a9f59cc8c5962c29892b6039b4f86fa992" + [[package]] name = "yansi" version = "0.5.0" diff --git a/Cargo.toml b/Cargo.toml index 3594149..bde8df1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ readme = "README.md" # atty = "0.2.14" async-std = "1.9.0" chrono = "0.4.19" -clap = "2.33.3" +clap = {version = "2.33.3", features = ["yaml"]} colored = "2.0.0" comfy-table = "3.0.0" derive_builder = "0.10.2" diff --git a/src/assets/body.min.css b/src/assets/body.min.css new file mode 100644 index 0000000..4e17cec --- /dev/null +++ b/src/assets/body.min.css @@ -0,0 +1,7 @@ +/*! + * Writ v1.0.4 + * + * Copyright © 2015, Curtis McEnroe + * + * https://cmcenroe.me/writ/LICENSE (ISC) + */dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}small{font-size:.833em}th{font-weight:400}blockquote,dl,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap} \ No newline at end of file diff --git a/src/assets/headers.min.css b/src/assets/headers.min.css new file mode 100644 index 0000000..ca3056c --- /dev/null +++ b/src/assets/headers.min.css @@ -0,0 +1,7 @@ +/*! + * Writ v1.0.4 + * + * Copyright © 2015, Curtis McEnroe + * + * https://cmcenroe.me/writ/LICENSE (ISC) + */h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}h1,h2,h3{line-height:3rem}h1,h2,h3,h4,h5,h6{margin:1.5rem 0 0} \ No newline at end of file diff --git a/src/assets/writ.min.css b/src/assets/writ.min.css deleted file mode 100644 index 1c9c0b4..0000000 --- a/src/assets/writ.min.css +++ /dev/null @@ -1,7 +0,0 @@ -/*! - * Writ v1.0.4 - * - * Copyright © 2015, Curtis McEnroe - * - * https://cmcenroe.me/writ/LICENSE (ISC) - */dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap} diff --git a/src/cli.rs b/src/cli.rs index 62937e7..c66f0de 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,7 +1,7 @@ use std::{fs, num::NonZeroUsize, path::Path}; use chrono::{DateTime, Local}; -use clap::{App, AppSettings, Arg, ArgMatches}; +use clap::{load_yaml, App, ArgMatches}; use flexi_logger::LevelFilter as LogLevel; use itertools::Itertools; @@ -22,80 +22,13 @@ pub struct AppConfig { pub start_time: DateTime, pub is_logging_to_file: bool, pub inline_toc: bool, + pub css_config: CSSConfig, } impl AppConfig { pub fn init_with_cli() -> Result { - let app = App::new("paperoni") - .settings(&[ - AppSettings::ArgRequiredElseHelp, - AppSettings::UnifiedHelpMessage, - ]) - .version(clap::crate_version!()) - .about( - "Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs", - ) - .arg( - Arg::with_name("urls") - .help("Urls of web articles") - .multiple(true), - ) - .arg( - Arg::with_name("file") - .short("f") - .long("file") - .help("Input file containing links") - .takes_value(true), - ) - .arg( - Arg::with_name("output_directory") - .long("output-dir") - .short("o") - .help("Directory to store output epub documents") - .conflicts_with("output_name") - .takes_value(true), - ) - .arg( - Arg::with_name("output_name") - .long("merge") - .help("Merge multiple articles into a single epub") - .long_help("Merge multiple articles into a single epub that will be given the name provided") - .conflicts_with("output_directory") - .takes_value(true), - ).arg( - Arg::with_name("max-conn") - .long("max_conn") - .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") - .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") - .takes_value(true)) - .arg( - Arg::with_name("verbosity") - .short("v") - .multiple(true) - .help("Enables logging of events and set the verbosity level. Use --help to read on its usage") - .long_help( -"This takes upto 4 levels of verbosity in the following order. - - Error (-v) - - Warn (-vv) - - Info (-vvv) - - Debug (-vvvv) - When this flag is passed, it disables the progress bars and logs to stderr. - If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag." - ) - .takes_value(false)) - .arg( - Arg::with_name("log-to-file") - .long("log-to-file") - .help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level") - .takes_value(false)) - .arg( - Arg::with_name("inline-toc") - .long("inline-toc") - .requires("output_name") - .help("Add an inlined Table of Contents page at the start of the merged article.") - .long_help("Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation") - ); - + let yaml_config = load_yaml!("cli_config.yml"); + let app = App::from_yaml(yaml_config).version(clap::crate_version!()); Self::try_from(app.get_matches()) } @@ -159,7 +92,7 @@ impl<'a> TryFrom> for AppConfig { Some(max_conn) => max_conn.parse::()?.get(), None => DEFAULT_MAX_CONN, }) - .merged(arg_matches.value_of("output_name").map(|name| { + .merged(arg_matches.value_of("output-name").map(|name| { if name.ends_with(".epub") { name.to_owned() } else { @@ -200,6 +133,16 @@ impl<'a> TryFrom> for AppConfig { .transpose()?, ) .start_time(Local::now()) + .css_config( + match ( + arg_matches.is_present("no-css"), + arg_matches.is_present("no-header-css"), + ) { + (true, _) => CSSConfig::None, + (_, true) => CSSConfig::NoHeaders, + _ => CSSConfig::All, + }, + ) .try_init() } } @@ -212,3 +155,10 @@ impl AppConfigBuilder { .init_merge_file() } } + +#[derive(Clone, Debug)] +pub enum CSSConfig { + All, + NoHeaders, + None, +} diff --git a/src/cli_config.yml b/src/cli_config.yml new file mode 100644 index 0000000..a1d9424 --- /dev/null +++ b/src/cli_config.yml @@ -0,0 +1,66 @@ +name: paperoni +about: Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs +settings: + - ArgRequiredElseHelp + - UnifiedHelpMessage +args: + - urls: + help: Urls of web articles + multiple: true + - file: + short: f + long: file + help: Input file containing links + takes_value: true + - output_directory: + short: o + long: output-dir + help: Directory to store output epub documents + conflicts_with: output-name + takes_value: true + - output-name: + long: merge + help: Merge multiple articles into a single epub + long_help: Merge multiple articles into a single epub that will be given the name provided + conflicts_with: output_directory + takes_value: true + - max-conn: + long: max-conn + help: The maximum number of concurrent HTTP connections when downloading articles. Default is 8 + long_help: "The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests." + takes_value: true + - verbosity: + short: v + multiple: true + help: Enables logging of events and set the verbosity level. Use --help to read on its usage + long_help: "This takes upto 4 levels of verbosity in the following order. + \n- Error (-v) + \n- Warn (-vv) + \n- Info (-vvv) + \n- Debug (-vvvv) + \nWhen this flag is passed, it disables the progress bars and logs to stderr. + \nIf you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag." + takes_value: false + - log-to-file: + long: log-to-file + help: Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level + takes_value: false + - inline-toc: + long: inline-toc + requires: output-name + help: Add an inlined Table of Contents page at the start of the merged article. + long_help: Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation" + - no-css: + long: no-css + conflicts_with: no-header-css + help: Removes the stylesheets used in the EPUB generation. Pass --help to learn more + long_help: "Removes the stylesheets used in the EPUB generation. + \nThe EPUB file will then be laid out based on your e-reader's default stylesheets. + \nImages and code blocks may overflow when this flag is set and layout of generated + \nPDFs will be affected. Use --no-header-css if you want to only disable the styling on headers." + takes_value: false + - no-header-css: + long: no-header-css + conflicts_with: no-css + help: Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default CSS, use --no-css instead. + takes_value: false diff --git a/src/epub.rs b/src/epub.rs index 79f8689..c522470 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -38,7 +38,8 @@ pub fn generate_epubs( enabled_bar }; - let stylesheet = include_bytes!("./assets/writ.min.css"); + let body_stylesheet = include_bytes!("./assets/body.min.css"); + let header_stylesheet = include_bytes!("./assets/headers.min.css"); let mut errors: Vec = Vec::new(); @@ -72,7 +73,7 @@ pub fn generate_epubs( epub.inline_toc(); } - match epub.stylesheet(stylesheet.as_bytes()) { + match add_stylesheets(&mut epub, body_stylesheet, header_stylesheet, app_config) { Ok(_) => (), Err(e) => { error!("Unable to add stylesheets to epub file"); @@ -188,8 +189,7 @@ pub fn generate_epubs( epub.metadata("author", replace_escaped_characters(author))?; } - epub.stylesheet(stylesheet.as_bytes())?; - + add_stylesheets(&mut epub, body_stylesheet, header_stylesheet, app_config)?; let title = replace_escaped_characters(article.metadata().title()); epub.metadata("title", &title)?; @@ -250,6 +250,25 @@ fn replace_escaped_characters(value: &str) -> String { .replace(">", ">") } +fn add_stylesheets( + epub: &mut EpubBuilder, + body_stylesheet: &[u8], + header_stylesheet: &[u8], + app_config: &AppConfig, +) -> Result<(), epub_builder::Error> { + match app_config.css_config { + crate::cli::CSSConfig::All => { + epub.stylesheet([header_stylesheet, body_stylesheet].concat().as_bytes())?; + Ok(()) + } + crate::cli::CSSConfig::NoHeaders => { + epub.stylesheet(body_stylesheet.as_bytes())?; + Ok(()) + } + _ => Ok(()), + } +} + //TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references fn generate_appendix(articles: Vec<&Extractor>) -> String { let link_tags: String = articles From 2f4da824bac01c9b8af4e3656e462439c2fe9cc0 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 24 Jul 2021 12:03:36 +0300 Subject: [PATCH 3/6] feat: add HTML exports with inlining of images fix: typo fix refactor: refactor `add_stylesheets` function --- .gitignore | 3 + Cargo.lock | 1 + Cargo.toml | 3 +- src/cli.rs | 26 ++- src/cli_config.yml | 16 ++ src/epub.rs | 26 +-- src/extractor.rs | 3 +- src/html.rs | 391 +++++++++++++++++++++++++++++++++++++++++++++ src/logs.rs | 4 +- src/main.rs | 29 +++- 10 files changed, 477 insertions(+), 25 deletions(-) create mode 100644 src/html.rs diff --git a/.gitignore b/.gitignore index 2b8060a..5e612be 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ /target *.epub +# Only ignore top level html files which may be made when testing +/*.html +*.pdf *.log .vscode/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 8a52a90..2824333 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1555,6 +1555,7 @@ name = "paperoni" version = "0.5.0-alpha1" dependencies = [ "async-std", + "base64", "chrono", "clap", "colored", diff --git a/Cargo.toml b/Cargo.toml index bde8df1..fced683 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,8 +14,9 @@ readme = "README.md" [dependencies] # atty = "0.2.14" async-std = "1.9.0" +base64 = "0.13.0" chrono = "0.4.19" -clap = {version = "2.33.3", features = ["yaml"]} +clap = { version = "2.33.3", features = ["yaml"] } colored = "2.0.0" comfy-table = "3.0.0" derive_builder = "0.10.2" diff --git a/src/cli.rs b/src/cli.rs index c66f0de..67f8ace 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -11,10 +11,10 @@ const DEFAULT_MAX_CONN: usize = 8; #[derive(derive_builder::Builder)] pub struct AppConfig { - /// Urls for store in epub + /// Article urls pub urls: Vec, pub max_conn: usize, - /// Path to file of multiple articles into a single epub + /// Path to file of multiple articles into a single article pub merged: Option, pub output_directory: Option, pub log_level: LogLevel, @@ -23,6 +23,8 @@ pub struct AppConfig { pub is_logging_to_file: bool, pub inline_toc: bool, pub css_config: CSSConfig, + pub export_type: ExportType, + pub is_inlining_images: bool, } impl AppConfig { @@ -93,10 +95,11 @@ impl<'a> TryFrom> for AppConfig { None => DEFAULT_MAX_CONN, }) .merged(arg_matches.value_of("output-name").map(|name| { - if name.ends_with(".epub") { + let file_ext = format!(".{}", arg_matches.value_of("export").unwrap()); + if name.ends_with(&file_ext) { name.to_owned() } else { - name.to_string() + ".epub" + name.to_string() + &file_ext } })) .can_disable_progress_bar( @@ -143,6 +146,15 @@ impl<'a> TryFrom> for AppConfig { _ => CSSConfig::All, }, ) + .export_type({ + let export_type = arg_matches.value_of("export").unwrap(); + if export_type == "html" { + ExportType::HTML + } else { + ExportType::EPUB + } + }) + .is_inlining_images(arg_matches.is_present("inline-images")) .try_init() } } @@ -162,3 +174,9 @@ pub enum CSSConfig { NoHeaders, None, } + +#[derive(Clone, Debug)] +pub enum ExportType { + HTML, + EPUB, +} diff --git a/src/cli_config.yml b/src/cli_config.yml index a1d9424..88be611 100644 --- a/src/cli_config.yml +++ b/src/cli_config.yml @@ -64,3 +64,19 @@ args: conflicts_with: no-css help: Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default CSS, use --no-css instead. takes_value: false + - export: + long: export + help: Specify the file type of the export. The type must be in lower case. + possible_values: [html, epub] + value_name: type + takes_value: true + default_value: epub + - inline-images: + long: inline-images + help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more. + long_help: "Inlines the article images when exporting to HTML using base64. + \nThis is used when you do not want a separate folder created for images during HTML export. + \nNOTE: It uses base64 encoding on the images which results in larger HTML export sizes as each image + increases in size by about 25%-33%." + takes_value: false + requires: export diff --git a/src/epub.rs b/src/epub.rs index c522470..8c280f1 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -12,7 +12,7 @@ use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor}; lazy_static! { static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); - static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_]"#).unwrap(); + static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_:]"#).unwrap(); } pub fn generate_epubs( @@ -38,9 +38,6 @@ pub fn generate_epubs( enabled_bar }; - let body_stylesheet = include_bytes!("./assets/body.min.css"); - let header_stylesheet = include_bytes!("./assets/headers.min.css"); - let mut errors: Vec = Vec::new(); match app_config.merged { @@ -73,7 +70,7 @@ pub fn generate_epubs( epub.inline_toc(); } - match add_stylesheets(&mut epub, body_stylesheet, header_stylesheet, app_config) { + match add_stylesheets(&mut epub, app_config) { Ok(_) => (), Err(e) => { error!("Unable to add stylesheets to epub file"); @@ -148,6 +145,8 @@ pub fn generate_epubs( let mut paperoni_err: PaperoniError = err.into(); paperoni_err.set_article_source(&name); errors.push(paperoni_err); + error!("Failed to generate epub: {}", name); + bar.finish_with_message("epub generation failed\n"); return Err(errors); } } @@ -189,7 +188,7 @@ pub fn generate_epubs( epub.metadata("author", replace_escaped_characters(author))?; } - add_stylesheets(&mut epub, body_stylesheet, header_stylesheet, app_config)?; + add_stylesheets(&mut epub, app_config)?; let title = replace_escaped_characters(article.metadata().title()); epub.metadata("title", &title)?; @@ -206,7 +205,7 @@ pub fn generate_epubs( let mut file_path = std::env::temp_dir(); file_path.push(&img.0); - let img_buf = File::open(&file_path).expect("Can't read file"); + let img_buf = File::open(&file_path).expect("Can't read image file"); epub.add_resource( file_path.file_name().unwrap(), img_buf, @@ -252,10 +251,10 @@ fn replace_escaped_characters(value: &str) -> String { fn add_stylesheets( epub: &mut EpubBuilder, - body_stylesheet: &[u8], - header_stylesheet: &[u8], app_config: &AppConfig, ) -> Result<(), epub_builder::Error> { + let body_stylesheet: &[u8] = include_bytes!("./assets/body.min.css"); + let header_stylesheet: &[u8] = include_bytes!("./assets/headers.min.css"); match app_config.css_config { crate::cli::CSSConfig::All => { epub.stylesheet([header_stylesheet, body_stylesheet].concat().as_bytes())?; @@ -434,6 +433,15 @@ fn serialize_to_xhtml( node_ref: &NodeRef, mut w: &mut W, ) -> Result<(), PaperoniError> { + { + // Add XHTML attributes + let html_elem = node_ref + .select_first("html") + .expect("Unable to get element in article"); + let mut html_attrs = html_elem.attributes.borrow_mut(); + html_attrs.insert("xmlns", "http://www.w3.org/1999/xhtml".into()); + html_attrs.insert("xmlns:epub", "http://www.idpf.org/2007/ops".into()); + } let mut escape_map = HashMap::new(); escape_map.insert("<", "<"); escape_map.insert(">", ">"); diff --git a/src/extractor.rs b/src/extractor.rs index f427e0f..9df5168 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -30,7 +30,8 @@ impl Extractor { self.readability.parse(&self.url)?; if let Some(article_node_ref) = &self.readability.article_node { let template = r#" - + + diff --git a/src/html.rs b/src/html.rs new file mode 100644 index 0000000..a26fe85 --- /dev/null +++ b/src/html.rs @@ -0,0 +1,391 @@ +use std::{ + collections::{BTreeMap, HashSet}, + fs::{self, File}, + path::Path, +}; + +use base64::encode; +use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; +use html5ever::{LocalName, Namespace, QualName}; +use indicatif::{ProgressBar, ProgressStyle}; +use kuchiki::{traits::*, NodeRef}; +use log::{debug, error, info}; + +use crate::{ + cli::{self, AppConfig}, + errors::PaperoniError, + extractor::Extractor, + moz_readability::MetaData, +}; + +const HEAD_ELEM_NOT_FOUND: &str = + "Unable to get element to inline css. Ensure that the root node is the HTML document."; +const BASE_HTML_TEMPLATE: &str = r#" + + + + + +"#; + +pub fn generate_html_exports( + articles: Vec, + app_config: &AppConfig, + successful_articles_table: &mut Table, +) -> Result<(), Vec> { + if articles.is_empty() { + return Ok(()); + } + + let bar = if app_config.can_disable_progress_bar { + ProgressBar::hidden() + } else { + let enabled_bar = ProgressBar::new(articles.len() as u64); + let style = ProgressStyle::default_bar().template( + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} html {pos}/{len:7} {msg:.green}", + ); + enabled_bar.set_style(style); + if !articles.is_empty() { + enabled_bar.set_message("Generating html files"); + } + enabled_bar + }; + + let mut errors: Vec = Vec::new(); + + match app_config.merged { + Some(ref name) => { + successful_articles_table.set_header(vec![Cell::new("Table of Contents") + .add_attribute(Attribute::Bold) + .set_alignment(CellAlignment::Center) + .fg(Color::Green)]); + + debug!("Creating {:?}", name); + + let base_html_elem = kuchiki::parse_html().one(BASE_HTML_TEMPLATE); + let body_elem = base_html_elem.select_first("body").unwrap(); + let base_path = Path::new(app_config.output_directory.as_deref().unwrap_or(".")); + let img_dirs_path_name = name.trim_end_matches(".html"); + let imgs_dir_path = base_path.join(img_dirs_path_name); + + if !(app_config.is_inlining_images || imgs_dir_path.exists()) { + info!("Creating imgs dir in {:?} for {}", imgs_dir_path, name); + if let Err(e) = std::fs::create_dir(&imgs_dir_path) { + error!("Unable to create imgs dir for HTML file"); + let err: PaperoniError = e.into(); + errors.push(err); + return Err(errors); + }; + } + + for (idx, article) in articles.iter().enumerate() { + let article_elem = article + .article() + .select_first("div[id=\"readability-page-1\"]") + .unwrap(); + + let title = article.metadata().title(); + + let mut elem_attr = article_elem.attributes.borrow_mut(); + if let Some(id_attr) = elem_attr.get_mut("id") { + *id_attr = format!("readability-page-{}", idx); + } + + for (img_url, mime_type_opt) in &article.img_urls { + if app_config.is_inlining_images { + info!("Inlining images for {}", title); + let result = update_imgs_base64( + article, + img_url, + mime_type_opt.as_deref().unwrap_or("image/*"), + ); + + if let Err(e) = result { + let mut err: PaperoniError = e.into(); + err.set_article_source(title); + error!("Unable to copy images to imgs dir for {}", title); + errors.push(err); + } + + info!("Completed inlining images for {}", title); + } else { + info!("Copying images to imgs dir for {}", title); + let result = update_img_urls(article, &imgs_dir_path).map_err(|e| { + let mut err: PaperoniError = e.into(); + err.set_article_source(title); + err + }); + if let Err(e) = result { + error!("Unable to copy images to imgs dir for {}", title); + errors.push(e); + } else { + info!("Successfully copied images to imgs dir for {}", title); + } + } + } + bar.inc(1); + successful_articles_table.add_row(vec![title]); + body_elem.as_node().append(article_elem.as_node().clone()); + debug!("Added {} to the export HTML file", title); + } + + insert_title_elem(&base_html_elem, name); + insert_appendix( + &base_html_elem, + articles + .iter() + .map(|article| (article.metadata(), article.url.as_str())) + .collect(), + ); + inline_css(&base_html_elem, app_config); + + info!("Added title, footer and inlined styles for {}", name); + + info!("Creating export HTML file: {}", name); + if let Err(mut err) = File::create(name) + .and_then(|mut out_file| base_html_elem.serialize(&mut out_file)) + .map_err(|e| -> PaperoniError { e.into() }) + { + error!("Failed to serialize articles to file: {}", name); + err.set_article_source(&name); + errors.push(err); + bar.finish_with_message("html generation failed"); + return Err(errors); + }; + + bar.finish_with_message("Generated html file\n"); + debug!("Created {:?}", name); + println!("Created {:?}", name); + } + None => { + successful_articles_table + .set_header(vec![Cell::new("Downloaded articles") + .add_attribute(Attribute::Bold) + .set_alignment(CellAlignment::Center) + .fg(Color::Green)]) + .set_content_arrangement(ContentArrangement::Dynamic); + + let mut file_names: HashSet = HashSet::new(); + + for article in &articles { + let mut file_name = format!( + "{}/{}.html", + app_config.output_directory.as_deref().unwrap_or("."), + article + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + + if file_names.contains(&file_name) { + info!("Article name {:?} already exists", file_name); + file_name = format!( + "{}/{}_{}.html", + app_config.output_directory.as_deref().unwrap_or("."), + article + .metadata() + .title() + .replace("/", " ") + .replace("\\", " "), + file_names.len() + ); + info!("Renamed to {:?}", file_name); + } + file_names.insert(file_name.clone()); + + debug!("Creating {:?}", file_name); + let export_article = || -> Result<(), PaperoniError> { + let mut out_file = File::create(&file_name)?; + + if app_config.is_inlining_images { + for (img_url, mime_type_opt) in &article.img_urls { + update_imgs_base64( + article, + img_url, + mime_type_opt.as_deref().unwrap_or("image/*"), + )? + } + } else { + let base_path = + Path::new(app_config.output_directory.as_deref().unwrap_or(".")); + let imgs_dir_name = article.metadata().title(); + + if !base_path.join(imgs_dir_name).exists() { + std::fs::create_dir(base_path.join(imgs_dir_name))?; + } + + let imgs_dir_path = base_path.join(imgs_dir_name); + update_img_urls(article, &imgs_dir_path)?; + } + + let utf8_encoding = + NodeRef::new_element(create_qualname("meta"), BTreeMap::new()); + if let Some(elem_node) = utf8_encoding.as_element() { + let mut elem_attrs = elem_node.attributes.borrow_mut(); + elem_attrs.insert("charset", "UTF-8".into()); + } + + if let Ok(head_elem) = article.article().select_first("head") { + let head_elem_node = head_elem.as_node(); + head_elem_node.append(utf8_encoding); + }; + + insert_title_elem(article.article(), article.metadata().title()); + insert_appendix(article.article(), vec![(article.metadata(), &article.url)]); + inline_css(article.article(), app_config); + + article.article().serialize(&mut out_file)?; + Ok(()) + }; + + if let Err(mut err) = export_article() { + err.set_article_source(&article.url); + errors.push(err); + } + debug!("Created {:?}", file_name); + + bar.inc(1); + successful_articles_table.add_row(vec![article.metadata().title()]); + } + bar.finish_with_message("Generated HTML files\n"); + } + } + + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } +} + +fn create_qualname(name: &str) -> QualName { + QualName::new( + None, + Namespace::from("http://www.w3.org/1999/xhtml"), + LocalName::from(name), + ) +} + +/// Updates the src attribute of `` elements with a base64 encoded string of the image data +fn update_imgs_base64( + article: &Extractor, + img_url: &str, + mime_type: &str, +) -> Result<(), std::io::Error> { + let temp_dir = std::env::temp_dir(); + let img_path = temp_dir.join(img_url); + let img_bytes = std::fs::read(img_path)?; + let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes)); + + let img_elems = article + .article() + .select(&format!("img[src=\"{}\"]", img_url)) + .unwrap(); + for img_elem in img_elems { + let mut img_attr = img_elem.attributes.borrow_mut(); + if let Some(src_attr) = img_attr.get_mut("src") { + *src_attr = img_base64_str.clone(); + } + } + Ok(()) +} + +/// Updates the src attribute of `` elements to the new `imgs_dir_path` and copies the image to the new file location +fn update_img_urls(article: &Extractor, imgs_dir_path: &Path) -> Result<(), std::io::Error> { + let temp_dir = std::env::temp_dir(); + for (img_url, _) in &article.img_urls { + let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url)); + info!("Copying {:?} to {:?}", from, to); + fs::copy(from, to)?; + let img_elems = article + .article() + .select(&format!("img[src=\"{}\"]", img_url)) + .unwrap(); + for img_elem in img_elems { + let mut img_attr = img_elem.attributes.borrow_mut(); + if let Some(src_attr) = img_attr.get_mut("src") { + *src_attr = imgs_dir_path.join(img_url).to_str().unwrap().into(); + } + } + } + Ok(()) +} + +/// Creates a `` element in an HTML document with the value set to the article's title +fn insert_title_elem(root_node: &NodeRef, title: &str) { + let title_content = NodeRef::new_text(title); + let title_elem = NodeRef::new_element(create_qualname("title"), BTreeMap::new()); + title_elem.append(title_content); + match root_node.select_first("head") { + Ok(head_elem) => { + head_elem.as_node().append(title_elem); + } + Err(_) => { + debug!("{}", HEAD_ELEM_NOT_FOUND); + let html_elem = root_node.select_first("html").unwrap(); + let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new()); + head_elem.append(title_elem); + html_elem.as_node().prepend(head_elem); + } + } +} + +/// Creates the appendix in an HTML document where article sources are added in a `<footer>` element +fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) { + let link_tags: String = article_links + .iter() + .map(|(meta_data, url)| { + let article_name = if !meta_data.title().is_empty() { + meta_data.title() + } else { + url + }; + format!("<a href=\"{}\">{}</a><br></br>", url, article_name) + }) + .collect(); + let footer_inner_html = format!("<h2>Appendix</h2><h2>Article sources</h3>{}", link_tags); + let footer_elem = + kuchiki::parse_fragment(create_qualname("footer"), Vec::new()).one(footer_inner_html); + root_node.append(footer_elem); +} + +/// Inlines the CSS stylesheets into the HTML article node +fn inline_css(root_node: &NodeRef, app_config: &AppConfig) { + let body_stylesheet = include_str!("./assets/body.min.css"); + let header_stylesheet = include_str!("./assets/headers.min.css"); + let mut css_str = String::new(); + match app_config.css_config { + cli::CSSConfig::NoHeaders => { + css_str.push_str(body_stylesheet); + } + cli::CSSConfig::All => { + css_str.push_str(body_stylesheet); + css_str.push_str(header_stylesheet); + } + cli::CSSConfig::None => { + return; + } + } + let css_html_str = format!("<style>{}</style>", css_str); + let style_container = + kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str); + let style_elem = style_container.select_first("style").unwrap(); + match root_node.select_first("head") { + Ok(head_elem) => { + head_elem.as_node().prepend(style_elem.as_node().to_owned()); + } + Err(_) => { + debug!("{}", HEAD_ELEM_NOT_FOUND); + let html_elem = root_node.select_first("html").unwrap(); + let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new()); + head_elem.prepend(style_elem.as_node().to_owned()); + html_elem.as_node().prepend(head_elem); + } + } + + // Remove the <link> of the stylesheet since styles are now inlined + if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") { + style_link_elem.as_node().detach(); + }; +} diff --git a/src/logs.rs b/src/logs.rs index a0f51d8..7ce58dc 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -11,7 +11,7 @@ use crate::errors::PaperoniError; pub fn display_summary( initial_article_count: usize, - succesful_articles_table: Table, + successful_articles_table: Table, partial_downloads: Vec<PartialDownload>, errors: Vec<PaperoniError>, ) { @@ -31,7 +31,7 @@ pub fn display_summary( ); if successfully_downloaded_count > 0 { - println!("{}", succesful_articles_table); + println!("{}", successful_articles_table); } if partial_downloads_count > 0 { diff --git a/src/main.rs b/src/main.rs index e378115..925beca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,6 +12,7 @@ mod cli; mod epub; mod errors; mod extractor; +mod html; /// This module is responsible for async HTTP calls for downloading /// the HTML content and images mod http; @@ -20,6 +21,7 @@ mod moz_readability; use cli::AppConfig; use epub::generate_epubs; +use html::generate_html_exports; use logs::display_summary; fn main() { @@ -64,22 +66,33 @@ fn run(app_config: AppConfig) { let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors); bar.finish_with_message("Downloaded articles"); - let mut succesful_articles_table = Table::new(); - succesful_articles_table + let mut successful_articles_table = Table::new(); + successful_articles_table .load_preset(UTF8_FULL) .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) .set_content_arrangement(ContentArrangement::Dynamic); - match generate_epubs(articles, &app_config, &mut succesful_articles_table) { - Ok(_) => (), - Err(gen_epub_errors) => { - errors.extend(gen_epub_errors); + + match app_config.export_type { + cli::ExportType::EPUB => { + match generate_epubs(articles, &app_config, &mut successful_articles_table) { + Ok(_) => (), + Err(gen_epub_errors) => { + errors.extend(gen_epub_errors); + } + }; } - }; + cli::ExportType::HTML => { + match generate_html_exports(articles, &app_config, &mut successful_articles_table) { + Ok(_) => (), + Err(gen_html_errors) => errors.extend(gen_html_errors), + } + } + } let has_errors = !errors.is_empty() || !partial_downloads.is_empty(); display_summary( app_config.urls.len(), - succesful_articles_table, + successful_articles_table, partial_downloads, errors, ); From eac28da798569989175a44de1947dfdeff0f454a Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Sat, 24 Jul 2021 12:35:30 +0300 Subject: [PATCH 4/6] fix: add validation when passing `--inline-toc` feat: add coloring when displaying CLI errors --- src/cli.rs | 12 +++++++++++- src/errors.rs | 2 ++ src/main.rs | 3 ++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 67f8ace..12a6357 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -119,7 +119,17 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig { 4..=u64::MAX => LogLevel::Debug, }) .is_logging_to_file(arg_matches.is_present("log-to-file")) - .inline_toc(arg_matches.is_present("inline-toc")) + .inline_toc( + (if arg_matches.is_present("inline-toc") { + if arg_matches.value_of("export") == Some("epub") { + Ok(true) + } else { + Err(Error::WrongExportInliningToC) + } + } else { + Ok(false) + })?, + ) .output_directory( arg_matches .value_of("output_directory") diff --git a/src/errors.rs b/src/errors.rs index 17ae34a..8fbfeea 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -156,4 +156,6 @@ pub enum CliError<BuilderError: Debug + Display> { OutputDirectoryNotExists, #[error("Unable to start logger!\n{0}")] LogError(#[from] LogError), + #[error("The --inline-toc can only be used exporting to epub")] + WrongExportInliningToC, } diff --git a/src/main.rs b/src/main.rs index 925beca..d82ac95 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,6 +3,7 @@ extern crate lazy_static; use std::process::exit; +use colored::Colorize; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::{ContentArrangement, Table}; use http::download; @@ -28,7 +29,7 @@ fn main() { let app_config = match cli::AppConfig::init_with_cli() { Ok(app_config) => app_config, Err(err) => { - eprintln!("{}", err); + eprintln!("{}: {}", "ERROR".bold().bright_red(), err); exit(1); } }; From e6f901eb5a8bd2bdfc95fe4424dddb2b3c6d8ff9 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Sat, 24 Jul 2021 12:43:40 +0300 Subject: [PATCH 5/6] refactor: rename `Extractor` to `Article` --- src/epub.rs | 14 +++++++------- src/extractor.rs | 28 ++++++++++++++-------------- src/html.rs | 24 ++++++++++++------------ src/http.rs | 10 +++++----- 4 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index 8c280f1..d589ff4 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -8,7 +8,7 @@ use indicatif::{ProgressBar, ProgressStyle}; use kuchiki::NodeRef; use log::{debug, error, info}; -use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor}; +use crate::{cli::AppConfig, errors::PaperoniError, extractor::Article}; lazy_static! { static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); @@ -16,7 +16,7 @@ lazy_static! { } pub fn generate_epubs( - articles: Vec<Extractor>, + articles: Vec<Article>, app_config: &AppConfig, successful_articles_table: &mut Table, ) -> Result<(), Vec<PaperoniError>> { @@ -88,9 +88,9 @@ pub fn generate_epubs( let content_url = format!("article_{}.xhtml", idx); let mut xhtml_buf = Vec::new(); let header_level_tocs = - get_header_level_toc_vec(&content_url, article.article()); + get_header_level_toc_vec(&content_url, article.node_ref()); - serialize_to_xhtml(article.article(), &mut xhtml_buf)?; + serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)?; let xhtml_str = std::str::from_utf8(&xhtml_buf)?; let section_name = article.metadata().title(); let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes()) @@ -179,8 +179,8 @@ pub fn generate_epubs( let mut out_file = File::create(&file_name).unwrap(); let mut xhtml_buf = Vec::new(); let header_level_tocs = - get_header_level_toc_vec("index.xhtml", article.article()); - serialize_to_xhtml(article.article(), &mut xhtml_buf) + get_header_level_toc_vec("index.xhtml", article.node_ref()); + serialize_to_xhtml(article.node_ref(), &mut xhtml_buf) .expect("Unable to serialize to xhtml"); let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap(); @@ -269,7 +269,7 @@ fn add_stylesheets<T: epub_builder::Zip>( } //TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references -fn generate_appendix(articles: Vec<&Extractor>) -> String { +fn generate_appendix(articles: Vec<&Article>) -> String { let link_tags: String = articles .iter() .map(|article| { diff --git a/src/extractor.rs b/src/extractor.rs index 9df5168..b16373a 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -6,18 +6,18 @@ use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option<String>); -pub struct Extractor { - article: Option<NodeRef>, +pub struct Article { + node_ref_opt: Option<NodeRef>, pub img_urls: Vec<ResourceInfo>, readability: Readability, pub url: String, } -impl Extractor { +impl Article { /// Create a new instance of an HTML extractor given an HTML string pub fn from_html(html_str: &str, url: &str) -> Self { - Extractor { - article: None, + Self { + node_ref_opt: None, img_urls: Vec::new(), readability: Readability::new(html_str), url: url.to_string(), @@ -42,14 +42,14 @@ impl Extractor { let doc = kuchiki::parse_html().one(template); let body = doc.select_first("body").unwrap(); body.as_node().append(article_node_ref.clone()); - self.article = Some(doc); + self.node_ref_opt = Some(doc); } Ok(()) } /// Traverses the DOM tree of the content and retrieves the IMG URLs pub fn extract_img_urls(&mut self) { - if let Some(content_ref) = &self.article { + if let Some(content_ref) = &self.node_ref_opt { self.img_urls = content_ref .select("img") .unwrap() @@ -67,8 +67,8 @@ impl Extractor { } /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse - pub fn article(&self) -> &NodeRef { - self.article.as_ref().expect( + pub fn node_ref(&self) -> &NodeRef { + self.node_ref_opt.as_ref().expect( "Article node doesn't exist. This may be because the document has not been parsed", ) } @@ -112,16 +112,16 @@ mod test { #[test] fn test_extract_img_urls() { - let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/"); - extractor + let mut article = Article::from_html(TEST_HTML, "http://example.com/"); + article .extract_content() .expect("Article extraction failed unexpectedly"); - extractor.extract_img_urls(); + article.extract_img_urls(); - assert!(extractor.img_urls.len() > 0); + assert!(article.img_urls.len() > 0); assert_eq!( vec![("http://example.com/img.jpg".to_string(), None)], - extractor.img_urls + article.img_urls ); } } diff --git a/src/html.rs b/src/html.rs index a26fe85..7b761d2 100644 --- a/src/html.rs +++ b/src/html.rs @@ -14,7 +14,7 @@ use log::{debug, error, info}; use crate::{ cli::{self, AppConfig}, errors::PaperoniError, - extractor::Extractor, + extractor::Article, moz_readability::MetaData, }; @@ -29,7 +29,7 @@ const BASE_HTML_TEMPLATE: &str = r#"<!DOCTYPE html> </html>"#; pub fn generate_html_exports( - articles: Vec<Extractor>, + articles: Vec<Article>, app_config: &AppConfig, successful_articles_table: &mut Table, ) -> Result<(), Vec<PaperoniError>> { @@ -80,7 +80,7 @@ pub fn generate_html_exports( for (idx, article) in articles.iter().enumerate() { let article_elem = article - .article() + .node_ref() .select_first("div[id=\"readability-page-1\"]") .unwrap(); @@ -226,16 +226,16 @@ pub fn generate_html_exports( elem_attrs.insert("charset", "UTF-8".into()); } - if let Ok(head_elem) = article.article().select_first("head") { + if let Ok(head_elem) = article.node_ref().select_first("head") { let head_elem_node = head_elem.as_node(); head_elem_node.append(utf8_encoding); }; - insert_title_elem(article.article(), article.metadata().title()); - insert_appendix(article.article(), vec![(article.metadata(), &article.url)]); - inline_css(article.article(), app_config); + insert_title_elem(article.node_ref(), article.metadata().title()); + insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]); + inline_css(article.node_ref(), app_config); - article.article().serialize(&mut out_file)?; + article.node_ref().serialize(&mut out_file)?; Ok(()) }; @@ -269,7 +269,7 @@ fn create_qualname(name: &str) -> QualName { /// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data fn update_imgs_base64( - article: &Extractor, + article: &Article, img_url: &str, mime_type: &str, ) -> Result<(), std::io::Error> { @@ -279,7 +279,7 @@ fn update_imgs_base64( let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes)); let img_elems = article - .article() + .node_ref() .select(&format!("img[src=\"{}\"]", img_url)) .unwrap(); for img_elem in img_elems { @@ -292,14 +292,14 @@ fn update_imgs_base64( } /// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location -fn update_img_urls(article: &Extractor, imgs_dir_path: &Path) -> Result<(), std::io::Error> { +fn update_img_urls(article: &Article, imgs_dir_path: &Path) -> Result<(), std::io::Error> { let temp_dir = std::env::temp_dir(); for (img_url, _) in &article.img_urls { let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url)); info!("Copying {:?} to {:?}", from, to); fs::copy(from, to)?; let img_elems = article - .article() + .node_ref() .select(&format!("img[src=\"{}\"]", img_url)) .unwrap(); for img_elem in img_elems { diff --git a/src/http.rs b/src/http.rs index 8707977..15cdb3c 100644 --- a/src/http.rs +++ b/src/http.rs @@ -9,7 +9,7 @@ use url::Url; use crate::cli::AppConfig; use crate::errors::{ErrorKind, ImgError, PaperoniError}; -use crate::extractor::Extractor; +use crate::extractor::Article; type HTMLResource = (String, String); pub fn download( @@ -17,7 +17,7 @@ pub fn download( bar: &ProgressBar, partial_downloads: &mut Vec<PartialDownload>, errors: &mut Vec<PaperoniError>, -) -> Vec<Extractor> { +) -> Vec<Article> { task::block_on(async { let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); @@ -26,7 +26,7 @@ pub fn download( match fetch_result { Ok((url, html)) => { debug!("Extracting {}", &url); - let mut extractor = Extractor::from_html(&html, &url); + let mut extractor = Article::from_html(&html, &url); bar.set_message("Extracting..."); match extractor.extract_content() { Ok(_) => { @@ -185,7 +185,7 @@ async fn process_img_response<'a>( } pub async fn download_images( - extractor: &mut Extractor, + extractor: &mut Article, article_origin: &Url, bar: &ProgressBar, ) -> Result<(), Vec<ImgError>> { @@ -237,7 +237,7 @@ pub async fn download_images( let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) { let (img_url, img_path, img_mime) = img_item; let img_ref = extractor - .article() + .node_ref() .select_first(&format!("img[src='{}']", img_url)) .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut(); From 40cf5b06c95005cd004b47454cc57d9ce0eb55cc Mon Sep 17 00:00:00 2001 From: Kenneth Gitere <gitere81@gmail.com> Date: Sat, 24 Jul 2021 13:29:14 +0300 Subject: [PATCH 6/6] chore: update README chore: bump version --- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 108 ++++++++++++++++++++++++++++++++++++++------- src/cli_config.yml | 2 +- 4 files changed, 95 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2824333..a40cb8e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1552,7 +1552,7 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.5.0-alpha1" +version = "0.6.0-alpha1" dependencies = [ "async-std", "base64", diff --git a/Cargo.toml b/Cargo.toml index fced683..ec637c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ description = "A web article downloader" homepage = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.5.0-alpha1" +version = "0.6.0-alpha1" authors = ["Kenneth Gitere <gitere81@gmail.com>"] edition = "2018" license = "MIT" diff --git a/README.md b/README.md index 5c547b0..8f08536 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ </a> </div> -Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. There is provisional<sup><a href="#pdf-exports">\*</a></sup> support for exporting to PDF as well. +Paperoni is a CLI tool made in Rust for downloading web articles as EPUB or HTML files. There is provisional<sup><a href="#pdf-exports">\*</a></sup> support for exporting to PDF as well. > This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash. @@ -23,7 +23,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run: ```sh -cargo install paperoni --version 0.5.0-alpha1 +cargo install paperoni --version 0.6.0-alpha1 ``` _Paperoni is still in alpha so the `version` flag has to be passed._ @@ -48,39 +48,54 @@ USAGE: paperoni [OPTIONS] [urls]... OPTIONS: + --export <type> + Specify the file type of the export. The type must be in lower case. [default: epub] [possible values: + html, epub] -f, --file <file> Input file containing links -h, --help Prints help information + --inline-images + Inlines the article images when exporting to HTML using base64. + This is used when you do not want a separate folder created for images during HTML export. + NOTE: It uses base64 encoding on the images which results in larger HTML export sizes as each image + increases in size by about 25%-33%. --inline-toc - Add an inlined Table of Contents page at the start of the merged article. - + Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation --log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level - --max-conn <max_conn> + --max-conn <max-conn> The maximum number of concurrent HTTP connections when downloading articles. Default is 8. NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests. - -o, --output-dir <output_directory> - Directory for saving epub documents - - --merge <output_name> + --no-css + Removes the stylesheets used in the EPUB generation. + The EPUB file will then be laid out based on your e-reader's default stylesheets. + Images and code blocks may overflow when this flag is set and layout of generated + PDFs will be affected. Use --no-header-css if you want to only disable the styling on headers. + --no-header-css + Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default + CSS, use --no-css instead. + --merge <output-name> Merge multiple articles into a single epub that will be given the name provided + -o, --output-dir <output_directory> + Directory to store output epub documents + -V, --version Prints version information -v This takes upto 4 levels of verbosity in the following order. - - Error (-v) - - Warn (-vv) - - Info (-vvv) - - Debug (-vvvv) - When this flag is passed, it disables the progress bars and logs to stderr. - If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag. + - Error (-v) + - Warn (-vv) + - Info (-vvv) + - Debug (-vvvv) + When this flag is passed, it disables the progress bars and logs to stderr. + If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag. ARGS: <urls>... @@ -112,6 +127,41 @@ These can also be read from a file using the `-f/--file` flag. paperoni -f links.txt ``` +### Exporting articles + +By default, Paperoni exports to EPUB files but you can change to HTML by passing the `--export html` flag. + +```sh +paperoni https://en.wikipedia.org/wiki/Pepperoni --export html +``` + +HTML exports allow you to read the articles as plain HTML documents on your browser but can also be used to convert to PDF as explained [here](#). + +When exporting to HTML, Paperoni will download the article's images to a folder named similar to the article. Therefore the folder structure would look like this for the command ran above: + +``` +. +├── Pepperoni - Wikipedia +│ ├── 1a9f886e9b58db72e0003a2cd52681d8.png +│ ├── 216f8a4265a1ceb3f8cfba4c2f9057b1.jpeg +│ ... +└── Pepperoni - Wikipedia.html +``` + +If you would instead prefer to have the images inlined directly to the HTML export, pass the `inline-images` flag, i.e.: + +```sh +paperoni https://en.wikipedia.org/wiki/Pepperoni --export html --inline-images +``` + +This is especially useful when exporting multiple links. + +**NOTE**: The inlining of images for HTML exports uses base64 encoding which is known to increase the overall size of images by about 25% to 33%. + +### Disabling CSS + +The `no-css` and `no-header-css` flags can be used to remove the default styling added by Paperoni. Refer to `--help` to see the usage of the flags. + ### Merging articles By default, Paperoni generates an epub file for each link. You can also merge multiple links @@ -153,7 +203,11 @@ There are also web pages it won't work on in general such as Twitter and Reddit ## PDF exports -As of version 0.5-alpha1, you can now export to PDF using a third party tool. This requires that you install [Calibre](https://calibre-ebook.com/) which comes with a ebook conversion. You can convert the epub to a pdf through the terminal with `ebook-convert`: +PDF conversion can be done using a third party tool. There are 2 options to do so: + +### EPUB to PDF + +This requires that you install [Calibre](https://calibre-ebook.com/) which comes with a ebook conversion. You can convert the epub to a pdf through the terminal with `ebook-convert`: ```sh # Assuming the downloaded epub was called foo.epub @@ -161,3 +215,25 @@ ebook-convert foo.epub foo.pdf ``` Alternatively, you can use the Calibre GUI to do the file conversion. + +### HTML to PDF + +The recommended approach is to use [Weasyprint](https://weasyprint.org/start/), a free and open-source tool that converts HTML documents to PDF. It is available on Linux, MacOS and Windows. Using the CLI, it can be done as follows: + +```sh +paperoni https://en.wikipedia.org/wiki/Pepperoni --export html +weasyprint "Pepperoni - Wikipedia.html" Pepperoni.pdf +``` + +Inlining images is not mandatory as Weasyprint will be able to find the files on its own. + +### Comparison of PDF conversion methods + +Either of the conversion methods is sufficient for most use cases. The main differences are listed below: +| | EPUB to PDF | HTML to PDF | +|----------------------|----------------------------|------------------| +| Wrapping code blocks | Yes | No | +| CSS customization | No | Yes | +| Generated file size | Slightly larger | Slightly smaller | + +The difference in file size is due to the additional fonts added to the PDF file by `ebook-convert`. diff --git a/src/cli_config.yml b/src/cli_config.yml index 88be611..4f86d52 100644 --- a/src/cli_config.yml +++ b/src/cli_config.yml @@ -49,7 +49,7 @@ args: long: inline-toc requires: output-name help: Add an inlined Table of Contents page at the start of the merged article. - long_help: Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation" + long_help: Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation - no-css: long: no-css conflicts_with: no-header-css