From 9c2232e37f25815d7335211f50a0e4c404257c8a Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 27 Jul 2021 17:08:58 +0300 Subject: [PATCH 1/5] fix: add validation when passing inline-images flag --- src/cli.rs | 12 +++++++++++- src/errors.rs | 4 +++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 12a6357..9578a49 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -164,7 +164,17 @@ impl<'a> TryFrom> for AppConfig { ExportType::EPUB } }) - .is_inlining_images(arg_matches.is_present("inline-images")) + .is_inlining_images( + (if arg_matches.is_present("inline-images") { + if arg_matches.value_of("export") == Some("html") { + Ok(true) + } else { + Err(Error::WrongExportInliningImages) + } + } else { + Ok(false) + })?, + ) .try_init() } } diff --git a/src/errors.rs b/src/errors.rs index 8fbfeea..5b1cc25 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -156,6 +156,8 @@ pub enum CliError { OutputDirectoryNotExists, #[error("Unable to start logger!\n{0}")] LogError(#[from] LogError), - #[error("The --inline-toc can only be used exporting to epub")] + #[error("The --inline-toc flag can only be used when exporting to epub")] WrongExportInliningToC, + #[error("The --inline-images flag can only be used when exporting to html")] + WrongExportInliningImages, } From 0357eaebb602ffbb726d68429ce5b5c8f9fd866c Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 27 Jul 2021 18:39:00 +0300 Subject: [PATCH 2/5] fix: fix insert_appendix function when inserting HTML nodes refactor: remove check for `` in inline_css The `` element is automatically added when parsing an HTML document, therefore, the program should panic if it still does not find the `` element --- src/html.rs | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/src/html.rs b/src/html.rs index 7b761d2..d6b26fd 100644 --- a/src/html.rs +++ b/src/html.rs @@ -344,10 +344,15 @@ fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) { format!("{}

", url, article_name) }) .collect(); - let footer_inner_html = format!("

Appendix

Article sources

{}", link_tags); - let footer_elem = - kuchiki::parse_fragment(create_qualname("footer"), Vec::new()).one(footer_inner_html); - root_node.append(footer_elem); + let footer_inner_html = format!( + "

Appendix

Article sources

{}
", + link_tags + ); + let footer_container = + kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(footer_inner_html); + let footer_elem = footer_container.select_first("footer").unwrap(); + + root_node.append(footer_elem.as_node().clone()); } /// Inlines the CSS stylesheets into the HTML article node @@ -371,18 +376,9 @@ fn inline_css(root_node: &NodeRef, app_config: &AppConfig) { let style_container = kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str); let style_elem = style_container.select_first("style").unwrap(); - match root_node.select_first("head") { - Ok(head_elem) => { - head_elem.as_node().prepend(style_elem.as_node().to_owned()); - } - Err(_) => { - debug!("{}", HEAD_ELEM_NOT_FOUND); - let html_elem = root_node.select_first("html").unwrap(); - let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new()); - head_elem.prepend(style_elem.as_node().to_owned()); - html_elem.as_node().prepend(head_elem); - } - } + let head_elem = root_node.select_first("head").expect(HEAD_ELEM_NOT_FOUND); + head_elem.as_node().prepend(style_elem.as_node().to_owned()); +} // Remove the of the stylesheet since styles are now inlined if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") { From 0b19376f59b2275422a717f0fda7c3dcb572d6c3 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 27 Jul 2021 18:43:08 +0300 Subject: [PATCH 3/5] test: add tests for html module --- src/html.rs | 120 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 6 deletions(-) diff --git a/src/html.rs b/src/html.rs index d6b26fd..b2425bf 100644 --- a/src/html.rs +++ b/src/html.rs @@ -12,7 +12,7 @@ use kuchiki::{traits::*, NodeRef}; use log::{debug, error, info}; use crate::{ - cli::{self, AppConfig}, + cli::{self, AppConfig, CSSConfig}, errors::PaperoniError, extractor::Article, moz_readability::MetaData, @@ -137,7 +137,8 @@ pub fn generate_html_exports( .map(|article| (article.metadata(), article.url.as_str())) .collect(), ); - inline_css(&base_html_elem, app_config); + inline_css(&base_html_elem, &app_config.css_config); + remove_existing_stylesheet_link(&base_html_elem); info!("Added title, footer and inlined styles for {}", name); @@ -233,7 +234,8 @@ pub fn generate_html_exports( insert_title_elem(article.node_ref(), article.metadata().title()); insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]); - inline_css(article.node_ref(), app_config); + inline_css(article.node_ref(), &app_config.css_config); + remove_existing_stylesheet_link(article.node_ref()); article.node_ref().serialize(&mut out_file)?; Ok(()) @@ -356,11 +358,11 @@ fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) { } /// Inlines the CSS stylesheets into the HTML article node -fn inline_css(root_node: &NodeRef, app_config: &AppConfig) { +fn inline_css(root_node: &NodeRef, css_config: &CSSConfig) { let body_stylesheet = include_str!("./assets/body.min.css"); let header_stylesheet = include_str!("./assets/headers.min.css"); let mut css_str = String::new(); - match app_config.css_config { + match css_config { cli::CSSConfig::NoHeaders => { css_str.push_str(body_stylesheet); } @@ -380,8 +382,114 @@ fn inline_css(root_node: &NodeRef, app_config: &AppConfig) { head_elem.as_node().prepend(style_elem.as_node().to_owned()); } - // Remove the of the stylesheet since styles are now inlined +/// Removes the of the stylesheet. This is used when inlining styles +fn remove_existing_stylesheet_link(root_node: &NodeRef) { if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") { style_link_elem.as_node().detach(); }; } + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn test_insert_title_elem() { + let title = "Sample title"; + let html_str = r#""#; + let doc = kuchiki::parse_html().one(html_str); + assert_eq!(0, doc.select("title").unwrap().count()); + + insert_title_elem(&doc, title); + assert_eq!(1, doc.select("title").unwrap().count()); + assert_eq!(title, doc.select_first("title").unwrap().text_contents()); + } + + #[test] + fn test_create_qualname() { + let name = "div"; + assert_eq!( + create_qualname(name), + QualName::new( + None, + Namespace::from("http://www.w3.org/1999/xhtml"), + LocalName::from(name) + ) + ); + } + + #[test] + fn test_inline_css() { + let html_str = r#" + + +

Lorem ipsum sample text goes here.

+ + "#; + let doc = kuchiki::parse_html().one(html_str); + let body_stylesheet = include_str!("./assets/body.min.css"); + let header_stylesheet = include_str!("./assets/headers.min.css"); + assert_eq!(0, doc.select("style").unwrap().count()); + + inline_css(&doc, &CSSConfig::None); + assert_eq!(0, doc.select("style").unwrap().count()); + + inline_css(&doc, &CSSConfig::NoHeaders); + assert_eq!(1, doc.select("style").unwrap().count()); + let style_elem = doc.select_first("style").unwrap(); + assert_eq!(body_stylesheet, style_elem.text_contents()); + + let doc = kuchiki::parse_html().one(html_str); + inline_css(&doc, &CSSConfig::All); + assert_eq!(1, doc.select("style").unwrap().count()); + let style_elem = doc.select_first("style").unwrap(); + assert_eq!( + format!("{}{}", body_stylesheet, header_stylesheet), + style_elem.text_contents() + ); + } + + #[test] + fn test_remove_existing_stylesheet_link() { + let html_str = r#" + +

Lorem ipsum sample text goes here.

"#; + let doc = kuchiki::parse_html().one(html_str); + assert_eq!(1, doc.select("link").unwrap().count()); + remove_existing_stylesheet_link(&doc); + assert_eq!(0, doc.select("link").unwrap().count()); + } + + #[test] + fn test_insert_appendix() { + let html_str = r#" + + +

Lorem ipsum sample text goes here.

+ + "#; + let doc = kuchiki::parse_html().one(html_str); + let meta_data = MetaData::new(); + + assert_eq!(0, doc.select("footer").unwrap().count()); + + insert_appendix(&doc, vec![(&meta_data, "http://example.org")]); + + assert_eq!(1, doc.select("footer").unwrap().count()); + assert_eq!(1, doc.select("footer > h2").unwrap().count()); + assert_eq!( + "Appendix", + doc.select_first("footer > h2").unwrap().text_contents() + ); + assert_eq!(1, doc.select("footer > h3").unwrap().count()); + assert_eq!( + "Article sources", + doc.select_first("footer > h3").unwrap().text_contents() + ); + assert_eq!(1, doc.select("a").unwrap().count()); + + let anchor_elem = doc.select_first("a").unwrap(); + assert_eq!("http://example.org", anchor_elem.text_contents()); + let anchor_attrs = anchor_elem.attributes.borrow(); + assert_eq!(Some("http://example.org"), anchor_attrs.get("href")); + } +} From 07479afeac8c7729ebddd95d6f45083768839c41 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Wed, 28 Jul 2021 09:10:22 +0300 Subject: [PATCH 4/5] refactor: refactor `update_imgs_base64` chore: add doc comment on ResourceType alias fix: add error when image MIME type is invalid on an image --- src/extractor.rs | 1 + src/html.rs | 91 ++++++++++++++++++++++-------------------------- src/http.rs | 9 +++++ 3 files changed, 51 insertions(+), 50 deletions(-) diff --git a/src/extractor.rs b/src/extractor.rs index b16373a..fcd13a4 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -4,6 +4,7 @@ use kuchiki::{traits::*, NodeRef}; use crate::errors::PaperoniError; use crate::moz_readability::{MetaData, Readability}; +/// A tuple of the url and an Option of the resource's MIME type pub type ResourceInfo = (String, Option); pub struct Article { diff --git a/src/html.rs b/src/html.rs index b2425bf..b43df6a 100644 --- a/src/html.rs +++ b/src/html.rs @@ -91,38 +91,33 @@ pub fn generate_html_exports( *id_attr = format!("readability-page-{}", idx); } - for (img_url, mime_type_opt) in &article.img_urls { - if app_config.is_inlining_images { - info!("Inlining images for {}", title); - let result = update_imgs_base64( - article, - img_url, - mime_type_opt.as_deref().unwrap_or("image/*"), - ); + if app_config.is_inlining_images { + info!("Inlining images for {}", title); + let result = update_imgs_base64(article); - if let Err(e) = result { - let mut err: PaperoniError = e.into(); - err.set_article_source(title); - error!("Unable to copy images to imgs dir for {}", title); - errors.push(err); - } + if let Err(e) = result { + let mut err: PaperoniError = e.into(); + err.set_article_source(title); + error!("Unable to copy images to imgs dir for {}", title); + errors.push(err); + } - info!("Completed inlining images for {}", title); + info!("Completed inlining images for {}", title); + } else { + info!("Copying images to imgs dir for {}", title); + let result = update_img_urls(article, &imgs_dir_path).map_err(|e| { + let mut err: PaperoniError = e.into(); + err.set_article_source(title); + err + }); + if let Err(e) = result { + error!("Unable to copy images to imgs dir for {}", title); + errors.push(e); } else { - info!("Copying images to imgs dir for {}", title); - let result = update_img_urls(article, &imgs_dir_path).map_err(|e| { - let mut err: PaperoniError = e.into(); - err.set_article_source(title); - err - }); - if let Err(e) = result { - error!("Unable to copy images to imgs dir for {}", title); - errors.push(e); - } else { - info!("Successfully copied images to imgs dir for {}", title); - } + info!("Successfully copied images to imgs dir for {}", title); } } + bar.inc(1); successful_articles_table.add_row(vec![title]); body_elem.as_node().append(article_elem.as_node().clone()); @@ -200,13 +195,7 @@ pub fn generate_html_exports( let mut out_file = File::create(&file_name)?; if app_config.is_inlining_images { - for (img_url, mime_type_opt) in &article.img_urls { - update_imgs_base64( - article, - img_url, - mime_type_opt.as_deref().unwrap_or("image/*"), - )? - } + update_imgs_base64(article)?; } else { let base_path = Path::new(app_config.output_directory.as_deref().unwrap_or(".")); @@ -270,24 +259,26 @@ fn create_qualname(name: &str) -> QualName { } /// Updates the src attribute of `` elements with a base64 encoded string of the image data -fn update_imgs_base64( - article: &Article, - img_url: &str, - mime_type: &str, -) -> Result<(), std::io::Error> { +fn update_imgs_base64(article: &Article) -> Result<(), std::io::Error> { let temp_dir = std::env::temp_dir(); - let img_path = temp_dir.join(img_url); - let img_bytes = std::fs::read(img_path)?; - let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes)); + for (img_url, mime_type) in &article.img_urls { + let img_path = temp_dir.join(img_url); + let img_bytes = std::fs::read(img_path)?; + let img_base64_str = format!( + "data:image:{};base64,{}", + mime_type.as_deref().unwrap_or("image/*"), + encode(img_bytes) + ); - let img_elems = article - .node_ref() - .select(&format!("img[src=\"{}\"]", img_url)) - .unwrap(); - for img_elem in img_elems { - let mut img_attr = img_elem.attributes.borrow_mut(); - if let Some(src_attr) = img_attr.get_mut("src") { - *src_attr = img_base64_str.clone(); + let img_elems = article + .node_ref() + .select(&format!("img[src=\"{}\"]", img_url)) + .unwrap(); + for img_elem in img_elems { + let mut img_attr = img_elem.attributes.borrow_mut(); + if let Some(src_attr) = img_attr.get_mut("src") { + *src_attr = img_base64_str.clone(); + } } } Ok(()) diff --git a/src/http.rs b/src/http.rs index 15cdb3c..1a1206d 100644 --- a/src/http.rs +++ b/src/http.rs @@ -150,6 +150,15 @@ async fn process_img_response<'a>( let img_mime = img_response .content_type() .map(|mime| mime.essence().to_string()); + if let Some(mime_str) = &img_mime { + if !mime_str.starts_with("image/") { + return Err(ErrorKind::HTTPError(format!( + "Invalid image MIME type: {} for {}", + mime_str, url + )) + .into()); + } + } let img_ext = match img_response .content_type() .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) From d4a23088a9ab0aa689d0581d85a8e78ab5294838 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 24 Aug 2021 07:20:23 +0300 Subject: [PATCH 5/5] test: add cli tests --- src/cli.rs | 141 +++++++++++++++++++++++++++++++++++++++++++-- src/cli_config.yml | 3 +- src/errors.rs | 16 +++++ 3 files changed, 154 insertions(+), 6 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 9578a49..faba4a5 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -9,13 +9,14 @@ type Error = crate::errors::CliError; const DEFAULT_MAX_CONN: usize = 8; -#[derive(derive_builder::Builder)] +#[derive(derive_builder::Builder, Debug)] pub struct AppConfig { /// Article urls pub urls: Vec, pub max_conn: usize, /// Path to file of multiple articles into a single article pub merged: Option, + // TODO: Change type to Path pub output_directory: Option, pub log_level: LogLevel, pub can_disable_progress_bar: bool, @@ -95,7 +96,7 @@ impl<'a> TryFrom> for AppConfig { None => DEFAULT_MAX_CONN, }) .merged(arg_matches.value_of("output-name").map(|name| { - let file_ext = format!(".{}", arg_matches.value_of("export").unwrap()); + let file_ext = format!(".{}", arg_matches.value_of("export").unwrap_or("epub")); if name.ends_with(&file_ext) { name.to_owned() } else { @@ -132,10 +133,11 @@ impl<'a> TryFrom> for AppConfig { ) .output_directory( arg_matches - .value_of("output_directory") + .value_of("output-directory") .map(|output_directory| { let path = Path::new(output_directory); if !path.exists() { + // TODO: Create the directory Err(Error::OutputDirectoryNotExists) } else if !path.is_dir() { Err(Error::WrongOutputDirectory) @@ -157,7 +159,7 @@ impl<'a> TryFrom> for AppConfig { }, ) .export_type({ - let export_type = arg_matches.value_of("export").unwrap(); + let export_type = arg_matches.value_of("export").unwrap_or("epub"); if export_type == "html" { ExportType::HTML } else { @@ -200,3 +202,134 @@ pub enum ExportType { HTML, EPUB, } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_clap_config_errors() { + let yaml_config = load_yaml!("cli_config.yml"); + let app = App::from_yaml(yaml_config); + + // It returns Ok when only a url is passed + let result = app + .clone() + .get_matches_from_safe(vec!["paperoni", "http://example.org"]); + assert!(result.is_ok()); + + // It returns an error when no args are passed + let result = app.clone().get_matches_from_safe(vec!["paperoni"]); + assert!(result.is_err()); + assert_eq!( + clap::ErrorKind::MissingArgumentOrSubcommand, + result.unwrap_err().kind + ); + + // It returns an error when both output-dir and merge are used + let result = app.clone().get_matches_from_safe(vec![ + "paperoni", + "http://example.org", + "--merge", + "foo", + "--output-dir", + "~", + ]); + assert!(result.is_err()); + assert_eq!(clap::ErrorKind::ArgumentConflict, result.unwrap_err().kind); + + // It returns an error when both no-css and no-header-css are used + let result = app.clone().get_matches_from_safe(vec![ + "paperoni", + "http://example.org", + "--no-css", + "--no-header-css", + ]); + assert!(result.is_err()); + assert_eq!(clap::ErrorKind::ArgumentConflict, result.unwrap_err().kind); + + // It returns an error when inline-toc is used without merge + let result = app.clone().get_matches_from_safe(vec![ + "paperoni", + "http://example.org", + "--inline-toc", + ]); + assert!(result.is_err()); + assert_eq!( + clap::ErrorKind::MissingRequiredArgument, + result.unwrap_err().kind + ); + + // It returns an error when inline-images is used without export + let result = app.clone().get_matches_from_safe(vec![ + "paperoni", + "http://example.org", + "--inline-images", + ]); + assert!(result.is_err()); + assert_eq!( + clap::ErrorKind::MissingRequiredArgument, + result.unwrap_err().kind + ); + + // It returns an error when export is given an invalid value + let result = app.clone().get_matches_from_safe(vec![ + "paperoni", + "http://example.org", + "--export", + "pdf", + ]); + assert!(result.is_err()); + assert_eq!(clap::ErrorKind::InvalidValue, result.unwrap_err().kind); + + // It returns an error when a max-conn is given a negative number. + let result = app.clone().get_matches_from_safe(vec![ + "paperoni", + "http://example.org", + "--max-conn", + "-1", + ]); + assert!(result.is_err()); + // The cli is configured not to accept negative numbers so passing "-1" would have it be read as a flag called 1 + assert_eq!(clap::ErrorKind::UnknownArgument, result.unwrap_err().kind); + } + + #[test] + fn test_init_with_cli() { + let yaml_config = load_yaml!("cli_config.yml"); + let app = App::from_yaml(yaml_config); + + // It returns an error when the urls passed are whitespace + let matches = app.clone().get_matches_from(vec!["paperoni", ""]); + let app_config = AppConfig::try_from(matches); + assert!(app_config.is_err()); + assert_eq!(Error::NoUrls, app_config.unwrap_err()); + + // It returns an error when inline-toc is used when exporting to HTML + let matches = app.clone().get_matches_from(vec![ + "paperoni", + "http://example.org", + "--merge", + "foo", + "--export", + "html", + "--inline-toc", + ]); + let app_config = AppConfig::try_from(matches); + assert!(app_config.is_err()); + assert_eq!(Error::WrongExportInliningToC, app_config.unwrap_err()); + // It returns an Ok when inline-toc is used when exporting to epub + let matches = app.clone().get_matches_from(vec![ + "paperoni", + "http://example.org", + "--merge", + "foo", + "--export", + "epub", + "--inline-toc", + ]); + assert!(AppConfig::try_from(matches).is_ok()); + + // It returns an error when inline-images is used when exporting to epub + } +} diff --git a/src/cli_config.yml b/src/cli_config.yml index 4f86d52..8de70e1 100644 --- a/src/cli_config.yml +++ b/src/cli_config.yml @@ -12,7 +12,7 @@ args: long: file help: Input file containing links takes_value: true - - output_directory: + - output-directory: short: o long: output-dir help: Directory to store output epub documents @@ -70,7 +70,6 @@ args: possible_values: [html, epub] value_name: type takes_value: true - default_value: epub - inline-images: long: inline-images help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more. diff --git a/src/errors.rs b/src/errors.rs index 5b1cc25..c615477 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -138,6 +138,14 @@ pub enum LogError { CreateLogDirectoryError(#[from] std::io::Error), } +// dumb hack to allow for comparing errors in testing. +// derive macros cannot be used because underlying errors like io::Error do not derive PartialEq +impl PartialEq for LogError { + fn eq(&self, other: &Self) -> bool { + format!("{:?}", self) == format!("{:?}", other) + } +} + #[derive(Debug, Error)] pub enum CliError { #[error("Failed to open file with urls: {0}")] @@ -161,3 +169,11 @@ pub enum CliError { #[error("The --inline-images flag can only be used when exporting to html")] WrongExportInliningImages, } + +// dumb hack to allow for comparing errors in testing. +// derive macros cannot be used because underlying errors like io::Error do not derive PartialEq +impl PartialEq for CliError { + fn eq(&self, other: &Self) -> bool { + format!("{:?}", self) == format!("{:?}", other) + } +}