Merge pull request #21 from hipstermojo/dev

v0.6.1 release
This commit is contained in:
Kenneth Gitere 2021-08-24 07:24:56 +03:00 committed by GitHub
commit dc16f9f52b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 345 additions and 80 deletions

View file

@ -9,13 +9,14 @@ type Error = crate::errors::CliError<AppConfigBuilderError>;
const DEFAULT_MAX_CONN: usize = 8; const DEFAULT_MAX_CONN: usize = 8;
#[derive(derive_builder::Builder)] #[derive(derive_builder::Builder, Debug)]
pub struct AppConfig { pub struct AppConfig {
/// Article urls /// Article urls
pub urls: Vec<String>, pub urls: Vec<String>,
pub max_conn: usize, pub max_conn: usize,
/// Path to file of multiple articles into a single article /// Path to file of multiple articles into a single article
pub merged: Option<String>, pub merged: Option<String>,
// TODO: Change type to Path
pub output_directory: Option<String>, pub output_directory: Option<String>,
pub log_level: LogLevel, pub log_level: LogLevel,
pub can_disable_progress_bar: bool, pub can_disable_progress_bar: bool,
@ -95,7 +96,7 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
None => DEFAULT_MAX_CONN, None => DEFAULT_MAX_CONN,
}) })
.merged(arg_matches.value_of("output-name").map(|name| { .merged(arg_matches.value_of("output-name").map(|name| {
let file_ext = format!(".{}", arg_matches.value_of("export").unwrap()); let file_ext = format!(".{}", arg_matches.value_of("export").unwrap_or("epub"));
if name.ends_with(&file_ext) { if name.ends_with(&file_ext) {
name.to_owned() name.to_owned()
} else { } else {
@ -132,10 +133,11 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
) )
.output_directory( .output_directory(
arg_matches arg_matches
.value_of("output_directory") .value_of("output-directory")
.map(|output_directory| { .map(|output_directory| {
let path = Path::new(output_directory); let path = Path::new(output_directory);
if !path.exists() { if !path.exists() {
// TODO: Create the directory
Err(Error::OutputDirectoryNotExists) Err(Error::OutputDirectoryNotExists)
} else if !path.is_dir() { } else if !path.is_dir() {
Err(Error::WrongOutputDirectory) Err(Error::WrongOutputDirectory)
@ -157,14 +159,24 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
}, },
) )
.export_type({ .export_type({
let export_type = arg_matches.value_of("export").unwrap(); let export_type = arg_matches.value_of("export").unwrap_or("epub");
if export_type == "html" { if export_type == "html" {
ExportType::HTML ExportType::HTML
} else { } else {
ExportType::EPUB ExportType::EPUB
} }
}) })
.is_inlining_images(arg_matches.is_present("inline-images")) .is_inlining_images(
(if arg_matches.is_present("inline-images") {
if arg_matches.value_of("export") == Some("html") {
Ok(true)
} else {
Err(Error::WrongExportInliningImages)
}
} else {
Ok(false)
})?,
)
.try_init() .try_init()
} }
} }
@ -190,3 +202,134 @@ pub enum ExportType {
HTML, HTML,
EPUB, EPUB,
} }
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_clap_config_errors() {
let yaml_config = load_yaml!("cli_config.yml");
let app = App::from_yaml(yaml_config);
// It returns Ok when only a url is passed
let result = app
.clone()
.get_matches_from_safe(vec!["paperoni", "http://example.org"]);
assert!(result.is_ok());
// It returns an error when no args are passed
let result = app.clone().get_matches_from_safe(vec!["paperoni"]);
assert!(result.is_err());
assert_eq!(
clap::ErrorKind::MissingArgumentOrSubcommand,
result.unwrap_err().kind
);
// It returns an error when both output-dir and merge are used
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--merge",
"foo",
"--output-dir",
"~",
]);
assert!(result.is_err());
assert_eq!(clap::ErrorKind::ArgumentConflict, result.unwrap_err().kind);
// It returns an error when both no-css and no-header-css are used
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--no-css",
"--no-header-css",
]);
assert!(result.is_err());
assert_eq!(clap::ErrorKind::ArgumentConflict, result.unwrap_err().kind);
// It returns an error when inline-toc is used without merge
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--inline-toc",
]);
assert!(result.is_err());
assert_eq!(
clap::ErrorKind::MissingRequiredArgument,
result.unwrap_err().kind
);
// It returns an error when inline-images is used without export
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--inline-images",
]);
assert!(result.is_err());
assert_eq!(
clap::ErrorKind::MissingRequiredArgument,
result.unwrap_err().kind
);
// It returns an error when export is given an invalid value
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--export",
"pdf",
]);
assert!(result.is_err());
assert_eq!(clap::ErrorKind::InvalidValue, result.unwrap_err().kind);
// It returns an error when a max-conn is given a negative number.
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--max-conn",
"-1",
]);
assert!(result.is_err());
// The cli is configured not to accept negative numbers so passing "-1" would have it be read as a flag called 1
assert_eq!(clap::ErrorKind::UnknownArgument, result.unwrap_err().kind);
}
#[test]
fn test_init_with_cli() {
let yaml_config = load_yaml!("cli_config.yml");
let app = App::from_yaml(yaml_config);
// It returns an error when the urls passed are whitespace
let matches = app.clone().get_matches_from(vec!["paperoni", ""]);
let app_config = AppConfig::try_from(matches);
assert!(app_config.is_err());
assert_eq!(Error::NoUrls, app_config.unwrap_err());
// It returns an error when inline-toc is used when exporting to HTML
let matches = app.clone().get_matches_from(vec![
"paperoni",
"http://example.org",
"--merge",
"foo",
"--export",
"html",
"--inline-toc",
]);
let app_config = AppConfig::try_from(matches);
assert!(app_config.is_err());
assert_eq!(Error::WrongExportInliningToC, app_config.unwrap_err());
// It returns an Ok when inline-toc is used when exporting to epub
let matches = app.clone().get_matches_from(vec![
"paperoni",
"http://example.org",
"--merge",
"foo",
"--export",
"epub",
"--inline-toc",
]);
assert!(AppConfig::try_from(matches).is_ok());
// It returns an error when inline-images is used when exporting to epub
}
}

View file

@ -12,7 +12,7 @@ args:
long: file long: file
help: Input file containing links help: Input file containing links
takes_value: true takes_value: true
- output_directory: - output-directory:
short: o short: o
long: output-dir long: output-dir
help: Directory to store output epub documents help: Directory to store output epub documents
@ -70,7 +70,6 @@ args:
possible_values: [html, epub] possible_values: [html, epub]
value_name: type value_name: type
takes_value: true takes_value: true
default_value: epub
- inline-images: - inline-images:
long: inline-images long: inline-images
help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more. help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more.

View file

@ -138,6 +138,14 @@ pub enum LogError {
CreateLogDirectoryError(#[from] std::io::Error), CreateLogDirectoryError(#[from] std::io::Error),
} }
// dumb hack to allow for comparing errors in testing.
// derive macros cannot be used because underlying errors like io::Error do not derive PartialEq
impl PartialEq for LogError {
fn eq(&self, other: &Self) -> bool {
format!("{:?}", self) == format!("{:?}", other)
}
}
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum CliError<BuilderError: Debug + Display> { pub enum CliError<BuilderError: Debug + Display> {
#[error("Failed to open file with urls: {0}")] #[error("Failed to open file with urls: {0}")]
@ -156,6 +164,16 @@ pub enum CliError<BuilderError: Debug + Display> {
OutputDirectoryNotExists, OutputDirectoryNotExists,
#[error("Unable to start logger!\n{0}")] #[error("Unable to start logger!\n{0}")]
LogError(#[from] LogError), LogError(#[from] LogError),
#[error("The --inline-toc can only be used exporting to epub")] #[error("The --inline-toc flag can only be used when exporting to epub")]
WrongExportInliningToC, WrongExportInliningToC,
#[error("The --inline-images flag can only be used when exporting to html")]
WrongExportInliningImages,
}
// dumb hack to allow for comparing errors in testing.
// derive macros cannot be used because underlying errors like io::Error do not derive PartialEq
impl<T: Debug + Display> PartialEq for CliError<T> {
fn eq(&self, other: &Self) -> bool {
format!("{:?}", self) == format!("{:?}", other)
}
} }

View file

@ -4,6 +4,7 @@ use kuchiki::{traits::*, NodeRef};
use crate::errors::PaperoniError; use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability}; use crate::moz_readability::{MetaData, Readability};
/// A tuple of the url and an Option of the resource's MIME type
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
pub struct Article { pub struct Article {

View file

@ -12,7 +12,7 @@ use kuchiki::{traits::*, NodeRef};
use log::{debug, error, info}; use log::{debug, error, info};
use crate::{ use crate::{
cli::{self, AppConfig}, cli::{self, AppConfig, CSSConfig},
errors::PaperoniError, errors::PaperoniError,
extractor::Article, extractor::Article,
moz_readability::MetaData, moz_readability::MetaData,
@ -91,38 +91,33 @@ pub fn generate_html_exports(
*id_attr = format!("readability-page-{}", idx); *id_attr = format!("readability-page-{}", idx);
} }
for (img_url, mime_type_opt) in &article.img_urls { if app_config.is_inlining_images {
if app_config.is_inlining_images { info!("Inlining images for {}", title);
info!("Inlining images for {}", title); let result = update_imgs_base64(article);
let result = update_imgs_base64(
article,
img_url,
mime_type_opt.as_deref().unwrap_or("image/*"),
);
if let Err(e) = result { if let Err(e) = result {
let mut err: PaperoniError = e.into(); let mut err: PaperoniError = e.into();
err.set_article_source(title); err.set_article_source(title);
error!("Unable to copy images to imgs dir for {}", title); error!("Unable to copy images to imgs dir for {}", title);
errors.push(err); errors.push(err);
} }
info!("Completed inlining images for {}", title); info!("Completed inlining images for {}", title);
} else {
info!("Copying images to imgs dir for {}", title);
let result = update_img_urls(article, &imgs_dir_path).map_err(|e| {
let mut err: PaperoniError = e.into();
err.set_article_source(title);
err
});
if let Err(e) = result {
error!("Unable to copy images to imgs dir for {}", title);
errors.push(e);
} else { } else {
info!("Copying images to imgs dir for {}", title); info!("Successfully copied images to imgs dir for {}", title);
let result = update_img_urls(article, &imgs_dir_path).map_err(|e| {
let mut err: PaperoniError = e.into();
err.set_article_source(title);
err
});
if let Err(e) = result {
error!("Unable to copy images to imgs dir for {}", title);
errors.push(e);
} else {
info!("Successfully copied images to imgs dir for {}", title);
}
} }
} }
bar.inc(1); bar.inc(1);
successful_articles_table.add_row(vec![title]); successful_articles_table.add_row(vec![title]);
body_elem.as_node().append(article_elem.as_node().clone()); body_elem.as_node().append(article_elem.as_node().clone());
@ -137,7 +132,8 @@ pub fn generate_html_exports(
.map(|article| (article.metadata(), article.url.as_str())) .map(|article| (article.metadata(), article.url.as_str()))
.collect(), .collect(),
); );
inline_css(&base_html_elem, app_config); inline_css(&base_html_elem, &app_config.css_config);
remove_existing_stylesheet_link(&base_html_elem);
info!("Added title, footer and inlined styles for {}", name); info!("Added title, footer and inlined styles for {}", name);
@ -199,13 +195,7 @@ pub fn generate_html_exports(
let mut out_file = File::create(&file_name)?; let mut out_file = File::create(&file_name)?;
if app_config.is_inlining_images { if app_config.is_inlining_images {
for (img_url, mime_type_opt) in &article.img_urls { update_imgs_base64(article)?;
update_imgs_base64(
article,
img_url,
mime_type_opt.as_deref().unwrap_or("image/*"),
)?
}
} else { } else {
let base_path = let base_path =
Path::new(app_config.output_directory.as_deref().unwrap_or(".")); Path::new(app_config.output_directory.as_deref().unwrap_or("."));
@ -233,7 +223,8 @@ pub fn generate_html_exports(
insert_title_elem(article.node_ref(), article.metadata().title()); insert_title_elem(article.node_ref(), article.metadata().title());
insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]); insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]);
inline_css(article.node_ref(), app_config); inline_css(article.node_ref(), &app_config.css_config);
remove_existing_stylesheet_link(article.node_ref());
article.node_ref().serialize(&mut out_file)?; article.node_ref().serialize(&mut out_file)?;
Ok(()) Ok(())
@ -268,24 +259,26 @@ fn create_qualname(name: &str) -> QualName {
} }
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data /// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
fn update_imgs_base64( fn update_imgs_base64(article: &Article) -> Result<(), std::io::Error> {
article: &Article,
img_url: &str,
mime_type: &str,
) -> Result<(), std::io::Error> {
let temp_dir = std::env::temp_dir(); let temp_dir = std::env::temp_dir();
let img_path = temp_dir.join(img_url); for (img_url, mime_type) in &article.img_urls {
let img_bytes = std::fs::read(img_path)?; let img_path = temp_dir.join(img_url);
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes)); let img_bytes = std::fs::read(img_path)?;
let img_base64_str = format!(
"data:image:{};base64,{}",
mime_type.as_deref().unwrap_or("image/*"),
encode(img_bytes)
);
let img_elems = article let img_elems = article
.node_ref() .node_ref()
.select(&format!("img[src=\"{}\"]", img_url)) .select(&format!("img[src=\"{}\"]", img_url))
.unwrap(); .unwrap();
for img_elem in img_elems { for img_elem in img_elems {
let mut img_attr = img_elem.attributes.borrow_mut(); let mut img_attr = img_elem.attributes.borrow_mut();
if let Some(src_attr) = img_attr.get_mut("src") { if let Some(src_attr) = img_attr.get_mut("src") {
*src_attr = img_base64_str.clone(); *src_attr = img_base64_str.clone();
}
} }
} }
Ok(()) Ok(())
@ -344,18 +337,23 @@ fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) {
format!("<a href=\"{}\">{}</a><br></br>", url, article_name) format!("<a href=\"{}\">{}</a><br></br>", url, article_name)
}) })
.collect(); .collect();
let footer_inner_html = format!("<h2>Appendix</h2><h2>Article sources</h3>{}", link_tags); let footer_inner_html = format!(
let footer_elem = "<footer><h2>Appendix</h2><h3>Article sources</h3>{}</footer>",
kuchiki::parse_fragment(create_qualname("footer"), Vec::new()).one(footer_inner_html); link_tags
root_node.append(footer_elem); );
let footer_container =
kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(footer_inner_html);
let footer_elem = footer_container.select_first("footer").unwrap();
root_node.append(footer_elem.as_node().clone());
} }
/// Inlines the CSS stylesheets into the HTML article node /// Inlines the CSS stylesheets into the HTML article node
fn inline_css(root_node: &NodeRef, app_config: &AppConfig) { fn inline_css(root_node: &NodeRef, css_config: &CSSConfig) {
let body_stylesheet = include_str!("./assets/body.min.css"); let body_stylesheet = include_str!("./assets/body.min.css");
let header_stylesheet = include_str!("./assets/headers.min.css"); let header_stylesheet = include_str!("./assets/headers.min.css");
let mut css_str = String::new(); let mut css_str = String::new();
match app_config.css_config { match css_config {
cli::CSSConfig::NoHeaders => { cli::CSSConfig::NoHeaders => {
css_str.push_str(body_stylesheet); css_str.push_str(body_stylesheet);
} }
@ -371,21 +369,118 @@ fn inline_css(root_node: &NodeRef, app_config: &AppConfig) {
let style_container = let style_container =
kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str); kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str);
let style_elem = style_container.select_first("style").unwrap(); let style_elem = style_container.select_first("style").unwrap();
match root_node.select_first("head") { let head_elem = root_node.select_first("head").expect(HEAD_ELEM_NOT_FOUND);
Ok(head_elem) => { head_elem.as_node().prepend(style_elem.as_node().to_owned());
head_elem.as_node().prepend(style_elem.as_node().to_owned()); }
}
Err(_) => {
debug!("{}", HEAD_ELEM_NOT_FOUND);
let html_elem = root_node.select_first("html").unwrap();
let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new());
head_elem.prepend(style_elem.as_node().to_owned());
html_elem.as_node().prepend(head_elem);
}
}
// Remove the <link> of the stylesheet since styles are now inlined /// Removes the <link> of the stylesheet. This is used when inlining styles
fn remove_existing_stylesheet_link(root_node: &NodeRef) {
if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") { if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") {
style_link_elem.as_node().detach(); style_link_elem.as_node().detach();
}; };
} }
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_insert_title_elem() {
let title = "Sample title";
let html_str = r#"<html><head><meta charset="UTF-8"/></head><body></body></html>"#;
let doc = kuchiki::parse_html().one(html_str);
assert_eq!(0, doc.select("title").unwrap().count());
insert_title_elem(&doc, title);
assert_eq!(1, doc.select("title").unwrap().count());
assert_eq!(title, doc.select_first("title").unwrap().text_contents());
}
#[test]
fn test_create_qualname() {
let name = "div";
assert_eq!(
create_qualname(name),
QualName::new(
None,
Namespace::from("http://www.w3.org/1999/xhtml"),
LocalName::from(name)
)
);
}
#[test]
fn test_inline_css() {
let html_str = r#"<html>
<head><meta charset="UTF-8"/></head>
<body>
<p>Lorem ipsum sample text goes here.</p>
</body>
</html>"#;
let doc = kuchiki::parse_html().one(html_str);
let body_stylesheet = include_str!("./assets/body.min.css");
let header_stylesheet = include_str!("./assets/headers.min.css");
assert_eq!(0, doc.select("style").unwrap().count());
inline_css(&doc, &CSSConfig::None);
assert_eq!(0, doc.select("style").unwrap().count());
inline_css(&doc, &CSSConfig::NoHeaders);
assert_eq!(1, doc.select("style").unwrap().count());
let style_elem = doc.select_first("style").unwrap();
assert_eq!(body_stylesheet, style_elem.text_contents());
let doc = kuchiki::parse_html().one(html_str);
inline_css(&doc, &CSSConfig::All);
assert_eq!(1, doc.select("style").unwrap().count());
let style_elem = doc.select_first("style").unwrap();
assert_eq!(
format!("{}{}", body_stylesheet, header_stylesheet),
style_elem.text_contents()
);
}
#[test]
fn test_remove_existing_stylesheet_link() {
let html_str = r#"<html>
<head><link href="stylesheet.css"></link></head>
<body><p>Lorem ipsum sample text goes here.</p></body></html>"#;
let doc = kuchiki::parse_html().one(html_str);
assert_eq!(1, doc.select("link").unwrap().count());
remove_existing_stylesheet_link(&doc);
assert_eq!(0, doc.select("link").unwrap().count());
}
#[test]
fn test_insert_appendix() {
let html_str = r#"<html>
<head><meta charset="UTF-8"/></head>
<body>
<p>Lorem ipsum sample text goes here.</p>
</body>
</html>"#;
let doc = kuchiki::parse_html().one(html_str);
let meta_data = MetaData::new();
assert_eq!(0, doc.select("footer").unwrap().count());
insert_appendix(&doc, vec![(&meta_data, "http://example.org")]);
assert_eq!(1, doc.select("footer").unwrap().count());
assert_eq!(1, doc.select("footer > h2").unwrap().count());
assert_eq!(
"Appendix",
doc.select_first("footer > h2").unwrap().text_contents()
);
assert_eq!(1, doc.select("footer > h3").unwrap().count());
assert_eq!(
"Article sources",
doc.select_first("footer > h3").unwrap().text_contents()
);
assert_eq!(1, doc.select("a").unwrap().count());
let anchor_elem = doc.select_first("a").unwrap();
assert_eq!("http://example.org", anchor_elem.text_contents());
let anchor_attrs = anchor_elem.attributes.borrow();
assert_eq!(Some("http://example.org"), anchor_attrs.get("href"));
}
}

View file

@ -150,6 +150,15 @@ async fn process_img_response<'a>(
let img_mime = img_response let img_mime = img_response
.content_type() .content_type()
.map(|mime| mime.essence().to_string()); .map(|mime| mime.essence().to_string());
if let Some(mime_str) = &img_mime {
if !mime_str.starts_with("image/") {
return Err(ErrorKind::HTTPError(format!(
"Invalid image MIME type: {} for {}",
mime_str, url
))
.into());
}
}
let img_ext = match img_response let img_ext = match img_response
.content_type() .content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())