Merge pull request #21 from hipstermojo/dev

v0.6.1 release
This commit is contained in:
Kenneth Gitere 2021-08-24 07:24:56 +03:00 committed by GitHub
commit dc16f9f52b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 345 additions and 80 deletions

View file

@ -9,13 +9,14 @@ type Error = crate::errors::CliError<AppConfigBuilderError>;
const DEFAULT_MAX_CONN: usize = 8;
#[derive(derive_builder::Builder)]
#[derive(derive_builder::Builder, Debug)]
pub struct AppConfig {
/// Article urls
pub urls: Vec<String>,
pub max_conn: usize,
/// Path to file of multiple articles into a single article
pub merged: Option<String>,
// TODO: Change type to Path
pub output_directory: Option<String>,
pub log_level: LogLevel,
pub can_disable_progress_bar: bool,
@ -95,7 +96,7 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
None => DEFAULT_MAX_CONN,
})
.merged(arg_matches.value_of("output-name").map(|name| {
let file_ext = format!(".{}", arg_matches.value_of("export").unwrap());
let file_ext = format!(".{}", arg_matches.value_of("export").unwrap_or("epub"));
if name.ends_with(&file_ext) {
name.to_owned()
} else {
@ -132,10 +133,11 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
)
.output_directory(
arg_matches
.value_of("output_directory")
.value_of("output-directory")
.map(|output_directory| {
let path = Path::new(output_directory);
if !path.exists() {
// TODO: Create the directory
Err(Error::OutputDirectoryNotExists)
} else if !path.is_dir() {
Err(Error::WrongOutputDirectory)
@ -157,14 +159,24 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
},
)
.export_type({
let export_type = arg_matches.value_of("export").unwrap();
let export_type = arg_matches.value_of("export").unwrap_or("epub");
if export_type == "html" {
ExportType::HTML
} else {
ExportType::EPUB
}
})
.is_inlining_images(arg_matches.is_present("inline-images"))
.is_inlining_images(
(if arg_matches.is_present("inline-images") {
if arg_matches.value_of("export") == Some("html") {
Ok(true)
} else {
Err(Error::WrongExportInliningImages)
}
} else {
Ok(false)
})?,
)
.try_init()
}
}
@ -190,3 +202,134 @@ pub enum ExportType {
HTML,
EPUB,
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_clap_config_errors() {
let yaml_config = load_yaml!("cli_config.yml");
let app = App::from_yaml(yaml_config);
// It returns Ok when only a url is passed
let result = app
.clone()
.get_matches_from_safe(vec!["paperoni", "http://example.org"]);
assert!(result.is_ok());
// It returns an error when no args are passed
let result = app.clone().get_matches_from_safe(vec!["paperoni"]);
assert!(result.is_err());
assert_eq!(
clap::ErrorKind::MissingArgumentOrSubcommand,
result.unwrap_err().kind
);
// It returns an error when both output-dir and merge are used
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--merge",
"foo",
"--output-dir",
"~",
]);
assert!(result.is_err());
assert_eq!(clap::ErrorKind::ArgumentConflict, result.unwrap_err().kind);
// It returns an error when both no-css and no-header-css are used
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--no-css",
"--no-header-css",
]);
assert!(result.is_err());
assert_eq!(clap::ErrorKind::ArgumentConflict, result.unwrap_err().kind);
// It returns an error when inline-toc is used without merge
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--inline-toc",
]);
assert!(result.is_err());
assert_eq!(
clap::ErrorKind::MissingRequiredArgument,
result.unwrap_err().kind
);
// It returns an error when inline-images is used without export
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--inline-images",
]);
assert!(result.is_err());
assert_eq!(
clap::ErrorKind::MissingRequiredArgument,
result.unwrap_err().kind
);
// It returns an error when export is given an invalid value
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--export",
"pdf",
]);
assert!(result.is_err());
assert_eq!(clap::ErrorKind::InvalidValue, result.unwrap_err().kind);
// It returns an error when a max-conn is given a negative number.
let result = app.clone().get_matches_from_safe(vec![
"paperoni",
"http://example.org",
"--max-conn",
"-1",
]);
assert!(result.is_err());
// The cli is configured not to accept negative numbers so passing "-1" would have it be read as a flag called 1
assert_eq!(clap::ErrorKind::UnknownArgument, result.unwrap_err().kind);
}
#[test]
fn test_init_with_cli() {
let yaml_config = load_yaml!("cli_config.yml");
let app = App::from_yaml(yaml_config);
// It returns an error when the urls passed are whitespace
let matches = app.clone().get_matches_from(vec!["paperoni", ""]);
let app_config = AppConfig::try_from(matches);
assert!(app_config.is_err());
assert_eq!(Error::NoUrls, app_config.unwrap_err());
// It returns an error when inline-toc is used when exporting to HTML
let matches = app.clone().get_matches_from(vec![
"paperoni",
"http://example.org",
"--merge",
"foo",
"--export",
"html",
"--inline-toc",
]);
let app_config = AppConfig::try_from(matches);
assert!(app_config.is_err());
assert_eq!(Error::WrongExportInliningToC, app_config.unwrap_err());
// It returns an Ok when inline-toc is used when exporting to epub
let matches = app.clone().get_matches_from(vec![
"paperoni",
"http://example.org",
"--merge",
"foo",
"--export",
"epub",
"--inline-toc",
]);
assert!(AppConfig::try_from(matches).is_ok());
// It returns an error when inline-images is used when exporting to epub
}
}

View file

@ -12,7 +12,7 @@ args:
long: file
help: Input file containing links
takes_value: true
- output_directory:
- output-directory:
short: o
long: output-dir
help: Directory to store output epub documents
@ -70,7 +70,6 @@ args:
possible_values: [html, epub]
value_name: type
takes_value: true
default_value: epub
- inline-images:
long: inline-images
help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more.

View file

@ -138,6 +138,14 @@ pub enum LogError {
CreateLogDirectoryError(#[from] std::io::Error),
}
// dumb hack to allow for comparing errors in testing.
// derive macros cannot be used because underlying errors like io::Error do not derive PartialEq
impl PartialEq for LogError {
fn eq(&self, other: &Self) -> bool {
format!("{:?}", self) == format!("{:?}", other)
}
}
#[derive(Debug, Error)]
pub enum CliError<BuilderError: Debug + Display> {
#[error("Failed to open file with urls: {0}")]
@ -156,6 +164,16 @@ pub enum CliError<BuilderError: Debug + Display> {
OutputDirectoryNotExists,
#[error("Unable to start logger!\n{0}")]
LogError(#[from] LogError),
#[error("The --inline-toc can only be used exporting to epub")]
#[error("The --inline-toc flag can only be used when exporting to epub")]
WrongExportInliningToC,
#[error("The --inline-images flag can only be used when exporting to html")]
WrongExportInliningImages,
}
// dumb hack to allow for comparing errors in testing.
// derive macros cannot be used because underlying errors like io::Error do not derive PartialEq
impl<T: Debug + Display> PartialEq for CliError<T> {
fn eq(&self, other: &Self) -> bool {
format!("{:?}", self) == format!("{:?}", other)
}
}

View file

@ -4,6 +4,7 @@ use kuchiki::{traits::*, NodeRef};
use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability};
/// A tuple of the url and an Option of the resource's MIME type
pub type ResourceInfo = (String, Option<String>);
pub struct Article {

View file

@ -12,7 +12,7 @@ use kuchiki::{traits::*, NodeRef};
use log::{debug, error, info};
use crate::{
cli::{self, AppConfig},
cli::{self, AppConfig, CSSConfig},
errors::PaperoniError,
extractor::Article,
moz_readability::MetaData,
@ -91,14 +91,9 @@ pub fn generate_html_exports(
*id_attr = format!("readability-page-{}", idx);
}
for (img_url, mime_type_opt) in &article.img_urls {
if app_config.is_inlining_images {
info!("Inlining images for {}", title);
let result = update_imgs_base64(
article,
img_url,
mime_type_opt.as_deref().unwrap_or("image/*"),
);
let result = update_imgs_base64(article);
if let Err(e) = result {
let mut err: PaperoniError = e.into();
@ -122,7 +117,7 @@ pub fn generate_html_exports(
info!("Successfully copied images to imgs dir for {}", title);
}
}
}
bar.inc(1);
successful_articles_table.add_row(vec![title]);
body_elem.as_node().append(article_elem.as_node().clone());
@ -137,7 +132,8 @@ pub fn generate_html_exports(
.map(|article| (article.metadata(), article.url.as_str()))
.collect(),
);
inline_css(&base_html_elem, app_config);
inline_css(&base_html_elem, &app_config.css_config);
remove_existing_stylesheet_link(&base_html_elem);
info!("Added title, footer and inlined styles for {}", name);
@ -199,13 +195,7 @@ pub fn generate_html_exports(
let mut out_file = File::create(&file_name)?;
if app_config.is_inlining_images {
for (img_url, mime_type_opt) in &article.img_urls {
update_imgs_base64(
article,
img_url,
mime_type_opt.as_deref().unwrap_or("image/*"),
)?
}
update_imgs_base64(article)?;
} else {
let base_path =
Path::new(app_config.output_directory.as_deref().unwrap_or("."));
@ -233,7 +223,8 @@ pub fn generate_html_exports(
insert_title_elem(article.node_ref(), article.metadata().title());
insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]);
inline_css(article.node_ref(), app_config);
inline_css(article.node_ref(), &app_config.css_config);
remove_existing_stylesheet_link(article.node_ref());
article.node_ref().serialize(&mut out_file)?;
Ok(())
@ -268,15 +259,16 @@ fn create_qualname(name: &str) -> QualName {
}
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
fn update_imgs_base64(
article: &Article,
img_url: &str,
mime_type: &str,
) -> Result<(), std::io::Error> {
fn update_imgs_base64(article: &Article) -> Result<(), std::io::Error> {
let temp_dir = std::env::temp_dir();
for (img_url, mime_type) in &article.img_urls {
let img_path = temp_dir.join(img_url);
let img_bytes = std::fs::read(img_path)?;
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));
let img_base64_str = format!(
"data:image:{};base64,{}",
mime_type.as_deref().unwrap_or("image/*"),
encode(img_bytes)
);
let img_elems = article
.node_ref()
@ -288,6 +280,7 @@ fn update_imgs_base64(
*src_attr = img_base64_str.clone();
}
}
}
Ok(())
}
@ -344,18 +337,23 @@ fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) {
format!("<a href=\"{}\">{}</a><br></br>", url, article_name)
})
.collect();
let footer_inner_html = format!("<h2>Appendix</h2><h2>Article sources</h3>{}", link_tags);
let footer_elem =
kuchiki::parse_fragment(create_qualname("footer"), Vec::new()).one(footer_inner_html);
root_node.append(footer_elem);
let footer_inner_html = format!(
"<footer><h2>Appendix</h2><h3>Article sources</h3>{}</footer>",
link_tags
);
let footer_container =
kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(footer_inner_html);
let footer_elem = footer_container.select_first("footer").unwrap();
root_node.append(footer_elem.as_node().clone());
}
/// Inlines the CSS stylesheets into the HTML article node
fn inline_css(root_node: &NodeRef, app_config: &AppConfig) {
fn inline_css(root_node: &NodeRef, css_config: &CSSConfig) {
let body_stylesheet = include_str!("./assets/body.min.css");
let header_stylesheet = include_str!("./assets/headers.min.css");
let mut css_str = String::new();
match app_config.css_config {
match css_config {
cli::CSSConfig::NoHeaders => {
css_str.push_str(body_stylesheet);
}
@ -371,21 +369,118 @@ fn inline_css(root_node: &NodeRef, app_config: &AppConfig) {
let style_container =
kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str);
let style_elem = style_container.select_first("style").unwrap();
match root_node.select_first("head") {
Ok(head_elem) => {
let head_elem = root_node.select_first("head").expect(HEAD_ELEM_NOT_FOUND);
head_elem.as_node().prepend(style_elem.as_node().to_owned());
}
Err(_) => {
debug!("{}", HEAD_ELEM_NOT_FOUND);
let html_elem = root_node.select_first("html").unwrap();
let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new());
head_elem.prepend(style_elem.as_node().to_owned());
html_elem.as_node().prepend(head_elem);
}
}
}
// Remove the <link> of the stylesheet since styles are now inlined
/// Removes the <link> of the stylesheet. This is used when inlining styles
fn remove_existing_stylesheet_link(root_node: &NodeRef) {
if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") {
style_link_elem.as_node().detach();
};
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_insert_title_elem() {
let title = "Sample title";
let html_str = r#"<html><head><meta charset="UTF-8"/></head><body></body></html>"#;
let doc = kuchiki::parse_html().one(html_str);
assert_eq!(0, doc.select("title").unwrap().count());
insert_title_elem(&doc, title);
assert_eq!(1, doc.select("title").unwrap().count());
assert_eq!(title, doc.select_first("title").unwrap().text_contents());
}
#[test]
fn test_create_qualname() {
let name = "div";
assert_eq!(
create_qualname(name),
QualName::new(
None,
Namespace::from("http://www.w3.org/1999/xhtml"),
LocalName::from(name)
)
);
}
#[test]
fn test_inline_css() {
let html_str = r#"<html>
<head><meta charset="UTF-8"/></head>
<body>
<p>Lorem ipsum sample text goes here.</p>
</body>
</html>"#;
let doc = kuchiki::parse_html().one(html_str);
let body_stylesheet = include_str!("./assets/body.min.css");
let header_stylesheet = include_str!("./assets/headers.min.css");
assert_eq!(0, doc.select("style").unwrap().count());
inline_css(&doc, &CSSConfig::None);
assert_eq!(0, doc.select("style").unwrap().count());
inline_css(&doc, &CSSConfig::NoHeaders);
assert_eq!(1, doc.select("style").unwrap().count());
let style_elem = doc.select_first("style").unwrap();
assert_eq!(body_stylesheet, style_elem.text_contents());
let doc = kuchiki::parse_html().one(html_str);
inline_css(&doc, &CSSConfig::All);
assert_eq!(1, doc.select("style").unwrap().count());
let style_elem = doc.select_first("style").unwrap();
assert_eq!(
format!("{}{}", body_stylesheet, header_stylesheet),
style_elem.text_contents()
);
}
#[test]
fn test_remove_existing_stylesheet_link() {
let html_str = r#"<html>
<head><link href="stylesheet.css"></link></head>
<body><p>Lorem ipsum sample text goes here.</p></body></html>"#;
let doc = kuchiki::parse_html().one(html_str);
assert_eq!(1, doc.select("link").unwrap().count());
remove_existing_stylesheet_link(&doc);
assert_eq!(0, doc.select("link").unwrap().count());
}
#[test]
fn test_insert_appendix() {
let html_str = r#"<html>
<head><meta charset="UTF-8"/></head>
<body>
<p>Lorem ipsum sample text goes here.</p>
</body>
</html>"#;
let doc = kuchiki::parse_html().one(html_str);
let meta_data = MetaData::new();
assert_eq!(0, doc.select("footer").unwrap().count());
insert_appendix(&doc, vec![(&meta_data, "http://example.org")]);
assert_eq!(1, doc.select("footer").unwrap().count());
assert_eq!(1, doc.select("footer > h2").unwrap().count());
assert_eq!(
"Appendix",
doc.select_first("footer > h2").unwrap().text_contents()
);
assert_eq!(1, doc.select("footer > h3").unwrap().count());
assert_eq!(
"Article sources",
doc.select_first("footer > h3").unwrap().text_contents()
);
assert_eq!(1, doc.select("a").unwrap().count());
let anchor_elem = doc.select_first("a").unwrap();
assert_eq!("http://example.org", anchor_elem.text_contents());
let anchor_attrs = anchor_elem.attributes.borrow();
assert_eq!(Some("http://example.org"), anchor_attrs.get("href"));
}
}

View file

@ -150,6 +150,15 @@ async fn process_img_response<'a>(
let img_mime = img_response
.content_type()
.map(|mime| mime.essence().to_string());
if let Some(mime_str) = &img_mime {
if !mime_str.starts_with("image/") {
return Err(ErrorKind::HTTPError(format!(
"Invalid image MIME type: {} for {}",
mime_str, url
))
.into());
}
}
let img_ext = match img_response
.content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())