commit
dc16f9f52b
6 changed files with 345 additions and 80 deletions
153
src/cli.rs
153
src/cli.rs
|
@ -9,13 +9,14 @@ type Error = crate::errors::CliError<AppConfigBuilderError>;
|
|||
|
||||
const DEFAULT_MAX_CONN: usize = 8;
|
||||
|
||||
#[derive(derive_builder::Builder)]
|
||||
#[derive(derive_builder::Builder, Debug)]
|
||||
pub struct AppConfig {
|
||||
/// Article urls
|
||||
pub urls: Vec<String>,
|
||||
pub max_conn: usize,
|
||||
/// Path to file of multiple articles into a single article
|
||||
pub merged: Option<String>,
|
||||
// TODO: Change type to Path
|
||||
pub output_directory: Option<String>,
|
||||
pub log_level: LogLevel,
|
||||
pub can_disable_progress_bar: bool,
|
||||
|
@ -95,7 +96,7 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
|
|||
None => DEFAULT_MAX_CONN,
|
||||
})
|
||||
.merged(arg_matches.value_of("output-name").map(|name| {
|
||||
let file_ext = format!(".{}", arg_matches.value_of("export").unwrap());
|
||||
let file_ext = format!(".{}", arg_matches.value_of("export").unwrap_or("epub"));
|
||||
if name.ends_with(&file_ext) {
|
||||
name.to_owned()
|
||||
} else {
|
||||
|
@ -132,10 +133,11 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
|
|||
)
|
||||
.output_directory(
|
||||
arg_matches
|
||||
.value_of("output_directory")
|
||||
.value_of("output-directory")
|
||||
.map(|output_directory| {
|
||||
let path = Path::new(output_directory);
|
||||
if !path.exists() {
|
||||
// TODO: Create the directory
|
||||
Err(Error::OutputDirectoryNotExists)
|
||||
} else if !path.is_dir() {
|
||||
Err(Error::WrongOutputDirectory)
|
||||
|
@ -157,14 +159,24 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
|
|||
},
|
||||
)
|
||||
.export_type({
|
||||
let export_type = arg_matches.value_of("export").unwrap();
|
||||
let export_type = arg_matches.value_of("export").unwrap_or("epub");
|
||||
if export_type == "html" {
|
||||
ExportType::HTML
|
||||
} else {
|
||||
ExportType::EPUB
|
||||
}
|
||||
})
|
||||
.is_inlining_images(arg_matches.is_present("inline-images"))
|
||||
.is_inlining_images(
|
||||
(if arg_matches.is_present("inline-images") {
|
||||
if arg_matches.value_of("export") == Some("html") {
|
||||
Ok(true)
|
||||
} else {
|
||||
Err(Error::WrongExportInliningImages)
|
||||
}
|
||||
} else {
|
||||
Ok(false)
|
||||
})?,
|
||||
)
|
||||
.try_init()
|
||||
}
|
||||
}
|
||||
|
@ -190,3 +202,134 @@ pub enum ExportType {
|
|||
HTML,
|
||||
EPUB,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_clap_config_errors() {
|
||||
let yaml_config = load_yaml!("cli_config.yml");
|
||||
let app = App::from_yaml(yaml_config);
|
||||
|
||||
// It returns Ok when only a url is passed
|
||||
let result = app
|
||||
.clone()
|
||||
.get_matches_from_safe(vec!["paperoni", "http://example.org"]);
|
||||
assert!(result.is_ok());
|
||||
|
||||
// It returns an error when no args are passed
|
||||
let result = app.clone().get_matches_from_safe(vec!["paperoni"]);
|
||||
assert!(result.is_err());
|
||||
assert_eq!(
|
||||
clap::ErrorKind::MissingArgumentOrSubcommand,
|
||||
result.unwrap_err().kind
|
||||
);
|
||||
|
||||
// It returns an error when both output-dir and merge are used
|
||||
let result = app.clone().get_matches_from_safe(vec![
|
||||
"paperoni",
|
||||
"http://example.org",
|
||||
"--merge",
|
||||
"foo",
|
||||
"--output-dir",
|
||||
"~",
|
||||
]);
|
||||
assert!(result.is_err());
|
||||
assert_eq!(clap::ErrorKind::ArgumentConflict, result.unwrap_err().kind);
|
||||
|
||||
// It returns an error when both no-css and no-header-css are used
|
||||
let result = app.clone().get_matches_from_safe(vec![
|
||||
"paperoni",
|
||||
"http://example.org",
|
||||
"--no-css",
|
||||
"--no-header-css",
|
||||
]);
|
||||
assert!(result.is_err());
|
||||
assert_eq!(clap::ErrorKind::ArgumentConflict, result.unwrap_err().kind);
|
||||
|
||||
// It returns an error when inline-toc is used without merge
|
||||
let result = app.clone().get_matches_from_safe(vec![
|
||||
"paperoni",
|
||||
"http://example.org",
|
||||
"--inline-toc",
|
||||
]);
|
||||
assert!(result.is_err());
|
||||
assert_eq!(
|
||||
clap::ErrorKind::MissingRequiredArgument,
|
||||
result.unwrap_err().kind
|
||||
);
|
||||
|
||||
// It returns an error when inline-images is used without export
|
||||
let result = app.clone().get_matches_from_safe(vec![
|
||||
"paperoni",
|
||||
"http://example.org",
|
||||
"--inline-images",
|
||||
]);
|
||||
assert!(result.is_err());
|
||||
assert_eq!(
|
||||
clap::ErrorKind::MissingRequiredArgument,
|
||||
result.unwrap_err().kind
|
||||
);
|
||||
|
||||
// It returns an error when export is given an invalid value
|
||||
let result = app.clone().get_matches_from_safe(vec![
|
||||
"paperoni",
|
||||
"http://example.org",
|
||||
"--export",
|
||||
"pdf",
|
||||
]);
|
||||
assert!(result.is_err());
|
||||
assert_eq!(clap::ErrorKind::InvalidValue, result.unwrap_err().kind);
|
||||
|
||||
// It returns an error when a max-conn is given a negative number.
|
||||
let result = app.clone().get_matches_from_safe(vec![
|
||||
"paperoni",
|
||||
"http://example.org",
|
||||
"--max-conn",
|
||||
"-1",
|
||||
]);
|
||||
assert!(result.is_err());
|
||||
// The cli is configured not to accept negative numbers so passing "-1" would have it be read as a flag called 1
|
||||
assert_eq!(clap::ErrorKind::UnknownArgument, result.unwrap_err().kind);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_init_with_cli() {
|
||||
let yaml_config = load_yaml!("cli_config.yml");
|
||||
let app = App::from_yaml(yaml_config);
|
||||
|
||||
// It returns an error when the urls passed are whitespace
|
||||
let matches = app.clone().get_matches_from(vec!["paperoni", ""]);
|
||||
let app_config = AppConfig::try_from(matches);
|
||||
assert!(app_config.is_err());
|
||||
assert_eq!(Error::NoUrls, app_config.unwrap_err());
|
||||
|
||||
// It returns an error when inline-toc is used when exporting to HTML
|
||||
let matches = app.clone().get_matches_from(vec![
|
||||
"paperoni",
|
||||
"http://example.org",
|
||||
"--merge",
|
||||
"foo",
|
||||
"--export",
|
||||
"html",
|
||||
"--inline-toc",
|
||||
]);
|
||||
let app_config = AppConfig::try_from(matches);
|
||||
assert!(app_config.is_err());
|
||||
assert_eq!(Error::WrongExportInliningToC, app_config.unwrap_err());
|
||||
// It returns an Ok when inline-toc is used when exporting to epub
|
||||
let matches = app.clone().get_matches_from(vec![
|
||||
"paperoni",
|
||||
"http://example.org",
|
||||
"--merge",
|
||||
"foo",
|
||||
"--export",
|
||||
"epub",
|
||||
"--inline-toc",
|
||||
]);
|
||||
assert!(AppConfig::try_from(matches).is_ok());
|
||||
|
||||
// It returns an error when inline-images is used when exporting to epub
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,7 +12,7 @@ args:
|
|||
long: file
|
||||
help: Input file containing links
|
||||
takes_value: true
|
||||
- output_directory:
|
||||
- output-directory:
|
||||
short: o
|
||||
long: output-dir
|
||||
help: Directory to store output epub documents
|
||||
|
@ -70,7 +70,6 @@ args:
|
|||
possible_values: [html, epub]
|
||||
value_name: type
|
||||
takes_value: true
|
||||
default_value: epub
|
||||
- inline-images:
|
||||
long: inline-images
|
||||
help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more.
|
||||
|
|
|
@ -138,6 +138,14 @@ pub enum LogError {
|
|||
CreateLogDirectoryError(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
// dumb hack to allow for comparing errors in testing.
|
||||
// derive macros cannot be used because underlying errors like io::Error do not derive PartialEq
|
||||
impl PartialEq for LogError {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
format!("{:?}", self) == format!("{:?}", other)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum CliError<BuilderError: Debug + Display> {
|
||||
#[error("Failed to open file with urls: {0}")]
|
||||
|
@ -156,6 +164,16 @@ pub enum CliError<BuilderError: Debug + Display> {
|
|||
OutputDirectoryNotExists,
|
||||
#[error("Unable to start logger!\n{0}")]
|
||||
LogError(#[from] LogError),
|
||||
#[error("The --inline-toc can only be used exporting to epub")]
|
||||
#[error("The --inline-toc flag can only be used when exporting to epub")]
|
||||
WrongExportInliningToC,
|
||||
#[error("The --inline-images flag can only be used when exporting to html")]
|
||||
WrongExportInliningImages,
|
||||
}
|
||||
|
||||
// dumb hack to allow for comparing errors in testing.
|
||||
// derive macros cannot be used because underlying errors like io::Error do not derive PartialEq
|
||||
impl<T: Debug + Display> PartialEq for CliError<T> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
format!("{:?}", self) == format!("{:?}", other)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ use kuchiki::{traits::*, NodeRef};
|
|||
use crate::errors::PaperoniError;
|
||||
use crate::moz_readability::{MetaData, Readability};
|
||||
|
||||
/// A tuple of the url and an Option of the resource's MIME type
|
||||
pub type ResourceInfo = (String, Option<String>);
|
||||
|
||||
pub struct Article {
|
||||
|
|
175
src/html.rs
175
src/html.rs
|
@ -12,7 +12,7 @@ use kuchiki::{traits::*, NodeRef};
|
|||
use log::{debug, error, info};
|
||||
|
||||
use crate::{
|
||||
cli::{self, AppConfig},
|
||||
cli::{self, AppConfig, CSSConfig},
|
||||
errors::PaperoniError,
|
||||
extractor::Article,
|
||||
moz_readability::MetaData,
|
||||
|
@ -91,14 +91,9 @@ pub fn generate_html_exports(
|
|||
*id_attr = format!("readability-page-{}", idx);
|
||||
}
|
||||
|
||||
for (img_url, mime_type_opt) in &article.img_urls {
|
||||
if app_config.is_inlining_images {
|
||||
info!("Inlining images for {}", title);
|
||||
let result = update_imgs_base64(
|
||||
article,
|
||||
img_url,
|
||||
mime_type_opt.as_deref().unwrap_or("image/*"),
|
||||
);
|
||||
let result = update_imgs_base64(article);
|
||||
|
||||
if let Err(e) = result {
|
||||
let mut err: PaperoniError = e.into();
|
||||
|
@ -122,7 +117,7 @@ pub fn generate_html_exports(
|
|||
info!("Successfully copied images to imgs dir for {}", title);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bar.inc(1);
|
||||
successful_articles_table.add_row(vec![title]);
|
||||
body_elem.as_node().append(article_elem.as_node().clone());
|
||||
|
@ -137,7 +132,8 @@ pub fn generate_html_exports(
|
|||
.map(|article| (article.metadata(), article.url.as_str()))
|
||||
.collect(),
|
||||
);
|
||||
inline_css(&base_html_elem, app_config);
|
||||
inline_css(&base_html_elem, &app_config.css_config);
|
||||
remove_existing_stylesheet_link(&base_html_elem);
|
||||
|
||||
info!("Added title, footer and inlined styles for {}", name);
|
||||
|
||||
|
@ -199,13 +195,7 @@ pub fn generate_html_exports(
|
|||
let mut out_file = File::create(&file_name)?;
|
||||
|
||||
if app_config.is_inlining_images {
|
||||
for (img_url, mime_type_opt) in &article.img_urls {
|
||||
update_imgs_base64(
|
||||
article,
|
||||
img_url,
|
||||
mime_type_opt.as_deref().unwrap_or("image/*"),
|
||||
)?
|
||||
}
|
||||
update_imgs_base64(article)?;
|
||||
} else {
|
||||
let base_path =
|
||||
Path::new(app_config.output_directory.as_deref().unwrap_or("."));
|
||||
|
@ -233,7 +223,8 @@ pub fn generate_html_exports(
|
|||
|
||||
insert_title_elem(article.node_ref(), article.metadata().title());
|
||||
insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]);
|
||||
inline_css(article.node_ref(), app_config);
|
||||
inline_css(article.node_ref(), &app_config.css_config);
|
||||
remove_existing_stylesheet_link(article.node_ref());
|
||||
|
||||
article.node_ref().serialize(&mut out_file)?;
|
||||
Ok(())
|
||||
|
@ -268,15 +259,16 @@ fn create_qualname(name: &str) -> QualName {
|
|||
}
|
||||
|
||||
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
|
||||
fn update_imgs_base64(
|
||||
article: &Article,
|
||||
img_url: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<(), std::io::Error> {
|
||||
fn update_imgs_base64(article: &Article) -> Result<(), std::io::Error> {
|
||||
let temp_dir = std::env::temp_dir();
|
||||
for (img_url, mime_type) in &article.img_urls {
|
||||
let img_path = temp_dir.join(img_url);
|
||||
let img_bytes = std::fs::read(img_path)?;
|
||||
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));
|
||||
let img_base64_str = format!(
|
||||
"data:image:{};base64,{}",
|
||||
mime_type.as_deref().unwrap_or("image/*"),
|
||||
encode(img_bytes)
|
||||
);
|
||||
|
||||
let img_elems = article
|
||||
.node_ref()
|
||||
|
@ -288,6 +280,7 @@ fn update_imgs_base64(
|
|||
*src_attr = img_base64_str.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -344,18 +337,23 @@ fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) {
|
|||
format!("<a href=\"{}\">{}</a><br></br>", url, article_name)
|
||||
})
|
||||
.collect();
|
||||
let footer_inner_html = format!("<h2>Appendix</h2><h2>Article sources</h3>{}", link_tags);
|
||||
let footer_elem =
|
||||
kuchiki::parse_fragment(create_qualname("footer"), Vec::new()).one(footer_inner_html);
|
||||
root_node.append(footer_elem);
|
||||
let footer_inner_html = format!(
|
||||
"<footer><h2>Appendix</h2><h3>Article sources</h3>{}</footer>",
|
||||
link_tags
|
||||
);
|
||||
let footer_container =
|
||||
kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(footer_inner_html);
|
||||
let footer_elem = footer_container.select_first("footer").unwrap();
|
||||
|
||||
root_node.append(footer_elem.as_node().clone());
|
||||
}
|
||||
|
||||
/// Inlines the CSS stylesheets into the HTML article node
|
||||
fn inline_css(root_node: &NodeRef, app_config: &AppConfig) {
|
||||
fn inline_css(root_node: &NodeRef, css_config: &CSSConfig) {
|
||||
let body_stylesheet = include_str!("./assets/body.min.css");
|
||||
let header_stylesheet = include_str!("./assets/headers.min.css");
|
||||
let mut css_str = String::new();
|
||||
match app_config.css_config {
|
||||
match css_config {
|
||||
cli::CSSConfig::NoHeaders => {
|
||||
css_str.push_str(body_stylesheet);
|
||||
}
|
||||
|
@ -371,21 +369,118 @@ fn inline_css(root_node: &NodeRef, app_config: &AppConfig) {
|
|||
let style_container =
|
||||
kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str);
|
||||
let style_elem = style_container.select_first("style").unwrap();
|
||||
match root_node.select_first("head") {
|
||||
Ok(head_elem) => {
|
||||
let head_elem = root_node.select_first("head").expect(HEAD_ELEM_NOT_FOUND);
|
||||
head_elem.as_node().prepend(style_elem.as_node().to_owned());
|
||||
}
|
||||
Err(_) => {
|
||||
debug!("{}", HEAD_ELEM_NOT_FOUND);
|
||||
let html_elem = root_node.select_first("html").unwrap();
|
||||
let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new());
|
||||
head_elem.prepend(style_elem.as_node().to_owned());
|
||||
html_elem.as_node().prepend(head_elem);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove the <link> of the stylesheet since styles are now inlined
|
||||
/// Removes the <link> of the stylesheet. This is used when inlining styles
|
||||
fn remove_existing_stylesheet_link(root_node: &NodeRef) {
|
||||
if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") {
|
||||
style_link_elem.as_node().detach();
|
||||
};
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn test_insert_title_elem() {
|
||||
let title = "Sample title";
|
||||
let html_str = r#"<html><head><meta charset="UTF-8"/></head><body></body></html>"#;
|
||||
let doc = kuchiki::parse_html().one(html_str);
|
||||
assert_eq!(0, doc.select("title").unwrap().count());
|
||||
|
||||
insert_title_elem(&doc, title);
|
||||
assert_eq!(1, doc.select("title").unwrap().count());
|
||||
assert_eq!(title, doc.select_first("title").unwrap().text_contents());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_qualname() {
|
||||
let name = "div";
|
||||
assert_eq!(
|
||||
create_qualname(name),
|
||||
QualName::new(
|
||||
None,
|
||||
Namespace::from("http://www.w3.org/1999/xhtml"),
|
||||
LocalName::from(name)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inline_css() {
|
||||
let html_str = r#"<html>
|
||||
<head><meta charset="UTF-8"/></head>
|
||||
<body>
|
||||
<p>Lorem ipsum sample text goes here.</p>
|
||||
</body>
|
||||
</html>"#;
|
||||
let doc = kuchiki::parse_html().one(html_str);
|
||||
let body_stylesheet = include_str!("./assets/body.min.css");
|
||||
let header_stylesheet = include_str!("./assets/headers.min.css");
|
||||
assert_eq!(0, doc.select("style").unwrap().count());
|
||||
|
||||
inline_css(&doc, &CSSConfig::None);
|
||||
assert_eq!(0, doc.select("style").unwrap().count());
|
||||
|
||||
inline_css(&doc, &CSSConfig::NoHeaders);
|
||||
assert_eq!(1, doc.select("style").unwrap().count());
|
||||
let style_elem = doc.select_first("style").unwrap();
|
||||
assert_eq!(body_stylesheet, style_elem.text_contents());
|
||||
|
||||
let doc = kuchiki::parse_html().one(html_str);
|
||||
inline_css(&doc, &CSSConfig::All);
|
||||
assert_eq!(1, doc.select("style").unwrap().count());
|
||||
let style_elem = doc.select_first("style").unwrap();
|
||||
assert_eq!(
|
||||
format!("{}{}", body_stylesheet, header_stylesheet),
|
||||
style_elem.text_contents()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_existing_stylesheet_link() {
|
||||
let html_str = r#"<html>
|
||||
<head><link href="stylesheet.css"></link></head>
|
||||
<body><p>Lorem ipsum sample text goes here.</p></body></html>"#;
|
||||
let doc = kuchiki::parse_html().one(html_str);
|
||||
assert_eq!(1, doc.select("link").unwrap().count());
|
||||
remove_existing_stylesheet_link(&doc);
|
||||
assert_eq!(0, doc.select("link").unwrap().count());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_insert_appendix() {
|
||||
let html_str = r#"<html>
|
||||
<head><meta charset="UTF-8"/></head>
|
||||
<body>
|
||||
<p>Lorem ipsum sample text goes here.</p>
|
||||
</body>
|
||||
</html>"#;
|
||||
let doc = kuchiki::parse_html().one(html_str);
|
||||
let meta_data = MetaData::new();
|
||||
|
||||
assert_eq!(0, doc.select("footer").unwrap().count());
|
||||
|
||||
insert_appendix(&doc, vec![(&meta_data, "http://example.org")]);
|
||||
|
||||
assert_eq!(1, doc.select("footer").unwrap().count());
|
||||
assert_eq!(1, doc.select("footer > h2").unwrap().count());
|
||||
assert_eq!(
|
||||
"Appendix",
|
||||
doc.select_first("footer > h2").unwrap().text_contents()
|
||||
);
|
||||
assert_eq!(1, doc.select("footer > h3").unwrap().count());
|
||||
assert_eq!(
|
||||
"Article sources",
|
||||
doc.select_first("footer > h3").unwrap().text_contents()
|
||||
);
|
||||
assert_eq!(1, doc.select("a").unwrap().count());
|
||||
|
||||
let anchor_elem = doc.select_first("a").unwrap();
|
||||
assert_eq!("http://example.org", anchor_elem.text_contents());
|
||||
let anchor_attrs = anchor_elem.attributes.borrow();
|
||||
assert_eq!(Some("http://example.org"), anchor_attrs.get("href"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -150,6 +150,15 @@ async fn process_img_response<'a>(
|
|||
let img_mime = img_response
|
||||
.content_type()
|
||||
.map(|mime| mime.essence().to_string());
|
||||
if let Some(mime_str) = &img_mime {
|
||||
if !mime_str.starts_with("image/") {
|
||||
return Err(ErrorKind::HTTPError(format!(
|
||||
"Invalid image MIME type: {} for {}",
|
||||
mime_str, url
|
||||
))
|
||||
.into());
|
||||
}
|
||||
}
|
||||
let img_ext = match img_response
|
||||
.content_type()
|
||||
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||
|
|
Reference in a new issue