From 2f4da824bac01c9b8af4e3656e462439c2fe9cc0 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 24 Jul 2021 12:03:36 +0300 Subject: [PATCH] feat: add HTML exports with inlining of images fix: typo fix refactor: refactor `add_stylesheets` function --- .gitignore | 3 + Cargo.lock | 1 + Cargo.toml | 3 +- src/cli.rs | 26 ++- src/cli_config.yml | 16 ++ src/epub.rs | 26 +-- src/extractor.rs | 3 +- src/html.rs | 391 +++++++++++++++++++++++++++++++++++++++++++++ src/logs.rs | 4 +- src/main.rs | 29 +++- 10 files changed, 477 insertions(+), 25 deletions(-) create mode 100644 src/html.rs diff --git a/.gitignore b/.gitignore index 2b8060a..5e612be 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ /target *.epub +# Only ignore top level html files which may be made when testing +/*.html +*.pdf *.log .vscode/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 8a52a90..2824333 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1555,6 +1555,7 @@ name = "paperoni" version = "0.5.0-alpha1" dependencies = [ "async-std", + "base64", "chrono", "clap", "colored", diff --git a/Cargo.toml b/Cargo.toml index bde8df1..fced683 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,8 +14,9 @@ readme = "README.md" [dependencies] # atty = "0.2.14" async-std = "1.9.0" +base64 = "0.13.0" chrono = "0.4.19" -clap = {version = "2.33.3", features = ["yaml"]} +clap = { version = "2.33.3", features = ["yaml"] } colored = "2.0.0" comfy-table = "3.0.0" derive_builder = "0.10.2" diff --git a/src/cli.rs b/src/cli.rs index c66f0de..67f8ace 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -11,10 +11,10 @@ const DEFAULT_MAX_CONN: usize = 8; #[derive(derive_builder::Builder)] pub struct AppConfig { - /// Urls for store in epub + /// Article urls pub urls: Vec, pub max_conn: usize, - /// Path to file of multiple articles into a single epub + /// Path to file of multiple articles into a single article pub merged: Option, pub output_directory: Option, pub log_level: LogLevel, @@ -23,6 +23,8 @@ pub struct AppConfig { pub is_logging_to_file: bool, pub inline_toc: bool, pub css_config: CSSConfig, + pub export_type: ExportType, + pub is_inlining_images: bool, } impl AppConfig { @@ -93,10 +95,11 @@ impl<'a> TryFrom> for AppConfig { None => DEFAULT_MAX_CONN, }) .merged(arg_matches.value_of("output-name").map(|name| { - if name.ends_with(".epub") { + let file_ext = format!(".{}", arg_matches.value_of("export").unwrap()); + if name.ends_with(&file_ext) { name.to_owned() } else { - name.to_string() + ".epub" + name.to_string() + &file_ext } })) .can_disable_progress_bar( @@ -143,6 +146,15 @@ impl<'a> TryFrom> for AppConfig { _ => CSSConfig::All, }, ) + .export_type({ + let export_type = arg_matches.value_of("export").unwrap(); + if export_type == "html" { + ExportType::HTML + } else { + ExportType::EPUB + } + }) + .is_inlining_images(arg_matches.is_present("inline-images")) .try_init() } } @@ -162,3 +174,9 @@ pub enum CSSConfig { NoHeaders, None, } + +#[derive(Clone, Debug)] +pub enum ExportType { + HTML, + EPUB, +} diff --git a/src/cli_config.yml b/src/cli_config.yml index a1d9424..88be611 100644 --- a/src/cli_config.yml +++ b/src/cli_config.yml @@ -64,3 +64,19 @@ args: conflicts_with: no-css help: Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default CSS, use --no-css instead. takes_value: false + - export: + long: export + help: Specify the file type of the export. The type must be in lower case. + possible_values: [html, epub] + value_name: type + takes_value: true + default_value: epub + - inline-images: + long: inline-images + help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more. + long_help: "Inlines the article images when exporting to HTML using base64. + \nThis is used when you do not want a separate folder created for images during HTML export. + \nNOTE: It uses base64 encoding on the images which results in larger HTML export sizes as each image + increases in size by about 25%-33%." + takes_value: false + requires: export diff --git a/src/epub.rs b/src/epub.rs index c522470..8c280f1 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -12,7 +12,7 @@ use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor}; lazy_static! { static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); - static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_]"#).unwrap(); + static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_:]"#).unwrap(); } pub fn generate_epubs( @@ -38,9 +38,6 @@ pub fn generate_epubs( enabled_bar }; - let body_stylesheet = include_bytes!("./assets/body.min.css"); - let header_stylesheet = include_bytes!("./assets/headers.min.css"); - let mut errors: Vec = Vec::new(); match app_config.merged { @@ -73,7 +70,7 @@ pub fn generate_epubs( epub.inline_toc(); } - match add_stylesheets(&mut epub, body_stylesheet, header_stylesheet, app_config) { + match add_stylesheets(&mut epub, app_config) { Ok(_) => (), Err(e) => { error!("Unable to add stylesheets to epub file"); @@ -148,6 +145,8 @@ pub fn generate_epubs( let mut paperoni_err: PaperoniError = err.into(); paperoni_err.set_article_source(&name); errors.push(paperoni_err); + error!("Failed to generate epub: {}", name); + bar.finish_with_message("epub generation failed\n"); return Err(errors); } } @@ -189,7 +188,7 @@ pub fn generate_epubs( epub.metadata("author", replace_escaped_characters(author))?; } - add_stylesheets(&mut epub, body_stylesheet, header_stylesheet, app_config)?; + add_stylesheets(&mut epub, app_config)?; let title = replace_escaped_characters(article.metadata().title()); epub.metadata("title", &title)?; @@ -206,7 +205,7 @@ pub fn generate_epubs( let mut file_path = std::env::temp_dir(); file_path.push(&img.0); - let img_buf = File::open(&file_path).expect("Can't read file"); + let img_buf = File::open(&file_path).expect("Can't read image file"); epub.add_resource( file_path.file_name().unwrap(), img_buf, @@ -252,10 +251,10 @@ fn replace_escaped_characters(value: &str) -> String { fn add_stylesheets( epub: &mut EpubBuilder, - body_stylesheet: &[u8], - header_stylesheet: &[u8], app_config: &AppConfig, ) -> Result<(), epub_builder::Error> { + let body_stylesheet: &[u8] = include_bytes!("./assets/body.min.css"); + let header_stylesheet: &[u8] = include_bytes!("./assets/headers.min.css"); match app_config.css_config { crate::cli::CSSConfig::All => { epub.stylesheet([header_stylesheet, body_stylesheet].concat().as_bytes())?; @@ -434,6 +433,15 @@ fn serialize_to_xhtml( node_ref: &NodeRef, mut w: &mut W, ) -> Result<(), PaperoniError> { + { + // Add XHTML attributes + let html_elem = node_ref + .select_first("html") + .expect("Unable to get element in article"); + let mut html_attrs = html_elem.attributes.borrow_mut(); + html_attrs.insert("xmlns", "http://www.w3.org/1999/xhtml".into()); + html_attrs.insert("xmlns:epub", "http://www.idpf.org/2007/ops".into()); + } let mut escape_map = HashMap::new(); escape_map.insert("<", "<"); escape_map.insert(">", ">"); diff --git a/src/extractor.rs b/src/extractor.rs index f427e0f..9df5168 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -30,7 +30,8 @@ impl Extractor { self.readability.parse(&self.url)?; if let Some(article_node_ref) = &self.readability.article_node { let template = r#" - + + diff --git a/src/html.rs b/src/html.rs new file mode 100644 index 0000000..a26fe85 --- /dev/null +++ b/src/html.rs @@ -0,0 +1,391 @@ +use std::{ + collections::{BTreeMap, HashSet}, + fs::{self, File}, + path::Path, +}; + +use base64::encode; +use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; +use html5ever::{LocalName, Namespace, QualName}; +use indicatif::{ProgressBar, ProgressStyle}; +use kuchiki::{traits::*, NodeRef}; +use log::{debug, error, info}; + +use crate::{ + cli::{self, AppConfig}, + errors::PaperoniError, + extractor::Extractor, + moz_readability::MetaData, +}; + +const HEAD_ELEM_NOT_FOUND: &str = + "Unable to get element to inline css. Ensure that the root node is the HTML document."; +const BASE_HTML_TEMPLATE: &str = r#" + + + + + +"#; + +pub fn generate_html_exports( + articles: Vec, + app_config: &AppConfig, + successful_articles_table: &mut Table, +) -> Result<(), Vec> { + if articles.is_empty() { + return Ok(()); + } + + let bar = if app_config.can_disable_progress_bar { + ProgressBar::hidden() + } else { + let enabled_bar = ProgressBar::new(articles.len() as u64); + let style = ProgressStyle::default_bar().template( + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} html {pos}/{len:7} {msg:.green}", + ); + enabled_bar.set_style(style); + if !articles.is_empty() { + enabled_bar.set_message("Generating html files"); + } + enabled_bar + }; + + let mut errors: Vec = Vec::new(); + + match app_config.merged { + Some(ref name) => { + successful_articles_table.set_header(vec![Cell::new("Table of Contents") + .add_attribute(Attribute::Bold) + .set_alignment(CellAlignment::Center) + .fg(Color::Green)]); + + debug!("Creating {:?}", name); + + let base_html_elem = kuchiki::parse_html().one(BASE_HTML_TEMPLATE); + let body_elem = base_html_elem.select_first("body").unwrap(); + let base_path = Path::new(app_config.output_directory.as_deref().unwrap_or(".")); + let img_dirs_path_name = name.trim_end_matches(".html"); + let imgs_dir_path = base_path.join(img_dirs_path_name); + + if !(app_config.is_inlining_images || imgs_dir_path.exists()) { + info!("Creating imgs dir in {:?} for {}", imgs_dir_path, name); + if let Err(e) = std::fs::create_dir(&imgs_dir_path) { + error!("Unable to create imgs dir for HTML file"); + let err: PaperoniError = e.into(); + errors.push(err); + return Err(errors); + }; + } + + for (idx, article) in articles.iter().enumerate() { + let article_elem = article + .article() + .select_first("div[id=\"readability-page-1\"]") + .unwrap(); + + let title = article.metadata().title(); + + let mut elem_attr = article_elem.attributes.borrow_mut(); + if let Some(id_attr) = elem_attr.get_mut("id") { + *id_attr = format!("readability-page-{}", idx); + } + + for (img_url, mime_type_opt) in &article.img_urls { + if app_config.is_inlining_images { + info!("Inlining images for {}", title); + let result = update_imgs_base64( + article, + img_url, + mime_type_opt.as_deref().unwrap_or("image/*"), + ); + + if let Err(e) = result { + let mut err: PaperoniError = e.into(); + err.set_article_source(title); + error!("Unable to copy images to imgs dir for {}", title); + errors.push(err); + } + + info!("Completed inlining images for {}", title); + } else { + info!("Copying images to imgs dir for {}", title); + let result = update_img_urls(article, &imgs_dir_path).map_err(|e| { + let mut err: PaperoniError = e.into(); + err.set_article_source(title); + err + }); + if let Err(e) = result { + error!("Unable to copy images to imgs dir for {}", title); + errors.push(e); + } else { + info!("Successfully copied images to imgs dir for {}", title); + } + } + } + bar.inc(1); + successful_articles_table.add_row(vec![title]); + body_elem.as_node().append(article_elem.as_node().clone()); + debug!("Added {} to the export HTML file", title); + } + + insert_title_elem(&base_html_elem, name); + insert_appendix( + &base_html_elem, + articles + .iter() + .map(|article| (article.metadata(), article.url.as_str())) + .collect(), + ); + inline_css(&base_html_elem, app_config); + + info!("Added title, footer and inlined styles for {}", name); + + info!("Creating export HTML file: {}", name); + if let Err(mut err) = File::create(name) + .and_then(|mut out_file| base_html_elem.serialize(&mut out_file)) + .map_err(|e| -> PaperoniError { e.into() }) + { + error!("Failed to serialize articles to file: {}", name); + err.set_article_source(&name); + errors.push(err); + bar.finish_with_message("html generation failed"); + return Err(errors); + }; + + bar.finish_with_message("Generated html file\n"); + debug!("Created {:?}", name); + println!("Created {:?}", name); + } + None => { + successful_articles_table + .set_header(vec![Cell::new("Downloaded articles") + .add_attribute(Attribute::Bold) + .set_alignment(CellAlignment::Center) + .fg(Color::Green)]) + .set_content_arrangement(ContentArrangement::Dynamic); + + let mut file_names: HashSet = HashSet::new(); + + for article in &articles { + let mut file_name = format!( + "{}/{}.html", + app_config.output_directory.as_deref().unwrap_or("."), + article + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + + if file_names.contains(&file_name) { + info!("Article name {:?} already exists", file_name); + file_name = format!( + "{}/{}_{}.html", + app_config.output_directory.as_deref().unwrap_or("."), + article + .metadata() + .title() + .replace("/", " ") + .replace("\\", " "), + file_names.len() + ); + info!("Renamed to {:?}", file_name); + } + file_names.insert(file_name.clone()); + + debug!("Creating {:?}", file_name); + let export_article = || -> Result<(), PaperoniError> { + let mut out_file = File::create(&file_name)?; + + if app_config.is_inlining_images { + for (img_url, mime_type_opt) in &article.img_urls { + update_imgs_base64( + article, + img_url, + mime_type_opt.as_deref().unwrap_or("image/*"), + )? + } + } else { + let base_path = + Path::new(app_config.output_directory.as_deref().unwrap_or(".")); + let imgs_dir_name = article.metadata().title(); + + if !base_path.join(imgs_dir_name).exists() { + std::fs::create_dir(base_path.join(imgs_dir_name))?; + } + + let imgs_dir_path = base_path.join(imgs_dir_name); + update_img_urls(article, &imgs_dir_path)?; + } + + let utf8_encoding = + NodeRef::new_element(create_qualname("meta"), BTreeMap::new()); + if let Some(elem_node) = utf8_encoding.as_element() { + let mut elem_attrs = elem_node.attributes.borrow_mut(); + elem_attrs.insert("charset", "UTF-8".into()); + } + + if let Ok(head_elem) = article.article().select_first("head") { + let head_elem_node = head_elem.as_node(); + head_elem_node.append(utf8_encoding); + }; + + insert_title_elem(article.article(), article.metadata().title()); + insert_appendix(article.article(), vec![(article.metadata(), &article.url)]); + inline_css(article.article(), app_config); + + article.article().serialize(&mut out_file)?; + Ok(()) + }; + + if let Err(mut err) = export_article() { + err.set_article_source(&article.url); + errors.push(err); + } + debug!("Created {:?}", file_name); + + bar.inc(1); + successful_articles_table.add_row(vec![article.metadata().title()]); + } + bar.finish_with_message("Generated HTML files\n"); + } + } + + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } +} + +fn create_qualname(name: &str) -> QualName { + QualName::new( + None, + Namespace::from("http://www.w3.org/1999/xhtml"), + LocalName::from(name), + ) +} + +/// Updates the src attribute of `` elements with a base64 encoded string of the image data +fn update_imgs_base64( + article: &Extractor, + img_url: &str, + mime_type: &str, +) -> Result<(), std::io::Error> { + let temp_dir = std::env::temp_dir(); + let img_path = temp_dir.join(img_url); + let img_bytes = std::fs::read(img_path)?; + let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes)); + + let img_elems = article + .article() + .select(&format!("img[src=\"{}\"]", img_url)) + .unwrap(); + for img_elem in img_elems { + let mut img_attr = img_elem.attributes.borrow_mut(); + if let Some(src_attr) = img_attr.get_mut("src") { + *src_attr = img_base64_str.clone(); + } + } + Ok(()) +} + +/// Updates the src attribute of `` elements to the new `imgs_dir_path` and copies the image to the new file location +fn update_img_urls(article: &Extractor, imgs_dir_path: &Path) -> Result<(), std::io::Error> { + let temp_dir = std::env::temp_dir(); + for (img_url, _) in &article.img_urls { + let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url)); + info!("Copying {:?} to {:?}", from, to); + fs::copy(from, to)?; + let img_elems = article + .article() + .select(&format!("img[src=\"{}\"]", img_url)) + .unwrap(); + for img_elem in img_elems { + let mut img_attr = img_elem.attributes.borrow_mut(); + if let Some(src_attr) = img_attr.get_mut("src") { + *src_attr = imgs_dir_path.join(img_url).to_str().unwrap().into(); + } + } + } + Ok(()) +} + +/// Creates a `` element in an HTML document with the value set to the article's title +fn insert_title_elem(root_node: &NodeRef, title: &str) { + let title_content = NodeRef::new_text(title); + let title_elem = NodeRef::new_element(create_qualname("title"), BTreeMap::new()); + title_elem.append(title_content); + match root_node.select_first("head") { + Ok(head_elem) => { + head_elem.as_node().append(title_elem); + } + Err(_) => { + debug!("{}", HEAD_ELEM_NOT_FOUND); + let html_elem = root_node.select_first("html").unwrap(); + let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new()); + head_elem.append(title_elem); + html_elem.as_node().prepend(head_elem); + } + } +} + +/// Creates the appendix in an HTML document where article sources are added in a `<footer>` element +fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) { + let link_tags: String = article_links + .iter() + .map(|(meta_data, url)| { + let article_name = if !meta_data.title().is_empty() { + meta_data.title() + } else { + url + }; + format!("<a href=\"{}\">{}</a><br></br>", url, article_name) + }) + .collect(); + let footer_inner_html = format!("<h2>Appendix</h2><h2>Article sources</h3>{}", link_tags); + let footer_elem = + kuchiki::parse_fragment(create_qualname("footer"), Vec::new()).one(footer_inner_html); + root_node.append(footer_elem); +} + +/// Inlines the CSS stylesheets into the HTML article node +fn inline_css(root_node: &NodeRef, app_config: &AppConfig) { + let body_stylesheet = include_str!("./assets/body.min.css"); + let header_stylesheet = include_str!("./assets/headers.min.css"); + let mut css_str = String::new(); + match app_config.css_config { + cli::CSSConfig::NoHeaders => { + css_str.push_str(body_stylesheet); + } + cli::CSSConfig::All => { + css_str.push_str(body_stylesheet); + css_str.push_str(header_stylesheet); + } + cli::CSSConfig::None => { + return; + } + } + let css_html_str = format!("<style>{}</style>", css_str); + let style_container = + kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str); + let style_elem = style_container.select_first("style").unwrap(); + match root_node.select_first("head") { + Ok(head_elem) => { + head_elem.as_node().prepend(style_elem.as_node().to_owned()); + } + Err(_) => { + debug!("{}", HEAD_ELEM_NOT_FOUND); + let html_elem = root_node.select_first("html").unwrap(); + let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new()); + head_elem.prepend(style_elem.as_node().to_owned()); + html_elem.as_node().prepend(head_elem); + } + } + + // Remove the <link> of the stylesheet since styles are now inlined + if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") { + style_link_elem.as_node().detach(); + }; +} diff --git a/src/logs.rs b/src/logs.rs index a0f51d8..7ce58dc 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -11,7 +11,7 @@ use crate::errors::PaperoniError; pub fn display_summary( initial_article_count: usize, - succesful_articles_table: Table, + successful_articles_table: Table, partial_downloads: Vec<PartialDownload>, errors: Vec<PaperoniError>, ) { @@ -31,7 +31,7 @@ pub fn display_summary( ); if successfully_downloaded_count > 0 { - println!("{}", succesful_articles_table); + println!("{}", successful_articles_table); } if partial_downloads_count > 0 { diff --git a/src/main.rs b/src/main.rs index e378115..925beca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,6 +12,7 @@ mod cli; mod epub; mod errors; mod extractor; +mod html; /// This module is responsible for async HTTP calls for downloading /// the HTML content and images mod http; @@ -20,6 +21,7 @@ mod moz_readability; use cli::AppConfig; use epub::generate_epubs; +use html::generate_html_exports; use logs::display_summary; fn main() { @@ -64,22 +66,33 @@ fn run(app_config: AppConfig) { let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors); bar.finish_with_message("Downloaded articles"); - let mut succesful_articles_table = Table::new(); - succesful_articles_table + let mut successful_articles_table = Table::new(); + successful_articles_table .load_preset(UTF8_FULL) .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) .set_content_arrangement(ContentArrangement::Dynamic); - match generate_epubs(articles, &app_config, &mut succesful_articles_table) { - Ok(_) => (), - Err(gen_epub_errors) => { - errors.extend(gen_epub_errors); + + match app_config.export_type { + cli::ExportType::EPUB => { + match generate_epubs(articles, &app_config, &mut successful_articles_table) { + Ok(_) => (), + Err(gen_epub_errors) => { + errors.extend(gen_epub_errors); + } + }; } - }; + cli::ExportType::HTML => { + match generate_html_exports(articles, &app_config, &mut successful_articles_table) { + Ok(_) => (), + Err(gen_html_errors) => errors.extend(gen_html_errors), + } + } + } let has_errors = !errors.is_empty() || !partial_downloads.is_empty(); display_summary( app_config.urls.len(), - succesful_articles_table, + successful_articles_table, partial_downloads, errors, );