feat: add HTML exports with inlining of images
fix: typo fix refactor: refactor `add_stylesheets` function
This commit is contained in:
parent
d1d1a0f3f4
commit
2f4da824ba
10 changed files with 477 additions and 25 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -1,4 +1,7 @@
|
||||||
/target
|
/target
|
||||||
*.epub
|
*.epub
|
||||||
|
# Only ignore top level html files which may be made when testing
|
||||||
|
/*.html
|
||||||
|
*.pdf
|
||||||
*.log
|
*.log
|
||||||
.vscode/
|
.vscode/
|
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1555,6 +1555,7 @@ name = "paperoni"
|
||||||
version = "0.5.0-alpha1"
|
version = "0.5.0-alpha1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-std",
|
"async-std",
|
||||||
|
"base64",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"colored",
|
"colored",
|
||||||
|
|
|
@ -14,6 +14,7 @@ readme = "README.md"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
# atty = "0.2.14"
|
# atty = "0.2.14"
|
||||||
async-std = "1.9.0"
|
async-std = "1.9.0"
|
||||||
|
base64 = "0.13.0"
|
||||||
chrono = "0.4.19"
|
chrono = "0.4.19"
|
||||||
clap = { version = "2.33.3", features = ["yaml"] }
|
clap = { version = "2.33.3", features = ["yaml"] }
|
||||||
colored = "2.0.0"
|
colored = "2.0.0"
|
||||||
|
|
26
src/cli.rs
26
src/cli.rs
|
@ -11,10 +11,10 @@ const DEFAULT_MAX_CONN: usize = 8;
|
||||||
|
|
||||||
#[derive(derive_builder::Builder)]
|
#[derive(derive_builder::Builder)]
|
||||||
pub struct AppConfig {
|
pub struct AppConfig {
|
||||||
/// Urls for store in epub
|
/// Article urls
|
||||||
pub urls: Vec<String>,
|
pub urls: Vec<String>,
|
||||||
pub max_conn: usize,
|
pub max_conn: usize,
|
||||||
/// Path to file of multiple articles into a single epub
|
/// Path to file of multiple articles into a single article
|
||||||
pub merged: Option<String>,
|
pub merged: Option<String>,
|
||||||
pub output_directory: Option<String>,
|
pub output_directory: Option<String>,
|
||||||
pub log_level: LogLevel,
|
pub log_level: LogLevel,
|
||||||
|
@ -23,6 +23,8 @@ pub struct AppConfig {
|
||||||
pub is_logging_to_file: bool,
|
pub is_logging_to_file: bool,
|
||||||
pub inline_toc: bool,
|
pub inline_toc: bool,
|
||||||
pub css_config: CSSConfig,
|
pub css_config: CSSConfig,
|
||||||
|
pub export_type: ExportType,
|
||||||
|
pub is_inlining_images: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AppConfig {
|
impl AppConfig {
|
||||||
|
@ -93,10 +95,11 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
|
||||||
None => DEFAULT_MAX_CONN,
|
None => DEFAULT_MAX_CONN,
|
||||||
})
|
})
|
||||||
.merged(arg_matches.value_of("output-name").map(|name| {
|
.merged(arg_matches.value_of("output-name").map(|name| {
|
||||||
if name.ends_with(".epub") {
|
let file_ext = format!(".{}", arg_matches.value_of("export").unwrap());
|
||||||
|
if name.ends_with(&file_ext) {
|
||||||
name.to_owned()
|
name.to_owned()
|
||||||
} else {
|
} else {
|
||||||
name.to_string() + ".epub"
|
name.to_string() + &file_ext
|
||||||
}
|
}
|
||||||
}))
|
}))
|
||||||
.can_disable_progress_bar(
|
.can_disable_progress_bar(
|
||||||
|
@ -143,6 +146,15 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
|
||||||
_ => CSSConfig::All,
|
_ => CSSConfig::All,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
.export_type({
|
||||||
|
let export_type = arg_matches.value_of("export").unwrap();
|
||||||
|
if export_type == "html" {
|
||||||
|
ExportType::HTML
|
||||||
|
} else {
|
||||||
|
ExportType::EPUB
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.is_inlining_images(arg_matches.is_present("inline-images"))
|
||||||
.try_init()
|
.try_init()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -162,3 +174,9 @@ pub enum CSSConfig {
|
||||||
NoHeaders,
|
NoHeaders,
|
||||||
None,
|
None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum ExportType {
|
||||||
|
HTML,
|
||||||
|
EPUB,
|
||||||
|
}
|
||||||
|
|
|
@ -64,3 +64,19 @@ args:
|
||||||
conflicts_with: no-css
|
conflicts_with: no-css
|
||||||
help: Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default CSS, use --no-css instead.
|
help: Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default CSS, use --no-css instead.
|
||||||
takes_value: false
|
takes_value: false
|
||||||
|
- export:
|
||||||
|
long: export
|
||||||
|
help: Specify the file type of the export. The type must be in lower case.
|
||||||
|
possible_values: [html, epub]
|
||||||
|
value_name: type
|
||||||
|
takes_value: true
|
||||||
|
default_value: epub
|
||||||
|
- inline-images:
|
||||||
|
long: inline-images
|
||||||
|
help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more.
|
||||||
|
long_help: "Inlines the article images when exporting to HTML using base64.
|
||||||
|
\nThis is used when you do not want a separate folder created for images during HTML export.
|
||||||
|
\nNOTE: It uses base64 encoding on the images which results in larger HTML export sizes as each image
|
||||||
|
increases in size by about 25%-33%."
|
||||||
|
takes_value: false
|
||||||
|
requires: export
|
||||||
|
|
26
src/epub.rs
26
src/epub.rs
|
@ -12,7 +12,7 @@ use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
||||||
static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_]"#).unwrap();
|
static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_:]"#).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_epubs(
|
pub fn generate_epubs(
|
||||||
|
@ -38,9 +38,6 @@ pub fn generate_epubs(
|
||||||
enabled_bar
|
enabled_bar
|
||||||
};
|
};
|
||||||
|
|
||||||
let body_stylesheet = include_bytes!("./assets/body.min.css");
|
|
||||||
let header_stylesheet = include_bytes!("./assets/headers.min.css");
|
|
||||||
|
|
||||||
let mut errors: Vec<PaperoniError> = Vec::new();
|
let mut errors: Vec<PaperoniError> = Vec::new();
|
||||||
|
|
||||||
match app_config.merged {
|
match app_config.merged {
|
||||||
|
@ -73,7 +70,7 @@ pub fn generate_epubs(
|
||||||
epub.inline_toc();
|
epub.inline_toc();
|
||||||
}
|
}
|
||||||
|
|
||||||
match add_stylesheets(&mut epub, body_stylesheet, header_stylesheet, app_config) {
|
match add_stylesheets(&mut epub, app_config) {
|
||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Unable to add stylesheets to epub file");
|
error!("Unable to add stylesheets to epub file");
|
||||||
|
@ -148,6 +145,8 @@ pub fn generate_epubs(
|
||||||
let mut paperoni_err: PaperoniError = err.into();
|
let mut paperoni_err: PaperoniError = err.into();
|
||||||
paperoni_err.set_article_source(&name);
|
paperoni_err.set_article_source(&name);
|
||||||
errors.push(paperoni_err);
|
errors.push(paperoni_err);
|
||||||
|
error!("Failed to generate epub: {}", name);
|
||||||
|
bar.finish_with_message("epub generation failed\n");
|
||||||
return Err(errors);
|
return Err(errors);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -189,7 +188,7 @@ pub fn generate_epubs(
|
||||||
epub.metadata("author", replace_escaped_characters(author))?;
|
epub.metadata("author", replace_escaped_characters(author))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
add_stylesheets(&mut epub, body_stylesheet, header_stylesheet, app_config)?;
|
add_stylesheets(&mut epub, app_config)?;
|
||||||
let title = replace_escaped_characters(article.metadata().title());
|
let title = replace_escaped_characters(article.metadata().title());
|
||||||
epub.metadata("title", &title)?;
|
epub.metadata("title", &title)?;
|
||||||
|
|
||||||
|
@ -206,7 +205,7 @@ pub fn generate_epubs(
|
||||||
let mut file_path = std::env::temp_dir();
|
let mut file_path = std::env::temp_dir();
|
||||||
file_path.push(&img.0);
|
file_path.push(&img.0);
|
||||||
|
|
||||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
let img_buf = File::open(&file_path).expect("Can't read image file");
|
||||||
epub.add_resource(
|
epub.add_resource(
|
||||||
file_path.file_name().unwrap(),
|
file_path.file_name().unwrap(),
|
||||||
img_buf,
|
img_buf,
|
||||||
|
@ -252,10 +251,10 @@ fn replace_escaped_characters(value: &str) -> String {
|
||||||
|
|
||||||
fn add_stylesheets<T: epub_builder::Zip>(
|
fn add_stylesheets<T: epub_builder::Zip>(
|
||||||
epub: &mut EpubBuilder<T>,
|
epub: &mut EpubBuilder<T>,
|
||||||
body_stylesheet: &[u8],
|
|
||||||
header_stylesheet: &[u8],
|
|
||||||
app_config: &AppConfig,
|
app_config: &AppConfig,
|
||||||
) -> Result<(), epub_builder::Error> {
|
) -> Result<(), epub_builder::Error> {
|
||||||
|
let body_stylesheet: &[u8] = include_bytes!("./assets/body.min.css");
|
||||||
|
let header_stylesheet: &[u8] = include_bytes!("./assets/headers.min.css");
|
||||||
match app_config.css_config {
|
match app_config.css_config {
|
||||||
crate::cli::CSSConfig::All => {
|
crate::cli::CSSConfig::All => {
|
||||||
epub.stylesheet([header_stylesheet, body_stylesheet].concat().as_bytes())?;
|
epub.stylesheet([header_stylesheet, body_stylesheet].concat().as_bytes())?;
|
||||||
|
@ -434,6 +433,15 @@ fn serialize_to_xhtml<W: std::io::Write>(
|
||||||
node_ref: &NodeRef,
|
node_ref: &NodeRef,
|
||||||
mut w: &mut W,
|
mut w: &mut W,
|
||||||
) -> Result<(), PaperoniError> {
|
) -> Result<(), PaperoniError> {
|
||||||
|
{
|
||||||
|
// Add XHTML attributes
|
||||||
|
let html_elem = node_ref
|
||||||
|
.select_first("html")
|
||||||
|
.expect("Unable to get <html> element in article");
|
||||||
|
let mut html_attrs = html_elem.attributes.borrow_mut();
|
||||||
|
html_attrs.insert("xmlns", "http://www.w3.org/1999/xhtml".into());
|
||||||
|
html_attrs.insert("xmlns:epub", "http://www.idpf.org/2007/ops".into());
|
||||||
|
}
|
||||||
let mut escape_map = HashMap::new();
|
let mut escape_map = HashMap::new();
|
||||||
escape_map.insert("<", "<");
|
escape_map.insert("<", "<");
|
||||||
escape_map.insert(">", ">");
|
escape_map.insert(">", ">");
|
||||||
|
|
|
@ -30,7 +30,8 @@ impl Extractor {
|
||||||
self.readability.parse(&self.url)?;
|
self.readability.parse(&self.url)?;
|
||||||
if let Some(article_node_ref) = &self.readability.article_node {
|
if let Some(article_node_ref) = &self.readability.article_node {
|
||||||
let template = r#"
|
let template = r#"
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<link rel="stylesheet" href="stylesheet.css" type="text/css"></link>
|
<link rel="stylesheet" href="stylesheet.css" type="text/css"></link>
|
||||||
</head>
|
</head>
|
||||||
|
|
391
src/html.rs
Normal file
391
src/html.rs
Normal file
|
@ -0,0 +1,391 @@
|
||||||
|
use std::{
|
||||||
|
collections::{BTreeMap, HashSet},
|
||||||
|
fs::{self, File},
|
||||||
|
path::Path,
|
||||||
|
};
|
||||||
|
|
||||||
|
use base64::encode;
|
||||||
|
use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table};
|
||||||
|
use html5ever::{LocalName, Namespace, QualName};
|
||||||
|
use indicatif::{ProgressBar, ProgressStyle};
|
||||||
|
use kuchiki::{traits::*, NodeRef};
|
||||||
|
use log::{debug, error, info};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
cli::{self, AppConfig},
|
||||||
|
errors::PaperoniError,
|
||||||
|
extractor::Extractor,
|
||||||
|
moz_readability::MetaData,
|
||||||
|
};
|
||||||
|
|
||||||
|
const HEAD_ELEM_NOT_FOUND: &str =
|
||||||
|
"Unable to get <head> element to inline css. Ensure that the root node is the HTML document.";
|
||||||
|
const BASE_HTML_TEMPLATE: &str = r#"<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
</head>
|
||||||
|
<body></body>
|
||||||
|
</html>"#;
|
||||||
|
|
||||||
|
pub fn generate_html_exports(
|
||||||
|
articles: Vec<Extractor>,
|
||||||
|
app_config: &AppConfig,
|
||||||
|
successful_articles_table: &mut Table,
|
||||||
|
) -> Result<(), Vec<PaperoniError>> {
|
||||||
|
if articles.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let bar = if app_config.can_disable_progress_bar {
|
||||||
|
ProgressBar::hidden()
|
||||||
|
} else {
|
||||||
|
let enabled_bar = ProgressBar::new(articles.len() as u64);
|
||||||
|
let style = ProgressStyle::default_bar().template(
|
||||||
|
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} html {pos}/{len:7} {msg:.green}",
|
||||||
|
);
|
||||||
|
enabled_bar.set_style(style);
|
||||||
|
if !articles.is_empty() {
|
||||||
|
enabled_bar.set_message("Generating html files");
|
||||||
|
}
|
||||||
|
enabled_bar
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut errors: Vec<PaperoniError> = Vec::new();
|
||||||
|
|
||||||
|
match app_config.merged {
|
||||||
|
Some(ref name) => {
|
||||||
|
successful_articles_table.set_header(vec![Cell::new("Table of Contents")
|
||||||
|
.add_attribute(Attribute::Bold)
|
||||||
|
.set_alignment(CellAlignment::Center)
|
||||||
|
.fg(Color::Green)]);
|
||||||
|
|
||||||
|
debug!("Creating {:?}", name);
|
||||||
|
|
||||||
|
let base_html_elem = kuchiki::parse_html().one(BASE_HTML_TEMPLATE);
|
||||||
|
let body_elem = base_html_elem.select_first("body").unwrap();
|
||||||
|
let base_path = Path::new(app_config.output_directory.as_deref().unwrap_or("."));
|
||||||
|
let img_dirs_path_name = name.trim_end_matches(".html");
|
||||||
|
let imgs_dir_path = base_path.join(img_dirs_path_name);
|
||||||
|
|
||||||
|
if !(app_config.is_inlining_images || imgs_dir_path.exists()) {
|
||||||
|
info!("Creating imgs dir in {:?} for {}", imgs_dir_path, name);
|
||||||
|
if let Err(e) = std::fs::create_dir(&imgs_dir_path) {
|
||||||
|
error!("Unable to create imgs dir for HTML file");
|
||||||
|
let err: PaperoniError = e.into();
|
||||||
|
errors.push(err);
|
||||||
|
return Err(errors);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
for (idx, article) in articles.iter().enumerate() {
|
||||||
|
let article_elem = article
|
||||||
|
.article()
|
||||||
|
.select_first("div[id=\"readability-page-1\"]")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let title = article.metadata().title();
|
||||||
|
|
||||||
|
let mut elem_attr = article_elem.attributes.borrow_mut();
|
||||||
|
if let Some(id_attr) = elem_attr.get_mut("id") {
|
||||||
|
*id_attr = format!("readability-page-{}", idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (img_url, mime_type_opt) in &article.img_urls {
|
||||||
|
if app_config.is_inlining_images {
|
||||||
|
info!("Inlining images for {}", title);
|
||||||
|
let result = update_imgs_base64(
|
||||||
|
article,
|
||||||
|
img_url,
|
||||||
|
mime_type_opt.as_deref().unwrap_or("image/*"),
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Err(e) = result {
|
||||||
|
let mut err: PaperoniError = e.into();
|
||||||
|
err.set_article_source(title);
|
||||||
|
error!("Unable to copy images to imgs dir for {}", title);
|
||||||
|
errors.push(err);
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Completed inlining images for {}", title);
|
||||||
|
} else {
|
||||||
|
info!("Copying images to imgs dir for {}", title);
|
||||||
|
let result = update_img_urls(article, &imgs_dir_path).map_err(|e| {
|
||||||
|
let mut err: PaperoniError = e.into();
|
||||||
|
err.set_article_source(title);
|
||||||
|
err
|
||||||
|
});
|
||||||
|
if let Err(e) = result {
|
||||||
|
error!("Unable to copy images to imgs dir for {}", title);
|
||||||
|
errors.push(e);
|
||||||
|
} else {
|
||||||
|
info!("Successfully copied images to imgs dir for {}", title);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bar.inc(1);
|
||||||
|
successful_articles_table.add_row(vec![title]);
|
||||||
|
body_elem.as_node().append(article_elem.as_node().clone());
|
||||||
|
debug!("Added {} to the export HTML file", title);
|
||||||
|
}
|
||||||
|
|
||||||
|
insert_title_elem(&base_html_elem, name);
|
||||||
|
insert_appendix(
|
||||||
|
&base_html_elem,
|
||||||
|
articles
|
||||||
|
.iter()
|
||||||
|
.map(|article| (article.metadata(), article.url.as_str()))
|
||||||
|
.collect(),
|
||||||
|
);
|
||||||
|
inline_css(&base_html_elem, app_config);
|
||||||
|
|
||||||
|
info!("Added title, footer and inlined styles for {}", name);
|
||||||
|
|
||||||
|
info!("Creating export HTML file: {}", name);
|
||||||
|
if let Err(mut err) = File::create(name)
|
||||||
|
.and_then(|mut out_file| base_html_elem.serialize(&mut out_file))
|
||||||
|
.map_err(|e| -> PaperoniError { e.into() })
|
||||||
|
{
|
||||||
|
error!("Failed to serialize articles to file: {}", name);
|
||||||
|
err.set_article_source(&name);
|
||||||
|
errors.push(err);
|
||||||
|
bar.finish_with_message("html generation failed");
|
||||||
|
return Err(errors);
|
||||||
|
};
|
||||||
|
|
||||||
|
bar.finish_with_message("Generated html file\n");
|
||||||
|
debug!("Created {:?}", name);
|
||||||
|
println!("Created {:?}", name);
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
successful_articles_table
|
||||||
|
.set_header(vec![Cell::new("Downloaded articles")
|
||||||
|
.add_attribute(Attribute::Bold)
|
||||||
|
.set_alignment(CellAlignment::Center)
|
||||||
|
.fg(Color::Green)])
|
||||||
|
.set_content_arrangement(ContentArrangement::Dynamic);
|
||||||
|
|
||||||
|
let mut file_names: HashSet<String> = HashSet::new();
|
||||||
|
|
||||||
|
for article in &articles {
|
||||||
|
let mut file_name = format!(
|
||||||
|
"{}/{}.html",
|
||||||
|
app_config.output_directory.as_deref().unwrap_or("."),
|
||||||
|
article
|
||||||
|
.metadata()
|
||||||
|
.title()
|
||||||
|
.replace("/", " ")
|
||||||
|
.replace("\\", " ")
|
||||||
|
);
|
||||||
|
|
||||||
|
if file_names.contains(&file_name) {
|
||||||
|
info!("Article name {:?} already exists", file_name);
|
||||||
|
file_name = format!(
|
||||||
|
"{}/{}_{}.html",
|
||||||
|
app_config.output_directory.as_deref().unwrap_or("."),
|
||||||
|
article
|
||||||
|
.metadata()
|
||||||
|
.title()
|
||||||
|
.replace("/", " ")
|
||||||
|
.replace("\\", " "),
|
||||||
|
file_names.len()
|
||||||
|
);
|
||||||
|
info!("Renamed to {:?}", file_name);
|
||||||
|
}
|
||||||
|
file_names.insert(file_name.clone());
|
||||||
|
|
||||||
|
debug!("Creating {:?}", file_name);
|
||||||
|
let export_article = || -> Result<(), PaperoniError> {
|
||||||
|
let mut out_file = File::create(&file_name)?;
|
||||||
|
|
||||||
|
if app_config.is_inlining_images {
|
||||||
|
for (img_url, mime_type_opt) in &article.img_urls {
|
||||||
|
update_imgs_base64(
|
||||||
|
article,
|
||||||
|
img_url,
|
||||||
|
mime_type_opt.as_deref().unwrap_or("image/*"),
|
||||||
|
)?
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let base_path =
|
||||||
|
Path::new(app_config.output_directory.as_deref().unwrap_or("."));
|
||||||
|
let imgs_dir_name = article.metadata().title();
|
||||||
|
|
||||||
|
if !base_path.join(imgs_dir_name).exists() {
|
||||||
|
std::fs::create_dir(base_path.join(imgs_dir_name))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let imgs_dir_path = base_path.join(imgs_dir_name);
|
||||||
|
update_img_urls(article, &imgs_dir_path)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let utf8_encoding =
|
||||||
|
NodeRef::new_element(create_qualname("meta"), BTreeMap::new());
|
||||||
|
if let Some(elem_node) = utf8_encoding.as_element() {
|
||||||
|
let mut elem_attrs = elem_node.attributes.borrow_mut();
|
||||||
|
elem_attrs.insert("charset", "UTF-8".into());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(head_elem) = article.article().select_first("head") {
|
||||||
|
let head_elem_node = head_elem.as_node();
|
||||||
|
head_elem_node.append(utf8_encoding);
|
||||||
|
};
|
||||||
|
|
||||||
|
insert_title_elem(article.article(), article.metadata().title());
|
||||||
|
insert_appendix(article.article(), vec![(article.metadata(), &article.url)]);
|
||||||
|
inline_css(article.article(), app_config);
|
||||||
|
|
||||||
|
article.article().serialize(&mut out_file)?;
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Err(mut err) = export_article() {
|
||||||
|
err.set_article_source(&article.url);
|
||||||
|
errors.push(err);
|
||||||
|
}
|
||||||
|
debug!("Created {:?}", file_name);
|
||||||
|
|
||||||
|
bar.inc(1);
|
||||||
|
successful_articles_table.add_row(vec![article.metadata().title()]);
|
||||||
|
}
|
||||||
|
bar.finish_with_message("Generated HTML files\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if errors.is_empty() {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(errors)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_qualname(name: &str) -> QualName {
|
||||||
|
QualName::new(
|
||||||
|
None,
|
||||||
|
Namespace::from("http://www.w3.org/1999/xhtml"),
|
||||||
|
LocalName::from(name),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
|
||||||
|
fn update_imgs_base64(
|
||||||
|
article: &Extractor,
|
||||||
|
img_url: &str,
|
||||||
|
mime_type: &str,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
let temp_dir = std::env::temp_dir();
|
||||||
|
let img_path = temp_dir.join(img_url);
|
||||||
|
let img_bytes = std::fs::read(img_path)?;
|
||||||
|
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));
|
||||||
|
|
||||||
|
let img_elems = article
|
||||||
|
.article()
|
||||||
|
.select(&format!("img[src=\"{}\"]", img_url))
|
||||||
|
.unwrap();
|
||||||
|
for img_elem in img_elems {
|
||||||
|
let mut img_attr = img_elem.attributes.borrow_mut();
|
||||||
|
if let Some(src_attr) = img_attr.get_mut("src") {
|
||||||
|
*src_attr = img_base64_str.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location
|
||||||
|
fn update_img_urls(article: &Extractor, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
|
||||||
|
let temp_dir = std::env::temp_dir();
|
||||||
|
for (img_url, _) in &article.img_urls {
|
||||||
|
let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url));
|
||||||
|
info!("Copying {:?} to {:?}", from, to);
|
||||||
|
fs::copy(from, to)?;
|
||||||
|
let img_elems = article
|
||||||
|
.article()
|
||||||
|
.select(&format!("img[src=\"{}\"]", img_url))
|
||||||
|
.unwrap();
|
||||||
|
for img_elem in img_elems {
|
||||||
|
let mut img_attr = img_elem.attributes.borrow_mut();
|
||||||
|
if let Some(src_attr) = img_attr.get_mut("src") {
|
||||||
|
*src_attr = imgs_dir_path.join(img_url).to_str().unwrap().into();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a `<title>` element in an HTML document with the value set to the article's title
|
||||||
|
fn insert_title_elem(root_node: &NodeRef, title: &str) {
|
||||||
|
let title_content = NodeRef::new_text(title);
|
||||||
|
let title_elem = NodeRef::new_element(create_qualname("title"), BTreeMap::new());
|
||||||
|
title_elem.append(title_content);
|
||||||
|
match root_node.select_first("head") {
|
||||||
|
Ok(head_elem) => {
|
||||||
|
head_elem.as_node().append(title_elem);
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
debug!("{}", HEAD_ELEM_NOT_FOUND);
|
||||||
|
let html_elem = root_node.select_first("html").unwrap();
|
||||||
|
let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new());
|
||||||
|
head_elem.append(title_elem);
|
||||||
|
html_elem.as_node().prepend(head_elem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates the appendix in an HTML document where article sources are added in a `<footer>` element
|
||||||
|
fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) {
|
||||||
|
let link_tags: String = article_links
|
||||||
|
.iter()
|
||||||
|
.map(|(meta_data, url)| {
|
||||||
|
let article_name = if !meta_data.title().is_empty() {
|
||||||
|
meta_data.title()
|
||||||
|
} else {
|
||||||
|
url
|
||||||
|
};
|
||||||
|
format!("<a href=\"{}\">{}</a><br></br>", url, article_name)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let footer_inner_html = format!("<h2>Appendix</h2><h2>Article sources</h3>{}", link_tags);
|
||||||
|
let footer_elem =
|
||||||
|
kuchiki::parse_fragment(create_qualname("footer"), Vec::new()).one(footer_inner_html);
|
||||||
|
root_node.append(footer_elem);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Inlines the CSS stylesheets into the HTML article node
|
||||||
|
fn inline_css(root_node: &NodeRef, app_config: &AppConfig) {
|
||||||
|
let body_stylesheet = include_str!("./assets/body.min.css");
|
||||||
|
let header_stylesheet = include_str!("./assets/headers.min.css");
|
||||||
|
let mut css_str = String::new();
|
||||||
|
match app_config.css_config {
|
||||||
|
cli::CSSConfig::NoHeaders => {
|
||||||
|
css_str.push_str(body_stylesheet);
|
||||||
|
}
|
||||||
|
cli::CSSConfig::All => {
|
||||||
|
css_str.push_str(body_stylesheet);
|
||||||
|
css_str.push_str(header_stylesheet);
|
||||||
|
}
|
||||||
|
cli::CSSConfig::None => {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let css_html_str = format!("<style>{}</style>", css_str);
|
||||||
|
let style_container =
|
||||||
|
kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str);
|
||||||
|
let style_elem = style_container.select_first("style").unwrap();
|
||||||
|
match root_node.select_first("head") {
|
||||||
|
Ok(head_elem) => {
|
||||||
|
head_elem.as_node().prepend(style_elem.as_node().to_owned());
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
debug!("{}", HEAD_ELEM_NOT_FOUND);
|
||||||
|
let html_elem = root_node.select_first("html").unwrap();
|
||||||
|
let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new());
|
||||||
|
head_elem.prepend(style_elem.as_node().to_owned());
|
||||||
|
html_elem.as_node().prepend(head_elem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove the <link> of the stylesheet since styles are now inlined
|
||||||
|
if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") {
|
||||||
|
style_link_elem.as_node().detach();
|
||||||
|
};
|
||||||
|
}
|
|
@ -11,7 +11,7 @@ use crate::errors::PaperoniError;
|
||||||
|
|
||||||
pub fn display_summary(
|
pub fn display_summary(
|
||||||
initial_article_count: usize,
|
initial_article_count: usize,
|
||||||
succesful_articles_table: Table,
|
successful_articles_table: Table,
|
||||||
partial_downloads: Vec<PartialDownload>,
|
partial_downloads: Vec<PartialDownload>,
|
||||||
errors: Vec<PaperoniError>,
|
errors: Vec<PaperoniError>,
|
||||||
) {
|
) {
|
||||||
|
@ -31,7 +31,7 @@ pub fn display_summary(
|
||||||
);
|
);
|
||||||
|
|
||||||
if successfully_downloaded_count > 0 {
|
if successfully_downloaded_count > 0 {
|
||||||
println!("{}", succesful_articles_table);
|
println!("{}", successful_articles_table);
|
||||||
}
|
}
|
||||||
|
|
||||||
if partial_downloads_count > 0 {
|
if partial_downloads_count > 0 {
|
||||||
|
|
21
src/main.rs
21
src/main.rs
|
@ -12,6 +12,7 @@ mod cli;
|
||||||
mod epub;
|
mod epub;
|
||||||
mod errors;
|
mod errors;
|
||||||
mod extractor;
|
mod extractor;
|
||||||
|
mod html;
|
||||||
/// This module is responsible for async HTTP calls for downloading
|
/// This module is responsible for async HTTP calls for downloading
|
||||||
/// the HTML content and images
|
/// the HTML content and images
|
||||||
mod http;
|
mod http;
|
||||||
|
@ -20,6 +21,7 @@ mod moz_readability;
|
||||||
|
|
||||||
use cli::AppConfig;
|
use cli::AppConfig;
|
||||||
use epub::generate_epubs;
|
use epub::generate_epubs;
|
||||||
|
use html::generate_html_exports;
|
||||||
use logs::display_summary;
|
use logs::display_summary;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
|
@ -64,22 +66,33 @@ fn run(app_config: AppConfig) {
|
||||||
let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors);
|
let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors);
|
||||||
bar.finish_with_message("Downloaded articles");
|
bar.finish_with_message("Downloaded articles");
|
||||||
|
|
||||||
let mut succesful_articles_table = Table::new();
|
let mut successful_articles_table = Table::new();
|
||||||
succesful_articles_table
|
successful_articles_table
|
||||||
.load_preset(UTF8_FULL)
|
.load_preset(UTF8_FULL)
|
||||||
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
|
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
|
||||||
.set_content_arrangement(ContentArrangement::Dynamic);
|
.set_content_arrangement(ContentArrangement::Dynamic);
|
||||||
match generate_epubs(articles, &app_config, &mut succesful_articles_table) {
|
|
||||||
|
match app_config.export_type {
|
||||||
|
cli::ExportType::EPUB => {
|
||||||
|
match generate_epubs(articles, &app_config, &mut successful_articles_table) {
|
||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
Err(gen_epub_errors) => {
|
Err(gen_epub_errors) => {
|
||||||
errors.extend(gen_epub_errors);
|
errors.extend(gen_epub_errors);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
cli::ExportType::HTML => {
|
||||||
|
match generate_html_exports(articles, &app_config, &mut successful_articles_table) {
|
||||||
|
Ok(_) => (),
|
||||||
|
Err(gen_html_errors) => errors.extend(gen_html_errors),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let has_errors = !errors.is_empty() || !partial_downloads.is_empty();
|
let has_errors = !errors.is_empty() || !partial_downloads.is_empty();
|
||||||
display_summary(
|
display_summary(
|
||||||
app_config.urls.len(),
|
app_config.urls.len(),
|
||||||
succesful_articles_table,
|
successful_articles_table,
|
||||||
partial_downloads,
|
partial_downloads,
|
||||||
errors,
|
errors,
|
||||||
);
|
);
|
||||||
|
|
Reference in a new issue