fix: fix broken links in toc generation

the fix involves ensuring the ToC is generated prior to serialization
because it mutates the document and will not work otherwise.

chore: add .vscode config to .gitignore
This commit is contained in:
Kenneth Gitere 2021-06-16 18:09:05 +03:00
parent 282d229754
commit c6c10689eb
3 changed files with 71 additions and 74 deletions

3
.gitignore vendored
View file

@ -1,3 +1,4 @@
/target /target
*.epub *.epub
*.log *.log
.vscode/

View file

@ -8,11 +8,11 @@ use indicatif::{ProgressBar, ProgressStyle};
use kuchiki::NodeRef; use kuchiki::NodeRef;
use log::{debug, error, info}; use log::{debug, error, info};
use crate::{ use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
cli::AppConfig,
errors::PaperoniError, lazy_static! {
extractor::{self, Extractor}, static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
}; }
pub fn generate_epubs( pub fn generate_epubs(
articles: Vec<Extractor>, articles: Vec<Extractor>,
@ -82,16 +82,17 @@ pub fn generate_epubs(
.enumerate() .enumerate()
.fold(&mut epub, |epub, (idx, article)| { .fold(&mut epub, |epub, (idx, article)| {
let mut article_result = || -> Result<(), PaperoniError> { let mut article_result = || -> Result<(), PaperoniError> {
let mut xhtml_buf = Vec::new();
extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
let section_name = article.metadata().title();
let content_url = format!("article_{}.xhtml", idx); let content_url = format!("article_{}.xhtml", idx);
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes()) let mut xhtml_buf = Vec::new();
.title(replace_escaped_characters(section_name));
let header_level_tocs = let header_level_tocs =
get_header_level_toc_vec(&content_url, article.article()); get_header_level_toc_vec(&content_url, article.article());
serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
let section_name = article.metadata().title();
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
.title(replace_escaped_characters(section_name));
for toc_element in header_level_tocs { for toc_element in header_level_tocs {
content = content.child(toc_element); content = content.child(toc_element);
} }
@ -172,11 +173,11 @@ pub fn generate_epubs(
debug!("Creating {:?}", file_name); debug!("Creating {:?}", file_name);
let mut out_file = File::create(&file_name).unwrap(); let mut out_file = File::create(&file_name).unwrap();
let mut xhtml_buf = Vec::new(); let mut xhtml_buf = Vec::new();
extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf)
.expect("Unable to serialize to xhtml");
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
let header_level_tocs = let header_level_tocs =
get_header_level_toc_vec("index.xhtml", article.article()); get_header_level_toc_vec("index.xhtml", article.article());
serialize_to_xhtml(article.article(), &mut xhtml_buf)
.expect("Unable to serialize to xhtml");
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
if let Some(author) = article.metadata().byline() { if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_escaped_characters(author))?; epub.metadata("author", replace_escaped_characters(author))?;
@ -398,6 +399,60 @@ fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec<TocElem
headers_vec headers_vec
} }
/// Serializes a NodeRef to a string that is XHTML compatible
/// The only DOM nodes serialized are Text and Element nodes
fn serialize_to_xhtml<W: std::io::Write>(
node_ref: &NodeRef,
mut w: &mut W,
) -> Result<(), PaperoniError> {
let mut escape_map = HashMap::new();
escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;");
escape_map.insert("&", "&amp;");
escape_map.insert("\"", "&quot;");
escape_map.insert("'", "&apos;");
for edge in node_ref.traverse_inclusive() {
match edge {
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
kuchiki::NodeData::Text(rc_text) => {
let text = rc_text.borrow();
let esc_text = ESC_SEQ_REGEX
.replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
write!(&mut w, "{}", esc_text)?;
}
kuchiki::NodeData::Element(elem_data) => {
let attrs = elem_data.attributes.borrow();
let attrs_str = attrs
.map
.iter()
.filter(|(k, _)| !k.local.contains("\""))
.map(|(k, v)| {
format!(
"{}=\"{}\"",
k.local,
ESC_SEQ_REGEX
.replace_all(&v.value, |captures: &regex::Captures| {
escape_map[&captures[1]]
})
)
})
.fold("".to_string(), |acc, val| acc + " " + &val);
write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
}
_ => (),
},
kuchiki::iter::NodeEdge::End(n) => match n.data() {
kuchiki::NodeData::Element(elem_data) => {
write!(&mut w, "</{}>", &elem_data.name.local)?;
}
_ => (),
},
}
}
Ok(())
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use kuchiki::traits::*; use kuchiki::traits::*;

View file

@ -1,5 +1,3 @@
use std::collections::HashMap;
use itertools::Itertools; use itertools::Itertools;
use kuchiki::{traits::*, NodeRef}; use kuchiki::{traits::*, NodeRef};
@ -8,10 +6,6 @@ use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
lazy_static! {
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
}
pub struct Extractor { pub struct Extractor {
article: Option<NodeRef>, article: Option<NodeRef>,
pub img_urls: Vec<ResourceInfo>, pub img_urls: Vec<ResourceInfo>,
@ -83,59 +77,6 @@ impl Extractor {
} }
} }
/// Serializes a NodeRef to a string that is XHTML compatible
/// The only DOM nodes serialized are Text and Element nodes
pub fn serialize_to_xhtml<W: std::io::Write>(
node_ref: &NodeRef,
mut w: &mut W,
) -> Result<(), PaperoniError> {
let mut escape_map = HashMap::new();
escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;");
escape_map.insert("&", "&amp;");
escape_map.insert("\"", "&quot;");
escape_map.insert("'", "&apos;");
for edge in node_ref.traverse_inclusive() {
match edge {
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
kuchiki::NodeData::Text(rc_text) => {
let text = rc_text.borrow();
let esc_text = ESC_SEQ_REGEX
.replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
write!(&mut w, "{}", esc_text)?;
}
kuchiki::NodeData::Element(elem_data) => {
let attrs = elem_data.attributes.borrow();
let attrs_str = attrs
.map
.iter()
.filter(|(k, _)| !k.local.contains("\""))
.map(|(k, v)| {
format!(
"{}=\"{}\"",
k.local,
ESC_SEQ_REGEX
.replace_all(&v.value, |captures: &regex::Captures| {
escape_map[&captures[1]]
})
)
})
.fold("".to_string(), |acc, val| acc + " " + &val);
write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
}
_ => (),
},
kuchiki::iter::NodeEdge::End(n) => match n.data() {
kuchiki::NodeData::Element(elem_data) => {
write!(&mut w, "</{}>", &elem_data.name.local)?;
}
_ => (),
},
}
}
Ok(())
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;