fix: fix broken links in toc generation

the fix involves ensuring the ToC is generated prior to serialization because it mutates the document and will not work otherwise. chore: add .vscode config to .gitignore
2021-06-16 18:09:05 +03:00 · 2021-06-16 18:09:05 +03:00 · c6c10689eb
commit c6c10689eb
parent 282d229754
3 changed files with 71 additions and 74 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 /target
 *.epub
 *.log
 .vscode/
--- a/src/epub.rs
+++ b/src/epub.rs
@ -8,11 +8,11 @@ use indicatif::{ProgressBar, ProgressStyle};
 use kuchiki::NodeRef;
 use log::{debug, error, info};
-use crate::{
+use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
-    cli::AppConfig,
+
-    errors::PaperoniError,
+lazy_static! {
-    extractor::{self, Extractor},
+    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
-};
+}
 pub fn generate_epubs(
    articles: Vec<Extractor>,
@ -82,16 +82,17 @@ pub fn generate_epubs(
                .enumerate()
                .fold(&mut epub, |epub, (idx, article)| {
                    let mut article_result = || -> Result<(), PaperoniError> {
                        let mut xhtml_buf = Vec::new();
                        extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
                        let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
                        let section_name = article.metadata().title();
                        let content_url = format!("article_{}.xhtml", idx);
-                        let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
+                        let mut xhtml_buf = Vec::new();
                            .title(replace_escaped_characters(section_name));
                        let header_level_tocs =
                            get_header_level_toc_vec(&content_url, article.article());
                        serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
                        let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
                        let section_name = article.metadata().title();
                        let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
                            .title(replace_escaped_characters(section_name));
                        for toc_element in header_level_tocs {
                            content = content.child(toc_element);
                        }
@ -172,11 +173,11 @@ pub fn generate_epubs(
                    debug!("Creating {:?}", file_name);
                    let mut out_file = File::create(&file_name).unwrap();
                    let mut xhtml_buf = Vec::new();
                    extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf)
                        .expect("Unable to serialize to xhtml");
                    let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
                    let header_level_tocs =
                        get_header_level_toc_vec("index.xhtml", article.article());
                    serialize_to_xhtml(article.article(), &mut xhtml_buf)
                        .expect("Unable to serialize to xhtml");
                    let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
                    if let Some(author) = article.metadata().byline() {
                        epub.metadata("author", replace_escaped_characters(author))?;
@ -398,6 +399,60 @@ fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec<TocElem
    headers_vec
 }
 /// Serializes a NodeRef to a string that is XHTML compatible
 /// The only DOM nodes serialized are Text and Element nodes
 fn serialize_to_xhtml<W: std::io::Write>(
    node_ref: &NodeRef,
    mut w: &mut W,
 ) -> Result<(), PaperoniError> {
    let mut escape_map = HashMap::new();
    escape_map.insert("<", "&lt;");
    escape_map.insert(">", "&gt;");
    escape_map.insert("&", "&amp;");
    escape_map.insert("\"", "&quot;");
    escape_map.insert("'", "&apos;");
    for edge in node_ref.traverse_inclusive() {
        match edge {
            kuchiki::iter::NodeEdge::Start(n) => match n.data() {
                kuchiki::NodeData::Text(rc_text) => {
                    let text = rc_text.borrow();
                    let esc_text = ESC_SEQ_REGEX
                        .replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
                    write!(&mut w, "{}", esc_text)?;
                }
                kuchiki::NodeData::Element(elem_data) => {
                    let attrs = elem_data.attributes.borrow();
                    let attrs_str = attrs
                        .map
                        .iter()
                        .filter(|(k, _)| !k.local.contains("\""))
                        .map(|(k, v)| {
                            format!(
                                "{}=\"{}\"",
                                k.local,
                                ESC_SEQ_REGEX
                                    .replace_all(&v.value, |captures: &regex::Captures| {
                                        escape_map[&captures[1]]
                                    })
                            )
                        })
                        .fold("".to_string(), |acc, val| acc + " " + &val);
                    write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
                }
                _ => (),
            },
            kuchiki::iter::NodeEdge::End(n) => match n.data() {
                kuchiki::NodeData::Element(elem_data) => {
                    write!(&mut w, "</{}>", &elem_data.name.local)?;
                }
                _ => (),
            },
        }
    }
    Ok(())
 }
 #[cfg(test)]
 mod test {
    use kuchiki::traits::*;
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -1,5 +1,3 @@
 use std::collections::HashMap;
 use itertools::Itertools;
 use kuchiki::{traits::*, NodeRef};
@ -8,10 +6,6 @@ use crate::moz_readability::{MetaData, Readability};
 pub type ResourceInfo = (String, Option<String>);
 lazy_static! {
    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
 }
 pub struct Extractor {
    article: Option<NodeRef>,
    pub img_urls: Vec<ResourceInfo>,
@ -83,59 +77,6 @@ impl Extractor {
    }
 }
 /// Serializes a NodeRef to a string that is XHTML compatible
 /// The only DOM nodes serialized are Text and Element nodes
 pub fn serialize_to_xhtml<W: std::io::Write>(
    node_ref: &NodeRef,
    mut w: &mut W,
 ) -> Result<(), PaperoniError> {
    let mut escape_map = HashMap::new();
    escape_map.insert("<", "&lt;");
    escape_map.insert(">", "&gt;");
    escape_map.insert("&", "&amp;");
    escape_map.insert("\"", "&quot;");
    escape_map.insert("'", "&apos;");
    for edge in node_ref.traverse_inclusive() {
        match edge {
            kuchiki::iter::NodeEdge::Start(n) => match n.data() {
                kuchiki::NodeData::Text(rc_text) => {
                    let text = rc_text.borrow();
                    let esc_text = ESC_SEQ_REGEX
                        .replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
                    write!(&mut w, "{}", esc_text)?;
                }
                kuchiki::NodeData::Element(elem_data) => {
                    let attrs = elem_data.attributes.borrow();
                    let attrs_str = attrs
                        .map
                        .iter()
                        .filter(|(k, _)| !k.local.contains("\""))
                        .map(|(k, v)| {
                            format!(
                                "{}=\"{}\"",
                                k.local,
                                ESC_SEQ_REGEX
                                    .replace_all(&v.value, |captures: &regex::Captures| {
                                        escape_map[&captures[1]]
                                    })
                            )
                        })
                        .fold("".to_string(), |acc, val| acc + " " + &val);
                    write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
                }
                _ => (),
            },
            kuchiki::iter::NodeEdge::End(n) => match n.data() {
                kuchiki::NodeData::Element(elem_data) => {
                    write!(&mut w, "</{}>", &elem_data.name.local)?;
                }
                _ => (),
            },
        }
    }
    Ok(())
 }
 #[cfg(test)]
 mod test {
    use super::*;