From a1156e10fc18b78cdc83afdbc7aff34e973dcf0f Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sun, 6 Jun 2021 13:01:57 +0300 Subject: [PATCH] Add `generate_header_ids` function Add h4 to header level ToC and update implementation Add tests --- src/epub.rs | 200 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 179 insertions(+), 21 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index 6c2d5f3..9797342 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -254,55 +254,80 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String { template } +/// Adds an id attribute to header elements and assigns a value based on +/// the hash of the text content. Headers with id attributes are not modified. +/// The headers here are known to have text because the grabbed article from +/// readability removes headers with no text. +fn generate_header_ids(root_node: &NodeRef) { + let headers = root_node + .select("h1, h2, h3, h4") + .expect("Unable to create selector for headings"); + let headers_no_id = headers.filter(|node_data_ref| { + let attrs = node_data_ref.attributes.borrow(); + !attrs.contains("id") + }); + for header in headers_no_id { + let mut attrs = header.attributes.borrow_mut(); + let text = header.text_contents(); + // The value of the id begins with an underscore because the hexadecimal + // digest might start with a number which would make it an invalid id + // when querying with selectors + let value = format!("_{:x}", md5::compute(text)); + attrs.insert("id", value); + } +} + /// Returns a vector of `TocElement` from a NodeRef used for adding to the Table of Contents for navigation fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec { - // TODO: Test this + generate_header_ids(article); + let mut headers_vec = Vec::new(); let mut header_levels = HashMap::new(); header_levels.insert("h1", 1); header_levels.insert("h2", 2); header_levels.insert("h3", 3); + header_levels.insert("h4", 4); let headings = article - .select("h1, h2, h3") + .select("h1, h2, h3, h4") .expect("Unable to create selector for headings"); - let mut prev_toc: Option = None; + let mut last_toc_elem_level: Option = None; for heading in headings { // TODO: Create a new function that adds an id attribute to heading tags before this function is called let elem_attrs = heading.attributes.borrow(); let elem_name: &str = &heading.name.local; - let id = elem_attrs - .get("id") - .map(|val| val.to_string()) - .unwrap_or(heading.text_contents().replace(" ", "-")); - let toc = TocElement::new(format!("{}#{}", content_url, id), heading.text_contents()) - .level(header_levels[elem_name]); - if let Some(prev_toc_element) = prev_toc { - if prev_toc_element.level <= toc.level { - headers_vec.push(prev_toc_element); - prev_toc = Some(toc); + let elem_level = header_levels[elem_name]; + let id = elem_attrs.get("id").map(|val| val.to_string()).unwrap(); + let toc = TocElement::new( + format!("{}#{}", content_url, id), + replace_escaped_characters(&heading.text_contents()), + ); + + if let Some(last_elem_level) = last_toc_elem_level { + if elem_level <= last_elem_level { + last_toc_elem_level = Some(elem_level); + headers_vec.push(toc); } else { - prev_toc = Some(prev_toc_element.child(toc)) + match headers_vec.last_mut() { + Some(toc_elem) => *toc_elem = toc_elem.clone().child(toc), + _ => unreachable!(), + } } } else { - prev_toc = Some(toc); + last_toc_elem_level = Some(elem_level); + headers_vec.push(toc); } } - - if let Some(toc_element) = prev_toc { - headers_vec.push(toc_element); - } - headers_vec } #[cfg(test)] mod test { use kuchiki::traits::*; - use super::{get_header_level_toc_vec, replace_escaped_characters}; + use super::{generate_header_ids, get_header_level_toc_vec, replace_escaped_characters}; #[test] fn test_replace_escaped_characters() { @@ -319,4 +344,137 @@ mod test { "Author Name <author@mail.example>" ); } + + #[test] + fn test_generate_header_ids() { + let html_str = r#" + + + +

Heading 1

+

Heading 2

+

Heading 2 again

+

Heading 4

+

Heading 1 again

+

Heading 3

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + generate_header_ids(&doc); + + let mut headers = doc.select("h1, h2, h3, h4").unwrap(); + let all_headers_have_ids = headers.all(|node_data_ref| { + let attrs = node_data_ref.attributes.borrow(); + if let Some(id) = attrs.get("id") { + !id.trim().is_empty() + } else { + false + } + }); + assert_eq!(true, all_headers_have_ids); + + let selector = format!("h1#_{:x}", md5::compute("Heading 1")); + assert_eq!(true, doc.select_first(&selector).is_ok()); + + let selector = format!("h1#_{:x}", md5::compute("Heading 1 again")); + assert_eq!(true, doc.select_first(&selector).is_ok()); + + let selector = "h2#heading-2-again"; + assert_eq!(true, doc.select_first(selector).is_ok()); + } + + #[test] + fn test_get_header_level_toc_vec() { + // NOTE: Due to `TocElement` not implementing PartialEq, the tests here + // will need to be manually written to cover for this + let html_str = r#" + + + +

Lorem ipsum

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + + let toc_vec = get_header_level_toc_vec("index.xhtml", &doc); + assert_eq!(0, toc_vec.len()); + + let html_str = r#" + + + +

Heading 1

+

Lorem ipsum

+
+

Heading 2

+

Lorem ipsum

+

Lorem ipsum

+
+

Subheading 3

+

Lorem ipsum

+

Second Heading 1

+

Lorem ipsum

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + + let toc_vec = get_header_level_toc_vec("index.xhtml", &doc); + assert_eq!(2, toc_vec.len()); + + let first_h1_toc = toc_vec.first().unwrap(); + assert_eq!("Heading 1", first_h1_toc.title); + assert_eq!(1, first_h1_toc.children.len()); + + let h2_toc = first_h1_toc.children.first().unwrap(); + assert_eq!("Heading 2", h2_toc.title); + assert_eq!(1, h2_toc.children.len()); + + let h3_toc = h2_toc.children.first().unwrap(); + assert_eq!("Subheading 3", h3_toc.title); + assert_eq!(0, h3_toc.children.len()); + + let last_h1_toc = toc_vec.last().unwrap(); + assert_eq!("Second Heading 1", last_h1_toc.title); + assert_eq!(0, last_h1_toc.children.len()); + + let html_str = r#" + + + +

Heading 1

+

Lorem ipsum

+
+

Heading 2

+

Lorem ipsum

+

Lorem ipsum

+

Subheading 3

+

Lorem ipsum

+
+

Heading 2

+

Lorem ipsum

+

Subheading 4

+

Conclusion

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + + let toc_vec = get_header_level_toc_vec("index.xhtml", &doc); + assert_eq!(1, toc_vec.len()); + + let h1_toc = toc_vec.first().unwrap(); + assert_eq!("Heading 1", h1_toc.title); + assert_eq!(3, h1_toc.children.len()); + + let first_h2_toc = h1_toc.children.first().unwrap(); + assert_eq!("Heading 2", first_h2_toc.title); + assert_eq!(1, first_h2_toc.children.len()); + + let h3_toc = first_h2_toc.children.first().unwrap(); + assert_eq!("Subheading 3", h3_toc.title); + assert_eq!(0, h3_toc.children.len()); + } }