Add generate_header_ids
function
Add h4 to header level ToC and update implementation Add tests
This commit is contained in:
parent
8220cf29f7
commit
a1156e10fc
1 changed files with 179 additions and 21 deletions
200
src/epub.rs
200
src/epub.rs
|
@ -254,55 +254,80 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String {
|
||||||
template
|
template
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Adds an id attribute to header elements and assigns a value based on
|
||||||
|
/// the hash of the text content. Headers with id attributes are not modified.
|
||||||
|
/// The headers here are known to have text because the grabbed article from
|
||||||
|
/// readability removes headers with no text.
|
||||||
|
fn generate_header_ids(root_node: &NodeRef) {
|
||||||
|
let headers = root_node
|
||||||
|
.select("h1, h2, h3, h4")
|
||||||
|
.expect("Unable to create selector for headings");
|
||||||
|
let headers_no_id = headers.filter(|node_data_ref| {
|
||||||
|
let attrs = node_data_ref.attributes.borrow();
|
||||||
|
!attrs.contains("id")
|
||||||
|
});
|
||||||
|
for header in headers_no_id {
|
||||||
|
let mut attrs = header.attributes.borrow_mut();
|
||||||
|
let text = header.text_contents();
|
||||||
|
// The value of the id begins with an underscore because the hexadecimal
|
||||||
|
// digest might start with a number which would make it an invalid id
|
||||||
|
// when querying with selectors
|
||||||
|
let value = format!("_{:x}", md5::compute(text));
|
||||||
|
attrs.insert("id", value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns a vector of `TocElement` from a NodeRef used for adding to the Table of Contents for navigation
|
/// Returns a vector of `TocElement` from a NodeRef used for adding to the Table of Contents for navigation
|
||||||
fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec<TocElement> {
|
fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec<TocElement> {
|
||||||
// TODO: Test this
|
generate_header_ids(article);
|
||||||
|
|
||||||
let mut headers_vec = Vec::new();
|
let mut headers_vec = Vec::new();
|
||||||
|
|
||||||
let mut header_levels = HashMap::new();
|
let mut header_levels = HashMap::new();
|
||||||
header_levels.insert("h1", 1);
|
header_levels.insert("h1", 1);
|
||||||
header_levels.insert("h2", 2);
|
header_levels.insert("h2", 2);
|
||||||
header_levels.insert("h3", 3);
|
header_levels.insert("h3", 3);
|
||||||
|
header_levels.insert("h4", 4);
|
||||||
|
|
||||||
let headings = article
|
let headings = article
|
||||||
.select("h1, h2, h3")
|
.select("h1, h2, h3, h4")
|
||||||
.expect("Unable to create selector for headings");
|
.expect("Unable to create selector for headings");
|
||||||
|
|
||||||
let mut prev_toc: Option<TocElement> = None;
|
let mut last_toc_elem_level: Option<i32> = None;
|
||||||
|
|
||||||
for heading in headings {
|
for heading in headings {
|
||||||
// TODO: Create a new function that adds an id attribute to heading tags before this function is called
|
// TODO: Create a new function that adds an id attribute to heading tags before this function is called
|
||||||
let elem_attrs = heading.attributes.borrow();
|
let elem_attrs = heading.attributes.borrow();
|
||||||
let elem_name: &str = &heading.name.local;
|
let elem_name: &str = &heading.name.local;
|
||||||
let id = elem_attrs
|
let elem_level = header_levels[elem_name];
|
||||||
.get("id")
|
let id = elem_attrs.get("id").map(|val| val.to_string()).unwrap();
|
||||||
.map(|val| val.to_string())
|
let toc = TocElement::new(
|
||||||
.unwrap_or(heading.text_contents().replace(" ", "-"));
|
format!("{}#{}", content_url, id),
|
||||||
let toc = TocElement::new(format!("{}#{}", content_url, id), heading.text_contents())
|
replace_escaped_characters(&heading.text_contents()),
|
||||||
.level(header_levels[elem_name]);
|
);
|
||||||
if let Some(prev_toc_element) = prev_toc {
|
|
||||||
if prev_toc_element.level <= toc.level {
|
if let Some(last_elem_level) = last_toc_elem_level {
|
||||||
headers_vec.push(prev_toc_element);
|
if elem_level <= last_elem_level {
|
||||||
prev_toc = Some(toc);
|
last_toc_elem_level = Some(elem_level);
|
||||||
|
headers_vec.push(toc);
|
||||||
} else {
|
} else {
|
||||||
prev_toc = Some(prev_toc_element.child(toc))
|
match headers_vec.last_mut() {
|
||||||
|
Some(toc_elem) => *toc_elem = toc_elem.clone().child(toc),
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
prev_toc = Some(toc);
|
last_toc_elem_level = Some(elem_level);
|
||||||
|
headers_vec.push(toc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(toc_element) = prev_toc {
|
|
||||||
headers_vec.push(toc_element);
|
|
||||||
}
|
|
||||||
|
|
||||||
headers_vec
|
headers_vec
|
||||||
}
|
}
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use kuchiki::traits::*;
|
use kuchiki::traits::*;
|
||||||
|
|
||||||
use super::{get_header_level_toc_vec, replace_escaped_characters};
|
use super::{generate_header_ids, get_header_level_toc_vec, replace_escaped_characters};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_replace_escaped_characters() {
|
fn test_replace_escaped_characters() {
|
||||||
|
@ -319,4 +344,137 @@ mod test {
|
||||||
"Author Name <author@mail.example>"
|
"Author Name <author@mail.example>"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_generate_header_ids() {
|
||||||
|
let html_str = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Heading 1</h1>
|
||||||
|
<h2 id="heading-2">Heading 2</h2>
|
||||||
|
<h2 id="heading-2-again">Heading 2 again</h2>
|
||||||
|
<h4>Heading 4</h4>
|
||||||
|
<h1>Heading 1 again</h1>
|
||||||
|
<h3 class="heading">Heading 3</h3>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"#;
|
||||||
|
let doc = kuchiki::parse_html().one(html_str);
|
||||||
|
generate_header_ids(&doc);
|
||||||
|
|
||||||
|
let mut headers = doc.select("h1, h2, h3, h4").unwrap();
|
||||||
|
let all_headers_have_ids = headers.all(|node_data_ref| {
|
||||||
|
let attrs = node_data_ref.attributes.borrow();
|
||||||
|
if let Some(id) = attrs.get("id") {
|
||||||
|
!id.trim().is_empty()
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
});
|
||||||
|
assert_eq!(true, all_headers_have_ids);
|
||||||
|
|
||||||
|
let selector = format!("h1#_{:x}", md5::compute("Heading 1"));
|
||||||
|
assert_eq!(true, doc.select_first(&selector).is_ok());
|
||||||
|
|
||||||
|
let selector = format!("h1#_{:x}", md5::compute("Heading 1 again"));
|
||||||
|
assert_eq!(true, doc.select_first(&selector).is_ok());
|
||||||
|
|
||||||
|
let selector = "h2#heading-2-again";
|
||||||
|
assert_eq!(true, doc.select_first(selector).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_get_header_level_toc_vec() {
|
||||||
|
// NOTE: Due to `TocElement` not implementing PartialEq, the tests here
|
||||||
|
// will need to be manually written to cover for this
|
||||||
|
let html_str = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"#;
|
||||||
|
let doc = kuchiki::parse_html().one(html_str);
|
||||||
|
|
||||||
|
let toc_vec = get_header_level_toc_vec("index.xhtml", &doc);
|
||||||
|
assert_eq!(0, toc_vec.len());
|
||||||
|
|
||||||
|
let html_str = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1 id="heading-1">Heading 1</h1>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
<div>
|
||||||
|
<h2 id="heading-2">Heading 2</h2>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
</div>
|
||||||
|
<h3 id="subheading-3">Subheading 3</h2>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
<h1 id="heading-2">Second Heading 1</h2>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"#;
|
||||||
|
let doc = kuchiki::parse_html().one(html_str);
|
||||||
|
|
||||||
|
let toc_vec = get_header_level_toc_vec("index.xhtml", &doc);
|
||||||
|
assert_eq!(2, toc_vec.len());
|
||||||
|
|
||||||
|
let first_h1_toc = toc_vec.first().unwrap();
|
||||||
|
assert_eq!("Heading 1", first_h1_toc.title);
|
||||||
|
assert_eq!(1, first_h1_toc.children.len());
|
||||||
|
|
||||||
|
let h2_toc = first_h1_toc.children.first().unwrap();
|
||||||
|
assert_eq!("Heading 2", h2_toc.title);
|
||||||
|
assert_eq!(1, h2_toc.children.len());
|
||||||
|
|
||||||
|
let h3_toc = h2_toc.children.first().unwrap();
|
||||||
|
assert_eq!("Subheading 3", h3_toc.title);
|
||||||
|
assert_eq!(0, h3_toc.children.len());
|
||||||
|
|
||||||
|
let last_h1_toc = toc_vec.last().unwrap();
|
||||||
|
assert_eq!("Second Heading 1", last_h1_toc.title);
|
||||||
|
assert_eq!(0, last_h1_toc.children.len());
|
||||||
|
|
||||||
|
let html_str = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1 id="heading-1">Heading 1</h1>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
<div>
|
||||||
|
<h2 id="heading-2">Heading 2</h2>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
<h3 id="subheading-3">Subheading 3</h2>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
</div>
|
||||||
|
<h2 id="heading-2">Heading 2</h2>
|
||||||
|
<p>Lorem ipsum</p>
|
||||||
|
<h4 id="subheading-4">Subheading 4</h4>
|
||||||
|
<h2 id="conclusion">Conclusion</h2>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"#;
|
||||||
|
let doc = kuchiki::parse_html().one(html_str);
|
||||||
|
|
||||||
|
let toc_vec = get_header_level_toc_vec("index.xhtml", &doc);
|
||||||
|
assert_eq!(1, toc_vec.len());
|
||||||
|
|
||||||
|
let h1_toc = toc_vec.first().unwrap();
|
||||||
|
assert_eq!("Heading 1", h1_toc.title);
|
||||||
|
assert_eq!(3, h1_toc.children.len());
|
||||||
|
|
||||||
|
let first_h2_toc = h1_toc.children.first().unwrap();
|
||||||
|
assert_eq!("Heading 2", first_h2_toc.title);
|
||||||
|
assert_eq!(1, first_h2_toc.children.len());
|
||||||
|
|
||||||
|
let h3_toc = first_h2_toc.children.first().unwrap();
|
||||||
|
assert_eq!("Subheading 3", h3_toc.title);
|
||||||
|
assert_eq!(0, h3_toc.children.len());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue