fix: fix broken links in toc generation
the fix involves ensuring the ToC is generated prior to serialization because it mutates the document and will not work otherwise. chore: add .vscode config to .gitignore
This commit is contained in:
parent
282d229754
commit
c6c10689eb
3 changed files with 71 additions and 74 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,3 +1,4 @@
|
||||||
/target
|
/target
|
||||||
*.epub
|
*.epub
|
||||||
*.log
|
*.log
|
||||||
|
.vscode/
|
83
src/epub.rs
83
src/epub.rs
|
@ -8,11 +8,11 @@ use indicatif::{ProgressBar, ProgressStyle};
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
use log::{debug, error, info};
|
use log::{debug, error, info};
|
||||||
|
|
||||||
use crate::{
|
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
|
||||||
cli::AppConfig,
|
|
||||||
errors::PaperoniError,
|
lazy_static! {
|
||||||
extractor::{self, Extractor},
|
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
||||||
};
|
}
|
||||||
|
|
||||||
pub fn generate_epubs(
|
pub fn generate_epubs(
|
||||||
articles: Vec<Extractor>,
|
articles: Vec<Extractor>,
|
||||||
|
@ -82,16 +82,17 @@ pub fn generate_epubs(
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.fold(&mut epub, |epub, (idx, article)| {
|
.fold(&mut epub, |epub, (idx, article)| {
|
||||||
let mut article_result = || -> Result<(), PaperoniError> {
|
let mut article_result = || -> Result<(), PaperoniError> {
|
||||||
let mut xhtml_buf = Vec::new();
|
|
||||||
extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
|
|
||||||
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
|
|
||||||
let section_name = article.metadata().title();
|
|
||||||
let content_url = format!("article_{}.xhtml", idx);
|
let content_url = format!("article_{}.xhtml", idx);
|
||||||
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
|
let mut xhtml_buf = Vec::new();
|
||||||
.title(replace_escaped_characters(section_name));
|
|
||||||
let header_level_tocs =
|
let header_level_tocs =
|
||||||
get_header_level_toc_vec(&content_url, article.article());
|
get_header_level_toc_vec(&content_url, article.article());
|
||||||
|
|
||||||
|
serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
|
||||||
|
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
|
||||||
|
let section_name = article.metadata().title();
|
||||||
|
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
|
||||||
|
.title(replace_escaped_characters(section_name));
|
||||||
|
|
||||||
for toc_element in header_level_tocs {
|
for toc_element in header_level_tocs {
|
||||||
content = content.child(toc_element);
|
content = content.child(toc_element);
|
||||||
}
|
}
|
||||||
|
@ -172,11 +173,11 @@ pub fn generate_epubs(
|
||||||
debug!("Creating {:?}", file_name);
|
debug!("Creating {:?}", file_name);
|
||||||
let mut out_file = File::create(&file_name).unwrap();
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
let mut xhtml_buf = Vec::new();
|
let mut xhtml_buf = Vec::new();
|
||||||
extractor::serialize_to_xhtml(article.article(), &mut xhtml_buf)
|
|
||||||
.expect("Unable to serialize to xhtml");
|
|
||||||
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
|
|
||||||
let header_level_tocs =
|
let header_level_tocs =
|
||||||
get_header_level_toc_vec("index.xhtml", article.article());
|
get_header_level_toc_vec("index.xhtml", article.article());
|
||||||
|
serialize_to_xhtml(article.article(), &mut xhtml_buf)
|
||||||
|
.expect("Unable to serialize to xhtml");
|
||||||
|
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
|
||||||
|
|
||||||
if let Some(author) = article.metadata().byline() {
|
if let Some(author) = article.metadata().byline() {
|
||||||
epub.metadata("author", replace_escaped_characters(author))?;
|
epub.metadata("author", replace_escaped_characters(author))?;
|
||||||
|
@ -398,6 +399,60 @@ fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec<TocElem
|
||||||
|
|
||||||
headers_vec
|
headers_vec
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Serializes a NodeRef to a string that is XHTML compatible
|
||||||
|
/// The only DOM nodes serialized are Text and Element nodes
|
||||||
|
fn serialize_to_xhtml<W: std::io::Write>(
|
||||||
|
node_ref: &NodeRef,
|
||||||
|
mut w: &mut W,
|
||||||
|
) -> Result<(), PaperoniError> {
|
||||||
|
let mut escape_map = HashMap::new();
|
||||||
|
escape_map.insert("<", "<");
|
||||||
|
escape_map.insert(">", ">");
|
||||||
|
escape_map.insert("&", "&");
|
||||||
|
escape_map.insert("\"", """);
|
||||||
|
escape_map.insert("'", "'");
|
||||||
|
for edge in node_ref.traverse_inclusive() {
|
||||||
|
match edge {
|
||||||
|
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
|
||||||
|
kuchiki::NodeData::Text(rc_text) => {
|
||||||
|
let text = rc_text.borrow();
|
||||||
|
let esc_text = ESC_SEQ_REGEX
|
||||||
|
.replace_all(&text, |captures: ®ex::Captures| escape_map[&captures[1]]);
|
||||||
|
write!(&mut w, "{}", esc_text)?;
|
||||||
|
}
|
||||||
|
kuchiki::NodeData::Element(elem_data) => {
|
||||||
|
let attrs = elem_data.attributes.borrow();
|
||||||
|
let attrs_str = attrs
|
||||||
|
.map
|
||||||
|
.iter()
|
||||||
|
.filter(|(k, _)| !k.local.contains("\""))
|
||||||
|
.map(|(k, v)| {
|
||||||
|
format!(
|
||||||
|
"{}=\"{}\"",
|
||||||
|
k.local,
|
||||||
|
ESC_SEQ_REGEX
|
||||||
|
.replace_all(&v.value, |captures: ®ex::Captures| {
|
||||||
|
escape_map[&captures[1]]
|
||||||
|
})
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.fold("".to_string(), |acc, val| acc + " " + &val);
|
||||||
|
write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
},
|
||||||
|
kuchiki::iter::NodeEdge::End(n) => match n.data() {
|
||||||
|
kuchiki::NodeData::Element(elem_data) => {
|
||||||
|
write!(&mut w, "</{}>", &elem_data.name.local)?;
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use kuchiki::traits::*;
|
use kuchiki::traits::*;
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use kuchiki::{traits::*, NodeRef};
|
use kuchiki::{traits::*, NodeRef};
|
||||||
|
|
||||||
|
@ -8,10 +6,6 @@ use crate::moz_readability::{MetaData, Readability};
|
||||||
|
|
||||||
pub type ResourceInfo = (String, Option<String>);
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
|
||||||
lazy_static! {
|
|
||||||
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Extractor {
|
pub struct Extractor {
|
||||||
article: Option<NodeRef>,
|
article: Option<NodeRef>,
|
||||||
pub img_urls: Vec<ResourceInfo>,
|
pub img_urls: Vec<ResourceInfo>,
|
||||||
|
@ -83,59 +77,6 @@ impl Extractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Serializes a NodeRef to a string that is XHTML compatible
|
|
||||||
/// The only DOM nodes serialized are Text and Element nodes
|
|
||||||
pub fn serialize_to_xhtml<W: std::io::Write>(
|
|
||||||
node_ref: &NodeRef,
|
|
||||||
mut w: &mut W,
|
|
||||||
) -> Result<(), PaperoniError> {
|
|
||||||
let mut escape_map = HashMap::new();
|
|
||||||
escape_map.insert("<", "<");
|
|
||||||
escape_map.insert(">", ">");
|
|
||||||
escape_map.insert("&", "&");
|
|
||||||
escape_map.insert("\"", """);
|
|
||||||
escape_map.insert("'", "'");
|
|
||||||
for edge in node_ref.traverse_inclusive() {
|
|
||||||
match edge {
|
|
||||||
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
|
|
||||||
kuchiki::NodeData::Text(rc_text) => {
|
|
||||||
let text = rc_text.borrow();
|
|
||||||
let esc_text = ESC_SEQ_REGEX
|
|
||||||
.replace_all(&text, |captures: ®ex::Captures| escape_map[&captures[1]]);
|
|
||||||
write!(&mut w, "{}", esc_text)?;
|
|
||||||
}
|
|
||||||
kuchiki::NodeData::Element(elem_data) => {
|
|
||||||
let attrs = elem_data.attributes.borrow();
|
|
||||||
let attrs_str = attrs
|
|
||||||
.map
|
|
||||||
.iter()
|
|
||||||
.filter(|(k, _)| !k.local.contains("\""))
|
|
||||||
.map(|(k, v)| {
|
|
||||||
format!(
|
|
||||||
"{}=\"{}\"",
|
|
||||||
k.local,
|
|
||||||
ESC_SEQ_REGEX
|
|
||||||
.replace_all(&v.value, |captures: ®ex::Captures| {
|
|
||||||
escape_map[&captures[1]]
|
|
||||||
})
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.fold("".to_string(), |acc, val| acc + " " + &val);
|
|
||||||
write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
|
|
||||||
}
|
|
||||||
_ => (),
|
|
||||||
},
|
|
||||||
kuchiki::iter::NodeEdge::End(n) => match n.data() {
|
|
||||||
kuchiki::NodeData::Element(elem_data) => {
|
|
||||||
write!(&mut w, "</{}>", &elem_data.name.local)?;
|
|
||||||
}
|
|
||||||
_ => (),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
Reference in a new issue