From 5f99bddc102fcff3d5bdd7fbf8aa9d9b3e30898a Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 24 Nov 2020 14:54:23 +0300 Subject: [PATCH] Add custom serializer for XHTML --- src/extractor.rs | 56 ++++++++++++++++++++++++++++++++++ src/main.rs | 9 ++---- src/moz_readability/regexes.rs | 4 --- 3 files changed, 58 insertions(+), 11 deletions(-) diff --git a/src/extractor.rs b/src/extractor.rs index 1c10dcc..7d04bdd 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use async_std::fs::File; use async_std::io::prelude::*; use async_std::task; @@ -8,6 +10,10 @@ use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); +lazy_static! { + static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap(); +} + pub struct Extractor { article: Option, pub img_urls: Vec, @@ -165,6 +171,56 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String { } } +/// Serializes a NodeRef to a string that is XHTML compatible +/// The only DOM nodes serialized are Text and Element nodes +pub fn serialize_to_xhtml( + node_ref: &NodeRef, + mut w: &mut W, +) -> Result<(), Box> { + let mut escape_map = HashMap::new(); + escape_map.insert("<", "<"); + escape_map.insert(">", ">"); + escape_map.insert("&", "&"); + for edge in node_ref.traverse_inclusive() { + match edge { + kuchiki::iter::NodeEdge::Start(n) => match n.data() { + kuchiki::NodeData::Text(rc_text) => { + let text = rc_text.borrow(); + let esc_text = ESC_SEQ_REGEX + .replace_all(&text, |captures: ®ex::Captures| escape_map[&captures[1]]); + write!(&mut w, "{}", esc_text)?; + } + kuchiki::NodeData::Element(elem_data) => { + let attrs = elem_data.attributes.borrow(); + let attrs_str = attrs + .map + .iter() + .map(|(k, v)| { + format!( + "{}=\"{}\"", + k.local, + ESC_SEQ_REGEX + .replace_all(&v.value, |captures: ®ex::Captures| { + escape_map[&captures[1]] + }) + ) + }) + .fold("".to_string(), |acc, val| acc + " " + &val); + write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?; + } + _ => (), + }, + kuchiki::iter::NodeEdge::End(n) => match n.data() { + kuchiki::NodeData::Element(elem_data) => { + write!(&mut w, "", &elem_data.name.local)?; + } + _ => (), + }, + } + } + Ok(()) +} + #[cfg(test)] mod test { use super::*; diff --git a/src/main.rs b/src/main.rs index ae81f8d..e61bef0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -56,14 +56,9 @@ fn download(urls: Vec) { let file_name = format!("{}.epub", extractor.metadata().title()); let mut out_file = File::create(&file_name).unwrap(); let mut html_buf = Vec::new(); - extractor - .article() - .unwrap() - .serialize(&mut html_buf) - .expect("Unable to serialize"); + extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); let html_buf = std::str::from_utf8(&html_buf).unwrap(); - let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX - .replace_all(html_buf, "$tag/>"); let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); if let Some(author) = extractor.metadata().byline() { epub.metadata("author", author.replace("&", "&")) diff --git a/src/moz_readability/regexes.rs b/src/moz_readability/regexes.rs index c4aa0d9..01b7e9e 100644 --- a/src/moz_readability/regexes.rs +++ b/src/moz_readability/regexes.rs @@ -132,8 +132,4 @@ lazy_static! { pub static ref REPLACE_END_SEPARATOR_REGEX: Regex = Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P.*)").unwrap(); pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap(); - pub static ref REPLACE_SELF_CLOSING_REGEX: Regex = Regex::new( - r#"(?P<(?:area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)(?: [a-z\-]+=["'][\sa-zA-Z0-9\./\-_#]+["']|[a-z\-]+)*)>"# - ) - .unwrap(); }