From 5f99bddc102fcff3d5bdd7fbf8aa9d9b3e30898a Mon Sep 17 00:00:00 2001
From: Kenneth Gitere <gitere81@gmail.com>
Date: Tue, 24 Nov 2020 14:54:23 +0300
Subject: [PATCH] Add custom serializer for XHTML

---
 src/extractor.rs               | 56 ++++++++++++++++++++++++++++++++++
 src/main.rs                    |  9 ++----
 src/moz_readability/regexes.rs |  4 ---
 3 files changed, 58 insertions(+), 11 deletions(-)
diff --git a/src/extractor.rs b/src/extractor.rs
index 1c10dcc..7d04bdd 100644
--- a/src/extractor.rs
+++ b/src/extractor.rs
@@ -1,3 +1,5 @@
+use std::collections::HashMap;
+
 use async_std::fs::File;
 use async_std::io::prelude::*;
 use async_std::task;
@@ -8,6 +10,10 @@ use crate::moz_readability::{MetaData, Readability};
 
 pub type ResourceInfo = (String, Option<String>);
 
+lazy_static! {
+    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
+}
+
 pub struct Extractor {
     article: Option<NodeRef>,
     pub img_urls: Vec<ResourceInfo>,
@@ -165,6 +171,56 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String {
     }
 }
 
+/// Serializes a NodeRef to a string that is XHTML compatible
+/// The only DOM nodes serialized are Text and Element nodes
+pub fn serialize_to_xhtml<W: std::io::Write>(
+    node_ref: &NodeRef,
+    mut w: &mut W,
+) -> Result<(), Box<dyn std::error::Error>> {
+    let mut escape_map = HashMap::new();
+    escape_map.insert("<", "&lt;");
+    escape_map.insert(">", "&gt;");
+    escape_map.insert("&", "&amp;");
+    for edge in node_ref.traverse_inclusive() {
+        match edge {
+            kuchiki::iter::NodeEdge::Start(n) => match n.data() {
+                kuchiki::NodeData::Text(rc_text) => {
+                    let text = rc_text.borrow();
+                    let esc_text = ESC_SEQ_REGEX
+                        .replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
+                    write!(&mut w, "{}", esc_text)?;
+                }
+                kuchiki::NodeData::Element(elem_data) => {
+                    let attrs = elem_data.attributes.borrow();
+                    let attrs_str = attrs
+                        .map
+                        .iter()
+                        .map(|(k, v)| {
+                            format!(
+                                "{}=\"{}\"",
+                                k.local,
+                                ESC_SEQ_REGEX
+                                    .replace_all(&v.value, |captures: &regex::Captures| {
+                                        escape_map[&captures[1]]
+                                    })
+                            )
+                        })
+                        .fold("".to_string(), |acc, val| acc + " " + &val);
+                    write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
+                }
+                _ => (),
+            },
+            kuchiki::iter::NodeEdge::End(n) => match n.data() {
+                kuchiki::NodeData::Element(elem_data) => {
+                    write!(&mut w, "</{}>", &elem_data.name.local)?;
+                }
+                _ => (),
+            },
+        }
+    }
+    Ok(())
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/src/main.rs b/src/main.rs
index ae81f8d..e61bef0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -56,14 +56,9 @@ fn download(urls: Vec<String>) {
                 let file_name = format!("{}.epub", extractor.metadata().title());
                 let mut out_file = File::create(&file_name).unwrap();
                 let mut html_buf = Vec::new();
-                extractor
-                    .article()
-                    .unwrap()
-                    .serialize(&mut html_buf)
-                    .expect("Unable to serialize");
+                extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
+                    .expect("Unable to serialize to xhtml");
                 let html_buf = std::str::from_utf8(&html_buf).unwrap();
-                let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX
-                    .replace_all(html_buf, "$tag/>");
                 let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
                 if let Some(author) = extractor.metadata().byline() {
                     epub.metadata("author", author.replace("&", "&amp;"))
diff --git a/src/moz_readability/regexes.rs b/src/moz_readability/regexes.rs
index c4aa0d9..01b7e9e 100644
--- a/src/moz_readability/regexes.rs
+++ b/src/moz_readability/regexes.rs
@@ -132,8 +132,4 @@ lazy_static! {
     pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
         Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
     pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
-    pub static ref REPLACE_SELF_CLOSING_REGEX: Regex = Regex::new(
-        r#"(?P<tag><(?:area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)(?: [a-z\-]+=["'][\sa-zA-Z0-9\./\-_#]+["']|[a-z\-]+)*)>"#
-    )
-    .unwrap();
 }