From 350447d1c4d2932aed3569684d39da14f55c5468 Mon Sep 17 00:00:00 2001
From: Kenneth Gitere <gitere81@gmail.com>
Date: Wed, 21 Oct 2020 16:21:53 +0300
Subject: [PATCH] Change calls on replacing regexes to `replace_all`

Add `fix_relative_uris`, `clean_classes`, `clean_readability_attrs`
and `post_process_content`
---
 src/moz_readability/mod.rs     | 358 ++++++++++++++++++++++++++++++++-
 src/moz_readability/regexes.rs |   2 +
 2 files changed, 353 insertions(+), 7 deletions(-)

diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs
index c1e86e2..ab38cdf 100644
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, HashMap, HashSet};
 use std::str::FromStr;
 
 use html5ever::{LocalName, Namespace, QualName};
@@ -7,6 +7,7 @@ use kuchiki::{
     traits::*,
     NodeData, NodeRef,
 };
+use url::Url;
 
 const SHARE_ELEMENT_THRESHOLD: usize = 500;
 const READABILITY_SCORE: &'static str = "readability-score";
@@ -68,13 +69,14 @@ impl Readability {
             article_dir: None,
         }
     }
-    pub fn parse(&mut self) {
+    pub fn parse(&mut self, url: &str) {
         self.unwrap_no_script_tags();
         self.remove_scripts();
         self.prep_document();
         let meta_data = self.get_article_metadata();
         self.article_title = meta_data.title.clone();
         self.grab_article();
+        self.post_process_content(url);
     }
 
     /// Recursively check if node is image, or if node contains exactly only one image
@@ -435,8 +437,8 @@ impl Readability {
                         let name_val = name_attr.unwrap();
                         if regexes::is_match_name_pattern(name_val) {
                             let name = name_val.to_lowercase();
-                            let name = regexes::REPLACE_WHITESPACE_REGEX.replace(&name, "");
-                            let name = regexes::REPLACE_DOT_REGEX.replace(&name, ":");
+                            let name = regexes::REPLACE_WHITESPACE_REGEX.replace_all(&name, "");
+                            let name = regexes::REPLACE_DOT_REGEX.replace_all(&name, ":");
                             values.insert(name.to_string(), content.trim().to_string());
                         }
                     }
@@ -590,21 +592,162 @@ impl Readability {
             }
         }
         cur_title = regexes::NORMALIZE_REGEX
-            .replace(cur_title.trim(), " ")
+            .replace_all(cur_title.trim(), " ")
             .to_string();
         let cur_word_count = word_count(&cur_title);
 
         if cur_word_count <= 4
             && (!title_had_hierarchical_separators
                 || cur_word_count
-                    != word_count(&regexes::REPLACE_MULTI_SEPARATOR_REGEX.replace(&orig_title, ""))
-                        - 1)
+                    != word_count(
+                        &regexes::REPLACE_MULTI_SEPARATOR_REGEX.replace_all(&orig_title, ""),
+                    ) - 1)
         {
             cur_title = orig_title;
         }
         cur_title
     }
 
+    /// Removes the class="" attribute from every element in the given subtree, except those that
+    /// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
+    fn clean_classes(&mut self) {
+        // TODO: This should accessed from Self
+        let classes_to_preserve: HashSet<&str> = HashSet::new();
+        if let Some(article_node) = &mut self.article_node {
+            for elem in article_node.inclusive_descendants().elements() {
+                let mut elem_attrs = elem.attributes.borrow_mut();
+                if let Some(class_list) = elem_attrs.get_mut("class") {
+                    let filtered_class: String = class_list
+                        .split_whitespace()
+                        .filter(|class| classes_to_preserve.contains(class))
+                        .fold("".to_string(), |acc, x| acc + " " + x);
+                    if filtered_class.is_empty() {
+                        elem_attrs.remove("class");
+                    } else {
+                        *class_list = filtered_class;
+                    }
+                }
+            }
+        }
+    }
+
+    ///  Converts each <a> and <img> uri in the given element to an absolute URI, ignoring #ref URIs.
+    fn fix_relative_uris(&mut self, document_uri: &str) {
+        if let Some(article_node) = &mut self.article_node {
+            let document_uri =
+                Url::parse(document_uri).expect("Unable to parse the document's URI");
+            let base_uri = self
+                .root_node
+                .select("base")
+                .unwrap()
+                .filter(|node_ref| {
+                    let node_attrs = node_ref.attributes.borrow();
+                    node_attrs.contains("href")
+                })
+                .map(|node_ref| {
+                    let node_attrs = node_ref.attributes.borrow();
+                    Url::parse(node_attrs.get("href").unwrap()).unwrap()
+                })
+                .next()
+                .unwrap_or(document_uri.clone());
+            let to_absolute_uri = |uri_str: &str| -> String {
+                if base_uri == document_uri && uri_str.starts_with("#") {
+                    return uri_str.to_string();
+                }
+
+                if let Ok(new_uri) = Url::parse(uri_str) {
+                    if new_uri.has_host() {
+                        return new_uri.to_string();
+                    }
+                } else if let Ok(joined_uri) = base_uri.join(uri_str) {
+                    return joined_uri.to_string();
+                }
+
+                uri_str.to_string()
+            };
+            let mut links = article_node.select("a").unwrap().filter(|a_ref| {
+                let link_attrs = a_ref.attributes.borrow();
+                link_attrs.contains("href")
+            });
+            let mut link = links.next();
+            while let Some(link_ref) = link {
+                link = links.next();
+                let mut link_attrs = link_ref.attributes.borrow_mut();
+                let href = link_attrs.get("href").map(|val| val.to_string()).unwrap();
+                if href.starts_with("javascript:") {
+                    let link_node = link_ref.as_node();
+                    if link_node.children().count() == 1
+                        && link_node
+                            .first_child()
+                            .map(|node_ref| node_ref.as_text().is_some())
+                            .unwrap()
+                    {
+                        let text_node = NodeRef::new_text(link_node.text_contents());
+                        link_node.insert_before(text_node);
+                        link_node.detach();
+                    } else {
+                        let container = NodeRef::new_element(
+                            QualName::new(None, Namespace::from(HTML_NS), LocalName::from("span")),
+                            BTreeMap::new(),
+                        );
+                        let mut children = link_node.children();
+                        let mut child = children.next();
+                        while let Some(child_ref) = child {
+                            child = children.next();
+                            container.append(child_ref);
+                        }
+                        link_node.insert_before(container);
+                        link_node.detach();
+                    }
+                } else {
+                    link_attrs.insert("href", to_absolute_uri(&href));
+                }
+            }
+            let media_nodes = article_node
+                .select("img, picture, figure, video, audio, source")
+                .unwrap();
+            for media_node in media_nodes {
+                let mut media_attrs = media_node.attributes.borrow_mut();
+                if let Some(src) = media_attrs.get_mut("src") {
+                    *src = to_absolute_uri(&src);
+                }
+
+                if let Some(poster) = media_attrs.get_mut("poster") {
+                    *poster = to_absolute_uri(&poster);
+                }
+
+                if let Some(srcset) = media_attrs.get_mut("srcset") {
+                    let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all(
+                        &srcset,
+                        |captures: &regex::Captures| {
+                            to_absolute_uri(&captures[1]) + &captures[2] + &captures[3]
+                        },
+                    );
+                    *srcset = new_srcset.to_string();
+                }
+            }
+        }
+    }
+
+    /// Removes readability attributes from DOM nodes as they are not needed in the final article
+    fn clean_readability_attrs(&mut self) {
+        if let Some(article_node) = &mut self.article_node {
+            for node in article_node.inclusive_descendants().elements() {
+                let mut node_attrs = node.attributes.borrow_mut();
+                node_attrs.remove(READABILITY_SCORE);
+                node_attrs.remove("readability-data-table");
+            }
+        }
+    }
+
+    /// Run any post-process modifications to article content as necessary.
+    fn post_process_content(&mut self, url: &str) {
+        self.fix_relative_uris(url);
+        // TODO: Add flag check
+        self.clean_classes();
+        self.clean_readability_attrs();
+    }
+
     /// Converts an inline CSS string to a [HashMap] of property and value(s)
     fn inline_css_str_to_map(css_str: &str) -> HashMap<&str, &str> {
         css_str
@@ -3462,4 +3605,205 @@ characters. For that reason, this <p> tag could not be a byline because it's too
         result.title = "A Longer Title".to_string();
         assert_eq!(result, doc.get_article_metadata());
     }
+
+    #[test]
+    fn test_fix_relative_uris() {
+        let html_str = r##"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <h1><a href="../home.html">Go back</a></h1>
+                <img id="ex-1" src="https://example.image.com/images/1.jpg" alt="Ex 1">
+                <img id="ex-2" src="https://example.image.com/images/2.jpg" alt="Ex 2">
+                <img id="ex-3" src="../images/2.jpg" alt="Ex 3">
+                <img id="ex-4" src="./images/1.jpg" alt="Ex 4">
+                <img id="ex-5" src="https://images.com/images/1.jpg" alt="Ex 5">
+                <p><a href="#ex-1">First image</a></p>
+            </body>
+        </html>
+        "##;
+        let mut doc = Readability::new(html_str);
+        doc.article_node = doc
+            .root_node
+            .select_first("body")
+            .ok()
+            .map(|node_ref| node_ref.as_node().clone());
+        doc.fix_relative_uris("https://example.image.com/blog/");
+
+        let node = doc.root_node.select_first("img#ex-1").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/images/1.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("img#ex-2").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/images/2.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("img#ex-3").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/images/2.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("img#ex-4").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/blog/images/1.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("img#ex-5").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://images.com/images/1.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("p a").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(Some("#ex-1"), node_attrs.get("href"));
+
+        let node = doc.root_node.select_first("h1 a").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/home.html"),
+            node_attrs.get("href")
+        );
+    }
+
+    #[test]
+    fn test_clean_classes() {
+        // TODO: This test will later be edited to ensure it checks to only remove certain classes
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <p class="a b c d">One</p>
+                <p class="b c d e">Two</p>
+                <div class="a b c div">Three</div>
+                <div class="b c d e">Four</div>
+                <ul class="a b c d">
+                    <li class="a b c d">One</li>
+                    <li class="b c d e">Two</li>
+                    <li class="b c d e">Three</li>
+                </ul>
+            </body>
+        </html>
+        "#;
+        let mut doc = Readability::new(html_str);
+        doc.article_node = doc
+            .root_node
+            .select_first("body")
+            .ok()
+            .map(|node_ref| node_ref.as_node().clone());
+        doc.clean_classes();
+
+        assert_eq!(
+            true,
+            doc.root_node
+                .inclusive_descendants()
+                .elements()
+                .all(|node_elem| {
+                    let node_attrs = node_elem.attributes.borrow();
+                    !node_attrs.contains("class")
+                })
+        );
+    }
+
+    #[test]
+    fn test_clean_readability_attrs() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <div readability-score="0.921487">
+                    <p readability-score="0.8102">Welcome to this awesome blog post. Only good content is here. No spam.</p>
+                    <p readability-score="0.6004">Let's look at some statistics</p>
+                    <table readability-score="0.719275" readability-data-table="true">
+                        <caption>Monthly savings</caption>
+                        <tr>
+                            <th>Month</th>
+                            <th>Savings</th>
+                        </tr>
+                        <tr>
+                            <td>January</td>
+                            <td>$100</td>
+                        </tr>
+                        <tr>
+                            <td>February</td>
+                            <td>$50</td>
+                        </tr>
+                    </table>
+                </div>
+            </body>
+        </html>
+        "#;
+        let mut doc = Readability::new(html_str);
+        doc.article_node = doc
+            .root_node
+            .select_first("body")
+            .ok()
+            .map(|node_ref| node_ref.as_node().clone());
+        doc.clean_readability_attrs();
+        assert_eq!(
+            true,
+            doc.root_node
+                .inclusive_descendants()
+                .elements()
+                .all(|node| {
+                    let node_attrs = node.attributes.borrow();
+                    node_attrs.map.len() == 0
+                })
+        );
+    }
+
+    #[test]
+    fn test_post_process_content() {
+        let html_str = r##"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <p class="a b c d">One</p>
+                <p class="b c d e">Two</p>
+                <div class="a b c div">Three</div>
+                <div class="b c d e">
+                    <img src="./img.jpg" class="lazy">
+                </div>
+                <ul class="a b c d">
+                    <li class="a b c d"><a href="#home">One</a></li>
+                    <li class="b c d e">Two</li>
+                    <li class="b c d e">Three</li>
+                </ul>
+            </body>
+        </html>
+        "##;
+        let mut doc = Readability::new(html_str);
+        doc.article_node = doc
+            .root_node
+            .select_first("body")
+            .ok()
+            .map(|node_ref| node_ref.as_node().clone());
+        doc.post_process_content("https://foo.blog/post/");
+        let has_class_attr = doc
+            .root_node
+            .inclusive_descendants()
+            .elements()
+            .any(|node_ref| {
+                let attrs = node_ref.attributes.borrow();
+                attrs.contains("class")
+            });
+        assert_eq!(false, has_class_attr);
+        let a_node = doc.root_node.select_first("a").unwrap();
+        let a_node_attrs = a_node.attributes.borrow();
+        assert_eq!(Some("#home"), a_node_attrs.get("href"));
+        let img_node = doc.root_node.select_first("img").unwrap();
+        let img_attrs = img_node.attributes.borrow();
+        assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src"));
+    }
 }
diff --git a/src/moz_readability/regexes.rs b/src/moz_readability/regexes.rs
index 1a49c77..01b7e9e 100644
--- a/src/moz_readability/regexes.rs
+++ b/src/moz_readability/regexes.rs
@@ -119,6 +119,8 @@ lazy_static! {
         r"(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*"
     )
     .unwrap();
+    pub static ref SRCSET_CAPTURE_REGEX: Regex =
+        Regex::new(r"(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))").unwrap();
     pub static ref REPLACE_WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
     pub static ref REPLACE_DOT_REGEX: Regex = Regex::new(r"\.").unwrap();
     pub static ref REPLACE_HTML_ESCAPE_REGEX: Regex =