Move MetaAttr to moz_readability and rename to MetaData

Add get_article_metadata, get_article_title and unescape_html_entities and their tests
2020-10-20 22:23:31 +03:00 · 2020-10-20 22:23:31 +03:00 · aacb442b7a
commit aacb442b7a
parent d99b1c687b
3 changed files with 429 additions and 35 deletions
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -66,8 +66,6 @@ impl Extractor {
            .extract_attr_val("html", "lang", |lang| lang.to_string())
            .unwrap_or("en".to_string());

-        let meta_attrs = MetaAttr::new(author, description, lang, tags, title);
-
        // Extract the article

        let article_ref = self.root_node.select_first("article").unwrap();
@ -192,33 +190,6 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String {
    }
 }

-#[derive(Debug)]
-pub struct MetaAttr {
-    author: Option<String>,
-    description: Option<String>,
-    language: String,
-    tags: Option<Vec<String>>,
-    title: String,
-}
-
-impl MetaAttr {
-    pub fn new(
-        author: Option<String>,
-        description: Option<String>,
-        language: String,
-        tags: Option<Vec<String>>,
-        title: String,
-    ) -> Self {
-        MetaAttr {
-            author,
-            description,
-            language,
-            tags,
-            title,
-        }
-    }
-}
-
 #[cfg(test)]
 mod test {
    use super::*;
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@ -1,6 +1,5 @@
 use std::collections::{BTreeMap, HashMap};
-
-use crate::extractor::MetaAttr;
+use std::str::FromStr;

 use html5ever::{LocalName, Namespace, QualName};
 use kuchiki::{
@ -73,7 +72,8 @@ impl Readability {
        self.unwrap_no_script_tags();
        self.remove_scripts();
        self.prep_document();
-        // TODO: Add implementation for get_article_metadata
+        let meta_data = self.get_article_metadata();
+        self.article_title = meta_data.title.clone();
        self.grab_article();
    }

@ -404,8 +404,205 @@ impl Readability {
    }

    ///Attempts to get excerpt and byline metadata for the article. @return Object with optional "excerpt" and "byline" properties
-    fn get_article_metadata(&self) -> MetaAttr {
-        unimplemented!()
+    fn get_article_metadata(&self) -> MetaData {
+        let mut values: HashMap<String, String> = HashMap::new();
+        let mut meta_data = MetaData::new();
+        if let Ok(meta_elems) = self.root_node.select("meta") {
+            meta_elems
+                .filter(|node_ref| {
+                    let node_attr = node_ref.attributes.borrow();
+                    node_attr.get("content").is_some()
+                })
+                .for_each(|node_ref| {
+                    let node_attr = node_ref.attributes.borrow();
+                    let content = node_attr.get("content").unwrap();
+                    let name_attr = node_attr.get("name");
+                    let mut matches = None;
+                    if let Some(property) = node_attr.get("property") {
+                        matches = regexes::PROPERTY_REGEX.captures(property);
+                        if matches.is_some() {
+                            let captures = matches.as_ref().unwrap();
+                            for capture in captures.iter() {
+                                let mut name = capture.unwrap().as_str().to_lowercase();
+                                name = regexes::REPLACE_WHITESPACE_REGEX
+                                    .replace_all(&name, "")
+                                    .to_string();
+                                values.insert(name, content.trim().to_string());
+                            }
+                        }
+                    }
+                    if matches.is_none() && name_attr.is_some() {
+                        let name_val = name_attr.unwrap();
+                        if regexes::is_match_name_pattern(name_val) {
+                            let name = name_val.to_lowercase();
+                            let name = regexes::REPLACE_WHITESPACE_REGEX.replace(&name, "");
+                            let name = regexes::REPLACE_DOT_REGEX.replace(&name, ":");
+                            values.insert(name.to_string(), content.trim().to_string());
+                        }
+                    }
+                });
+        }
+
+        let meta_title_keys = [
+            "dc:title",
+            "dcterm:title",
+            "og:title",
+            "weibo:article:title",
+            "weibo:webpage:title",
+            "title",
+            "twitter:title",
+        ];
+        meta_data.title = if let Some(key) = meta_title_keys
+            .iter()
+            .find(|key| values.contains_key(**key))
+        {
+            values.get(*key).map(|title| title.to_owned()).unwrap()
+        } else {
+            self.get_article_title()
+        };
+
+        let meta_byline_keys = ["dc:creator", "dcterm:creator", "author"];
+        meta_data.byline = {
+            let possible_key = meta_byline_keys
+                .iter()
+                .find(|key| values.contains_key(**key));
+            if let Some(actual_key) = possible_key {
+                values.get(*actual_key).map(|byline| byline.to_owned())
+            } else {
+                None
+            }
+        };
+
+        let meta_excerpt_keys = [
+            "dc:description",
+            "dcterm:description",
+            "og:description",
+            "weibo:article:description",
+            "weibo:webpage:description",
+            "description",
+            "twitter:description",
+        ];
+        meta_data.excerpt = {
+            let possible_key = meta_excerpt_keys
+                .iter()
+                .find(|key| values.contains_key(**key));
+            if let Some(actual_key) = possible_key {
+                values.get(*actual_key).map(|excerpt| excerpt.to_owned())
+            } else {
+                None
+            }
+        };
+
+        meta_data.site_name = values
+            .get("og:site_name")
+            .map(|site_name| site_name.to_owned());
+
+        Self::unescape_html_entities(&mut meta_data.title);
+        if meta_data.byline.is_some() {
+            Self::unescape_html_entities(&mut meta_data.byline.as_mut().unwrap());
+        }
+
+        if meta_data.excerpt.is_some() {
+            Self::unescape_html_entities(&mut meta_data.excerpt.as_mut().unwrap());
+        }
+
+        if meta_data.site_name.is_some() {
+            Self::unescape_html_entities(&mut meta_data.site_name.as_mut().unwrap());
+        }
+
+        meta_data
+    }
+
+    /// Converts some of the common HTML entities in string to their corresponding characters.
+    fn unescape_html_entities(value: &mut String) {
+        if !value.is_empty() {
+            // TODO: Extract this
+            let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
+            html_escape_map.insert("lt", "<");
+            html_escape_map.insert("gt", ">");
+            html_escape_map.insert("amp", "&");
+            html_escape_map.insert("quot", "\"");
+            html_escape_map.insert("apos", "'");
+            let mut new_value = regexes::REPLACE_HTML_ESCAPE_REGEX
+                .replace_all(&value, |captures: &regex::Captures| {
+                    html_escape_map[&captures[1]].to_string()
+                })
+                .to_string();
+            new_value = regexes::REPLACE_HEX_REGEX
+                .replace_all(&new_value, |captures: &regex::Captures| {
+                    let num = if let Some(hex_capture) = captures.get(1) {
+                        u16::from_str_radix(hex_capture.as_str(), 16)
+                    } else if let Some(dec_capture) = captures.get(2) {
+                        u16::from_str(dec_capture.as_str())
+                    } else {
+                        unreachable!("Unable to match any of the captures");
+                    };
+                    String::from_utf16_lossy(&[num.unwrap()])
+                })
+                .to_string();
+            *value = new_value;
+        }
+    }
+
+    /// Get the article title as an H1.
+    fn get_article_title(&self) -> String {
+        let mut cur_title = self
+            .root_node
+            .select_first("title")
+            .map(|title| title.text_contents().trim().to_string())
+            .expect("This file has no <title> tag to extract a title from");
+        let orig_title = cur_title.clone();
+        let mut title_had_hierarchical_separators = false;
+        let word_count = |s: &str| -> usize { s.split_whitespace().count() };
+        if regexes::is_match_title_separator(&cur_title) {
+            title_had_hierarchical_separators = regexes::is_match_has_title_separator(&cur_title);
+            cur_title = regexes::REPLACE_START_SEPARATOR_REGEX
+                .replace_all(&orig_title, "$start")
+                .to_string();
+            if word_count(&cur_title) < 3 {
+                cur_title = regexes::REPLACE_END_SEPARATOR_REGEX
+                    .replace_all(&orig_title, "$end")
+                    .to_string();
+            }
+        } else if cur_title.contains(": ") {
+            let trimmed_title = cur_title.trim();
+            let is_match_heading = self
+                .root_node
+                .select("h1, h2")
+                .unwrap()
+                .any(|heading| heading.text_contents().trim() == trimmed_title);
+            if !is_match_heading {
+                let mut idx = orig_title.rfind(":").unwrap() + 1;
+                let mut new_title = &orig_title[idx..];
+                if word_count(new_title) < 3 {
+                    idx = orig_title.find(":").unwrap() + 1;
+                    new_title = &orig_title[idx..];
+                } else if word_count(&orig_title[0..orig_title.find(":").unwrap()]) > 5 {
+                    new_title = &orig_title;
+                }
+                cur_title = new_title.to_string();
+            }
+        } else if cur_title.len() > 150 || cur_title.len() < 15 {
+            let mut h1_nodes = self.root_node.select("h1").unwrap();
+            let (_, h1_count) = h1_nodes.size_hint();
+            if Some(1) == h1_count {
+                cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None);
+            }
+        }
+        cur_title = regexes::NORMALIZE_REGEX
+            .replace(cur_title.trim(), " ")
+            .to_string();
+        let cur_word_count = word_count(&cur_title);
+
+        if cur_word_count <= 4
+            && (!title_had_hierarchical_separators
+                || cur_word_count
+                    != word_count(&regexes::REPLACE_MULTI_SEPARATOR_REGEX.replace(&orig_title, ""))
+                        - 1)
+        {
+            cur_title = orig_title;
+        }
+        cur_title
    }

    /// Converts an inline CSS string to a [HashMap] of property and value(s)
@ -1672,10 +1869,28 @@ impl Readability {
        }
    }
 }
+#[derive(Debug, PartialEq)]
+pub struct MetaData {
+    byline: Option<String>,
+    excerpt: Option<String>,
+    site_name: Option<String>,
+    title: String,
+}
+
+impl MetaData {
+    pub fn new() -> Self {
+        MetaData {
+            byline: None,
+            excerpt: None,
+            site_name: None,
+            title: "".into(),
+        }
+    }
+}

 #[cfg(test)]
 mod test {
-    use super::{Readability, SizeInfo, HTML_NS, READABILITY_SCORE};
+    use super::{MetaData, Readability, SizeInfo, HTML_NS, READABILITY_SCORE};
    use html5ever::{LocalName, Namespace, QualName};
    use kuchiki::traits::*;
    use kuchiki::NodeRef;
@ -3075,4 +3290,176 @@ characters. For that reason, this <p> tag could not be a byline because it's too
                .local
        );
    }
+
+    #[test]
+    fn test_get_article_title() {
+        let mut html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Porting Readability to Rust</title>
+            </head>
+            <body>
+                <p></p>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Porting Readability to Rust", doc.get_article_title());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Crates.io: The Rust package repository</title>
+            </head>
+            <body>
+                <p></p>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!(
+            "Crates.io: The Rust package repository",
+            doc.get_article_title()
+        );
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Crates.io: The Rust package repository</title>
+            </head>
+            <body>
+                <h1>Crates.io: The Rust package repository</h1>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!(
+            "Crates.io: The Rust package repository",
+            doc.get_article_title()
+        );
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Crates.io: A package repository</title>
+            </head>
+            <body>
+                <h1>Crates.io: A Rust package repository</h1>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Crates.io: A package repository", doc.get_article_title());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Foo developer \ Blog</title>
+            </head>
+            <body>
+                <p></p>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Foo developer \\ Blog", doc.get_article_title());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Foo developer » Blog Post on Foo bar stuff</title>
+            </head>
+            <body>
+                <p></p>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Blog Post on Foo bar stuff", doc.get_article_title());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Blog</title>
+            </head>
+            <body>
+                <h1>Getting started with Rust</h1>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Blog", doc.get_article_title());
+    }
+
+    #[test]
+    fn test_unescape_html_entities() {
+        let mut input = "Therefore, 5 &gt; 3".to_string();
+        Readability::unescape_html_entities(&mut input);
+        assert_eq!("Therefore, 5 > 3", &input);
+        input = "Logical AND (&amp;&amp;)".to_string();
+        Readability::unescape_html_entities(&mut input);
+        assert_eq!("Logical AND (&&)", &input);
+        input = "&#117; &#43; &#101; = &#252;".to_string();
+        Readability::unescape_html_entities(&mut input);
+        assert_eq!("u + e = ü", input);
+        input = "&#x0158;&#x016d;&#x0161;&#x0163;".to_string();
+        Readability::unescape_html_entities(&mut input);
+        assert_eq!("Řŭšţ", input);
+    }
+
+    #[test]
+    fn test_get_article_metadata() {
+        let mut html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <meta charset="utf-8"/>
+                <meta name="description" content="A post on how hard it is to work with text."/>
+                <meta name="viewport" content="width=device-width"/>
+                <title>Foo Coder / Blog on the difficulty of using utf-8</title>
+                <meta name="author" content="Foo Coder"/>
+            </head>
+            <body></body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let mut result = MetaData::new();
+        result.byline = Some("Foo Coder".to_string());
+        result.excerpt = Some("A post on how hard it is to work with text.".to_string());
+        result.title = "Blog on the difficulty of using utf-8".to_string();
+        assert_eq!(result, doc.get_article_metadata());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+                <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" user-scalable="no" />
+                <meta name="title" content="A Long Title" />
+                <meta name="description" content="Foo bar baz bo&#223;" />
+                <meta property="og:site_name" content="Blog Place" />
+                <meta property="og:title" content="A Longer Title" />
+                <meta property="og:description" content="Foo bar baz bo&#223;" />
+                <meta name="author" content="F&#x00f6;o Coder" />
+                <meta name="dc:creator" content="F&#x00f6;o Coder" />
+                <meta name="twitter:card" content="summary_large_image" />
+                <title>The Longest Title</title>
+            </head>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        result = MetaData::new();
+        result.byline = Some("Föo Coder".to_string());
+        result.excerpt = Some("Foo bar baz boß".to_string());
+        result.site_name = Some("Blog Place".to_string());
+        result.title = "A Longer Title".to_string();
+        assert_eq!(result, doc.get_article_metadata());
+    }
 }
--- a/src/moz_readability/regexes.rs
+++ b/src/moz_readability/regexes.rs
@ -89,9 +89,45 @@ pub fn is_match_src_regex(match_str: &str) -> bool {
    SRC_REGEX.is_match(match_str)
 }

+pub fn is_match_name_pattern(match_str: &str) -> bool {
+    lazy_static! {
+        static ref NAME_PATTERN_REGEX: Regex = Regex::new(r"(?i)\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$").unwrap();
+    }
+    NAME_PATTERN_REGEX.is_match(match_str)
+}
+
+pub fn is_match_title_separator(match_str: &str) -> bool {
+    lazy_static! {
+        static ref TITLE_SEPARATOR_REGEX: Regex = Regex::new(r" [\|\-\\/>»] ").unwrap();
+    }
+    TITLE_SEPARATOR_REGEX.is_match(match_str)
+}
+
+pub fn is_match_has_title_separator(match_str: &str) -> bool {
+    lazy_static! {
+        static ref HAS_TITLE_SEPARATOR_REGEX: Regex = Regex::new(r" [\\/>»] ").unwrap();
+    }
+    HAS_TITLE_SEPARATOR_REGEX.is_match(match_str)
+}
+
 lazy_static! {
    pub static ref NORMALIZE_REGEX: Regex = Regex::new(r"\s{2,}").unwrap();
    pub static ref B64_DATA_URL_REGEX: Regex =
        Regex::new(r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*").unwrap();
    pub static ref BASE64_REGEX: Regex = Regex::new(r"(?i)base64\s*").unwrap();
+    pub static ref PROPERTY_REGEX: Regex = Regex::new(
+        r"(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*"
+    )
+    .unwrap();
+    pub static ref REPLACE_WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
+    pub static ref REPLACE_DOT_REGEX: Regex = Regex::new(r"\.").unwrap();
+    pub static ref REPLACE_HTML_ESCAPE_REGEX: Regex =
+        Regex::new("&(quot|amp|apos|lt|gt);").unwrap();
+    pub static ref REPLACE_HEX_REGEX: Regex =
+        Regex::new(r"(?i)&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));").unwrap();
+    pub static ref REPLACE_START_SEPARATOR_REGEX: Regex =
+        Regex::new(r"(?i)(?P<start>.*)[\|\-\\/>»] .*").unwrap();
+    pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
+        Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
+    pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
 }