diff --git a/src/extractor.rs b/src/extractor.rs index b393f5b..787836f 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -66,8 +66,6 @@ impl Extractor { .extract_attr_val("html", "lang", |lang| lang.to_string()) .unwrap_or("en".to_string()); - let meta_attrs = MetaAttr::new(author, description, lang, tags, title); - // Extract the article let article_ref = self.root_node.select_first("article").unwrap(); @@ -192,33 +190,6 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String { } } -#[derive(Debug)] -pub struct MetaAttr { - author: Option, - description: Option, - language: String, - tags: Option>, - title: String, -} - -impl MetaAttr { - pub fn new( - author: Option, - description: Option, - language: String, - tags: Option>, - title: String, - ) -> Self { - MetaAttr { - author, - description, - language, - tags, - title, - } - } -} - #[cfg(test)] mod test { use super::*; diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 7c10d7a..c1e86e2 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -1,6 +1,5 @@ use std::collections::{BTreeMap, HashMap}; - -use crate::extractor::MetaAttr; +use std::str::FromStr; use html5ever::{LocalName, Namespace, QualName}; use kuchiki::{ @@ -73,7 +72,8 @@ impl Readability { self.unwrap_no_script_tags(); self.remove_scripts(); self.prep_document(); - // TODO: Add implementation for get_article_metadata + let meta_data = self.get_article_metadata(); + self.article_title = meta_data.title.clone(); self.grab_article(); } @@ -404,8 +404,205 @@ impl Readability { } ///Attempts to get excerpt and byline metadata for the article. @return Object with optional "excerpt" and "byline" properties - fn get_article_metadata(&self) -> MetaAttr { - unimplemented!() + fn get_article_metadata(&self) -> MetaData { + let mut values: HashMap = HashMap::new(); + let mut meta_data = MetaData::new(); + if let Ok(meta_elems) = self.root_node.select("meta") { + meta_elems + .filter(|node_ref| { + let node_attr = node_ref.attributes.borrow(); + node_attr.get("content").is_some() + }) + .for_each(|node_ref| { + let node_attr = node_ref.attributes.borrow(); + let content = node_attr.get("content").unwrap(); + let name_attr = node_attr.get("name"); + let mut matches = None; + if let Some(property) = node_attr.get("property") { + matches = regexes::PROPERTY_REGEX.captures(property); + if matches.is_some() { + let captures = matches.as_ref().unwrap(); + for capture in captures.iter() { + let mut name = capture.unwrap().as_str().to_lowercase(); + name = regexes::REPLACE_WHITESPACE_REGEX + .replace_all(&name, "") + .to_string(); + values.insert(name, content.trim().to_string()); + } + } + } + if matches.is_none() && name_attr.is_some() { + let name_val = name_attr.unwrap(); + if regexes::is_match_name_pattern(name_val) { + let name = name_val.to_lowercase(); + let name = regexes::REPLACE_WHITESPACE_REGEX.replace(&name, ""); + let name = regexes::REPLACE_DOT_REGEX.replace(&name, ":"); + values.insert(name.to_string(), content.trim().to_string()); + } + } + }); + } + + let meta_title_keys = [ + "dc:title", + "dcterm:title", + "og:title", + "weibo:article:title", + "weibo:webpage:title", + "title", + "twitter:title", + ]; + meta_data.title = if let Some(key) = meta_title_keys + .iter() + .find(|key| values.contains_key(**key)) + { + values.get(*key).map(|title| title.to_owned()).unwrap() + } else { + self.get_article_title() + }; + + let meta_byline_keys = ["dc:creator", "dcterm:creator", "author"]; + meta_data.byline = { + let possible_key = meta_byline_keys + .iter() + .find(|key| values.contains_key(**key)); + if let Some(actual_key) = possible_key { + values.get(*actual_key).map(|byline| byline.to_owned()) + } else { + None + } + }; + + let meta_excerpt_keys = [ + "dc:description", + "dcterm:description", + "og:description", + "weibo:article:description", + "weibo:webpage:description", + "description", + "twitter:description", + ]; + meta_data.excerpt = { + let possible_key = meta_excerpt_keys + .iter() + .find(|key| values.contains_key(**key)); + if let Some(actual_key) = possible_key { + values.get(*actual_key).map(|excerpt| excerpt.to_owned()) + } else { + None + } + }; + + meta_data.site_name = values + .get("og:site_name") + .map(|site_name| site_name.to_owned()); + + Self::unescape_html_entities(&mut meta_data.title); + if meta_data.byline.is_some() { + Self::unescape_html_entities(&mut meta_data.byline.as_mut().unwrap()); + } + + if meta_data.excerpt.is_some() { + Self::unescape_html_entities(&mut meta_data.excerpt.as_mut().unwrap()); + } + + if meta_data.site_name.is_some() { + Self::unescape_html_entities(&mut meta_data.site_name.as_mut().unwrap()); + } + + meta_data + } + + /// Converts some of the common HTML entities in string to their corresponding characters. + fn unescape_html_entities(value: &mut String) { + if !value.is_empty() { + // TODO: Extract this + let mut html_escape_map: HashMap<&str, &str> = HashMap::new(); + html_escape_map.insert("lt", "<"); + html_escape_map.insert("gt", ">"); + html_escape_map.insert("amp", "&"); + html_escape_map.insert("quot", "\""); + html_escape_map.insert("apos", "'"); + let mut new_value = regexes::REPLACE_HTML_ESCAPE_REGEX + .replace_all(&value, |captures: ®ex::Captures| { + html_escape_map[&captures[1]].to_string() + }) + .to_string(); + new_value = regexes::REPLACE_HEX_REGEX + .replace_all(&new_value, |captures: ®ex::Captures| { + let num = if let Some(hex_capture) = captures.get(1) { + u16::from_str_radix(hex_capture.as_str(), 16) + } else if let Some(dec_capture) = captures.get(2) { + u16::from_str(dec_capture.as_str()) + } else { + unreachable!("Unable to match any of the captures"); + }; + String::from_utf16_lossy(&[num.unwrap()]) + }) + .to_string(); + *value = new_value; + } + } + + /// Get the article title as an H1. + fn get_article_title(&self) -> String { + let mut cur_title = self + .root_node + .select_first("title") + .map(|title| title.text_contents().trim().to_string()) + .expect("This file has no tag to extract a title from"); + let orig_title = cur_title.clone(); + let mut title_had_hierarchical_separators = false; + let word_count = |s: &str| -> usize { s.split_whitespace().count() }; + if regexes::is_match_title_separator(&cur_title) { + title_had_hierarchical_separators = regexes::is_match_has_title_separator(&cur_title); + cur_title = regexes::REPLACE_START_SEPARATOR_REGEX + .replace_all(&orig_title, "$start") + .to_string(); + if word_count(&cur_title) < 3 { + cur_title = regexes::REPLACE_END_SEPARATOR_REGEX + .replace_all(&orig_title, "$end") + .to_string(); + } + } else if cur_title.contains(": ") { + let trimmed_title = cur_title.trim(); + let is_match_heading = self + .root_node + .select("h1, h2") + .unwrap() + .any(|heading| heading.text_contents().trim() == trimmed_title); + if !is_match_heading { + let mut idx = orig_title.rfind(":").unwrap() + 1; + let mut new_title = &orig_title[idx..]; + if word_count(new_title) < 3 { + idx = orig_title.find(":").unwrap() + 1; + new_title = &orig_title[idx..]; + } else if word_count(&orig_title[0..orig_title.find(":").unwrap()]) > 5 { + new_title = &orig_title; + } + cur_title = new_title.to_string(); + } + } else if cur_title.len() > 150 || cur_title.len() < 15 { + let mut h1_nodes = self.root_node.select("h1").unwrap(); + let (_, h1_count) = h1_nodes.size_hint(); + if Some(1) == h1_count { + cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None); + } + } + cur_title = regexes::NORMALIZE_REGEX + .replace(cur_title.trim(), " ") + .to_string(); + let cur_word_count = word_count(&cur_title); + + if cur_word_count <= 4 + && (!title_had_hierarchical_separators + || cur_word_count + != word_count(®exes::REPLACE_MULTI_SEPARATOR_REGEX.replace(&orig_title, "")) + - 1) + { + cur_title = orig_title; + } + cur_title } /// Converts an inline CSS string to a [HashMap] of property and value(s) @@ -1672,10 +1869,28 @@ impl Readability { } } } +#[derive(Debug, PartialEq)] +pub struct MetaData { + byline: Option<String>, + excerpt: Option<String>, + site_name: Option<String>, + title: String, +} + +impl MetaData { + pub fn new() -> Self { + MetaData { + byline: None, + excerpt: None, + site_name: None, + title: "".into(), + } + } +} #[cfg(test)] mod test { - use super::{Readability, SizeInfo, HTML_NS, READABILITY_SCORE}; + use super::{MetaData, Readability, SizeInfo, HTML_NS, READABILITY_SCORE}; use html5ever::{LocalName, Namespace, QualName}; use kuchiki::traits::*; use kuchiki::NodeRef; @@ -3075,4 +3290,176 @@ characters. For that reason, this <p> tag could not be a byline because it's too .local ); } + + #[test] + fn test_get_article_title() { + let mut html_str = r#" + <!DOCTYPE html> + <html> + <head> + <title>Porting Readability to Rust + + +

+ + + "#; + let doc = Readability::new(html_str); + assert_eq!("Porting Readability to Rust", doc.get_article_title()); + + html_str = r#" + + + + Crates.io: The Rust package repository + + +

+ + + "#; + let doc = Readability::new(html_str); + assert_eq!( + "Crates.io: The Rust package repository", + doc.get_article_title() + ); + + html_str = r#" + + + + Crates.io: The Rust package repository + + +

Crates.io: The Rust package repository

+ + + "#; + let doc = Readability::new(html_str); + assert_eq!( + "Crates.io: The Rust package repository", + doc.get_article_title() + ); + + html_str = r#" + + + + Crates.io: A package repository + + +

Crates.io: A Rust package repository

+ + + "#; + let doc = Readability::new(html_str); + assert_eq!("Crates.io: A package repository", doc.get_article_title()); + + html_str = r#" + + + + Foo developer \ Blog + + +

+ + + "#; + let doc = Readability::new(html_str); + assert_eq!("Foo developer \\ Blog", doc.get_article_title()); + + html_str = r#" + + + + Foo developer » Blog Post on Foo bar stuff + + +

+ + + "#; + let doc = Readability::new(html_str); + assert_eq!("Blog Post on Foo bar stuff", doc.get_article_title()); + + html_str = r#" + + + + Blog + + +

Getting started with Rust

+ + + "#; + let doc = Readability::new(html_str); + assert_eq!("Blog", doc.get_article_title()); + } + + #[test] + fn test_unescape_html_entities() { + let mut input = "Therefore, 5 > 3".to_string(); + Readability::unescape_html_entities(&mut input); + assert_eq!("Therefore, 5 > 3", &input); + input = "Logical AND (&&)".to_string(); + Readability::unescape_html_entities(&mut input); + assert_eq!("Logical AND (&&)", &input); + input = "u + e = ü".to_string(); + Readability::unescape_html_entities(&mut input); + assert_eq!("u + e = ü", input); + input = "Řŭšţ".to_string(); + Readability::unescape_html_entities(&mut input); + assert_eq!("Řŭšţ", input); + } + + #[test] + fn test_get_article_metadata() { + let mut html_str = r#" + + + + + + + Foo Coder / Blog on the difficulty of using utf-8 + + + + + "#; + let doc = Readability::new(html_str); + let mut result = MetaData::new(); + result.byline = Some("Foo Coder".to_string()); + result.excerpt = Some("A post on how hard it is to work with text.".to_string()); + result.title = "Blog on the difficulty of using utf-8".to_string(); + assert_eq!(result, doc.get_article_metadata()); + + html_str = r#" + + + + + + + + + + + + + + The Longest Title + + + "#; + let doc = Readability::new(html_str); + result = MetaData::new(); + result.byline = Some("Föo Coder".to_string()); + result.excerpt = Some("Foo bar baz boß".to_string()); + result.site_name = Some("Blog Place".to_string()); + result.title = "A Longer Title".to_string(); + assert_eq!(result, doc.get_article_metadata()); + } } diff --git a/src/moz_readability/regexes.rs b/src/moz_readability/regexes.rs index 6b7c063..1a49c77 100644 --- a/src/moz_readability/regexes.rs +++ b/src/moz_readability/regexes.rs @@ -89,9 +89,45 @@ pub fn is_match_src_regex(match_str: &str) -> bool { SRC_REGEX.is_match(match_str) } +pub fn is_match_name_pattern(match_str: &str) -> bool { + lazy_static! { + static ref NAME_PATTERN_REGEX: Regex = Regex::new(r"(?i)\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$").unwrap(); + } + NAME_PATTERN_REGEX.is_match(match_str) +} + +pub fn is_match_title_separator(match_str: &str) -> bool { + lazy_static! { + static ref TITLE_SEPARATOR_REGEX: Regex = Regex::new(r" [\|\-\\/>»] ").unwrap(); + } + TITLE_SEPARATOR_REGEX.is_match(match_str) +} + +pub fn is_match_has_title_separator(match_str: &str) -> bool { + lazy_static! { + static ref HAS_TITLE_SEPARATOR_REGEX: Regex = Regex::new(r" [\\/>»] ").unwrap(); + } + HAS_TITLE_SEPARATOR_REGEX.is_match(match_str) +} + lazy_static! { pub static ref NORMALIZE_REGEX: Regex = Regex::new(r"\s{2,}").unwrap(); pub static ref B64_DATA_URL_REGEX: Regex = Regex::new(r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*").unwrap(); pub static ref BASE64_REGEX: Regex = Regex::new(r"(?i)base64\s*").unwrap(); + pub static ref PROPERTY_REGEX: Regex = Regex::new( + r"(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*" + ) + .unwrap(); + pub static ref REPLACE_WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap(); + pub static ref REPLACE_DOT_REGEX: Regex = Regex::new(r"\.").unwrap(); + pub static ref REPLACE_HTML_ESCAPE_REGEX: Regex = + Regex::new("&(quot|amp|apos|lt|gt);").unwrap(); + pub static ref REPLACE_HEX_REGEX: Regex = + Regex::new(r"(?i)&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));").unwrap(); + pub static ref REPLACE_START_SEPARATOR_REGEX: Regex = + Regex::new(r"(?i)(?P.*)[\|\-\\/>»] .*").unwrap(); + pub static ref REPLACE_END_SEPARATOR_REGEX: Regex = + Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P.*)").unwrap(); + pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap(); }