From 350447d1c4d2932aed3569684d39da14f55c5468 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Wed, 21 Oct 2020 16:21:53 +0300 Subject: [PATCH] Change calls on replacing regexes to `replace_all` Add `fix_relative_uris`, `clean_classes`, `clean_readability_attrs` and `post_process_content` --- src/moz_readability/mod.rs | 358 ++++++++++++++++++++++++++++++++- src/moz_readability/regexes.rs | 2 + 2 files changed, 353 insertions(+), 7 deletions(-) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index c1e86e2..ab38cdf 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::str::FromStr; use html5ever::{LocalName, Namespace, QualName}; @@ -7,6 +7,7 @@ use kuchiki::{ traits::*, NodeData, NodeRef, }; +use url::Url; const SHARE_ELEMENT_THRESHOLD: usize = 500; const READABILITY_SCORE: &'static str = "readability-score"; @@ -68,13 +69,14 @@ impl Readability { article_dir: None, } } - pub fn parse(&mut self) { + pub fn parse(&mut self, url: &str) { self.unwrap_no_script_tags(); self.remove_scripts(); self.prep_document(); let meta_data = self.get_article_metadata(); self.article_title = meta_data.title.clone(); self.grab_article(); + self.post_process_content(url); } /// Recursively check if node is image, or if node contains exactly only one image @@ -435,8 +437,8 @@ impl Readability { let name_val = name_attr.unwrap(); if regexes::is_match_name_pattern(name_val) { let name = name_val.to_lowercase(); - let name = regexes::REPLACE_WHITESPACE_REGEX.replace(&name, ""); - let name = regexes::REPLACE_DOT_REGEX.replace(&name, ":"); + let name = regexes::REPLACE_WHITESPACE_REGEX.replace_all(&name, ""); + let name = regexes::REPLACE_DOT_REGEX.replace_all(&name, ":"); values.insert(name.to_string(), content.trim().to_string()); } } @@ -590,21 +592,162 @@ impl Readability { } } cur_title = regexes::NORMALIZE_REGEX - .replace(cur_title.trim(), " ") + .replace_all(cur_title.trim(), " ") .to_string(); let cur_word_count = word_count(&cur_title); if cur_word_count <= 4 && (!title_had_hierarchical_separators || cur_word_count - != word_count(®exes::REPLACE_MULTI_SEPARATOR_REGEX.replace(&orig_title, "")) - - 1) + != word_count( + ®exes::REPLACE_MULTI_SEPARATOR_REGEX.replace_all(&orig_title, ""), + ) - 1) { cur_title = orig_title; } cur_title } + /// Removes the class="" attribute from every element in the given subtree, except those that + /// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object. + fn clean_classes(&mut self) { + // TODO: This should accessed from Self + let classes_to_preserve: HashSet<&str> = HashSet::new(); + if let Some(article_node) = &mut self.article_node { + for elem in article_node.inclusive_descendants().elements() { + let mut elem_attrs = elem.attributes.borrow_mut(); + if let Some(class_list) = elem_attrs.get_mut("class") { + let filtered_class: String = class_list + .split_whitespace() + .filter(|class| classes_to_preserve.contains(class)) + .fold("".to_string(), |acc, x| acc + " " + x); + if filtered_class.is_empty() { + elem_attrs.remove("class"); + } else { + *class_list = filtered_class; + } + } + } + } + } + + /// Converts each and uri in the given element to an absolute URI, ignoring #ref URIs. + fn fix_relative_uris(&mut self, document_uri: &str) { + if let Some(article_node) = &mut self.article_node { + let document_uri = + Url::parse(document_uri).expect("Unable to parse the document's URI"); + let base_uri = self + .root_node + .select("base") + .unwrap() + .filter(|node_ref| { + let node_attrs = node_ref.attributes.borrow(); + node_attrs.contains("href") + }) + .map(|node_ref| { + let node_attrs = node_ref.attributes.borrow(); + Url::parse(node_attrs.get("href").unwrap()).unwrap() + }) + .next() + .unwrap_or(document_uri.clone()); + let to_absolute_uri = |uri_str: &str| -> String { + if base_uri == document_uri && uri_str.starts_with("#") { + return uri_str.to_string(); + } + + if let Ok(new_uri) = Url::parse(uri_str) { + if new_uri.has_host() { + return new_uri.to_string(); + } + } else if let Ok(joined_uri) = base_uri.join(uri_str) { + return joined_uri.to_string(); + } + + uri_str.to_string() + }; + let mut links = article_node.select("a").unwrap().filter(|a_ref| { + let link_attrs = a_ref.attributes.borrow(); + link_attrs.contains("href") + }); + let mut link = links.next(); + while let Some(link_ref) = link { + link = links.next(); + let mut link_attrs = link_ref.attributes.borrow_mut(); + let href = link_attrs.get("href").map(|val| val.to_string()).unwrap(); + if href.starts_with("javascript:") { + let link_node = link_ref.as_node(); + if link_node.children().count() == 1 + && link_node + .first_child() + .map(|node_ref| node_ref.as_text().is_some()) + .unwrap() + { + let text_node = NodeRef::new_text(link_node.text_contents()); + link_node.insert_before(text_node); + link_node.detach(); + } else { + let container = NodeRef::new_element( + QualName::new(None, Namespace::from(HTML_NS), LocalName::from("span")), + BTreeMap::new(), + ); + let mut children = link_node.children(); + let mut child = children.next(); + while let Some(child_ref) = child { + child = children.next(); + container.append(child_ref); + } + link_node.insert_before(container); + link_node.detach(); + } + } else { + link_attrs.insert("href", to_absolute_uri(&href)); + } + } + let media_nodes = article_node + .select("img, picture, figure, video, audio, source") + .unwrap(); + for media_node in media_nodes { + let mut media_attrs = media_node.attributes.borrow_mut(); + if let Some(src) = media_attrs.get_mut("src") { + *src = to_absolute_uri(&src); + } + + if let Some(poster) = media_attrs.get_mut("poster") { + *poster = to_absolute_uri(&poster); + } + + if let Some(srcset) = media_attrs.get_mut("srcset") { + let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all( + &srcset, + |captures: ®ex::Captures| { + to_absolute_uri(&captures[1]) + &captures[2] + &captures[3] + }, + ); + *srcset = new_srcset.to_string(); + } + } + } + } + + /// Removes readability attributes from DOM nodes as they are not needed in the final article + fn clean_readability_attrs(&mut self) { + if let Some(article_node) = &mut self.article_node { + for node in article_node.inclusive_descendants().elements() { + let mut node_attrs = node.attributes.borrow_mut(); + node_attrs.remove(READABILITY_SCORE); + node_attrs.remove("readability-data-table"); + } + } + } + + /// Run any post-process modifications to article content as necessary. + fn post_process_content(&mut self, url: &str) { + self.fix_relative_uris(url); + // TODO: Add flag check + self.clean_classes(); + self.clean_readability_attrs(); + } + /// Converts an inline CSS string to a [HashMap] of property and value(s) fn inline_css_str_to_map(css_str: &str) -> HashMap<&str, &str> { css_str @@ -3462,4 +3605,205 @@ characters. For that reason, this

tag could not be a byline because it's too result.title = "A Longer Title".to_string(); assert_eq!(result, doc.get_article_metadata()); } + + #[test] + fn test_fix_relative_uris() { + let html_str = r##" + + + +

Go back

+ Ex 1 + Ex 2 + Ex 3 + Ex 4 + Ex 5 +

First image

+ + + "##; + let mut doc = Readability::new(html_str); + doc.article_node = doc + .root_node + .select_first("body") + .ok() + .map(|node_ref| node_ref.as_node().clone()); + doc.fix_relative_uris("https://example.image.com/blog/"); + + let node = doc.root_node.select_first("img#ex-1").unwrap(); + let node_attrs = node.attributes.borrow(); + assert_eq!( + Some("https://example.image.com/images/1.jpg"), + node_attrs.get("src") + ); + + let node = doc.root_node.select_first("img#ex-2").unwrap(); + let node_attrs = node.attributes.borrow(); + assert_eq!( + Some("https://example.image.com/images/2.jpg"), + node_attrs.get("src") + ); + + let node = doc.root_node.select_first("img#ex-3").unwrap(); + let node_attrs = node.attributes.borrow(); + assert_eq!( + Some("https://example.image.com/images/2.jpg"), + node_attrs.get("src") + ); + + let node = doc.root_node.select_first("img#ex-4").unwrap(); + let node_attrs = node.attributes.borrow(); + assert_eq!( + Some("https://example.image.com/blog/images/1.jpg"), + node_attrs.get("src") + ); + + let node = doc.root_node.select_first("img#ex-5").unwrap(); + let node_attrs = node.attributes.borrow(); + assert_eq!( + Some("https://images.com/images/1.jpg"), + node_attrs.get("src") + ); + + let node = doc.root_node.select_first("p a").unwrap(); + let node_attrs = node.attributes.borrow(); + assert_eq!(Some("#ex-1"), node_attrs.get("href")); + + let node = doc.root_node.select_first("h1 a").unwrap(); + let node_attrs = node.attributes.borrow(); + assert_eq!( + Some("https://example.image.com/home.html"), + node_attrs.get("href") + ); + } + + #[test] + fn test_clean_classes() { + // TODO: This test will later be edited to ensure it checks to only remove certain classes + let html_str = r#" + + + +

One

+

Two

+
Three
+
Four
+ + + + "#; + let mut doc = Readability::new(html_str); + doc.article_node = doc + .root_node + .select_first("body") + .ok() + .map(|node_ref| node_ref.as_node().clone()); + doc.clean_classes(); + + assert_eq!( + true, + doc.root_node + .inclusive_descendants() + .elements() + .all(|node_elem| { + let node_attrs = node_elem.attributes.borrow(); + !node_attrs.contains("class") + }) + ); + } + + #[test] + fn test_clean_readability_attrs() { + let html_str = r#" + + + +
+

Welcome to this awesome blog post. Only good content is here. No spam.

+

Let's look at some statistics

+ + + + + + + + + + + + + + +
Monthly savings
MonthSavings
January$100
February$50
+
+ + + "#; + let mut doc = Readability::new(html_str); + doc.article_node = doc + .root_node + .select_first("body") + .ok() + .map(|node_ref| node_ref.as_node().clone()); + doc.clean_readability_attrs(); + assert_eq!( + true, + doc.root_node + .inclusive_descendants() + .elements() + .all(|node| { + let node_attrs = node.attributes.borrow(); + node_attrs.map.len() == 0 + }) + ); + } + + #[test] + fn test_post_process_content() { + let html_str = r##" + + + +

One

+

Two

+
Three
+
+ +
+ + + + "##; + let mut doc = Readability::new(html_str); + doc.article_node = doc + .root_node + .select_first("body") + .ok() + .map(|node_ref| node_ref.as_node().clone()); + doc.post_process_content("https://foo.blog/post/"); + let has_class_attr = doc + .root_node + .inclusive_descendants() + .elements() + .any(|node_ref| { + let attrs = node_ref.attributes.borrow(); + attrs.contains("class") + }); + assert_eq!(false, has_class_attr); + let a_node = doc.root_node.select_first("a").unwrap(); + let a_node_attrs = a_node.attributes.borrow(); + assert_eq!(Some("#home"), a_node_attrs.get("href")); + let img_node = doc.root_node.select_first("img").unwrap(); + let img_attrs = img_node.attributes.borrow(); + assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src")); + } } diff --git a/src/moz_readability/regexes.rs b/src/moz_readability/regexes.rs index 1a49c77..01b7e9e 100644 --- a/src/moz_readability/regexes.rs +++ b/src/moz_readability/regexes.rs @@ -119,6 +119,8 @@ lazy_static! { r"(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*" ) .unwrap(); + pub static ref SRCSET_CAPTURE_REGEX: Regex = + Regex::new(r"(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))").unwrap(); pub static ref REPLACE_WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap(); pub static ref REPLACE_DOT_REGEX: Regex = Regex::new(r"\.").unwrap(); pub static ref REPLACE_HTML_ESCAPE_REGEX: Regex =