From b661211f0f3d4e6d84d3c34452b0f98dfffb1af4 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 15 Oct 2020 22:21:21 +0300 Subject: [PATCH] Refactored code to use regexes from regexes module Extracted constants from the code for easier reusability in some cases. Change select queries for multiple elements to use the `,` operator instead of calling `chain`. Remove check for "null" in `fix_lazy_images`. This mitigates a JSOM issue so it doesn't affect the Rust code in any way. --- src/moz_readability/mod.rs | 299 ++++++++++++++++++------------------- 1 file changed, 148 insertions(+), 151 deletions(-) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 81585de..eb68219 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -8,15 +8,41 @@ use kuchiki::{ traits::*, NodeData, NodeRef, }; -use regex::Regex; +const SHARE_ELEMENT_THRESHOLD: usize = 500; +const READABILITY_SCORE: &'static str = "readability-score"; const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; +// TODO: Change to HashSet const PHRASING_ELEMS: [&str; 39] = [ "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object", "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong", "sub", "sup", "textarea", "time", "var", "wbr", ]; +// TODO: Change to HashSet +const DEFAULT_TAGS_TO_SCORE: [&str; 9] = + ["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"]; +// TODO: Change to HashSet +const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"]; +const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [ + "align", + "background", + "bgcolor", + "border", + "cellpadding", + "cellspacing", + "frame", + "hspace", + "rules", + "style", + "valign", + "vspace", +]; + +const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"]; +// TODO: Change to HashSet +const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"]; + mod regexes; pub struct Readability { @@ -85,15 +111,11 @@ impl Readability { imgs.filter(|img_node_ref| { let img_attrs = img_node_ref.attributes.borrow(); !img_attrs.map.iter().any(|(name, attr)| { - // TODO: Replace with regex &name.local == "src" || &name.local == "srcset" || &name.local == "data-src" || &name.local == "data-srcset" - || attr.value.ends_with(".jpg") - || attr.value.ends_with(".jpeg") - || attr.value.ends_with(".png") - || attr.value.ends_with(".webp") + || regexes::is_match_img_ext(&attr.value) }) }) .for_each(|img_ref| img_ref.as_node().detach()); @@ -130,11 +152,7 @@ impl Readability { !val.value.trim().is_empty() && (&attr.local == "src" || &attr.local == "srcset" - // TODO: Replace with regex - || val.value.ends_with(".jpg") - || val.value.ends_with(".jpeg") - || val.value.ends_with(".png") - || val.value.ends_with(".webp")) + || regexes::is_match_img_ext(&val.value)) }); for (prev_attr, prev_value) in prev_attrs { match new_img.attributes.borrow().get(&prev_attr.local) { @@ -405,19 +423,16 @@ impl Readability { let elem_attrs = elem_data.attributes.borrow(); let rel_attr = elem_attrs.get("rel"); let itemprop_attr = elem_attrs.get("itemprop"); - let byline_regex = Regex::new(r"(?i)byline|author|dateline|writtenby|p-author") - .expect("Unable to create byline_regex"); let is_byline = (if rel_attr.is_some() { rel_attr.unwrap() == "author" } else if itemprop_attr.is_some() { itemprop_attr.unwrap().contains("author") } else { - byline_regex.is_match(match_string) + regexes::is_match_byline(match_string) }) && Self::is_valid_byline(&node_ref.text_contents()); if is_byline { self.byline = Some(node_ref.text_contents().trim().to_owned()); } - dbg!(is_byline); is_byline } else { false @@ -513,9 +528,7 @@ impl Readability { } !node_ref.children().any(|node| { node.as_text().is_some() - && Regex::new(r"\S$") - .unwrap() - .is_match(&node.text_contents().trim_end()) + && regexes::is_match_has_content(&node.text_contents().trim_end()) }) } @@ -523,9 +536,8 @@ impl Readability { let will_normalize = normalize_spaces.unwrap_or(true); let text = node_ref.text_contents(); let text = text.trim(); - let normalize_regex = Regex::new(r"\s{2,}").unwrap(); if will_normalize { - return normalize_regex.replace_all(&text, " ").to_string(); + return regexes::NORMALIZE_REGEX.replace_all(&text, " ").to_string(); } text.to_owned() } @@ -603,15 +615,13 @@ impl Readability { fn get_class_weight(node_ref: &NodeRef) -> i32 { //TODO: Add check for weighing classes let mut weight = 0; - let positive_regex = Regex::new(r"(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story").unwrap(); - let negative_regex = Regex::new(r"(?i)hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget").unwrap(); let node_elem = node_ref.as_element().unwrap(); let node_attrs = node_elem.attributes.borrow(); if let Some(id) = node_attrs.get("id") { if !id.trim().is_empty() { - weight = if positive_regex.is_match(id) { + weight = if regexes::is_match_positive(id) { weight + 25 - } else if negative_regex.is_match(id) { + } else if regexes::is_match_negative(id) { weight - 25 } else { weight @@ -620,9 +630,9 @@ impl Readability { } if let Some(class) = node_attrs.get("class") { if !class.trim().is_empty() { - weight = if positive_regex.is_match(class) { + weight = if regexes::is_match_positive(class) { weight + 25 - } else if negative_regex.is_match(class) { + } else if regexes::is_match_negative(class) { weight - 25 } else { weight @@ -641,8 +651,8 @@ impl Readability { // should not also be mutably borrowed score += Self::get_class_weight(node_ref); let mut elem_attrs = element.attributes.borrow_mut(); - elem_attrs.insert("readability-score", score.to_string()); - let readability = elem_attrs.get_mut("readability-score"); + elem_attrs.insert(READABILITY_SCORE, score.to_string()); + let readability = elem_attrs.get_mut(READABILITY_SCORE); match &*element.name.local { "div" => score += 5, "pre" | "td" | "blockquote" => score += 3, @@ -717,8 +727,8 @@ impl Readability { continue; } } - let data_table_descendants = vec!["col", "colgroup", "tfoot", "thead", "th"]; - if data_table_descendants + + if DATA_TABLE_DESCENDANTS .iter() .any(|tag_name| table_node.select_first(tag_name).is_ok()) { @@ -750,35 +760,27 @@ impl Readability { /// Convert images and figures that have properties like data-src into images that can be loaded without JS fn fix_lazy_images(node_ref: &mut NodeRef) { - let imgs = node_ref.select("img").unwrap(); - let pictures = node_ref.select("picture").unwrap(); - let figures = node_ref.select("figure").unwrap(); - let regex = Regex::new(r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*").unwrap(); - let nodes = imgs.chain(pictures).chain(figures); + let nodes = node_ref.select("img, picture, figure").unwrap(); for node in nodes { let mut node_attr = node.attributes.borrow_mut(); if let Some(src) = node_attr.get("src") { - let src_captures = regex.captures(src); + let src_captures = regexes::B64_DATA_URL_REGEX.captures(src); if src_captures.is_some() { let svg_capture = src_captures.unwrap().get(1); if svg_capture.is_some() && svg_capture.unwrap().as_str() == "image/svg+xml" { continue; } - let svg_could_be_removed = node_attr + let src_could_be_removed = node_attr .map .iter() .filter(|(name, _)| &name.local != "src") - .filter(|(_, val)| { - let regex = Regex::new(r"(?i)\.(jpg|jpeg|png|webp)").unwrap(); - regex.is_match(&val.value) - }) + .filter(|(_, val)| regexes::is_match_img_ext(&val.value)) .count() > 0; - if svg_could_be_removed { - let base64_regex = Regex::new(r"(?i)base64\s*").unwrap(); - let b64_start = base64_regex.find(src).unwrap().start(); + if src_could_be_removed { + let b64_start = regexes::BASE64_REGEX.find(src).unwrap().start(); let b64_length = src.len() - b64_start; if b64_length < 133 { node_attr.remove("src"); @@ -789,7 +791,7 @@ impl Readability { let src = node_attr.get("src"); let srcset = node_attr.get("srcset"); let class = node_attr.get("class"); - if (src.is_some() || (srcset.is_some() && srcset.unwrap() != "null")) + if (src.is_some() || srcset.is_some()) && class.is_some() && !class.unwrap().contains("lazy") { @@ -803,11 +805,9 @@ impl Readability { .filter(|(key, _)| !(&key.local == "src" || &key.local == "srcset")) .for_each(|(_, val)| { let mut copy_to = ""; - let srcset_regex = Regex::new(r"\.(jpg|jpeg|png|webp)\s+\d").unwrap(); - let src_regex = Regex::new(r"^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$").unwrap(); - if srcset_regex.is_match(&val.value) { + if regexes::is_match_srcset(&val.value) { copy_to = "srcset"; - } else if src_regex.is_match(&val.value) { + } else if regexes::is_match_src_regex(&val.value) { copy_to = "src"; } if copy_to.len() > 0 { @@ -818,9 +818,8 @@ impl Readability { } } else if tag_name == "figure" { let node_ref = node.as_node(); - let imgs = node_ref.select("img").unwrap(); - let pictures = node_ref.select("picture").unwrap(); - if imgs.chain(pictures).count() > 0 { + let img_picture_nodes = node_ref.select("img, picture").unwrap(); + if img_picture_nodes.count() > 0 { let img = NodeRef::new_element( QualName::new( None, @@ -872,124 +871,125 @@ impl Readability { Some(-1), Some(is_data_table), ) - }) - .map(|node_data_ref|{ - let weight = Self::get_class_weight(node_data_ref.as_node()); - (node_data_ref,weight) - }) - .filter(|(_, weight)| weight < &0) - .filter(|(node_data_ref,_)| get_char_count(node_data_ref.as_node()) < 10) - .filter(|(node_data_ref,_)|{ - let embed_tags = vec!["object", "embed", "iframe"]; - let mut embeds = node_data_ref - .as_node() - .select(embed_tags.join(",").as_str()) - .unwrap(); - if embed_tags.contains(&&*node_data_ref.name.local) { - embeds.next(); - } - let videos_regex = Regex::new(r"(?i)\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)").unwrap(); - !(embeds.any(|node| &node.name.local == "object") || embeds.any(|node_data_ref| { - let attrs = node_data_ref.attributes.borrow(); - !attrs.map.iter().any(|(key,_)|videos_regex.is_match(&key.local)) - })) - }) - .for_each(|(node_data_ref, weight)| { - let node = node_data_ref.as_node(); + }); + let mut next_node = nodes.next(); + while let Some(node_data_ref) = next_node { + next_node = nodes.next(); + let node = node_data_ref.as_node(); + let weight = Self::get_class_weight(node); + // Remove all elements with negative class weights + if weight < 0 { + node.detach(); + continue; + } - let mut p_nodes = node_data_ref.as_node().select("p").unwrap().count(); - let mut img_nodes = node_data_ref.as_node().select("img").unwrap().count(); - let mut li_nodes = node_data_ref.as_node().select("li").unwrap().count(); - let mut input_nodes = node_data_ref.as_node().select("input").unwrap().count(); + if get_char_count(node) >= 10 { + continue; + } + let mut embeds = node_data_ref + .as_node() + .select("object, embed, iframe") + .unwrap(); + let can_skip_embed = embeds.any(|node_data_ref| { + &node_data_ref.name.local == "object" || { + let attrs = node_data_ref.attributes.borrow(); - match node_name.as_ref() { - "p" => p_nodes -= 1, - "img" =>img_nodes -= 1, - "li" => li_nodes -= 1, - "input" => input_nodes -= 1, - _ => () - } - - let p = p_nodes as f32; - let img = img_nodes as f32; - - let embed_count = node.select("object, embed, iframe").unwrap().count(); - let link_density = Self::get_link_density(node); - let content_length = Self::get_inner_text(node, None).len(); - let has_figure_ancestor = Self::has_ancestor_tag(node, "figure", None, None); - let have_to_remove = (img_nodes > 1 && p /img < 0.5 && !has_figure_ancestor) || - (!is_list && li_nodes > p_nodes) || (input_nodes > (p_nodes / 3)) || - (!is_list && content_length < 25 && (img_nodes == 0 || img_nodes > 2) && !has_figure_ancestor) || - (!is_list && weight < 25 && link_density > 0.2) || (weight >= 25 && link_density > 0.5) || - ((embed_count == 1 && content_length < 75) || embed_count > 1); - if have_to_remove { - node.detach(); + attrs + .map + .iter() + .any(|(_, val)| regexes::is_match_videos(&val.value)) } }); + if can_skip_embed { + continue; + } + + let p_nodes = node_data_ref.as_node().select("p").unwrap().count(); + let img_nodes = node_data_ref.as_node().select("img").unwrap().count(); + let li_nodes = node_data_ref.as_node().select("li").unwrap().count() as i32 - 100; + let input_nodes = node_data_ref.as_node().select("input").unwrap().count(); + + let p = p_nodes as f32; + let img = img_nodes as f32; + + let embed_count = node.select("object, embed, iframe").unwrap().count(); + let link_density = Self::get_link_density(node); + let content_length = Self::get_inner_text(node, None).len(); + let has_figure_ancestor = Self::has_ancestor_tag(node, "figure", None, None); + let have_to_remove = (img_nodes > 1 && p / img < 0.5 && !has_figure_ancestor) + || (!is_list && li_nodes > p_nodes as i32) + || (input_nodes > (p_nodes / 3)) + || (!is_list + && content_length < 25 + && (img_nodes == 0 || img_nodes > 2) + && !has_figure_ancestor) + || (!is_list && weight < 25 && link_density > 0.2) + || (weight >= 25 && link_density > 0.5) + || ((embed_count == 1 && content_length < 75) || embed_count > 1); + if have_to_remove { + node.detach(); + } + } } /// Clean a node of all elements of type "tag". (Unless it's a YouTube or Vimeo video) fn clean(node_ref: &mut NodeRef, tag_name: &str) { + // Can be changed to a HashSet let is_embed = vec!["object", "embed", "iframe"].contains(&tag_name); let mut nodes = node_ref.select(tag_name).unwrap(); - let videos_regex = Regex::new(r"(?i)\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)").unwrap(); - if &node_ref.as_element().unwrap().name.local == tag_name { + if node_ref.as_element().is_some() && &node_ref.as_element().unwrap().name.local == tag_name + { nodes.next(); } - nodes - .filter(|node_data_ref| { - !is_embed - || { - let attrs = node_data_ref.attributes.borrow(); - !attrs - .map - .iter() - .any(|(key, _)| videos_regex.is_match(&key.local)) - } - || &node_data_ref.name.local == "object" - }) - .for_each(|node_data_ref| node_data_ref.as_node().detach()); + let mut nodes = nodes.filter(|node_data_ref| { + !is_embed + || { + let attrs = node_data_ref.attributes.borrow(); + !attrs + .map + .iter() + .any(|(_, val)| regexes::is_match_videos(&val.value)) + } + || &node_data_ref.name.local == "object" // This currently does not check the innerHTML. + }); + let mut node = nodes.next(); + while let Some(node_data_ref) = node { + node = nodes.next(); + node_data_ref.as_node().detach() + } } /// Clean out spurious headers from an Element. Checks things like classnames and link density. fn clean_headers(node_ref: &mut NodeRef) { - let mut nodes = node_ref.select("h1,h2").unwrap(); - - if vec!["h1", "h2"].contains(&node_ref.as_element().unwrap().name.local.as_ref()) { + let mut nodes = node_ref + .select("h1, h2") + .unwrap() + .filter(|node_data_ref| Self::get_class_weight(node_data_ref.as_node()) < 0); + if node_ref.as_element().is_some() + && vec!["h1", "h2"].contains(&node_ref.as_element().unwrap().name.local.as_ref()) + { nodes.next(); } - nodes - .filter(|node_data_ref| Self::get_class_weight(node_data_ref.as_node()) < 0) - .for_each(|node_data_ref| node_data_ref.as_node().detach()); + let mut node = nodes.next(); + + while let Some(node_data_ref) = node { + node = nodes.next(); + node_data_ref.as_node().detach(); + } } /// Remove the style attribute on every element and descendants. fn clean_styles(node_ref: &mut NodeRef) { - let presentational_attributes = vec![ - "align", - "background", - "bgcolor", - "border", - "cellpadding", - "cellspacing", - "frame", - "hspace", - "rules", - "style", - "valign", - "vspace", - ]; - let deprecated_size_attribute_elems = vec!["table", "th", "td", "hr", "pre"]; node_ref .inclusive_descendants() .elements() .filter(|node| &node.name.local != "svg") .for_each(|node_data_ref| { let mut attrs = node_data_ref.attributes.borrow_mut(); - presentational_attributes.iter().for_each(|pres_attr| { + PRESENTATIONAL_ATTRIBUTES.iter().for_each(|pres_attr| { attrs.remove(*pres_attr); }); - if deprecated_size_attribute_elems.contains(&node_data_ref.name.local.as_ref()) { + if DEPRECATED_SIZE_ATTRIBUTE_ELEMS.contains(&node_data_ref.name.local.as_ref()) { attrs.remove("width"); attrs.remove("height"); } @@ -1026,13 +1026,10 @@ impl Readability { Self::clean(node_ref, "link"); Self::clean(node_ref, "aside"); - // TODO: Extract as constant - let share_element_threshold = 500; - let regex = Regex::new(r"(\b|_)(share|sharedaddy)(\b|_)").unwrap(); - node_ref.children().for_each(|mut node| { Self::clean_matched_nodes(&mut node, |node: &NodeRef, match_string| { - regex.is_match(match_string) && node.text_contents().len() < share_element_threshold + regexes::is_match_share_elems(match_string) + && node.text_contents().len() < SHARE_ELEMENT_THRESHOLD }); }); @@ -1554,7 +1551,7 @@ impl Readability { #[cfg(test)] mod test { - use super::{Readability, SizeInfo, HTML_NS}; + use super::{Readability, SizeInfo, HTML_NS, READABILITY_SCORE}; use html5ever::{LocalName, Namespace, QualName}; use kuchiki::traits::*; use kuchiki::NodeRef; @@ -2348,31 +2345,31 @@ characters. For that reason, this

tag could not be a byline because it's too let mut node = target.as_node().clone(); Readability::initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); - assert_eq!(Some("55"), node_attrs.get("readability-score")); + assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE)); target = doc.root_node.select_first("h1.hidden").unwrap(); let mut node = target.as_node().clone(); Readability::initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); - assert_eq!(Some("-30"), node_attrs.get("readability-score")); + assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE)); target = doc.root_node.select_first("p#story").unwrap(); let mut node = target.as_node().clone(); Readability::initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); - assert_eq!(Some("25"), node_attrs.get("readability-score")); + assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE)); target = doc.root_node.select_first("div#comments").unwrap(); let mut node = target.as_node().clone(); Readability::initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); - assert_eq!(Some("-20"), node_attrs.get("readability-score")); + assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE)); target = doc.root_node.select_first("pre.comment").unwrap(); let mut node = target.as_node().clone(); Readability::initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); - assert_eq!(Some("-22"), node_attrs.get("readability-score")); + assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE)); } #[test]