From a94798cc95dcb0f58c8232a77d1efa7e9d7cc69f Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 22 Oct 2020 08:24:46 +0300 Subject: [PATCH] Add flags for conditional cleaning and removal of nodes This also includes updating the function signatures of the affected methods --- src/moz_readability/mod.rs | 136 ++++++++++++++++++++++++++----------- 1 file changed, 98 insertions(+), 38 deletions(-) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 4e4486f..32d6ab0 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -10,6 +10,9 @@ use kuchiki::{ use url::Url; const DEFAULT_CHAR_THRESHOLD: usize = 500; +const FLAG_STRIP_UNLIKELYS: u32 = 0x1; +const FLAG_WEIGHT_CLASSES: u32 = 0x2; +const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4; const READABILITY_SCORE: &'static str = "readability-score"; const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; // TODO: Change to HashSet @@ -51,6 +54,7 @@ pub struct Readability { article_title: String, pub article_node: Option, article_dir: Option, + flags: u32, } #[derive(Debug, PartialEq)] @@ -67,6 +71,7 @@ impl Readability { article_title: "".into(), article_node: None, article_dir: None, + flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY, } } pub fn parse(&mut self, url: &str) { @@ -992,8 +997,10 @@ impl Readability { /// Get an element's class/id weight using regular expressions to tell if this /// element looks good or bad. - fn get_class_weight(node_ref: &NodeRef) -> i32 { - //TODO: Add check for weighing classes + fn get_class_weight(&self, node_ref: &NodeRef) -> i32 { + if !self.flag_is_active(FLAG_WEIGHT_CLASSES) { + return 0; + } let mut weight = 0; let node_elem = node_ref.as_element().unwrap(); let node_attrs = node_elem.attributes.borrow(); @@ -1024,12 +1031,12 @@ impl Readability { /// Initialize a node with the readability attribute. Also checks the /// className/id for special names to add to its score. - fn initialize_node(node_ref: &mut NodeRef) { + fn initialize_node(&self, node_ref: &mut NodeRef) { if let Some(element) = node_ref.as_element() { let mut score = 0.0; // This must be computed first because it borrows the NodeRef which // should not also be mutably borrowed - score += Self::get_class_weight(node_ref) as f32; + score += self.get_class_weight(node_ref) as f32; let mut elem_attrs = element.attributes.borrow_mut(); elem_attrs.insert(READABILITY_SCORE, score.to_string()); let readability = elem_attrs.get_mut(READABILITY_SCORE); @@ -1222,8 +1229,10 @@ impl Readability { /// Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm /// based on content length, classnames, link density, number of images & embeds, etc. - fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) { - // TODO: Add flag check + fn clean_conditionally(&self, node_ref: &mut NodeRef, tag_name: &str) { + if !self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) { + return; + } let is_list = tag_name == "ul" || tag_name == "ol"; let is_data_table = |node_ref: &NodeRef| { let node_elem = node_ref.as_element().unwrap(); @@ -1253,7 +1262,7 @@ impl Readability { while let Some(node_data_ref) = next_node { next_node = nodes.next(); let node = node_data_ref.as_node(); - let weight = Self::get_class_weight(node); + let weight = self.get_class_weight(node); // Remove all elements with negative class weights if weight < 0 { node.detach(); @@ -1336,12 +1345,12 @@ impl Readability { } /// Clean out spurious headers from an Element. Checks things like classnames and link density. - fn clean_headers(node_ref: &mut NodeRef) { + fn clean_headers(&self, node_ref: &mut NodeRef) { let mut nodes = node_ref .descendants() .select("h1, h2") .unwrap() - .filter(|node_data_ref| Self::get_class_weight(node_data_ref.as_node()) < 0); + .filter(|node_data_ref| self.get_class_weight(node_data_ref.as_node()) < 0); let mut node = nodes.next(); while let Some(node_data_ref) = node { @@ -1391,8 +1400,8 @@ impl Readability { Self::clean_styles(node_ref); self.mark_data_tables(); Self::fix_lazy_images(node_ref); - Self::clean_conditionally(node_ref, "form"); - Self::clean_conditionally(node_ref, "fieldset"); + self.clean_conditionally(node_ref, "form"); + self.clean_conditionally(node_ref, "fieldset"); Self::clean(node_ref, "object"); Self::clean(node_ref, "embed"); Self::clean(node_ref, "h1"); @@ -1430,11 +1439,11 @@ impl Readability { Self::clean(node_ref, "textarea"); Self::clean(node_ref, "select"); Self::clean(node_ref, "button"); - Self::clean_headers(node_ref); + self.clean_headers(node_ref); - Self::clean_conditionally(node_ref, "table"); - Self::clean_conditionally(node_ref, "ul"); - Self::clean_conditionally(node_ref, "div"); + self.clean_conditionally(node_ref, "table"); + self.clean_conditionally(node_ref, "ul"); + self.clean_conditionally(node_ref, "div"); let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| { let p_node = node_data_ref.as_node(); @@ -1499,6 +1508,14 @@ impl Readability { } } + fn flag_is_active(&self, flag: u32) -> bool { + self.flags & flag > 0 + } + + fn remove_flag(&mut self, flag: u32) { + self.flags = self.flags & !flag; + } + /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// a user wants to read. Then return it wrapped up in a div. fn grab_article(&mut self) { @@ -1524,8 +1541,7 @@ impl Readability { loop { // var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); - // TODO: Add flag for checking this - let strip_unlikely_candidates = true; + let strip_unlikely_candidates = self.flag_is_active(FLAG_STRIP_UNLIKELYS); // // First, node prepping. Trash nodes that look cruddy (like ones with the // // class name "comment", etc), and turn divs into P tags where they have been @@ -1675,7 +1691,7 @@ impl Readability { ancestor_attrs.contains(READABILITY_SCORE) }; if !has_readability { - Self::initialize_node(&mut ancestor); + self.initialize_node(&mut ancestor); candidates.push(ancestor.clone()); } @@ -1754,7 +1770,7 @@ impl Readability { top_candidate.append(child_node); }); page.as_node().append(top_candidate.clone()); - Self::initialize_node(&mut top_candidate); + self.initialize_node(&mut top_candidate); } else { let alternative_candidate_ancestors: Vec>; top_candidate = top_candidates.get(0).unwrap().clone(); @@ -1813,7 +1829,7 @@ impl Readability { }; if top_candidate_readability.is_none() { - Self::initialize_node(&mut top_candidate); + self.initialize_node(&mut top_candidate); } parent_of_top_candidate = top_candidate.parent().unwrap(); @@ -1873,7 +1889,7 @@ impl Readability { .map(|score| score.to_string()) }; if top_candidate_readability.is_none() { - Self::initialize_node(&mut top_candidate); + self.initialize_node(&mut top_candidate); } } let mut article_content = NodeRef::new_element( @@ -1985,9 +2001,16 @@ impl Readability { let text_length = Self::get_inner_text(&article_content, Some(true)).len(); let mut parse_successful = true; if text_length < DEFAULT_CHAR_THRESHOLD { - // TODO Add flag checks parse_successful = false; - println!("I haz a smol content. Plz run me again"); + if self.flag_is_active(FLAG_STRIP_UNLIKELYS) { + self.remove_flag(FLAG_STRIP_UNLIKELYS); + } else if self.flag_is_active(FLAG_WEIGHT_CLASSES) { + self.remove_flag(FLAG_WEIGHT_CLASSES); + } else if self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) { + self.remove_flag(FLAG_CLEAN_CONDITIONALLY); + } else { + parse_successful = true; + } } if parse_successful { let parent_ancestors = Self::get_node_ancestors(&parent_of_top_candidate, None); @@ -2037,7 +2060,10 @@ impl MetaData { #[cfg(test)] mod test { - use super::{MetaData, Readability, SizeInfo, HTML_NS, READABILITY_SCORE}; + use super::{ + MetaData, Readability, SizeInfo, FLAG_CLEAN_CONDITIONALLY, FLAG_STRIP_UNLIKELYS, + FLAG_WEIGHT_CLASSES, HTML_NS, READABILITY_SCORE, + }; use html5ever::{LocalName, Namespace, QualName}; use kuchiki::traits::*; use kuchiki::NodeRef; @@ -2793,22 +2819,22 @@ characters. For that reason, this

tag could not be a byline because it's too "#; let doc = Readability::new(html_str); let mut target = doc.root_node.select_first("body").unwrap(); - assert_eq!(0, Readability::get_class_weight(target.as_node())); + assert_eq!(0, doc.get_class_weight(target.as_node())); target = doc.root_node.select_first("div#blog").unwrap(); - assert_eq!(50, Readability::get_class_weight(target.as_node())); + assert_eq!(50, doc.get_class_weight(target.as_node())); target = doc.root_node.select_first("h1.hidden").unwrap(); - assert_eq!(-25, Readability::get_class_weight(target.as_node())); + assert_eq!(-25, doc.get_class_weight(target.as_node())); target = doc.root_node.select_first("p#story").unwrap(); - assert_eq!(25, Readability::get_class_weight(target.as_node())); + assert_eq!(25, doc.get_class_weight(target.as_node())); target = doc.root_node.select_first("div#comments").unwrap(); - assert_eq!(-25, Readability::get_class_weight(target.as_node())); + assert_eq!(-25, doc.get_class_weight(target.as_node())); target = doc.root_node.select_first("p.comment").unwrap(); - assert_eq!(-25, Readability::get_class_weight(target.as_node())); + assert_eq!(-25, doc.get_class_weight(target.as_node())); } #[test] @@ -2831,31 +2857,31 @@ characters. For that reason, this

tag could not be a byline because it's too let doc = Readability::new(html_str); let mut target = doc.root_node.select_first("div#blog").unwrap(); let mut node = target.as_node().clone(); - Readability::initialize_node(&mut node); + doc.initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE)); target = doc.root_node.select_first("h1.hidden").unwrap(); let mut node = target.as_node().clone(); - Readability::initialize_node(&mut node); + doc.initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE)); target = doc.root_node.select_first("p#story").unwrap(); let mut node = target.as_node().clone(); - Readability::initialize_node(&mut node); + doc.initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE)); target = doc.root_node.select_first("div#comments").unwrap(); let mut node = target.as_node().clone(); - Readability::initialize_node(&mut node); + doc.initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE)); target = doc.root_node.select_first("pre.comment").unwrap(); let mut node = target.as_node().clone(); - Readability::initialize_node(&mut node); + doc.initialize_node(&mut node); let node_attrs = node.as_element().unwrap().attributes.borrow(); assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE)); } @@ -3108,14 +3134,14 @@ characters. For that reason, this

tag could not be a byline because it's too let mut doc = Readability::new(html_str); let body = doc.root_node.select_first("body").unwrap(); doc.mark_data_tables(); - Readability::clean_conditionally(&mut body.as_node().clone(), "table"); + doc.clean_conditionally(&mut body.as_node().clone(), "table"); assert_eq!(true, doc.root_node.select_first("#data-table").is_ok()); assert_eq!(false, doc.root_node.select_first("#display-table").is_ok()); assert_eq!( false, doc.root_node.select_first("#display-table-removed").is_ok() ); - Readability::clean_conditionally(&mut body.as_node().clone(), "div"); + doc.clean_conditionally(&mut body.as_node().clone(), "div"); assert_eq!(false, doc.root_node.select_first("div.comment").is_ok()); assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok()); assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok()); @@ -3174,7 +3200,7 @@ characters. For that reason, this

tag could not be a byline because it's too let h2_count = doc.root_node.select("h2").unwrap().count(); assert_eq!(2, h1_count); assert_eq!(1, h2_count); - Readability::clean_headers(&mut body.as_node().clone()); + doc.clean_headers(&mut body.as_node().clone()); let h1_count = doc.root_node.select("h1").unwrap().count(); let h2_count = doc.root_node.select("h2").unwrap().count(); assert_eq!(0, h1_count); @@ -3810,4 +3836,38 @@ characters. For that reason, this

tag could not be a byline because it's too let img_attrs = img_node.attributes.borrow(); assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src")); } + + #[test] + fn test_flag_is_active() { + let html_str = r" + + + + + + "; + let doc = Readability::new(html_str); + assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS)); + assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES)); + assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY)); + } + + #[test] + fn test_remove_flag() { + let html_str = r" + + + + + + "; + let mut doc = Readability::new(html_str); + assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY)); + doc.remove_flag(FLAG_CLEAN_CONDITIONALLY); + assert_eq!(false, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY)); + assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES)); + doc.remove_flag(FLAG_WEIGHT_CLASSES); + assert_eq!(false, doc.flag_is_active(FLAG_WEIGHT_CLASSES)); + assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS)); + } }