Add flags for conditional cleaning and removal of nodes

This also includes updating the function signatures of the affected
methods
This commit is contained in:
Kenneth Gitere 2020-10-22 08:24:46 +03:00
parent f17c9bfbc9
commit a94798cc95

View file

@ -10,6 +10,9 @@ use kuchiki::{
use url::Url; use url::Url;
const DEFAULT_CHAR_THRESHOLD: usize = 500; const DEFAULT_CHAR_THRESHOLD: usize = 500;
const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
const READABILITY_SCORE: &'static str = "readability-score"; const READABILITY_SCORE: &'static str = "readability-score";
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
// TODO: Change to HashSet // TODO: Change to HashSet
@ -51,6 +54,7 @@ pub struct Readability {
article_title: String, article_title: String,
pub article_node: Option<NodeRef>, pub article_node: Option<NodeRef>,
article_dir: Option<String>, article_dir: Option<String>,
flags: u32,
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
@ -67,6 +71,7 @@ impl Readability {
article_title: "".into(), article_title: "".into(),
article_node: None, article_node: None,
article_dir: None, article_dir: None,
flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
} }
} }
pub fn parse(&mut self, url: &str) { pub fn parse(&mut self, url: &str) {
@ -992,8 +997,10 @@ impl Readability {
/// Get an element's class/id weight using regular expressions to tell if this /// Get an element's class/id weight using regular expressions to tell if this
/// element looks good or bad. /// element looks good or bad.
fn get_class_weight(node_ref: &NodeRef) -> i32 { fn get_class_weight(&self, node_ref: &NodeRef) -> i32 {
//TODO: Add check for weighing classes if !self.flag_is_active(FLAG_WEIGHT_CLASSES) {
return 0;
}
let mut weight = 0; let mut weight = 0;
let node_elem = node_ref.as_element().unwrap(); let node_elem = node_ref.as_element().unwrap();
let node_attrs = node_elem.attributes.borrow(); let node_attrs = node_elem.attributes.borrow();
@ -1024,12 +1031,12 @@ impl Readability {
/// Initialize a node with the readability attribute. Also checks the /// Initialize a node with the readability attribute. Also checks the
/// className/id for special names to add to its score. /// className/id for special names to add to its score.
fn initialize_node(node_ref: &mut NodeRef) { fn initialize_node(&self, node_ref: &mut NodeRef) {
if let Some(element) = node_ref.as_element() { if let Some(element) = node_ref.as_element() {
let mut score = 0.0; let mut score = 0.0;
// This must be computed first because it borrows the NodeRef which // This must be computed first because it borrows the NodeRef which
// should not also be mutably borrowed // should not also be mutably borrowed
score += Self::get_class_weight(node_ref) as f32; score += self.get_class_weight(node_ref) as f32;
let mut elem_attrs = element.attributes.borrow_mut(); let mut elem_attrs = element.attributes.borrow_mut();
elem_attrs.insert(READABILITY_SCORE, score.to_string()); elem_attrs.insert(READABILITY_SCORE, score.to_string());
let readability = elem_attrs.get_mut(READABILITY_SCORE); let readability = elem_attrs.get_mut(READABILITY_SCORE);
@ -1222,8 +1229,10 @@ impl Readability {
/// Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm /// Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm
/// based on content length, classnames, link density, number of images & embeds, etc. /// based on content length, classnames, link density, number of images & embeds, etc.
fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) { fn clean_conditionally(&self, node_ref: &mut NodeRef, tag_name: &str) {
// TODO: Add flag check if !self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
return;
}
let is_list = tag_name == "ul" || tag_name == "ol"; let is_list = tag_name == "ul" || tag_name == "ol";
let is_data_table = |node_ref: &NodeRef| { let is_data_table = |node_ref: &NodeRef| {
let node_elem = node_ref.as_element().unwrap(); let node_elem = node_ref.as_element().unwrap();
@ -1253,7 +1262,7 @@ impl Readability {
while let Some(node_data_ref) = next_node { while let Some(node_data_ref) = next_node {
next_node = nodes.next(); next_node = nodes.next();
let node = node_data_ref.as_node(); let node = node_data_ref.as_node();
let weight = Self::get_class_weight(node); let weight = self.get_class_weight(node);
// Remove all elements with negative class weights // Remove all elements with negative class weights
if weight < 0 { if weight < 0 {
node.detach(); node.detach();
@ -1336,12 +1345,12 @@ impl Readability {
} }
/// Clean out spurious headers from an Element. Checks things like classnames and link density. /// Clean out spurious headers from an Element. Checks things like classnames and link density.
fn clean_headers(node_ref: &mut NodeRef) { fn clean_headers(&self, node_ref: &mut NodeRef) {
let mut nodes = node_ref let mut nodes = node_ref
.descendants() .descendants()
.select("h1, h2") .select("h1, h2")
.unwrap() .unwrap()
.filter(|node_data_ref| Self::get_class_weight(node_data_ref.as_node()) < 0); .filter(|node_data_ref| self.get_class_weight(node_data_ref.as_node()) < 0);
let mut node = nodes.next(); let mut node = nodes.next();
while let Some(node_data_ref) = node { while let Some(node_data_ref) = node {
@ -1391,8 +1400,8 @@ impl Readability {
Self::clean_styles(node_ref); Self::clean_styles(node_ref);
self.mark_data_tables(); self.mark_data_tables();
Self::fix_lazy_images(node_ref); Self::fix_lazy_images(node_ref);
Self::clean_conditionally(node_ref, "form"); self.clean_conditionally(node_ref, "form");
Self::clean_conditionally(node_ref, "fieldset"); self.clean_conditionally(node_ref, "fieldset");
Self::clean(node_ref, "object"); Self::clean(node_ref, "object");
Self::clean(node_ref, "embed"); Self::clean(node_ref, "embed");
Self::clean(node_ref, "h1"); Self::clean(node_ref, "h1");
@ -1430,11 +1439,11 @@ impl Readability {
Self::clean(node_ref, "textarea"); Self::clean(node_ref, "textarea");
Self::clean(node_ref, "select"); Self::clean(node_ref, "select");
Self::clean(node_ref, "button"); Self::clean(node_ref, "button");
Self::clean_headers(node_ref); self.clean_headers(node_ref);
Self::clean_conditionally(node_ref, "table"); self.clean_conditionally(node_ref, "table");
Self::clean_conditionally(node_ref, "ul"); self.clean_conditionally(node_ref, "ul");
Self::clean_conditionally(node_ref, "div"); self.clean_conditionally(node_ref, "div");
let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| { let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| {
let p_node = node_data_ref.as_node(); let p_node = node_data_ref.as_node();
@ -1499,6 +1508,14 @@ impl Readability {
} }
} }
fn flag_is_active(&self, flag: u32) -> bool {
self.flags & flag > 0
}
fn remove_flag(&mut self, flag: u32) {
self.flags = self.flags & !flag;
}
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
/// a user wants to read. Then return it wrapped up in a div. /// a user wants to read. Then return it wrapped up in a div.
fn grab_article(&mut self) { fn grab_article(&mut self) {
@ -1524,8 +1541,7 @@ impl Readability {
loop { loop {
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); // var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
// TODO: Add flag for checking this let strip_unlikely_candidates = self.flag_is_active(FLAG_STRIP_UNLIKELYS);
let strip_unlikely_candidates = true;
// // First, node prepping. Trash nodes that look cruddy (like ones with the // // First, node prepping. Trash nodes that look cruddy (like ones with the
// // class name "comment", etc), and turn divs into P tags where they have been // // class name "comment", etc), and turn divs into P tags where they have been
@ -1675,7 +1691,7 @@ impl Readability {
ancestor_attrs.contains(READABILITY_SCORE) ancestor_attrs.contains(READABILITY_SCORE)
}; };
if !has_readability { if !has_readability {
Self::initialize_node(&mut ancestor); self.initialize_node(&mut ancestor);
candidates.push(ancestor.clone()); candidates.push(ancestor.clone());
} }
@ -1754,7 +1770,7 @@ impl Readability {
top_candidate.append(child_node); top_candidate.append(child_node);
}); });
page.as_node().append(top_candidate.clone()); page.as_node().append(top_candidate.clone());
Self::initialize_node(&mut top_candidate); self.initialize_node(&mut top_candidate);
} else { } else {
let alternative_candidate_ancestors: Vec<Vec<NodeRef>>; let alternative_candidate_ancestors: Vec<Vec<NodeRef>>;
top_candidate = top_candidates.get(0).unwrap().clone(); top_candidate = top_candidates.get(0).unwrap().clone();
@ -1813,7 +1829,7 @@ impl Readability {
}; };
if top_candidate_readability.is_none() { if top_candidate_readability.is_none() {
Self::initialize_node(&mut top_candidate); self.initialize_node(&mut top_candidate);
} }
parent_of_top_candidate = top_candidate.parent().unwrap(); parent_of_top_candidate = top_candidate.parent().unwrap();
@ -1873,7 +1889,7 @@ impl Readability {
.map(|score| score.to_string()) .map(|score| score.to_string())
}; };
if top_candidate_readability.is_none() { if top_candidate_readability.is_none() {
Self::initialize_node(&mut top_candidate); self.initialize_node(&mut top_candidate);
} }
} }
let mut article_content = NodeRef::new_element( let mut article_content = NodeRef::new_element(
@ -1985,9 +2001,16 @@ impl Readability {
let text_length = Self::get_inner_text(&article_content, Some(true)).len(); let text_length = Self::get_inner_text(&article_content, Some(true)).len();
let mut parse_successful = true; let mut parse_successful = true;
if text_length < DEFAULT_CHAR_THRESHOLD { if text_length < DEFAULT_CHAR_THRESHOLD {
// TODO Add flag checks
parse_successful = false; parse_successful = false;
println!("I haz a smol content. Plz run me again"); if self.flag_is_active(FLAG_STRIP_UNLIKELYS) {
self.remove_flag(FLAG_STRIP_UNLIKELYS);
} else if self.flag_is_active(FLAG_WEIGHT_CLASSES) {
self.remove_flag(FLAG_WEIGHT_CLASSES);
} else if self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
self.remove_flag(FLAG_CLEAN_CONDITIONALLY);
} else {
parse_successful = true;
}
} }
if parse_successful { if parse_successful {
let parent_ancestors = Self::get_node_ancestors(&parent_of_top_candidate, None); let parent_ancestors = Self::get_node_ancestors(&parent_of_top_candidate, None);
@ -2037,7 +2060,10 @@ impl MetaData {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::{MetaData, Readability, SizeInfo, HTML_NS, READABILITY_SCORE}; use super::{
MetaData, Readability, SizeInfo, FLAG_CLEAN_CONDITIONALLY, FLAG_STRIP_UNLIKELYS,
FLAG_WEIGHT_CLASSES, HTML_NS, READABILITY_SCORE,
};
use html5ever::{LocalName, Namespace, QualName}; use html5ever::{LocalName, Namespace, QualName};
use kuchiki::traits::*; use kuchiki::traits::*;
use kuchiki::NodeRef; use kuchiki::NodeRef;
@ -2793,22 +2819,22 @@ characters. For that reason, this <p> tag could not be a byline because it's too
"#; "#;
let doc = Readability::new(html_str); let doc = Readability::new(html_str);
let mut target = doc.root_node.select_first("body").unwrap(); let mut target = doc.root_node.select_first("body").unwrap();
assert_eq!(0, Readability::get_class_weight(target.as_node())); assert_eq!(0, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("div#blog").unwrap(); target = doc.root_node.select_first("div#blog").unwrap();
assert_eq!(50, Readability::get_class_weight(target.as_node())); assert_eq!(50, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("h1.hidden").unwrap(); target = doc.root_node.select_first("h1.hidden").unwrap();
assert_eq!(-25, Readability::get_class_weight(target.as_node())); assert_eq!(-25, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("p#story").unwrap(); target = doc.root_node.select_first("p#story").unwrap();
assert_eq!(25, Readability::get_class_weight(target.as_node())); assert_eq!(25, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("div#comments").unwrap(); target = doc.root_node.select_first("div#comments").unwrap();
assert_eq!(-25, Readability::get_class_weight(target.as_node())); assert_eq!(-25, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("p.comment").unwrap(); target = doc.root_node.select_first("p.comment").unwrap();
assert_eq!(-25, Readability::get_class_weight(target.as_node())); assert_eq!(-25, doc.get_class_weight(target.as_node()));
} }
#[test] #[test]
@ -2831,31 +2857,31 @@ characters. For that reason, this <p> tag could not be a byline because it's too
let doc = Readability::new(html_str); let doc = Readability::new(html_str);
let mut target = doc.root_node.select_first("div#blog").unwrap(); let mut target = doc.root_node.select_first("div#blog").unwrap();
let mut node = target.as_node().clone(); let mut node = target.as_node().clone();
Readability::initialize_node(&mut node); doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow(); let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE)); assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE));
target = doc.root_node.select_first("h1.hidden").unwrap(); target = doc.root_node.select_first("h1.hidden").unwrap();
let mut node = target.as_node().clone(); let mut node = target.as_node().clone();
Readability::initialize_node(&mut node); doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow(); let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE)); assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE));
target = doc.root_node.select_first("p#story").unwrap(); target = doc.root_node.select_first("p#story").unwrap();
let mut node = target.as_node().clone(); let mut node = target.as_node().clone();
Readability::initialize_node(&mut node); doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow(); let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE)); assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE));
target = doc.root_node.select_first("div#comments").unwrap(); target = doc.root_node.select_first("div#comments").unwrap();
let mut node = target.as_node().clone(); let mut node = target.as_node().clone();
Readability::initialize_node(&mut node); doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow(); let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE)); assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE));
target = doc.root_node.select_first("pre.comment").unwrap(); target = doc.root_node.select_first("pre.comment").unwrap();
let mut node = target.as_node().clone(); let mut node = target.as_node().clone();
Readability::initialize_node(&mut node); doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow(); let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE)); assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE));
} }
@ -3108,14 +3134,14 @@ characters. For that reason, this <p> tag could not be a byline because it's too
let mut doc = Readability::new(html_str); let mut doc = Readability::new(html_str);
let body = doc.root_node.select_first("body").unwrap(); let body = doc.root_node.select_first("body").unwrap();
doc.mark_data_tables(); doc.mark_data_tables();
Readability::clean_conditionally(&mut body.as_node().clone(), "table"); doc.clean_conditionally(&mut body.as_node().clone(), "table");
assert_eq!(true, doc.root_node.select_first("#data-table").is_ok()); assert_eq!(true, doc.root_node.select_first("#data-table").is_ok());
assert_eq!(false, doc.root_node.select_first("#display-table").is_ok()); assert_eq!(false, doc.root_node.select_first("#display-table").is_ok());
assert_eq!( assert_eq!(
false, false,
doc.root_node.select_first("#display-table-removed").is_ok() doc.root_node.select_first("#display-table-removed").is_ok()
); );
Readability::clean_conditionally(&mut body.as_node().clone(), "div"); doc.clean_conditionally(&mut body.as_node().clone(), "div");
assert_eq!(false, doc.root_node.select_first("div.comment").is_ok()); assert_eq!(false, doc.root_node.select_first("div.comment").is_ok());
assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok()); assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok());
assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok()); assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok());
@ -3174,7 +3200,7 @@ characters. For that reason, this <p> tag could not be a byline because it's too
let h2_count = doc.root_node.select("h2").unwrap().count(); let h2_count = doc.root_node.select("h2").unwrap().count();
assert_eq!(2, h1_count); assert_eq!(2, h1_count);
assert_eq!(1, h2_count); assert_eq!(1, h2_count);
Readability::clean_headers(&mut body.as_node().clone()); doc.clean_headers(&mut body.as_node().clone());
let h1_count = doc.root_node.select("h1").unwrap().count(); let h1_count = doc.root_node.select("h1").unwrap().count();
let h2_count = doc.root_node.select("h2").unwrap().count(); let h2_count = doc.root_node.select("h2").unwrap().count();
assert_eq!(0, h1_count); assert_eq!(0, h1_count);
@ -3810,4 +3836,38 @@ characters. For that reason, this <p> tag could not be a byline because it's too
let img_attrs = img_node.attributes.borrow(); let img_attrs = img_node.attributes.borrow();
assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src")); assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src"));
} }
#[test]
fn test_flag_is_active() {
let html_str = r"
<!DOCTYPE html>
<html>
<body>
</body>
</html>
";
let doc = Readability::new(html_str);
assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
}
#[test]
fn test_remove_flag() {
let html_str = r"
<!DOCTYPE html>
<html>
<body>
</body>
</html>
";
let mut doc = Readability::new(html_str);
assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
doc.remove_flag(FLAG_CLEAN_CONDITIONALLY);
assert_eq!(false, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
doc.remove_flag(FLAG_WEIGHT_CLASSES);
assert_eq!(false, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
}
} }