Add flags for conditional cleaning and removal of nodes
This also includes updating the function signatures of the affected methods
This commit is contained in:
parent
f17c9bfbc9
commit
a94798cc95
1 changed files with 98 additions and 38 deletions
|
@ -10,6 +10,9 @@ use kuchiki::{
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||||
|
const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
|
||||||
|
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
||||||
|
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
|
||||||
const READABILITY_SCORE: &'static str = "readability-score";
|
const READABILITY_SCORE: &'static str = "readability-score";
|
||||||
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
||||||
// TODO: Change to HashSet
|
// TODO: Change to HashSet
|
||||||
|
@ -51,6 +54,7 @@ pub struct Readability {
|
||||||
article_title: String,
|
article_title: String,
|
||||||
pub article_node: Option<NodeRef>,
|
pub article_node: Option<NodeRef>,
|
||||||
article_dir: Option<String>,
|
article_dir: Option<String>,
|
||||||
|
flags: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
|
@ -67,6 +71,7 @@ impl Readability {
|
||||||
article_title: "".into(),
|
article_title: "".into(),
|
||||||
article_node: None,
|
article_node: None,
|
||||||
article_dir: None,
|
article_dir: None,
|
||||||
|
flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn parse(&mut self, url: &str) {
|
pub fn parse(&mut self, url: &str) {
|
||||||
|
@ -992,8 +997,10 @@ impl Readability {
|
||||||
|
|
||||||
/// Get an element's class/id weight using regular expressions to tell if this
|
/// Get an element's class/id weight using regular expressions to tell if this
|
||||||
/// element looks good or bad.
|
/// element looks good or bad.
|
||||||
fn get_class_weight(node_ref: &NodeRef) -> i32 {
|
fn get_class_weight(&self, node_ref: &NodeRef) -> i32 {
|
||||||
//TODO: Add check for weighing classes
|
if !self.flag_is_active(FLAG_WEIGHT_CLASSES) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
let mut weight = 0;
|
let mut weight = 0;
|
||||||
let node_elem = node_ref.as_element().unwrap();
|
let node_elem = node_ref.as_element().unwrap();
|
||||||
let node_attrs = node_elem.attributes.borrow();
|
let node_attrs = node_elem.attributes.borrow();
|
||||||
|
@ -1024,12 +1031,12 @@ impl Readability {
|
||||||
|
|
||||||
/// Initialize a node with the readability attribute. Also checks the
|
/// Initialize a node with the readability attribute. Also checks the
|
||||||
/// className/id for special names to add to its score.
|
/// className/id for special names to add to its score.
|
||||||
fn initialize_node(node_ref: &mut NodeRef) {
|
fn initialize_node(&self, node_ref: &mut NodeRef) {
|
||||||
if let Some(element) = node_ref.as_element() {
|
if let Some(element) = node_ref.as_element() {
|
||||||
let mut score = 0.0;
|
let mut score = 0.0;
|
||||||
// This must be computed first because it borrows the NodeRef which
|
// This must be computed first because it borrows the NodeRef which
|
||||||
// should not also be mutably borrowed
|
// should not also be mutably borrowed
|
||||||
score += Self::get_class_weight(node_ref) as f32;
|
score += self.get_class_weight(node_ref) as f32;
|
||||||
let mut elem_attrs = element.attributes.borrow_mut();
|
let mut elem_attrs = element.attributes.borrow_mut();
|
||||||
elem_attrs.insert(READABILITY_SCORE, score.to_string());
|
elem_attrs.insert(READABILITY_SCORE, score.to_string());
|
||||||
let readability = elem_attrs.get_mut(READABILITY_SCORE);
|
let readability = elem_attrs.get_mut(READABILITY_SCORE);
|
||||||
|
@ -1222,8 +1229,10 @@ impl Readability {
|
||||||
|
|
||||||
/// Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm
|
/// Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm
|
||||||
/// based on content length, classnames, link density, number of images & embeds, etc.
|
/// based on content length, classnames, link density, number of images & embeds, etc.
|
||||||
fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) {
|
fn clean_conditionally(&self, node_ref: &mut NodeRef, tag_name: &str) {
|
||||||
// TODO: Add flag check
|
if !self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
let is_list = tag_name == "ul" || tag_name == "ol";
|
let is_list = tag_name == "ul" || tag_name == "ol";
|
||||||
let is_data_table = |node_ref: &NodeRef| {
|
let is_data_table = |node_ref: &NodeRef| {
|
||||||
let node_elem = node_ref.as_element().unwrap();
|
let node_elem = node_ref.as_element().unwrap();
|
||||||
|
@ -1253,7 +1262,7 @@ impl Readability {
|
||||||
while let Some(node_data_ref) = next_node {
|
while let Some(node_data_ref) = next_node {
|
||||||
next_node = nodes.next();
|
next_node = nodes.next();
|
||||||
let node = node_data_ref.as_node();
|
let node = node_data_ref.as_node();
|
||||||
let weight = Self::get_class_weight(node);
|
let weight = self.get_class_weight(node);
|
||||||
// Remove all elements with negative class weights
|
// Remove all elements with negative class weights
|
||||||
if weight < 0 {
|
if weight < 0 {
|
||||||
node.detach();
|
node.detach();
|
||||||
|
@ -1336,12 +1345,12 @@ impl Readability {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Clean out spurious headers from an Element. Checks things like classnames and link density.
|
/// Clean out spurious headers from an Element. Checks things like classnames and link density.
|
||||||
fn clean_headers(node_ref: &mut NodeRef) {
|
fn clean_headers(&self, node_ref: &mut NodeRef) {
|
||||||
let mut nodes = node_ref
|
let mut nodes = node_ref
|
||||||
.descendants()
|
.descendants()
|
||||||
.select("h1, h2")
|
.select("h1, h2")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.filter(|node_data_ref| Self::get_class_weight(node_data_ref.as_node()) < 0);
|
.filter(|node_data_ref| self.get_class_weight(node_data_ref.as_node()) < 0);
|
||||||
let mut node = nodes.next();
|
let mut node = nodes.next();
|
||||||
|
|
||||||
while let Some(node_data_ref) = node {
|
while let Some(node_data_ref) = node {
|
||||||
|
@ -1391,8 +1400,8 @@ impl Readability {
|
||||||
Self::clean_styles(node_ref);
|
Self::clean_styles(node_ref);
|
||||||
self.mark_data_tables();
|
self.mark_data_tables();
|
||||||
Self::fix_lazy_images(node_ref);
|
Self::fix_lazy_images(node_ref);
|
||||||
Self::clean_conditionally(node_ref, "form");
|
self.clean_conditionally(node_ref, "form");
|
||||||
Self::clean_conditionally(node_ref, "fieldset");
|
self.clean_conditionally(node_ref, "fieldset");
|
||||||
Self::clean(node_ref, "object");
|
Self::clean(node_ref, "object");
|
||||||
Self::clean(node_ref, "embed");
|
Self::clean(node_ref, "embed");
|
||||||
Self::clean(node_ref, "h1");
|
Self::clean(node_ref, "h1");
|
||||||
|
@ -1430,11 +1439,11 @@ impl Readability {
|
||||||
Self::clean(node_ref, "textarea");
|
Self::clean(node_ref, "textarea");
|
||||||
Self::clean(node_ref, "select");
|
Self::clean(node_ref, "select");
|
||||||
Self::clean(node_ref, "button");
|
Self::clean(node_ref, "button");
|
||||||
Self::clean_headers(node_ref);
|
self.clean_headers(node_ref);
|
||||||
|
|
||||||
Self::clean_conditionally(node_ref, "table");
|
self.clean_conditionally(node_ref, "table");
|
||||||
Self::clean_conditionally(node_ref, "ul");
|
self.clean_conditionally(node_ref, "ul");
|
||||||
Self::clean_conditionally(node_ref, "div");
|
self.clean_conditionally(node_ref, "div");
|
||||||
|
|
||||||
let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| {
|
let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| {
|
||||||
let p_node = node_data_ref.as_node();
|
let p_node = node_data_ref.as_node();
|
||||||
|
@ -1499,6 +1508,14 @@ impl Readability {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn flag_is_active(&self, flag: u32) -> bool {
|
||||||
|
self.flags & flag > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remove_flag(&mut self, flag: u32) {
|
||||||
|
self.flags = self.flags & !flag;
|
||||||
|
}
|
||||||
|
|
||||||
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
|
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
|
||||||
/// a user wants to read. Then return it wrapped up in a div.
|
/// a user wants to read. Then return it wrapped up in a div.
|
||||||
fn grab_article(&mut self) {
|
fn grab_article(&mut self) {
|
||||||
|
@ -1524,8 +1541,7 @@ impl Readability {
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
|
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
|
||||||
// TODO: Add flag for checking this
|
let strip_unlikely_candidates = self.flag_is_active(FLAG_STRIP_UNLIKELYS);
|
||||||
let strip_unlikely_candidates = true;
|
|
||||||
|
|
||||||
// // First, node prepping. Trash nodes that look cruddy (like ones with the
|
// // First, node prepping. Trash nodes that look cruddy (like ones with the
|
||||||
// // class name "comment", etc), and turn divs into P tags where they have been
|
// // class name "comment", etc), and turn divs into P tags where they have been
|
||||||
|
@ -1675,7 +1691,7 @@ impl Readability {
|
||||||
ancestor_attrs.contains(READABILITY_SCORE)
|
ancestor_attrs.contains(READABILITY_SCORE)
|
||||||
};
|
};
|
||||||
if !has_readability {
|
if !has_readability {
|
||||||
Self::initialize_node(&mut ancestor);
|
self.initialize_node(&mut ancestor);
|
||||||
candidates.push(ancestor.clone());
|
candidates.push(ancestor.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1754,7 +1770,7 @@ impl Readability {
|
||||||
top_candidate.append(child_node);
|
top_candidate.append(child_node);
|
||||||
});
|
});
|
||||||
page.as_node().append(top_candidate.clone());
|
page.as_node().append(top_candidate.clone());
|
||||||
Self::initialize_node(&mut top_candidate);
|
self.initialize_node(&mut top_candidate);
|
||||||
} else {
|
} else {
|
||||||
let alternative_candidate_ancestors: Vec<Vec<NodeRef>>;
|
let alternative_candidate_ancestors: Vec<Vec<NodeRef>>;
|
||||||
top_candidate = top_candidates.get(0).unwrap().clone();
|
top_candidate = top_candidates.get(0).unwrap().clone();
|
||||||
|
@ -1813,7 +1829,7 @@ impl Readability {
|
||||||
};
|
};
|
||||||
|
|
||||||
if top_candidate_readability.is_none() {
|
if top_candidate_readability.is_none() {
|
||||||
Self::initialize_node(&mut top_candidate);
|
self.initialize_node(&mut top_candidate);
|
||||||
}
|
}
|
||||||
parent_of_top_candidate = top_candidate.parent().unwrap();
|
parent_of_top_candidate = top_candidate.parent().unwrap();
|
||||||
|
|
||||||
|
@ -1873,7 +1889,7 @@ impl Readability {
|
||||||
.map(|score| score.to_string())
|
.map(|score| score.to_string())
|
||||||
};
|
};
|
||||||
if top_candidate_readability.is_none() {
|
if top_candidate_readability.is_none() {
|
||||||
Self::initialize_node(&mut top_candidate);
|
self.initialize_node(&mut top_candidate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut article_content = NodeRef::new_element(
|
let mut article_content = NodeRef::new_element(
|
||||||
|
@ -1985,9 +2001,16 @@ impl Readability {
|
||||||
let text_length = Self::get_inner_text(&article_content, Some(true)).len();
|
let text_length = Self::get_inner_text(&article_content, Some(true)).len();
|
||||||
let mut parse_successful = true;
|
let mut parse_successful = true;
|
||||||
if text_length < DEFAULT_CHAR_THRESHOLD {
|
if text_length < DEFAULT_CHAR_THRESHOLD {
|
||||||
// TODO Add flag checks
|
|
||||||
parse_successful = false;
|
parse_successful = false;
|
||||||
println!("I haz a smol content. Plz run me again");
|
if self.flag_is_active(FLAG_STRIP_UNLIKELYS) {
|
||||||
|
self.remove_flag(FLAG_STRIP_UNLIKELYS);
|
||||||
|
} else if self.flag_is_active(FLAG_WEIGHT_CLASSES) {
|
||||||
|
self.remove_flag(FLAG_WEIGHT_CLASSES);
|
||||||
|
} else if self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
|
||||||
|
self.remove_flag(FLAG_CLEAN_CONDITIONALLY);
|
||||||
|
} else {
|
||||||
|
parse_successful = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if parse_successful {
|
if parse_successful {
|
||||||
let parent_ancestors = Self::get_node_ancestors(&parent_of_top_candidate, None);
|
let parent_ancestors = Self::get_node_ancestors(&parent_of_top_candidate, None);
|
||||||
|
@ -2037,7 +2060,10 @@ impl MetaData {
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::{MetaData, Readability, SizeInfo, HTML_NS, READABILITY_SCORE};
|
use super::{
|
||||||
|
MetaData, Readability, SizeInfo, FLAG_CLEAN_CONDITIONALLY, FLAG_STRIP_UNLIKELYS,
|
||||||
|
FLAG_WEIGHT_CLASSES, HTML_NS, READABILITY_SCORE,
|
||||||
|
};
|
||||||
use html5ever::{LocalName, Namespace, QualName};
|
use html5ever::{LocalName, Namespace, QualName};
|
||||||
use kuchiki::traits::*;
|
use kuchiki::traits::*;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
|
@ -2793,22 +2819,22 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
"#;
|
"#;
|
||||||
let doc = Readability::new(html_str);
|
let doc = Readability::new(html_str);
|
||||||
let mut target = doc.root_node.select_first("body").unwrap();
|
let mut target = doc.root_node.select_first("body").unwrap();
|
||||||
assert_eq!(0, Readability::get_class_weight(target.as_node()));
|
assert_eq!(0, doc.get_class_weight(target.as_node()));
|
||||||
|
|
||||||
target = doc.root_node.select_first("div#blog").unwrap();
|
target = doc.root_node.select_first("div#blog").unwrap();
|
||||||
assert_eq!(50, Readability::get_class_weight(target.as_node()));
|
assert_eq!(50, doc.get_class_weight(target.as_node()));
|
||||||
|
|
||||||
target = doc.root_node.select_first("h1.hidden").unwrap();
|
target = doc.root_node.select_first("h1.hidden").unwrap();
|
||||||
assert_eq!(-25, Readability::get_class_weight(target.as_node()));
|
assert_eq!(-25, doc.get_class_weight(target.as_node()));
|
||||||
|
|
||||||
target = doc.root_node.select_first("p#story").unwrap();
|
target = doc.root_node.select_first("p#story").unwrap();
|
||||||
assert_eq!(25, Readability::get_class_weight(target.as_node()));
|
assert_eq!(25, doc.get_class_weight(target.as_node()));
|
||||||
|
|
||||||
target = doc.root_node.select_first("div#comments").unwrap();
|
target = doc.root_node.select_first("div#comments").unwrap();
|
||||||
assert_eq!(-25, Readability::get_class_weight(target.as_node()));
|
assert_eq!(-25, doc.get_class_weight(target.as_node()));
|
||||||
|
|
||||||
target = doc.root_node.select_first("p.comment").unwrap();
|
target = doc.root_node.select_first("p.comment").unwrap();
|
||||||
assert_eq!(-25, Readability::get_class_weight(target.as_node()));
|
assert_eq!(-25, doc.get_class_weight(target.as_node()));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -2831,31 +2857,31 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
let doc = Readability::new(html_str);
|
let doc = Readability::new(html_str);
|
||||||
let mut target = doc.root_node.select_first("div#blog").unwrap();
|
let mut target = doc.root_node.select_first("div#blog").unwrap();
|
||||||
let mut node = target.as_node().clone();
|
let mut node = target.as_node().clone();
|
||||||
Readability::initialize_node(&mut node);
|
doc.initialize_node(&mut node);
|
||||||
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
||||||
assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE));
|
assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE));
|
||||||
|
|
||||||
target = doc.root_node.select_first("h1.hidden").unwrap();
|
target = doc.root_node.select_first("h1.hidden").unwrap();
|
||||||
let mut node = target.as_node().clone();
|
let mut node = target.as_node().clone();
|
||||||
Readability::initialize_node(&mut node);
|
doc.initialize_node(&mut node);
|
||||||
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
||||||
assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE));
|
assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE));
|
||||||
|
|
||||||
target = doc.root_node.select_first("p#story").unwrap();
|
target = doc.root_node.select_first("p#story").unwrap();
|
||||||
let mut node = target.as_node().clone();
|
let mut node = target.as_node().clone();
|
||||||
Readability::initialize_node(&mut node);
|
doc.initialize_node(&mut node);
|
||||||
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
||||||
assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE));
|
assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE));
|
||||||
|
|
||||||
target = doc.root_node.select_first("div#comments").unwrap();
|
target = doc.root_node.select_first("div#comments").unwrap();
|
||||||
let mut node = target.as_node().clone();
|
let mut node = target.as_node().clone();
|
||||||
Readability::initialize_node(&mut node);
|
doc.initialize_node(&mut node);
|
||||||
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
||||||
assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE));
|
assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE));
|
||||||
|
|
||||||
target = doc.root_node.select_first("pre.comment").unwrap();
|
target = doc.root_node.select_first("pre.comment").unwrap();
|
||||||
let mut node = target.as_node().clone();
|
let mut node = target.as_node().clone();
|
||||||
Readability::initialize_node(&mut node);
|
doc.initialize_node(&mut node);
|
||||||
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
let node_attrs = node.as_element().unwrap().attributes.borrow();
|
||||||
assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE));
|
assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE));
|
||||||
}
|
}
|
||||||
|
@ -3108,14 +3134,14 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
let mut doc = Readability::new(html_str);
|
let mut doc = Readability::new(html_str);
|
||||||
let body = doc.root_node.select_first("body").unwrap();
|
let body = doc.root_node.select_first("body").unwrap();
|
||||||
doc.mark_data_tables();
|
doc.mark_data_tables();
|
||||||
Readability::clean_conditionally(&mut body.as_node().clone(), "table");
|
doc.clean_conditionally(&mut body.as_node().clone(), "table");
|
||||||
assert_eq!(true, doc.root_node.select_first("#data-table").is_ok());
|
assert_eq!(true, doc.root_node.select_first("#data-table").is_ok());
|
||||||
assert_eq!(false, doc.root_node.select_first("#display-table").is_ok());
|
assert_eq!(false, doc.root_node.select_first("#display-table").is_ok());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
false,
|
false,
|
||||||
doc.root_node.select_first("#display-table-removed").is_ok()
|
doc.root_node.select_first("#display-table-removed").is_ok()
|
||||||
);
|
);
|
||||||
Readability::clean_conditionally(&mut body.as_node().clone(), "div");
|
doc.clean_conditionally(&mut body.as_node().clone(), "div");
|
||||||
assert_eq!(false, doc.root_node.select_first("div.comment").is_ok());
|
assert_eq!(false, doc.root_node.select_first("div.comment").is_ok());
|
||||||
assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok());
|
assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok());
|
||||||
assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok());
|
assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok());
|
||||||
|
@ -3174,7 +3200,7 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
let h2_count = doc.root_node.select("h2").unwrap().count();
|
let h2_count = doc.root_node.select("h2").unwrap().count();
|
||||||
assert_eq!(2, h1_count);
|
assert_eq!(2, h1_count);
|
||||||
assert_eq!(1, h2_count);
|
assert_eq!(1, h2_count);
|
||||||
Readability::clean_headers(&mut body.as_node().clone());
|
doc.clean_headers(&mut body.as_node().clone());
|
||||||
let h1_count = doc.root_node.select("h1").unwrap().count();
|
let h1_count = doc.root_node.select("h1").unwrap().count();
|
||||||
let h2_count = doc.root_node.select("h2").unwrap().count();
|
let h2_count = doc.root_node.select("h2").unwrap().count();
|
||||||
assert_eq!(0, h1_count);
|
assert_eq!(0, h1_count);
|
||||||
|
@ -3810,4 +3836,38 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
let img_attrs = img_node.attributes.borrow();
|
let img_attrs = img_node.attributes.borrow();
|
||||||
assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src"));
|
assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_flag_is_active() {
|
||||||
|
let html_str = r"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
";
|
||||||
|
let doc = Readability::new(html_str);
|
||||||
|
assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
|
||||||
|
assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
|
||||||
|
assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_remove_flag() {
|
||||||
|
let html_str = r"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
";
|
||||||
|
let mut doc = Readability::new(html_str);
|
||||||
|
assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
|
||||||
|
doc.remove_flag(FLAG_CLEAN_CONDITIONALLY);
|
||||||
|
assert_eq!(false, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
|
||||||
|
assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
|
||||||
|
doc.remove_flag(FLAG_WEIGHT_CLASSES);
|
||||||
|
assert_eq!(false, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
|
||||||
|
assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue