Add bug fixes for overflows in subtraction, giving a default for

capture groups and in extracting nodes. Add fix in `is_probably_visible`
This commit is contained in:
Kenneth Gitere 2020-10-21 18:50:24 +03:00
parent 350447d1c4
commit f17c9bfbc9

View file

@ -9,7 +9,7 @@ use kuchiki::{
}; };
use url::Url; use url::Url;
const SHARE_ELEMENT_THRESHOLD: usize = 500; const DEFAULT_CHAR_THRESHOLD: usize = 500;
const READABILITY_SCORE: &'static str = "readability-score"; const READABILITY_SCORE: &'static str = "readability-score";
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
// TODO: Change to HashSet // TODO: Change to HashSet
@ -720,7 +720,9 @@ impl Readability {
let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all( let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all(
&srcset, &srcset,
|captures: &regex::Captures| { |captures: &regex::Captures| {
to_absolute_uri(&captures[1]) + &captures[2] + &captures[3] to_absolute_uri(&captures[1])
+ &captures.get(2).map(|cap| cap.as_str()).unwrap_or("")
+ &captures[3]
}, },
); );
*srcset = new_srcset.to_string(); *srcset = new_srcset.to_string();
@ -778,7 +780,7 @@ impl Readability {
&& (if let Some(aria_hidden_attr) = attributes.get("aria-hidden"){ && (if let Some(aria_hidden_attr) = attributes.get("aria-hidden"){
aria_hidden_attr.trim() != "true" aria_hidden_attr.trim() != "true"
} else if let Some(class_str) = attributes.get("class"){ } else if let Some(class_str) = attributes.get("class"){
!class_str.split(" ").collect::<Vec<&str>>().contains(&"fallback-image") class_str.split(" ").collect::<Vec<&str>>().contains(&"fallback-image")
} else { } else {
true true
}) })
@ -1401,15 +1403,15 @@ impl Readability {
node_ref.children().for_each(|mut node| { node_ref.children().for_each(|mut node| {
Self::clean_matched_nodes(&mut node, |node: &NodeRef, match_string| { Self::clean_matched_nodes(&mut node, |node: &NodeRef, match_string| {
regexes::is_match_share_elems(match_string) regexes::is_match_share_elems(match_string)
&& node.text_contents().len() < SHARE_ELEMENT_THRESHOLD && node.text_contents().len() < DEFAULT_CHAR_THRESHOLD
}); });
}); });
let h2_nodes = node_ref.select("h2").unwrap().take(2).collect::<Vec<_>>(); let h2_nodes = node_ref.select("h2").unwrap().take(2).collect::<Vec<_>>();
if h2_nodes.len() == 1 { if h2_nodes.len() == 1 {
let h2_node = h2_nodes[0].as_node(); let h2_node = h2_nodes[0].as_node();
let length_similar_rate = ((h2_node.text_contents().len() - self.article_title.len()) let length_similar_rate = ((h2_node.text_contents().len() as isize
as f32) - self.article_title.len() as isize) as f32)
/ self.article_title.len() as f32; / self.article_title.len() as f32;
if length_similar_rate.abs() < 0.5 { if length_similar_rate.abs() < 0.5 {
let titles_match = if length_similar_rate > 0.0 { let titles_match = if length_similar_rate > 0.0 {
@ -1586,7 +1588,7 @@ impl Readability {
} }
_ => (), _ => (),
} }
if !DEFAULT_TAGS_TO_SCORE.contains(&node_name) { if DEFAULT_TAGS_TO_SCORE.contains(&node_name) {
elements_to_score.push(node_ref.clone()); elements_to_score.push(node_ref.clone());
} }
if node_name == "div" { if node_name == "div" {
@ -1888,7 +1890,9 @@ impl Readability {
let sibling_score_threshold = (10.0_f32).max(top_candidate_score * 0.2); let sibling_score_threshold = (10.0_f32).max(top_candidate_score * 0.2);
parent_of_top_candidate = top_candidate.parent().unwrap(); parent_of_top_candidate = top_candidate.parent().unwrap();
let siblings = parent_of_top_candidate.children(); let siblings = parent_of_top_candidate
.children()
.filter(|node| node.as_element().is_some());
let (top_candidate_class, top_candidate_score) = { let (top_candidate_class, top_candidate_score) = {
let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow(); let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
@ -1980,7 +1984,7 @@ impl Readability {
let text_length = Self::get_inner_text(&article_content, Some(true)).len(); let text_length = Self::get_inner_text(&article_content, Some(true)).len();
let mut parse_successful = true; let mut parse_successful = true;
if text_length < 500 { if text_length < DEFAULT_CHAR_THRESHOLD {
// TODO Add flag checks // TODO Add flag checks
parse_successful = false; parse_successful = false;
println!("I haz a smol content. Plz run me again"); println!("I haz a smol content. Plz run me again");
@ -2393,7 +2397,7 @@ mod test {
true, true,
Readability::is_probably_visible(&visible_aria_div_node.as_node()) Readability::is_probably_visible(&visible_aria_div_node.as_node())
); );
assert_eq!(false, Readability::is_probably_visible(&img_node.as_node())); assert_eq!(true, Readability::is_probably_visible(&img_node.as_node()));
assert_eq!( assert_eq!(
true, true,
Readability::is_probably_visible(&visible_div_node.as_node()) Readability::is_probably_visible(&visible_div_node.as_node())