Add bug fixes for overflows in subtraction, giving a default for
capture groups and in extracting nodes. Add fix in `is_probably_visible`
This commit is contained in:
parent
350447d1c4
commit
f17c9bfbc9
1 changed files with 14 additions and 10 deletions
|
@ -9,7 +9,7 @@ use kuchiki::{
|
||||||
};
|
};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
const SHARE_ELEMENT_THRESHOLD: usize = 500;
|
const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||||
const READABILITY_SCORE: &'static str = "readability-score";
|
const READABILITY_SCORE: &'static str = "readability-score";
|
||||||
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
||||||
// TODO: Change to HashSet
|
// TODO: Change to HashSet
|
||||||
|
@ -720,7 +720,9 @@ impl Readability {
|
||||||
let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all(
|
let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all(
|
||||||
&srcset,
|
&srcset,
|
||||||
|captures: ®ex::Captures| {
|
|captures: ®ex::Captures| {
|
||||||
to_absolute_uri(&captures[1]) + &captures[2] + &captures[3]
|
to_absolute_uri(&captures[1])
|
||||||
|
+ &captures.get(2).map(|cap| cap.as_str()).unwrap_or("")
|
||||||
|
+ &captures[3]
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
*srcset = new_srcset.to_string();
|
*srcset = new_srcset.to_string();
|
||||||
|
@ -778,7 +780,7 @@ impl Readability {
|
||||||
&& (if let Some(aria_hidden_attr) = attributes.get("aria-hidden"){
|
&& (if let Some(aria_hidden_attr) = attributes.get("aria-hidden"){
|
||||||
aria_hidden_attr.trim() != "true"
|
aria_hidden_attr.trim() != "true"
|
||||||
} else if let Some(class_str) = attributes.get("class"){
|
} else if let Some(class_str) = attributes.get("class"){
|
||||||
!class_str.split(" ").collect::<Vec<&str>>().contains(&"fallback-image")
|
class_str.split(" ").collect::<Vec<&str>>().contains(&"fallback-image")
|
||||||
} else {
|
} else {
|
||||||
true
|
true
|
||||||
})
|
})
|
||||||
|
@ -1401,15 +1403,15 @@ impl Readability {
|
||||||
node_ref.children().for_each(|mut node| {
|
node_ref.children().for_each(|mut node| {
|
||||||
Self::clean_matched_nodes(&mut node, |node: &NodeRef, match_string| {
|
Self::clean_matched_nodes(&mut node, |node: &NodeRef, match_string| {
|
||||||
regexes::is_match_share_elems(match_string)
|
regexes::is_match_share_elems(match_string)
|
||||||
&& node.text_contents().len() < SHARE_ELEMENT_THRESHOLD
|
&& node.text_contents().len() < DEFAULT_CHAR_THRESHOLD
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
let h2_nodes = node_ref.select("h2").unwrap().take(2).collect::<Vec<_>>();
|
let h2_nodes = node_ref.select("h2").unwrap().take(2).collect::<Vec<_>>();
|
||||||
if h2_nodes.len() == 1 {
|
if h2_nodes.len() == 1 {
|
||||||
let h2_node = h2_nodes[0].as_node();
|
let h2_node = h2_nodes[0].as_node();
|
||||||
let length_similar_rate = ((h2_node.text_contents().len() - self.article_title.len())
|
let length_similar_rate = ((h2_node.text_contents().len() as isize
|
||||||
as f32)
|
- self.article_title.len() as isize) as f32)
|
||||||
/ self.article_title.len() as f32;
|
/ self.article_title.len() as f32;
|
||||||
if length_similar_rate.abs() < 0.5 {
|
if length_similar_rate.abs() < 0.5 {
|
||||||
let titles_match = if length_similar_rate > 0.0 {
|
let titles_match = if length_similar_rate > 0.0 {
|
||||||
|
@ -1586,7 +1588,7 @@ impl Readability {
|
||||||
}
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
}
|
}
|
||||||
if !DEFAULT_TAGS_TO_SCORE.contains(&node_name) {
|
if DEFAULT_TAGS_TO_SCORE.contains(&node_name) {
|
||||||
elements_to_score.push(node_ref.clone());
|
elements_to_score.push(node_ref.clone());
|
||||||
}
|
}
|
||||||
if node_name == "div" {
|
if node_name == "div" {
|
||||||
|
@ -1888,7 +1890,9 @@ impl Readability {
|
||||||
|
|
||||||
let sibling_score_threshold = (10.0_f32).max(top_candidate_score * 0.2);
|
let sibling_score_threshold = (10.0_f32).max(top_candidate_score * 0.2);
|
||||||
parent_of_top_candidate = top_candidate.parent().unwrap();
|
parent_of_top_candidate = top_candidate.parent().unwrap();
|
||||||
let siblings = parent_of_top_candidate.children();
|
let siblings = parent_of_top_candidate
|
||||||
|
.children()
|
||||||
|
.filter(|node| node.as_element().is_some());
|
||||||
|
|
||||||
let (top_candidate_class, top_candidate_score) = {
|
let (top_candidate_class, top_candidate_score) = {
|
||||||
let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
|
let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
|
||||||
|
@ -1980,7 +1984,7 @@ impl Readability {
|
||||||
|
|
||||||
let text_length = Self::get_inner_text(&article_content, Some(true)).len();
|
let text_length = Self::get_inner_text(&article_content, Some(true)).len();
|
||||||
let mut parse_successful = true;
|
let mut parse_successful = true;
|
||||||
if text_length < 500 {
|
if text_length < DEFAULT_CHAR_THRESHOLD {
|
||||||
// TODO Add flag checks
|
// TODO Add flag checks
|
||||||
parse_successful = false;
|
parse_successful = false;
|
||||||
println!("I haz a smol content. Plz run me again");
|
println!("I haz a smol content. Plz run me again");
|
||||||
|
@ -2393,7 +2397,7 @@ mod test {
|
||||||
true,
|
true,
|
||||||
Readability::is_probably_visible(&visible_aria_div_node.as_node())
|
Readability::is_probably_visible(&visible_aria_div_node.as_node())
|
||||||
);
|
);
|
||||||
assert_eq!(false, Readability::is_probably_visible(&img_node.as_node()));
|
assert_eq!(true, Readability::is_probably_visible(&img_node.as_node()));
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
true,
|
true,
|
||||||
Readability::is_probably_visible(&visible_div_node.as_node())
|
Readability::is_probably_visible(&visible_div_node.as_node())
|
||||||
|
|
Reference in a new issue