Fix bug in deletion of multiple nodes.

When calling `detach` in a for loop or `for_each` iterator consumer,
only the first node is ever deleted.

Fix replacement of table nodes in prep_article
Edit clean_conditionally to remove unnecessary assignment.
This commit is contained in:
Kenneth Gitere 2020-10-20 08:07:47 +03:00
parent ccdbbb5a16
commit 94fa8db218

View file

@ -113,7 +113,7 @@ impl Readability {
/// This improves the quality of the images we use on some sites (e.g. Medium). /// This improves the quality of the images we use on some sites (e.g. Medium).
fn unwrap_no_script_tags(&mut self) { fn unwrap_no_script_tags(&mut self) {
if let Ok(imgs) = self.root_node.select("img") { if let Ok(imgs) = self.root_node.select("img") {
imgs.filter(|img_node_ref| { let mut nodes = imgs.filter(|img_node_ref| {
let img_attrs = img_node_ref.attributes.borrow(); let img_attrs = img_node_ref.attributes.borrow();
!img_attrs.map.iter().any(|(name, attr)| { !img_attrs.map.iter().any(|(name, attr)| {
&name.local == "src" &name.local == "src"
@ -122,8 +122,12 @@ impl Readability {
|| &name.local == "data-srcset" || &name.local == "data-srcset"
|| regexes::is_match_img_ext(&attr.value) || regexes::is_match_img_ext(&attr.value)
}) })
}) });
.for_each(|img_ref| img_ref.as_node().detach()); let mut node_ref = nodes.next();
while let Some(img_ref) = node_ref {
node_ref = nodes.next();
img_ref.as_node().detach();
}
} }
if let Ok(noscripts) = self.root_node.select("noscript") { if let Ok(noscripts) = self.root_node.select("noscript") {
@ -194,11 +198,23 @@ impl Readability {
/// Removes script tags from the document. /// Removes script tags from the document.
fn remove_scripts(&mut self) { fn remove_scripts(&mut self) {
match self.root_node.select("script") { match self.root_node.select("script") {
Ok(script_elems) => script_elems.for_each(|elem| elem.as_node().detach()), Ok(mut script_elems) => {
let mut next_script = script_elems.next();
while let Some(next_script_ref) = next_script {
next_script = script_elems.next();
next_script_ref.as_node().detach();
}
}
Err(_) => (), Err(_) => (),
} }
match self.root_node.select("noscript") { match self.root_node.select("noscript") {
Ok(noscript_elems) => noscript_elems.for_each(|elem| elem.as_node().detach()), Ok(mut noscript_elems) => {
let mut next_noscript = noscript_elems.next();
while let Some(noscript_ref) = next_noscript {
next_noscript = noscript_elems.next();
noscript_ref.as_node().detach();
}
}
Err(_) => (), Err(_) => (),
} }
} }
@ -207,7 +223,13 @@ impl Readability {
/// CSS, and handling terrible markup. /// CSS, and handling terrible markup.
fn prep_document(&mut self) { fn prep_document(&mut self) {
match self.root_node.select("style") { match self.root_node.select("style") {
Ok(style_elems) => style_elems.for_each(|elem| elem.as_node().detach()), Ok(mut style_elems) => {
let mut style_elem = style_elems.next();
while let Some(style_ref) = style_elem {
style_elem = style_elems.next();
style_ref.as_node().detach();
}
}
Err(_) => (), Err(_) => (),
} }
self.replace_brs(); self.replace_brs();
@ -234,8 +256,8 @@ impl Readability {
{ {
replaced = true; replaced = true;
let br_sibling = next_elem.next_sibling(); let br_sibling = next_elem.next_sibling();
next_elem.detach();
next = Self::next_element(br_sibling, false); next = Self::next_element(br_sibling, false);
next_elem.detach();
} else { } else {
break; break;
} }
@ -861,7 +883,6 @@ impl Readability {
fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) { fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) {
// TODO: Add flag check // TODO: Add flag check
let is_list = tag_name == "ul" || tag_name == "ol"; let is_list = tag_name == "ul" || tag_name == "ol";
let nodes = node_ref.descendants().select(tag_name).unwrap();
let is_data_table = |node_ref: &NodeRef| { let is_data_table = |node_ref: &NodeRef| {
let node_elem = node_ref.as_element().unwrap(); let node_elem = node_ref.as_element().unwrap();
let attrs = node_elem.attributes.borrow(); let attrs = node_elem.attributes.borrow();
@ -869,7 +890,10 @@ impl Readability {
}; };
let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count(); let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
let mut nodes = nodes let mut nodes = node_ref
.descendants()
.select(tag_name)
.unwrap()
// Do not remove data tables // Do not remove data tables
.filter(|node_data_ref| { .filter(|node_data_ref| {
!(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node())) !(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node()))
@ -1070,10 +1094,7 @@ impl Readability {
Self::clean_conditionally(node_ref, "ul"); Self::clean_conditionally(node_ref, "ul");
Self::clean_conditionally(node_ref, "div"); Self::clean_conditionally(node_ref, "div");
node_ref let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| {
.select("p")
.unwrap()
.filter(|node_data_ref| {
let p_node = node_data_ref.as_node(); let p_node = node_data_ref.as_node();
let img_count = p_node.select("img").unwrap().count(); let img_count = p_node.select("img").unwrap().count();
let embed_count = p_node.select("embed").unwrap().count(); let embed_count = p_node.select("embed").unwrap().count();
@ -1081,23 +1102,30 @@ impl Readability {
let iframe_count = p_node.select("iframe").unwrap().count(); let iframe_count = p_node.select("iframe").unwrap().count();
let total = img_count + embed_count + object_count + iframe_count; let total = img_count + embed_count + object_count + iframe_count;
total == 0 && Self::get_inner_text(node_data_ref.as_node(), Some(false)).is_empty() total == 0 && Self::get_inner_text(node_data_ref.as_node(), Some(false)).is_empty()
}) });
.for_each(|node_data_ref| node_data_ref.as_node().detach()); let mut p_node = p_nodes.next();
while let Some(p_node_ref) = p_node {
p_node = p_nodes.next();
p_node_ref.as_node().detach();
}
// TODO: Fix the code for deleting nodes let mut br_nodes = node_ref.select("br").unwrap().filter(|node_data_ref| {
node_ref
.select("br")
.unwrap()
.filter(|node_data_ref| {
let br_node = node_data_ref.as_node(); let br_node = node_data_ref.as_node();
// WARN: This assumes `next_element` returns an element node. // WARN: This assumes `next_element` returns an element node.
let next_node = Self::next_element(br_node.next_sibling(), true); let next_node = Self::next_element(br_node.next_sibling(), true);
next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p" next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
}) });
.for_each(|node_data_ref| node_data_ref.as_node().detach()); let mut br_node = br_nodes.next();
while let Some(br_node_ref) = br_node {
br_node = br_nodes.next();
br_node_ref.as_node().detach();
}
node_ref.select("table").unwrap().for_each(|node_data_ref| { let mut table_nodes = node_ref.select("table").unwrap();
let table_node = node_data_ref.as_node(); let mut table_node = table_nodes.next();
while let Some(table_node_ref) = table_node {
table_node = table_nodes.next();
let table_node = table_node_ref.as_node();
// WARN: This assumes `next_element` returns an element node. // WARN: This assumes `next_element` returns an element node.
let table_child = Self::next_element(table_node.first_child(), true); let table_child = Self::next_element(table_node.first_child(), true);
let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") { let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
@ -1110,7 +1138,7 @@ impl Readability {
if Self::has_single_tag_inside_element(&tbody, "tr") { if Self::has_single_tag_inside_element(&tbody, "tr") {
let row = Self::next_element(tbody.first_child(), true).unwrap(); let row = Self::next_element(tbody.first_child(), true).unwrap();
if Self::has_single_tag_inside_element(&row, "td") { if Self::has_single_tag_inside_element(&row, "td") {
let cell = Self::next_element(row.first_child(), true).unwrap(); let mut cell = Self::next_element(row.first_child(), true).unwrap();
let tag = if cell let tag = if cell
.children() .children()
.all(|cell_child| Self::is_phrasing_content(&cell_child)) .all(|cell_child| Self::is_phrasing_content(&cell_child))
@ -1119,10 +1147,14 @@ impl Readability {
} else { } else {
"div" "div"
}; };
Self::set_node_tag(&cell, tag); cell = Self::set_node_tag(&cell, tag);
if let Some(parent) = table_node.parent() {
parent.append(cell);
table_node.detach();
}
}
} }
} }
});
} }
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff