Fix bug in deletion of multiple nodes.
When calling `detach` in a for loop or `for_each` iterator consumer, only the first node is ever deleted. Fix replacement of table nodes in prep_article Edit clean_conditionally to remove unnecessary assignment.
This commit is contained in:
parent
ccdbbb5a16
commit
94fa8db218
1 changed files with 70 additions and 38 deletions
|
@ -113,7 +113,7 @@ impl Readability {
|
||||||
/// This improves the quality of the images we use on some sites (e.g. Medium).
|
/// This improves the quality of the images we use on some sites (e.g. Medium).
|
||||||
fn unwrap_no_script_tags(&mut self) {
|
fn unwrap_no_script_tags(&mut self) {
|
||||||
if let Ok(imgs) = self.root_node.select("img") {
|
if let Ok(imgs) = self.root_node.select("img") {
|
||||||
imgs.filter(|img_node_ref| {
|
let mut nodes = imgs.filter(|img_node_ref| {
|
||||||
let img_attrs = img_node_ref.attributes.borrow();
|
let img_attrs = img_node_ref.attributes.borrow();
|
||||||
!img_attrs.map.iter().any(|(name, attr)| {
|
!img_attrs.map.iter().any(|(name, attr)| {
|
||||||
&name.local == "src"
|
&name.local == "src"
|
||||||
|
@ -122,8 +122,12 @@ impl Readability {
|
||||||
|| &name.local == "data-srcset"
|
|| &name.local == "data-srcset"
|
||||||
|| regexes::is_match_img_ext(&attr.value)
|
|| regexes::is_match_img_ext(&attr.value)
|
||||||
})
|
})
|
||||||
})
|
});
|
||||||
.for_each(|img_ref| img_ref.as_node().detach());
|
let mut node_ref = nodes.next();
|
||||||
|
while let Some(img_ref) = node_ref {
|
||||||
|
node_ref = nodes.next();
|
||||||
|
img_ref.as_node().detach();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Ok(noscripts) = self.root_node.select("noscript") {
|
if let Ok(noscripts) = self.root_node.select("noscript") {
|
||||||
|
@ -194,11 +198,23 @@ impl Readability {
|
||||||
/// Removes script tags from the document.
|
/// Removes script tags from the document.
|
||||||
fn remove_scripts(&mut self) {
|
fn remove_scripts(&mut self) {
|
||||||
match self.root_node.select("script") {
|
match self.root_node.select("script") {
|
||||||
Ok(script_elems) => script_elems.for_each(|elem| elem.as_node().detach()),
|
Ok(mut script_elems) => {
|
||||||
|
let mut next_script = script_elems.next();
|
||||||
|
while let Some(next_script_ref) = next_script {
|
||||||
|
next_script = script_elems.next();
|
||||||
|
next_script_ref.as_node().detach();
|
||||||
|
}
|
||||||
|
}
|
||||||
Err(_) => (),
|
Err(_) => (),
|
||||||
}
|
}
|
||||||
match self.root_node.select("noscript") {
|
match self.root_node.select("noscript") {
|
||||||
Ok(noscript_elems) => noscript_elems.for_each(|elem| elem.as_node().detach()),
|
Ok(mut noscript_elems) => {
|
||||||
|
let mut next_noscript = noscript_elems.next();
|
||||||
|
while let Some(noscript_ref) = next_noscript {
|
||||||
|
next_noscript = noscript_elems.next();
|
||||||
|
noscript_ref.as_node().detach();
|
||||||
|
}
|
||||||
|
}
|
||||||
Err(_) => (),
|
Err(_) => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -207,7 +223,13 @@ impl Readability {
|
||||||
/// CSS, and handling terrible markup.
|
/// CSS, and handling terrible markup.
|
||||||
fn prep_document(&mut self) {
|
fn prep_document(&mut self) {
|
||||||
match self.root_node.select("style") {
|
match self.root_node.select("style") {
|
||||||
Ok(style_elems) => style_elems.for_each(|elem| elem.as_node().detach()),
|
Ok(mut style_elems) => {
|
||||||
|
let mut style_elem = style_elems.next();
|
||||||
|
while let Some(style_ref) = style_elem {
|
||||||
|
style_elem = style_elems.next();
|
||||||
|
style_ref.as_node().detach();
|
||||||
|
}
|
||||||
|
}
|
||||||
Err(_) => (),
|
Err(_) => (),
|
||||||
}
|
}
|
||||||
self.replace_brs();
|
self.replace_brs();
|
||||||
|
@ -234,8 +256,8 @@ impl Readability {
|
||||||
{
|
{
|
||||||
replaced = true;
|
replaced = true;
|
||||||
let br_sibling = next_elem.next_sibling();
|
let br_sibling = next_elem.next_sibling();
|
||||||
next_elem.detach();
|
|
||||||
next = Self::next_element(br_sibling, false);
|
next = Self::next_element(br_sibling, false);
|
||||||
|
next_elem.detach();
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -861,7 +883,6 @@ impl Readability {
|
||||||
fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) {
|
fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) {
|
||||||
// TODO: Add flag check
|
// TODO: Add flag check
|
||||||
let is_list = tag_name == "ul" || tag_name == "ol";
|
let is_list = tag_name == "ul" || tag_name == "ol";
|
||||||
let nodes = node_ref.descendants().select(tag_name).unwrap();
|
|
||||||
let is_data_table = |node_ref: &NodeRef| {
|
let is_data_table = |node_ref: &NodeRef| {
|
||||||
let node_elem = node_ref.as_element().unwrap();
|
let node_elem = node_ref.as_element().unwrap();
|
||||||
let attrs = node_elem.attributes.borrow();
|
let attrs = node_elem.attributes.borrow();
|
||||||
|
@ -869,7 +890,10 @@ impl Readability {
|
||||||
};
|
};
|
||||||
let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
|
let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
|
||||||
|
|
||||||
let mut nodes = nodes
|
let mut nodes = node_ref
|
||||||
|
.descendants()
|
||||||
|
.select(tag_name)
|
||||||
|
.unwrap()
|
||||||
// Do not remove data tables
|
// Do not remove data tables
|
||||||
.filter(|node_data_ref| {
|
.filter(|node_data_ref| {
|
||||||
!(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node()))
|
!(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node()))
|
||||||
|
@ -1070,34 +1094,38 @@ impl Readability {
|
||||||
Self::clean_conditionally(node_ref, "ul");
|
Self::clean_conditionally(node_ref, "ul");
|
||||||
Self::clean_conditionally(node_ref, "div");
|
Self::clean_conditionally(node_ref, "div");
|
||||||
|
|
||||||
node_ref
|
let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| {
|
||||||
.select("p")
|
let p_node = node_data_ref.as_node();
|
||||||
.unwrap()
|
let img_count = p_node.select("img").unwrap().count();
|
||||||
.filter(|node_data_ref| {
|
let embed_count = p_node.select("embed").unwrap().count();
|
||||||
let p_node = node_data_ref.as_node();
|
let object_count = p_node.select("object").unwrap().count();
|
||||||
let img_count = p_node.select("img").unwrap().count();
|
let iframe_count = p_node.select("iframe").unwrap().count();
|
||||||
let embed_count = p_node.select("embed").unwrap().count();
|
let total = img_count + embed_count + object_count + iframe_count;
|
||||||
let object_count = p_node.select("object").unwrap().count();
|
total == 0 && Self::get_inner_text(node_data_ref.as_node(), Some(false)).is_empty()
|
||||||
let iframe_count = p_node.select("iframe").unwrap().count();
|
});
|
||||||
let total = img_count + embed_count + object_count + iframe_count;
|
let mut p_node = p_nodes.next();
|
||||||
total == 0 && Self::get_inner_text(node_data_ref.as_node(), Some(false)).is_empty()
|
while let Some(p_node_ref) = p_node {
|
||||||
})
|
p_node = p_nodes.next();
|
||||||
.for_each(|node_data_ref| node_data_ref.as_node().detach());
|
p_node_ref.as_node().detach();
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: Fix the code for deleting nodes
|
let mut br_nodes = node_ref.select("br").unwrap().filter(|node_data_ref| {
|
||||||
node_ref
|
let br_node = node_data_ref.as_node();
|
||||||
.select("br")
|
// WARN: This assumes `next_element` returns an element node.
|
||||||
.unwrap()
|
let next_node = Self::next_element(br_node.next_sibling(), true);
|
||||||
.filter(|node_data_ref| {
|
next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
|
||||||
let br_node = node_data_ref.as_node();
|
});
|
||||||
// WARN: This assumes `next_element` returns an element node.
|
let mut br_node = br_nodes.next();
|
||||||
let next_node = Self::next_element(br_node.next_sibling(), true);
|
while let Some(br_node_ref) = br_node {
|
||||||
next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
|
br_node = br_nodes.next();
|
||||||
})
|
br_node_ref.as_node().detach();
|
||||||
.for_each(|node_data_ref| node_data_ref.as_node().detach());
|
}
|
||||||
|
|
||||||
node_ref.select("table").unwrap().for_each(|node_data_ref| {
|
let mut table_nodes = node_ref.select("table").unwrap();
|
||||||
let table_node = node_data_ref.as_node();
|
let mut table_node = table_nodes.next();
|
||||||
|
while let Some(table_node_ref) = table_node {
|
||||||
|
table_node = table_nodes.next();
|
||||||
|
let table_node = table_node_ref.as_node();
|
||||||
// WARN: This assumes `next_element` returns an element node.
|
// WARN: This assumes `next_element` returns an element node.
|
||||||
let table_child = Self::next_element(table_node.first_child(), true);
|
let table_child = Self::next_element(table_node.first_child(), true);
|
||||||
let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
|
let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
|
||||||
|
@ -1110,7 +1138,7 @@ impl Readability {
|
||||||
if Self::has_single_tag_inside_element(&tbody, "tr") {
|
if Self::has_single_tag_inside_element(&tbody, "tr") {
|
||||||
let row = Self::next_element(tbody.first_child(), true).unwrap();
|
let row = Self::next_element(tbody.first_child(), true).unwrap();
|
||||||
if Self::has_single_tag_inside_element(&row, "td") {
|
if Self::has_single_tag_inside_element(&row, "td") {
|
||||||
let cell = Self::next_element(row.first_child(), true).unwrap();
|
let mut cell = Self::next_element(row.first_child(), true).unwrap();
|
||||||
let tag = if cell
|
let tag = if cell
|
||||||
.children()
|
.children()
|
||||||
.all(|cell_child| Self::is_phrasing_content(&cell_child))
|
.all(|cell_child| Self::is_phrasing_content(&cell_child))
|
||||||
|
@ -1119,10 +1147,14 @@ impl Readability {
|
||||||
} else {
|
} else {
|
||||||
"div"
|
"div"
|
||||||
};
|
};
|
||||||
Self::set_node_tag(&cell, tag);
|
cell = Self::set_node_tag(&cell, tag);
|
||||||
|
if let Some(parent) = table_node.parent() {
|
||||||
|
parent.append(cell);
|
||||||
|
table_node.detach();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
|
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
|
||||||
|
|
Reference in a new issue