Fix calls to select to return an iterator excluding the original

calling node.

Edit `next_element` to either return an element node only or element/
text node
This commit is contained in:
Kenneth Gitere 2020-10-16 14:57:26 +03:00
parent 6377c01fb3
commit 3254064c0d

View file

@ -175,8 +175,9 @@ impl Readability {
.borrow_mut() .borrow_mut()
.insert(attr_name, prev_value.value.clone()); .insert(attr_name, prev_value.value.clone());
} }
// WARN: This assumes `next_element` returns an element node!!
let inner_node_child = Self::next_element(inner_node_ref.first_child()); let inner_node_child =
Self::next_element(inner_node_ref.first_child(), true);
prev_elem.insert_after(inner_node_child.unwrap()); prev_elem.insert_after(inner_node_child.unwrap());
prev_elem.detach(); prev_elem.detach();
} }
@ -218,8 +219,9 @@ impl Readability {
/// <div>foo<br>bar<p>abc</p></div> /// <div>foo<br>bar<p>abc</p></div>
fn replace_brs(&mut self) { fn replace_brs(&mut self) {
if let Ok(mut br_tags) = self.root_node.select("br") { if let Ok(mut br_tags) = self.root_node.select("br") {
// The uses of `next_element` here are safe as it explicitly ensures the next element is an element node
while let Some(br_tag) = br_tags.next() { while let Some(br_tag) = br_tags.next() {
let mut next = Self::next_element(br_tag.as_node().next_sibling()); let mut next = Self::next_element(br_tag.as_node().next_sibling(), false);
let mut replaced = false; let mut replaced = false;
while let Some(next_elem) = next { while let Some(next_elem) = next {
if next_elem.as_element().is_some() if next_elem.as_element().is_some()
@ -228,7 +230,7 @@ impl Readability {
replaced = true; replaced = true;
let br_sibling = next_elem.next_sibling(); let br_sibling = next_elem.next_sibling();
next_elem.detach(); next_elem.detach();
next = Self::next_element(br_sibling); next = Self::next_element(br_sibling, false);
} else { } else {
break; break;
} }
@ -331,7 +333,9 @@ impl Readability {
/// Finds the next element, starting from the given node, and ignoring /// Finds the next element, starting from the given node, and ignoring
/// whitespace in between. If the given node is an element, the same node is /// whitespace in between. If the given node is an element, the same node is
/// returned. /// returned.
fn next_element(node_ref: Option<NodeRef>) -> Option<NodeRef> { /// The must_be_element argument ensure the next element is actually an element node.
/// This is likely to factored out into a new function.
fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
// TODO: Could probably be refactored to use the elements method // TODO: Could probably be refactored to use the elements method
let mut node_ref = node_ref; let mut node_ref = node_ref;
while node_ref.is_some() { while node_ref.is_some() {
@ -340,6 +344,10 @@ impl Readability {
_ => { _ => {
if node_ref.as_ref().unwrap().text_contents().trim().is_empty() { if node_ref.as_ref().unwrap().text_contents().trim().is_empty() {
node_ref = node_ref.as_ref().unwrap().next_sibling(); node_ref = node_ref.as_ref().unwrap().next_sibling();
} else if must_be_element
&& !node_ref.as_ref().unwrap().text_contents().trim().is_empty()
{
node_ref = node_ref.as_ref().unwrap().next_sibling();
} else { } else {
break; break;
} }
@ -448,16 +456,17 @@ impl Readability {
/// ///
/// Calling this in a loop will traverse the DOM depth-first. /// Calling this in a loop will traverse the DOM depth-first.
fn get_next_node(node_ref: &NodeRef, ignore_self_and_kids: bool) -> Option<NodeRef> { fn get_next_node(node_ref: &NodeRef, ignore_self_and_kids: bool) -> Option<NodeRef> {
// WARN: The uses of `next_element` here assume it returns an element node.
let has_elem_children = node_ref.children().elements().count(); let has_elem_children = node_ref.children().elements().count();
if !ignore_self_and_kids && has_elem_children > 0 { if !ignore_self_and_kids && has_elem_children > 0 {
Self::next_element(node_ref.first_child()) Self::next_element(node_ref.first_child(), true)
} else if let Some(next_sibling) = Self::next_element(node_ref.next_sibling()) { } else if let Some(next_sibling) = Self::next_element(node_ref.next_sibling(), true) {
Some(next_sibling) Some(next_sibling)
} else { } else {
// Keep walking up the node hierarchy until a parent with element siblings is found // Keep walking up the node hierarchy until a parent with element siblings is found
let mut node = node_ref.parent(); let mut node = node_ref.parent();
while let Some(parent) = node { while let Some(parent) = node {
if let Some(next_sibling) = Self::next_element(parent.next_sibling()) { if let Some(next_sibling) = Self::next_element(parent.next_sibling(), true) {
return Some(next_sibling); return Some(next_sibling);
} else { } else {
node = parent.parent(); node = parent.parent();
@ -845,7 +854,7 @@ impl Readability {
fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) { fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) {
// TODO: Add flag check // TODO: Add flag check
let is_list = tag_name == "ul" || tag_name == "ol"; let is_list = tag_name == "ul" || tag_name == "ol";
let mut nodes = node_ref.select(tag_name).unwrap(); let mut nodes = node_ref.descendants().select(tag_name).unwrap();
let is_data_table = |node_ref: &NodeRef| { let is_data_table = |node_ref: &NodeRef| {
let node_elem = node_ref.as_element().unwrap(); let node_elem = node_ref.as_element().unwrap();
let attrs = node_elem.attributes.borrow(); let attrs = node_elem.attributes.borrow();
@ -853,10 +862,6 @@ impl Readability {
}; };
let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count(); let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
let node_name = &node_ref.as_element().unwrap().name.local; let node_name = &node_ref.as_element().unwrap().name.local;
// Because select returns an inclusive iterator, we should skip the first one.
if node_name == tag_name {
nodes.next();
}
let mut nodes = nodes let mut nodes = nodes
// Do not remove data tables // Do not remove data tables
.filter(|node_data_ref| { .filter(|node_data_ref| {
@ -935,11 +940,7 @@ impl Readability {
fn clean(node_ref: &mut NodeRef, tag_name: &str) { fn clean(node_ref: &mut NodeRef, tag_name: &str) {
// Can be changed to a HashSet // Can be changed to a HashSet
let is_embed = vec!["object", "embed", "iframe"].contains(&tag_name); let is_embed = vec!["object", "embed", "iframe"].contains(&tag_name);
let mut nodes = node_ref.select(tag_name).unwrap(); let mut nodes = node_ref.descendants().select(tag_name).unwrap();
if node_ref.as_element().is_some() && &node_ref.as_element().unwrap().name.local == tag_name
{
nodes.next();
}
let mut nodes = nodes.filter(|node_data_ref| { let mut nodes = nodes.filter(|node_data_ref| {
!is_embed !is_embed
|| { || {
@ -1077,29 +1078,33 @@ impl Readability {
}) })
.for_each(|node_data_ref| node_data_ref.as_node().detach()); .for_each(|node_data_ref| node_data_ref.as_node().detach());
// TODO: Fix the code for deleting nodes
node_ref node_ref
.select("br") .select("br")
.unwrap() .unwrap()
.filter(|node_data_ref| { .filter(|node_data_ref| {
let br_node = node_data_ref.as_node(); let br_node = node_data_ref.as_node();
let next_node = Self::next_element(br_node.next_sibling()); // WARN: This assumes `next_element` returns an element node.
let next_node = Self::next_element(br_node.next_sibling(), true);
next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p" next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
}) })
.for_each(|node_data_ref| node_data_ref.as_node().detach()); .for_each(|node_data_ref| node_data_ref.as_node().detach());
node_ref.select("table").unwrap().for_each(|node_data_ref| { node_ref.select("table").unwrap().for_each(|node_data_ref| {
let table_node = node_data_ref.as_node(); let table_node = node_data_ref.as_node();
let table_child = Self::next_element(table_node.first_child()); // WARN: This assumes `next_element` returns an element node.
let table_child = Self::next_element(table_node.first_child(), true);
let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") { let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
table_child.as_ref().unwrap() table_child.as_ref().unwrap()
} else { } else {
table_node table_node
}; };
// WARN: This block assumes `next_element` returns an element node
if Self::has_single_tag_inside_element(&tbody, "tr") { if Self::has_single_tag_inside_element(&tbody, "tr") {
let row = Self::next_element(tbody.first_child()).unwrap(); let row = Self::next_element(tbody.first_child(), true).unwrap();
if Self::has_single_tag_inside_element(&row, "td") { if Self::has_single_tag_inside_element(&row, "td") {
let cell = Self::next_element(row.first_child()).unwrap(); let cell = Self::next_element(row.first_child(), true).unwrap();
let tag = if cell let tag = if cell
.children() .children()
.all(|cell_child| Self::is_phrasing_content(&cell_child)) .all(|cell_child| Self::is_phrasing_content(&cell_child))
@ -1623,12 +1628,13 @@ mod test {
<!-- Commented content --> <!-- Commented content -->
<p id="b">This is another node. The next line is just whitespace</p> <p id="b">This is another node. The next line is just whitespace</p>
This is standalone text"#; This is standalone text
<p> Some <span>more</span> text</p>"#;
let doc = Readability::new(html_str); let doc = Readability::new(html_str);
let p = doc.root_node.select_first("#a").unwrap(); let p = doc.root_node.select_first("#a").unwrap();
let p = p.as_node(); let p = p.as_node();
let mut p_node_option: Option<NodeRef> = Some(p.clone()); let mut p_node_option: Option<NodeRef> = Some(p.clone());
p_node_option = Readability::next_element(p_node_option); p_node_option = Readability::next_element(p_node_option, false);
assert_eq!(Some(p.clone()), p_node_option); assert_eq!(Some(p.clone()), p_node_option);
let p_node_option = p_node_option.unwrap(); let p_node_option = p_node_option.unwrap();
@ -1636,20 +1642,20 @@ mod test {
let p_node_option_attr = p_node_option.unwrap().attributes.borrow(); let p_node_option_attr = p_node_option.unwrap().attributes.borrow();
assert_eq!("a", p_node_option_attr.get("id").unwrap()); assert_eq!("a", p_node_option_attr.get("id").unwrap());
let next = Readability::next_element(p.next_sibling()); let next = Readability::next_element(p.next_sibling(), false);
let next = next.unwrap(); let next = next.unwrap();
let next_elem = next.as_element(); let next_elem = next.as_element();
let next_attr = next_elem.unwrap().attributes.borrow(); let next_attr = next_elem.unwrap().attributes.borrow();
assert_eq!("b", next_attr.get("id").unwrap()); assert_eq!("b", next_attr.get("id").unwrap());
let next = Readability::next_element(next.next_sibling()); let next = Readability::next_element(next.next_sibling(), false);
let next = next.unwrap(); let next = next.unwrap();
assert_eq!(true, next.as_text().is_some()); assert_eq!(true, next.as_text().is_some());
assert_eq!("This is standalone text", next.text_contents().trim()); assert_eq!("This is standalone text", next.text_contents().trim());
let next = Readability::next_element(None); let next = Readability::next_element(None, false);
assert_eq!(None, next); assert_eq!(None, next);
} }