Fix calls to select
to return an iterator excluding the original
calling node. Edit `next_element` to either return an element node only or element/ text node
This commit is contained in:
parent
6377c01fb3
commit
3254064c0d
1 changed files with 33 additions and 27 deletions
|
@ -175,8 +175,9 @@ impl Readability {
|
||||||
.borrow_mut()
|
.borrow_mut()
|
||||||
.insert(attr_name, prev_value.value.clone());
|
.insert(attr_name, prev_value.value.clone());
|
||||||
}
|
}
|
||||||
|
// WARN: This assumes `next_element` returns an element node!!
|
||||||
let inner_node_child = Self::next_element(inner_node_ref.first_child());
|
let inner_node_child =
|
||||||
|
Self::next_element(inner_node_ref.first_child(), true);
|
||||||
prev_elem.insert_after(inner_node_child.unwrap());
|
prev_elem.insert_after(inner_node_child.unwrap());
|
||||||
prev_elem.detach();
|
prev_elem.detach();
|
||||||
}
|
}
|
||||||
|
@ -218,8 +219,9 @@ impl Readability {
|
||||||
/// <div>foo<br>bar<p>abc</p></div>
|
/// <div>foo<br>bar<p>abc</p></div>
|
||||||
fn replace_brs(&mut self) {
|
fn replace_brs(&mut self) {
|
||||||
if let Ok(mut br_tags) = self.root_node.select("br") {
|
if let Ok(mut br_tags) = self.root_node.select("br") {
|
||||||
|
// The uses of `next_element` here are safe as it explicitly ensures the next element is an element node
|
||||||
while let Some(br_tag) = br_tags.next() {
|
while let Some(br_tag) = br_tags.next() {
|
||||||
let mut next = Self::next_element(br_tag.as_node().next_sibling());
|
let mut next = Self::next_element(br_tag.as_node().next_sibling(), false);
|
||||||
let mut replaced = false;
|
let mut replaced = false;
|
||||||
while let Some(next_elem) = next {
|
while let Some(next_elem) = next {
|
||||||
if next_elem.as_element().is_some()
|
if next_elem.as_element().is_some()
|
||||||
|
@ -228,7 +230,7 @@ impl Readability {
|
||||||
replaced = true;
|
replaced = true;
|
||||||
let br_sibling = next_elem.next_sibling();
|
let br_sibling = next_elem.next_sibling();
|
||||||
next_elem.detach();
|
next_elem.detach();
|
||||||
next = Self::next_element(br_sibling);
|
next = Self::next_element(br_sibling, false);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -331,7 +333,9 @@ impl Readability {
|
||||||
/// Finds the next element, starting from the given node, and ignoring
|
/// Finds the next element, starting from the given node, and ignoring
|
||||||
/// whitespace in between. If the given node is an element, the same node is
|
/// whitespace in between. If the given node is an element, the same node is
|
||||||
/// returned.
|
/// returned.
|
||||||
fn next_element(node_ref: Option<NodeRef>) -> Option<NodeRef> {
|
/// The must_be_element argument ensure the next element is actually an element node.
|
||||||
|
/// This is likely to factored out into a new function.
|
||||||
|
fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
|
||||||
// TODO: Could probably be refactored to use the elements method
|
// TODO: Could probably be refactored to use the elements method
|
||||||
let mut node_ref = node_ref;
|
let mut node_ref = node_ref;
|
||||||
while node_ref.is_some() {
|
while node_ref.is_some() {
|
||||||
|
@ -340,6 +344,10 @@ impl Readability {
|
||||||
_ => {
|
_ => {
|
||||||
if node_ref.as_ref().unwrap().text_contents().trim().is_empty() {
|
if node_ref.as_ref().unwrap().text_contents().trim().is_empty() {
|
||||||
node_ref = node_ref.as_ref().unwrap().next_sibling();
|
node_ref = node_ref.as_ref().unwrap().next_sibling();
|
||||||
|
} else if must_be_element
|
||||||
|
&& !node_ref.as_ref().unwrap().text_contents().trim().is_empty()
|
||||||
|
{
|
||||||
|
node_ref = node_ref.as_ref().unwrap().next_sibling();
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -448,16 +456,17 @@ impl Readability {
|
||||||
///
|
///
|
||||||
/// Calling this in a loop will traverse the DOM depth-first.
|
/// Calling this in a loop will traverse the DOM depth-first.
|
||||||
fn get_next_node(node_ref: &NodeRef, ignore_self_and_kids: bool) -> Option<NodeRef> {
|
fn get_next_node(node_ref: &NodeRef, ignore_self_and_kids: bool) -> Option<NodeRef> {
|
||||||
|
// WARN: The uses of `next_element` here assume it returns an element node.
|
||||||
let has_elem_children = node_ref.children().elements().count();
|
let has_elem_children = node_ref.children().elements().count();
|
||||||
if !ignore_self_and_kids && has_elem_children > 0 {
|
if !ignore_self_and_kids && has_elem_children > 0 {
|
||||||
Self::next_element(node_ref.first_child())
|
Self::next_element(node_ref.first_child(), true)
|
||||||
} else if let Some(next_sibling) = Self::next_element(node_ref.next_sibling()) {
|
} else if let Some(next_sibling) = Self::next_element(node_ref.next_sibling(), true) {
|
||||||
Some(next_sibling)
|
Some(next_sibling)
|
||||||
} else {
|
} else {
|
||||||
// Keep walking up the node hierarchy until a parent with element siblings is found
|
// Keep walking up the node hierarchy until a parent with element siblings is found
|
||||||
let mut node = node_ref.parent();
|
let mut node = node_ref.parent();
|
||||||
while let Some(parent) = node {
|
while let Some(parent) = node {
|
||||||
if let Some(next_sibling) = Self::next_element(parent.next_sibling()) {
|
if let Some(next_sibling) = Self::next_element(parent.next_sibling(), true) {
|
||||||
return Some(next_sibling);
|
return Some(next_sibling);
|
||||||
} else {
|
} else {
|
||||||
node = parent.parent();
|
node = parent.parent();
|
||||||
|
@ -845,7 +854,7 @@ impl Readability {
|
||||||
fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) {
|
fn clean_conditionally(node_ref: &mut NodeRef, tag_name: &str) {
|
||||||
// TODO: Add flag check
|
// TODO: Add flag check
|
||||||
let is_list = tag_name == "ul" || tag_name == "ol";
|
let is_list = tag_name == "ul" || tag_name == "ol";
|
||||||
let mut nodes = node_ref.select(tag_name).unwrap();
|
let mut nodes = node_ref.descendants().select(tag_name).unwrap();
|
||||||
let is_data_table = |node_ref: &NodeRef| {
|
let is_data_table = |node_ref: &NodeRef| {
|
||||||
let node_elem = node_ref.as_element().unwrap();
|
let node_elem = node_ref.as_element().unwrap();
|
||||||
let attrs = node_elem.attributes.borrow();
|
let attrs = node_elem.attributes.borrow();
|
||||||
|
@ -853,10 +862,6 @@ impl Readability {
|
||||||
};
|
};
|
||||||
let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
|
let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
|
||||||
let node_name = &node_ref.as_element().unwrap().name.local;
|
let node_name = &node_ref.as_element().unwrap().name.local;
|
||||||
// Because select returns an inclusive iterator, we should skip the first one.
|
|
||||||
if node_name == tag_name {
|
|
||||||
nodes.next();
|
|
||||||
}
|
|
||||||
let mut nodes = nodes
|
let mut nodes = nodes
|
||||||
// Do not remove data tables
|
// Do not remove data tables
|
||||||
.filter(|node_data_ref| {
|
.filter(|node_data_ref| {
|
||||||
|
@ -935,11 +940,7 @@ impl Readability {
|
||||||
fn clean(node_ref: &mut NodeRef, tag_name: &str) {
|
fn clean(node_ref: &mut NodeRef, tag_name: &str) {
|
||||||
// Can be changed to a HashSet
|
// Can be changed to a HashSet
|
||||||
let is_embed = vec!["object", "embed", "iframe"].contains(&tag_name);
|
let is_embed = vec!["object", "embed", "iframe"].contains(&tag_name);
|
||||||
let mut nodes = node_ref.select(tag_name).unwrap();
|
let mut nodes = node_ref.descendants().select(tag_name).unwrap();
|
||||||
if node_ref.as_element().is_some() && &node_ref.as_element().unwrap().name.local == tag_name
|
|
||||||
{
|
|
||||||
nodes.next();
|
|
||||||
}
|
|
||||||
let mut nodes = nodes.filter(|node_data_ref| {
|
let mut nodes = nodes.filter(|node_data_ref| {
|
||||||
!is_embed
|
!is_embed
|
||||||
|| {
|
|| {
|
||||||
|
@ -1077,29 +1078,33 @@ impl Readability {
|
||||||
})
|
})
|
||||||
.for_each(|node_data_ref| node_data_ref.as_node().detach());
|
.for_each(|node_data_ref| node_data_ref.as_node().detach());
|
||||||
|
|
||||||
|
// TODO: Fix the code for deleting nodes
|
||||||
node_ref
|
node_ref
|
||||||
.select("br")
|
.select("br")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.filter(|node_data_ref| {
|
.filter(|node_data_ref| {
|
||||||
let br_node = node_data_ref.as_node();
|
let br_node = node_data_ref.as_node();
|
||||||
let next_node = Self::next_element(br_node.next_sibling());
|
// WARN: This assumes `next_element` returns an element node.
|
||||||
|
let next_node = Self::next_element(br_node.next_sibling(), true);
|
||||||
next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
|
next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
|
||||||
})
|
})
|
||||||
.for_each(|node_data_ref| node_data_ref.as_node().detach());
|
.for_each(|node_data_ref| node_data_ref.as_node().detach());
|
||||||
|
|
||||||
node_ref.select("table").unwrap().for_each(|node_data_ref| {
|
node_ref.select("table").unwrap().for_each(|node_data_ref| {
|
||||||
let table_node = node_data_ref.as_node();
|
let table_node = node_data_ref.as_node();
|
||||||
let table_child = Self::next_element(table_node.first_child());
|
// WARN: This assumes `next_element` returns an element node.
|
||||||
|
let table_child = Self::next_element(table_node.first_child(), true);
|
||||||
let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
|
let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
|
||||||
table_child.as_ref().unwrap()
|
table_child.as_ref().unwrap()
|
||||||
} else {
|
} else {
|
||||||
table_node
|
table_node
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// WARN: This block assumes `next_element` returns an element node
|
||||||
if Self::has_single_tag_inside_element(&tbody, "tr") {
|
if Self::has_single_tag_inside_element(&tbody, "tr") {
|
||||||
let row = Self::next_element(tbody.first_child()).unwrap();
|
let row = Self::next_element(tbody.first_child(), true).unwrap();
|
||||||
if Self::has_single_tag_inside_element(&row, "td") {
|
if Self::has_single_tag_inside_element(&row, "td") {
|
||||||
let cell = Self::next_element(row.first_child()).unwrap();
|
let cell = Self::next_element(row.first_child(), true).unwrap();
|
||||||
let tag = if cell
|
let tag = if cell
|
||||||
.children()
|
.children()
|
||||||
.all(|cell_child| Self::is_phrasing_content(&cell_child))
|
.all(|cell_child| Self::is_phrasing_content(&cell_child))
|
||||||
|
@ -1623,12 +1628,13 @@ mod test {
|
||||||
<!-- Commented content -->
|
<!-- Commented content -->
|
||||||
<p id="b">This is another node. The next line is just whitespace</p>
|
<p id="b">This is another node. The next line is just whitespace</p>
|
||||||
|
|
||||||
This is standalone text"#;
|
This is standalone text
|
||||||
|
<p> Some <span>more</span> text</p>"#;
|
||||||
let doc = Readability::new(html_str);
|
let doc = Readability::new(html_str);
|
||||||
let p = doc.root_node.select_first("#a").unwrap();
|
let p = doc.root_node.select_first("#a").unwrap();
|
||||||
let p = p.as_node();
|
let p = p.as_node();
|
||||||
let mut p_node_option: Option<NodeRef> = Some(p.clone());
|
let mut p_node_option: Option<NodeRef> = Some(p.clone());
|
||||||
p_node_option = Readability::next_element(p_node_option);
|
p_node_option = Readability::next_element(p_node_option, false);
|
||||||
assert_eq!(Some(p.clone()), p_node_option);
|
assert_eq!(Some(p.clone()), p_node_option);
|
||||||
|
|
||||||
let p_node_option = p_node_option.unwrap();
|
let p_node_option = p_node_option.unwrap();
|
||||||
|
@ -1636,20 +1642,20 @@ mod test {
|
||||||
let p_node_option_attr = p_node_option.unwrap().attributes.borrow();
|
let p_node_option_attr = p_node_option.unwrap().attributes.borrow();
|
||||||
assert_eq!("a", p_node_option_attr.get("id").unwrap());
|
assert_eq!("a", p_node_option_attr.get("id").unwrap());
|
||||||
|
|
||||||
let next = Readability::next_element(p.next_sibling());
|
let next = Readability::next_element(p.next_sibling(), false);
|
||||||
|
|
||||||
let next = next.unwrap();
|
let next = next.unwrap();
|
||||||
let next_elem = next.as_element();
|
let next_elem = next.as_element();
|
||||||
let next_attr = next_elem.unwrap().attributes.borrow();
|
let next_attr = next_elem.unwrap().attributes.borrow();
|
||||||
assert_eq!("b", next_attr.get("id").unwrap());
|
assert_eq!("b", next_attr.get("id").unwrap());
|
||||||
|
|
||||||
let next = Readability::next_element(next.next_sibling());
|
let next = Readability::next_element(next.next_sibling(), false);
|
||||||
|
|
||||||
let next = next.unwrap();
|
let next = next.unwrap();
|
||||||
assert_eq!(true, next.as_text().is_some());
|
assert_eq!(true, next.as_text().is_some());
|
||||||
assert_eq!("This is standalone text", next.text_contents().trim());
|
assert_eq!("This is standalone text", next.text_contents().trim());
|
||||||
|
|
||||||
let next = Readability::next_element(None);
|
let next = Readability::next_element(None, false);
|
||||||
assert_eq!(None, next);
|
assert_eq!(None, next);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Reference in a new issue