diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs
index 15b7ec8..372e0ec 100644
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@@ -811,11 +811,10 @@ impl Readability {
copy_to = "src";
}
if copy_to.len() > 0 {
+ let new_val = val.value.clone();
let tag_name = &node.name.local;
if tag_name == "img" || tag_name == "picture" {
- if let Some(attr) = node_attr.get_mut(copy_to) {
- *attr = val.value.clone();
- }
+ node_attr.insert(copy_to, new_val);
} else if tag_name == "figure" {
let node_ref = node.as_node();
let img_picture_nodes = node_ref.select("img, picture").unwrap();
@@ -831,7 +830,7 @@ impl Readability {
{
let mut img_attr =
img.as_element().unwrap().attributes.borrow_mut();
- img_attr.insert(copy_to, val.value.clone());
+ img_attr.insert(copy_to, new_val);
}
node_ref.append(img);
}
@@ -850,7 +849,7 @@ impl Readability {
let is_data_table = |node_ref: &NodeRef| {
let node_elem = node_ref.as_element().unwrap();
let attrs = node_elem.attributes.borrow();
- !(attrs.get("readability-data-table") == Some("true"))
+ attrs.get("readability-data-table") == Some("true")
};
let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
let node_name = &node_ref.as_element().unwrap().name.local;
@@ -858,10 +857,10 @@ impl Readability {
if node_name == tag_name {
nodes.next();
}
- nodes
+ let mut nodes = nodes
// Do not remove data tables
.filter(|node_data_ref| {
- !(node_name == "table" && is_data_table(node_data_ref.as_node()))
+ !(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node()))
})
// Do not remove if it is a child of a data table
.filter(|node_data_ref| {
@@ -2518,7 +2517,124 @@ characters. For that reason, this
tag could not be a byline because it's too
}
#[test]
- fn test_fix_lazy_images() {}
+ fn test_fix_lazy_images() {
+ let html_str = r#"
+
+
+
+
+
+
+
+
+
+
+
+ "#;
+ let doc = Readability::new(html_str);
+ let svg_uri = doc.root_node.select_first("#svg-uri").unwrap();
+ let normal_src = doc.root_node.select_first("#normal-src").unwrap();
+ let gif_uri = doc.root_node.select_first("#gif-uri").unwrap();
+ let picture = doc.root_node.select_first("picture").unwrap();
+ Readability::fix_lazy_images(&mut doc.root_node.clone());
+ assert_eq!(svg_uri, doc.root_node.select_first("#svg-uri").unwrap());
+ assert_eq!(
+ normal_src,
+ doc.root_node.select_first("#normal-src").unwrap()
+ );
+ assert_eq!(gif_uri, doc.root_node.select_first("#gif-uri").unwrap());
+ assert_eq!(picture, doc.root_node.select_first("picture").unwrap());
+
+ let gif_uri_remove_src = doc.root_node.select_first("#gif-uri-remove-src").unwrap();
+ let gif_uri_remove_src_attrs = gif_uri_remove_src.attributes.borrow();
+ assert_eq!(
+ gif_uri_remove_src_attrs.get("data-src"),
+ gif_uri_remove_src_attrs.get("src")
+ );
+ let lazy_loaded = doc.root_node.select_first("#lazy-loaded").unwrap();
+ let lazy_loaded_attrs = lazy_loaded.attributes.borrow();
+ assert_eq!(
+ lazy_loaded_attrs.get("data-src"),
+ lazy_loaded_attrs.get("src")
+ );
+ }
+
+ #[test]
+ fn test_clean_conditionally() {
+ let html_str = r#"
+
+
+
+
+ Monthly savings
+
+ Month |
+ Savings |
+
+
+ January |
+ $100 |
+
+
+ February |
+ $50 |
+
+
+
+
+ Left |
+ Main |
+ Right |
+
+
+
+
+
+ The days of the week: Mon, Tue, Wed, Thur, Fri, Sat, Sun.
+ The months of the year: Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Oct, Nov, Dec.
+
+
+
+
+
+
+
+ "#;
+ let mut doc = Readability::new(html_str);
+ let body = doc.root_node.select_first("body").unwrap();
+ doc.mark_data_tables();
+ Readability::clean_conditionally(&mut body.as_node().clone(), "table");
+ assert_eq!(true, doc.root_node.select_first("#data-table").is_ok());
+ assert_eq!(false, doc.root_node.select_first("#display-table").is_ok());
+ assert_eq!(
+ false,
+ doc.root_node.select_first("#display-table-removed").is_ok()
+ );
+ Readability::clean_conditionally(&mut body.as_node().clone(), "div");
+ assert_eq!(false, doc.root_node.select_first("div.comment").is_ok());
+ assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok());
+ assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok());
+ assert_eq!(false, doc.root_node.select_first("div#footer").is_ok());
+ }
+
#[test]
fn test_clean() {
let html_str = r#"
The parent div will be deleted due to negative weight classes
+