diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 15b7ec8..372e0ec 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -811,11 +811,10 @@ impl Readability { copy_to = "src"; } if copy_to.len() > 0 { + let new_val = val.value.clone(); let tag_name = &node.name.local; if tag_name == "img" || tag_name == "picture" { - if let Some(attr) = node_attr.get_mut(copy_to) { - *attr = val.value.clone(); - } + node_attr.insert(copy_to, new_val); } else if tag_name == "figure" { let node_ref = node.as_node(); let img_picture_nodes = node_ref.select("img, picture").unwrap(); @@ -831,7 +830,7 @@ impl Readability { { let mut img_attr = img.as_element().unwrap().attributes.borrow_mut(); - img_attr.insert(copy_to, val.value.clone()); + img_attr.insert(copy_to, new_val); } node_ref.append(img); } @@ -850,7 +849,7 @@ impl Readability { let is_data_table = |node_ref: &NodeRef| { let node_elem = node_ref.as_element().unwrap(); let attrs = node_elem.attributes.borrow(); - !(attrs.get("readability-data-table") == Some("true")) + attrs.get("readability-data-table") == Some("true") }; let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count(); let node_name = &node_ref.as_element().unwrap().name.local; @@ -858,10 +857,10 @@ impl Readability { if node_name == tag_name { nodes.next(); } - nodes + let mut nodes = nodes // Do not remove data tables .filter(|node_data_ref| { - !(node_name == "table" && is_data_table(node_data_ref.as_node())) + !(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node())) }) // Do not remove if it is a child of a data table .filter(|node_data_ref| { @@ -2518,7 +2517,124 @@ characters. For that reason, this

tag could not be a byline because it's too } #[test] - fn test_fix_lazy_images() {} + fn test_fix_lazy_images() { + let html_str = r#" + + + + Basketball + + star + star + + + + + Flowers + + + + "#; + let doc = Readability::new(html_str); + let svg_uri = doc.root_node.select_first("#svg-uri").unwrap(); + let normal_src = doc.root_node.select_first("#normal-src").unwrap(); + let gif_uri = doc.root_node.select_first("#gif-uri").unwrap(); + let picture = doc.root_node.select_first("picture").unwrap(); + Readability::fix_lazy_images(&mut doc.root_node.clone()); + assert_eq!(svg_uri, doc.root_node.select_first("#svg-uri").unwrap()); + assert_eq!( + normal_src, + doc.root_node.select_first("#normal-src").unwrap() + ); + assert_eq!(gif_uri, doc.root_node.select_first("#gif-uri").unwrap()); + assert_eq!(picture, doc.root_node.select_first("picture").unwrap()); + + let gif_uri_remove_src = doc.root_node.select_first("#gif-uri-remove-src").unwrap(); + let gif_uri_remove_src_attrs = gif_uri_remove_src.attributes.borrow(); + assert_eq!( + gif_uri_remove_src_attrs.get("data-src"), + gif_uri_remove_src_attrs.get("src") + ); + let lazy_loaded = doc.root_node.select_first("#lazy-loaded").unwrap(); + let lazy_loaded_attrs = lazy_loaded.attributes.borrow(); + assert_eq!( + lazy_loaded_attrs.get("data-src"), + lazy_loaded_attrs.get("src") + ); + } + + #[test] + fn test_clean_conditionally() { + let html_str = r#" + + + + + + + + + + + + + + + + + +
Monthly savings
MonthSavings
January$100
February$50
+ + + + + + +
LeftMainRight
+ + + + + +
OneTwo
+

+

The parent div will be deleted due to negative weight classes

+
+
+ The days of the week: Mon, Tue, Wed, Thur, Fri, Sat, Sun. + The months of the year: Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Oct, Nov, Dec. +
+
+ +
+ + + + "#; + let mut doc = Readability::new(html_str); + let body = doc.root_node.select_first("body").unwrap(); + doc.mark_data_tables(); + Readability::clean_conditionally(&mut body.as_node().clone(), "table"); + assert_eq!(true, doc.root_node.select_first("#data-table").is_ok()); + assert_eq!(false, doc.root_node.select_first("#display-table").is_ok()); + assert_eq!( + false, + doc.root_node.select_first("#display-table-removed").is_ok() + ); + Readability::clean_conditionally(&mut body.as_node().clone(), "div"); + assert_eq!(false, doc.root_node.select_first("div.comment").is_ok()); + assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok()); + assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok()); + assert_eq!(false, doc.root_node.select_first("div#footer").is_ok()); + } + #[test] fn test_clean() { let html_str = r#"