From d99b1c687b7d7f654449a60f4e285faa1ecfa5ee Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 20 Oct 2020 10:13:34 +0300 Subject: [PATCH] Fix counting of h2 nodes in prep_article Add test for prep_article --- src/moz_readability/mod.rs | 193 ++++++++++++++++++++++++++++++++++++- 1 file changed, 190 insertions(+), 3 deletions(-) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 299ddc3..7c10d7a 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -1065,9 +1065,9 @@ impl Readability { }); }); - let mut h2 = node_ref.select("h2").unwrap(); - if h2.by_ref().count() == 1 { - let h2_node = h2.next().unwrap(); + let h2_nodes = node_ref.select("h2").unwrap().take(2).collect::>(); + if h2_nodes.len() == 1 { + let h2_node = h2_nodes[0].as_node(); let length_similar_rate = ((h2_node.text_contents().len() - self.article_title.len()) as f32) / self.article_title.len() as f32; @@ -2888,4 +2888,191 @@ characters. For that reason, this

tag could not be a byline because it's too assert_eq!(2, p_count); assert_eq!(0, li_count); } + + #[test] + fn test_prep_article() { + let html_str = r#" + + + + A test HTML file + + +

A test HTML file

+ + +

A h1 tag

+

A h1 tag to be removed

+
+ + + + + + +
LeftMain Content of the systemRight
+
A red box
+
+ A blue box +
+ + + + + + + + + + + + + + + + + + + +
Monthly savings
MonthSavings
January$100
February$50
+ +
+
+
+ Personal details: + +

+ +

+
+
+
+ +
+ +
+ + + + + +
One cell table. This is going to be replaced
+ +
+ +
+
+
+ + +
+ +
+
+
+ +
+

+ Lorem ipsum dolor, sit amet consectetur adipisicing elit. Minima quia numquam aperiam dolores ipsam, eos perferendis cupiditate adipisci perspiciatis + dolore, sunt, iusto nobis? Nulla molestiae id repellat quibusdam nobis quia. Lorem ipsum dolor sit amet consectetur, adipisicing elit. Voluptas + laudantium omnis nemo qui libero? Eius suscipit veritatis, tenetur impedit et voluptatibus. Rerum repellat totam quam nobis harum fuga consequatur + corrupti? Lorem ipsum dolor sit amet consectetur, adipisicing elit. Iure excepturi accusamus nemo voluptatibus laborum minus dicta blanditiis totam + aperiam velit amet cupiditate hic a molestias odio nam, fugiat facere iusto. +

+
+ + + + +
One cell table. This is going to be replaced
+
A Paperoni test
+ + + + "#; + let mut doc = Readability::new(html_str); + doc.article_title = "A test HTML file".into(); + let body = doc.root_node.select_first("body").unwrap(); + doc.prep_article(&mut body.as_node().clone()); + + // Ensure tables were assigned their data table scores + let table_node = doc.root_node.select_first("table").unwrap(); + let node_attr = table_node.attributes.borrow(); + assert_eq!(true, node_attr.get("readability-data-table").is_some()); + + let forms_and_fieldsets = doc.root_node.select("form, fieldset").unwrap(); + assert_eq!(0, forms_and_fieldsets.count()); + + let nodes = doc + .root_node + .select("h1, object, embed, footer, link, aside") + .unwrap(); + assert_eq!(0, nodes.count()); + + assert_eq!(2, doc.root_node.select("p").unwrap().count()); + assert_eq!(true, doc.root_node.select_first("p.share").is_err()); + assert_eq!(true, doc.root_node.select_first("p#share").is_ok()); + assert_eq!(true, doc.root_node.select_first("p#td-to-p").is_ok()); + + let node = doc.root_node.select_first("h2"); + assert_eq!(true, node.is_err()); + + let nodes = doc + .root_node + .select("input, textarea, select, button") + .unwrap(); + assert_eq!(0, nodes.count()); + + let nodes = doc.root_node.select("iframe").unwrap(); + assert_eq!(1, nodes.count()); + let node = doc.root_node.select_first("iframe#yt"); + assert_eq!(true, node.is_ok()); + + let nodes = doc.root_node.select("h1").unwrap(); + assert_eq!(0, nodes.count()); + + let nodes = doc + .root_node + .select("#tbl-one, #tbl-replace-p, #tbl-replace-div") + .unwrap(); + assert_eq!(0, nodes.count()); + + let tables = doc.root_node.select("#tbl-two, #tbl-three").unwrap(); + assert_eq!(2, tables.count()); + + assert_eq!(true, doc.root_node.select_first("ul").is_ok()); + + assert_eq!(2, doc.root_node.select("div").unwrap().count()); + assert_eq!(true, doc.root_node.select_first("div#div-p").is_ok()); + assert_eq!(true, doc.root_node.select_first("div#td-to-div").is_ok()); + + assert_eq!(1, doc.root_node.select("br").unwrap().count()); + let node_ref = doc.root_node.select_first("br").unwrap(); + assert_eq!( + "div", + &node_ref + .as_node() + .following_siblings() + .elements() + .next() + .unwrap() + .name + .local + ); + } }