From e1debf5630df594c0e587261352591d75d3cdb2b Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Mon, 31 Aug 2020 19:30:09 +0300 Subject: [PATCH] Add moz_readability initial code and accompanying unit tests This currently contains the preprocessing code of the Readability. It is a port of Readability.js by Mozilla. --- Cargo.lock | 5 +- Cargo.toml | 3 +- src/main.rs | 1 + src/moz_readability/mod.rs | 653 +++++++++++++++++++++++++++++++++++++ test_html/simple.html | 25 ++ 5 files changed, 684 insertions(+), 3 deletions(-) create mode 100644 src/moz_readability/mod.rs create mode 100644 test_html/simple.html diff --git a/Cargo.lock b/Cargo.lock index fafdfb7..f3ad4b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -704,9 +704,9 @@ dependencies = [ [[package]] name = "kuchiki" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1beeffc5ae5ab0def2cb85e26063a8e6b4f579b0adec3805bf87472086948956" +checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358" dependencies = [ "cssparser", "html5ever 0.25.1", @@ -1014,6 +1014,7 @@ version = "0.1.0" dependencies = [ "async-std", "epub-builder", + "html5ever 0.25.1", "kuchiki", "md5", "structopt", diff --git a/Cargo.toml b/Cargo.toml index 801a3a8..64afcb0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,8 @@ license = "MIT" [dependencies] async-std = "1.5.0" epub-builder = "0.4.5" -kuchiki = "0.8.0" +html5ever = "0.25.1" +kuchiki = "0.8.1" md5 = "0.7.0" surf = "1.0.3" structopt = { version = "0.3" } diff --git a/src/main.rs b/src/main.rs index 6f15e9e..8284875 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ use url::Url; mod cli; mod extractor; +mod moz_readability; use extractor::Extractor; fn main() { diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs new file mode 100644 index 0000000..170d7e8 --- /dev/null +++ b/src/moz_readability/mod.rs @@ -0,0 +1,653 @@ +use std::collections::BTreeMap; + +use crate::extractor::MetaAttr; + +use html5ever::{LocalName, Namespace, QualName}; +use kuchiki::{ + iter::{Descendants, Elements, Select}, + traits::*, + NodeData, NodeRef, +}; + +const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; +const PHRASING_ELEMS: [&str; 39] = [ + "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", + "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object", + "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong", + "sub", "sup", "textarea", "time", "var", "wbr", +]; + +pub struct Readability { + root_node: NodeRef, +} + +impl Readability { + pub fn new(html_str: &str) -> Self { + Self { + root_node: kuchiki::parse_html().one(html_str), + } + } + pub fn parse(&mut self) { + self.unwrap_no_script_tags(); + } + /// Recursively check if node is image, or if node contains exactly only one image + /// whether as a direct child or as its descendants. + fn is_single_image(node_ref: &NodeRef) -> bool { + if let Some(element) = node_ref.as_element() { + if &element.name.local == "img" { + return true; + } + } + + if node_ref.children().filter(Self::has_content).count() != 1 + || !node_ref.text_contents().trim().is_empty() + { + return false; + } + + return Readability::is_single_image( + &node_ref + .children() + .filter(Self::has_content) + .next() + .expect("Unable to get first child which should exist"), + ); + } + + fn has_content(node_ref: &NodeRef) -> bool { + match node_ref.data() { + NodeData::Text(text) => !text.borrow().trim().is_empty(), + _ => true, + } + } + + /// Find all