Add moz_readability initial code and accompanying unit tests

This currently contains the preprocessing code of the Readability.
It is a port of Readability.js by Mozilla.
This commit is contained in:
Kenneth Gitere 2020-08-31 19:30:09 +03:00
parent a27e45b5f3
commit e1debf5630
5 changed files with 684 additions and 3 deletions

5
Cargo.lock generated
View file

@ -704,9 +704,9 @@ dependencies = [
[[package]] [[package]]
name = "kuchiki" name = "kuchiki"
version = "0.8.0" version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1beeffc5ae5ab0def2cb85e26063a8e6b4f579b0adec3805bf87472086948956" checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358"
dependencies = [ dependencies = [
"cssparser", "cssparser",
"html5ever 0.25.1", "html5ever 0.25.1",
@ -1014,6 +1014,7 @@ version = "0.1.0"
dependencies = [ dependencies = [
"async-std", "async-std",
"epub-builder", "epub-builder",
"html5ever 0.25.1",
"kuchiki", "kuchiki",
"md5", "md5",
"structopt", "structopt",

View file

@ -10,7 +10,8 @@ license = "MIT"
[dependencies] [dependencies]
async-std = "1.5.0" async-std = "1.5.0"
epub-builder = "0.4.5" epub-builder = "0.4.5"
kuchiki = "0.8.0" html5ever = "0.25.1"
kuchiki = "0.8.1"
md5 = "0.7.0" md5 = "0.7.0"
surf = "1.0.3" surf = "1.0.3"
structopt = { version = "0.3" } structopt = { version = "0.3" }

View file

@ -7,6 +7,7 @@ use url::Url;
mod cli; mod cli;
mod extractor; mod extractor;
mod moz_readability;
use extractor::Extractor; use extractor::Extractor;
fn main() { fn main() {

653
src/moz_readability/mod.rs Normal file
View file

@ -0,0 +1,653 @@
use std::collections::BTreeMap;
use crate::extractor::MetaAttr;
use html5ever::{LocalName, Namespace, QualName};
use kuchiki::{
iter::{Descendants, Elements, Select},
traits::*,
NodeData, NodeRef,
};
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
const PHRASING_ELEMS: [&str; 39] = [
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
"embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
"output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
"sub", "sup", "textarea", "time", "var", "wbr",
];
pub struct Readability {
root_node: NodeRef,
}
impl Readability {
pub fn new(html_str: &str) -> Self {
Self {
root_node: kuchiki::parse_html().one(html_str),
}
}
pub fn parse(&mut self) {
self.unwrap_no_script_tags();
}
/// Recursively check if node is image, or if node contains exactly only one image
/// whether as a direct child or as its descendants.
fn is_single_image(node_ref: &NodeRef) -> bool {
if let Some(element) = node_ref.as_element() {
if &element.name.local == "img" {
return true;
}
}
if node_ref.children().filter(Self::has_content).count() != 1
|| !node_ref.text_contents().trim().is_empty()
{
return false;
}
return Readability::is_single_image(
&node_ref
.children()
.filter(Self::has_content)
.next()
.expect("Unable to get first child which should exist"),
);
}
fn has_content(node_ref: &NodeRef) -> bool {
match node_ref.data() {
NodeData::Text(text) => !text.borrow().trim().is_empty(),
_ => true,
}
}
/// Find all <noscript> that are located after <img> nodes, and which contain only one <img> element.
/// Replace the first image with the image from inside the <noscript> tag, and remove the <noscript> tag.
/// This improves the quality of the images we use on some sites (e.g. Medium).
fn unwrap_no_script_tags(&mut self) {
if let Ok(imgs) = self.root_node.select("img") {
imgs.filter(|img_node_ref| {
let img_attrs = img_node_ref.attributes.borrow();
!img_attrs.map.iter().any(|(name, attr)| {
// TODO: Replace with regex
&name.local == "src"
|| &name.local == "srcset"
|| &name.local == "data-src"
|| &name.local == "data-srcset"
|| attr.value.ends_with(".jpg")
|| attr.value.ends_with(".jpeg")
|| attr.value.ends_with(".png")
|| attr.value.ends_with(".webp")
})
})
.for_each(|img_ref| img_ref.as_node().detach());
}
if let Ok(noscripts) = self.root_node.select("noscript") {
for noscript in noscripts {
let inner_node_ref = kuchiki::parse_fragment(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
Vec::new(),
)
.one(noscript.text_contents());
if !Self::is_single_image(&inner_node_ref) {
continue;
}
if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
// TODO: Fix this to have a better way of extracting nodes that are elements
while prev_elem.as_element().is_none() {
match prev_elem.previous_sibling() {
Some(new_prev) => prev_elem = new_prev,
None => break,
};
}
if Self::is_single_image(&prev_elem) && prev_elem.as_element().is_some() {
let prev_img = if &prev_elem.as_element().unwrap().name.local != "img" {
prev_elem.select_first("img").unwrap().as_node().clone()
} else {
prev_elem.clone()
};
let new_img = inner_node_ref.select_first("img").unwrap();
let prev_attrs = prev_img.as_element().unwrap().attributes.borrow();
let prev_attrs = prev_attrs.map.iter().filter(|(attr, val)| {
!val.value.trim().is_empty()
&& (&attr.local == "src"
|| &attr.local == "srcset"
// TODO: Replace with regex
|| val.value.ends_with(".jpg")
|| val.value.ends_with(".jpeg")
|| val.value.ends_with(".png")
|| val.value.ends_with(".webp"))
});
for (prev_attr, prev_value) in prev_attrs {
match new_img.attributes.borrow().get(&prev_attr.local) {
Some(value) => {
if value == prev_value.value {
continue;
}
}
None => (),
}
let attr_name: &str = &prev_attr.local;
let mut attr_name = attr_name.to_owned();
if new_img.attributes.borrow().contains(attr_name.clone()) {
let new_name = format!("data-old-{}", &attr_name);
attr_name = new_name;
}
new_img
.attributes
.borrow_mut()
.insert(attr_name, prev_value.value.clone());
}
// TODO: Replace the code below with next_element
prev_elem.insert_after(
inner_node_ref
.first_child()
.unwrap()
.children()
.filter(Self::has_content)
.next()
.unwrap(),
);
prev_elem.detach();
}
}
}
}
}
/// Removes script tags from the document.
fn remove_scripts(&mut self) {
match self.root_node.select("script") {
Ok(script_elems) => script_elems.for_each(|elem| elem.as_node().detach()),
Err(_) => (),
}
match self.root_node.select("noscript") {
Ok(noscript_elems) => noscript_elems.for_each(|elem| elem.as_node().detach()),
Err(_) => (),
}
}
/// Prepare the HTML document for readability to scrape it. This includes things like stripping
/// CSS, and handling terrible markup.
fn prep_document(&mut self) {
match self.root_node.select("style") {
Ok(style_elems) => style_elems.for_each(|elem| elem.as_node().detach()),
Err(_) => (),
}
self.replace_brs();
match self.root_node.select("font") {
Ok(nodes_iter) => Self::replace_node_tags(nodes_iter, "span"),
Err(_) => (),
}
}
/// Replaces 2 or more successive <br> elements with a single <p>.
/// Whitespace between <br> elements are ignored. For example:
/// <div>foo<br>bar<br> <br><br>abc</div>
/// will become:
/// <div>foo<br>bar<p>abc</p></div>
fn replace_brs(&mut self) {
if let Ok(mut br_tags) = self.root_node.select("br") {
// TODO: Change to for loop
while let Some(br_tag) = br_tags.next() {
let mut next = br_tag.as_node().next_sibling();
let mut replaced = false;
Self::next_element(&mut next);
while let Some(next_elem) = next {
if next_elem.as_element().is_some()
&& &next_elem.as_element().as_ref().unwrap().name.local == "br"
{
replaced = true;
let br_sibling = next_elem.next_sibling();
next_elem.detach();
next = br_sibling;
} else {
break;
}
Self::next_element(&mut next);
}
if replaced {
let p = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("p")),
BTreeMap::new(),
);
br_tag.as_node().insert_before(p);
let p = br_tag.as_node().previous_sibling().unwrap();
br_tag.as_node().detach();
next = p.next_sibling();
while next.is_some() {
let next_sibling = next.unwrap();
if let Some(next_elem) = next_sibling.as_element() {
if &next_elem.name.local == "br" {
if let Some(second_sibling) = next_sibling.next_sibling() {
if second_sibling.as_element().is_some()
&& "br" == &second_sibling.as_element().unwrap().name.local
{
break;
}
}
}
}
if !Self::is_phrasing_content(&next_sibling) {
break;
}
let sibling = next_sibling.next_sibling();
p.append(next_sibling);
next = sibling;
}
while let Some(first_child) = p.first_child() {
if Self::is_whitespace(&first_child) {
first_child.detach();
} else {
break;
}
}
while let Some(last_child) = p.last_child() {
if Self::is_whitespace(&last_child) {
last_child.detach();
} else {
break;
}
}
if let Some(parent) = p.parent() {
if &parent.as_element().as_ref().unwrap().name.local == "p" {
Self::set_node_tag(&parent, "div");
}
}
}
}
}
}
/// Iterates over a Select, and calls set_node_tag for each node.
fn replace_node_tags(nodes: Select<Elements<Descendants>>, name: &str) {
for node in nodes {
Self::set_node_tag(node.as_node(), name);
}
}
/// Replaces the specified NodeRef by replacing its name. This works by copying over its
/// children and its attributes.
fn set_node_tag(node_ref: &NodeRef, name: &str) {
// TODO: Change function to own node_ref so that a user does not try to use it after dropping
match node_ref.as_element() {
Some(elem) => {
let attributes = elem.attributes.borrow().clone().map.into_iter();
let replacement = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from(name)),
attributes,
);
for child in node_ref.children() {
replacement.append(child);
}
node_ref.insert_before(replacement);
node_ref.detach();
}
None => (),
}
}
fn is_whitespace(node_ref: &NodeRef) -> bool {
match node_ref.data() {
NodeData::Element(elem_data) => &elem_data.name.local == "br",
NodeData::Text(text_ref) => text_ref.borrow().trim().len() == 0,
_ => false,
}
}
/// Finds the next element, starting from the given node, and ignoring
/// whitespace in between. If the given node is an element, the same node is
/// returned.
fn next_element(node_ref: &mut Option<NodeRef>) {
// The signature of this method affects how it is used in loops since the code
// has to ensure that it is called before the next iteration. This probably
// makes it less obvious to understand the first time. This may change in the future.
while node_ref.is_some() {
match node_ref.as_ref().unwrap().data() {
NodeData::Element(_) => break,
_ => {
if node_ref.as_ref().unwrap().text_contents().trim().is_empty() {
*node_ref = node_ref.as_ref().unwrap().next_sibling();
} else {
break;
}
}
}
}
}
/// Determine if a node qualifies as phrasing content.
/// https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
fn is_phrasing_content(node_ref: &NodeRef) -> bool {
node_ref.as_text().is_some()
|| match node_ref.as_element() {
Some(elem) => {
let name: &str = &elem.name.local;
PHRASING_ELEMS.contains(&name)
|| ((name == "a" || name == "del" || name == "ins")
&& node_ref
.children()
.all(|child_ref| Self::is_phrasing_content(&child_ref)))
}
None => false,
}
}
///Attempts to get excerpt and byline metadata for the article. @return Object with optional "excerpt" and "byline" properties
fn get_article_metadata(&self) -> MetaAttr {
unimplemented!()
}
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
/// a user wants to read. Then return it wrapped up in a div.
fn grab_article(&mut self) {}
}
#[cfg(test)]
mod test {
use super::Readability;
use super::HTML_NS;
use html5ever::{LocalName, Namespace, QualName};
use kuchiki::traits::*;
use kuchiki::NodeRef;
// TODO: Refactor not to use test file possibly
const TEST_HTML: &'static str = include_str!("../../test_html/simple.html");
#[test]
fn test_unwrap_no_script_tags() {
let mut readability = Readability::new(TEST_HTML);
let img_count = readability.root_node.select("img").unwrap().count();
assert_eq!(3, img_count);
readability.unwrap_no_script_tags();
let img_count = readability.root_node.select("img").unwrap().count();
assert_eq!(2, img_count);
// Ensure attributes were copied over
let updated_img = readability.root_node.select_first("img#lazy-load").unwrap();
let updated_img_attrs = updated_img.attributes.borrow();
assert_eq!(true, updated_img_attrs.contains("data-old-src"));
assert_eq!(Some("lazy-load.png"), updated_img_attrs.get("data-old-src"));
assert_eq!(Some("eager-load.png"), updated_img_attrs.get("src"));
}
#[test]
fn test_is_single_image() {
let readability = Readability::new(TEST_HTML);
let img_elem_ref = readability.root_node.select_first("img").unwrap();
assert_eq!(true, Readability::is_single_image(&img_elem_ref.as_node()));
let noscript_elem_ref = readability.root_node.select_first("noscript").unwrap();
assert_eq!(
false,
Readability::is_single_image(&noscript_elem_ref.as_node())
);
let div_elem_ref = readability
.root_node
.select_first("div.invalid-elems")
.unwrap();
assert_eq!(false, Readability::is_single_image(&div_elem_ref.as_node()));
let div_elem_ref = kuchiki::parse_fragment(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
Vec::new(),
)
.one(noscript_elem_ref.as_node().text_contents().trim());
assert_eq!(true, Readability::is_single_image(&div_elem_ref));
}
#[test]
fn test_remove_scripts() {
let mut readability = Readability::new(TEST_HTML);
let noscript_elems = readability.root_node.select("noscript").unwrap();
assert_eq!(1, noscript_elems.count());
readability.remove_scripts();
let noscript_elems = readability.root_node.select("noscript").unwrap();
assert_eq!(0, noscript_elems.count());
}
#[test]
fn test_next_element() {
let html_str = r#"
<p id="a">This is a node</p>
<!-- Commented content -->
<p id="b">This is another node. The next line is just whitespace</p>
This is standalone text"#;
let doc = Readability::new(html_str);
let p = doc.root_node.select_first("#a").unwrap();
let p = p.as_node();
let mut p_node_option: Option<NodeRef> = Some(p.clone());
Readability::next_element(&mut p_node_option);
assert_eq!(Some(p.clone()), p_node_option);
let p_node_option = p_node_option.unwrap();
let p_node_option = p_node_option.as_element();
let p_node_option_attr = p_node_option.unwrap().attributes.borrow();
assert_eq!("a", p_node_option_attr.get("id").unwrap());
let mut next = p.next_sibling();
Readability::next_element(&mut next);
let next = next.unwrap();
let next_elem = next.as_element();
let next_attr = next_elem.unwrap().attributes.borrow();
assert_eq!("b", next_attr.get("id").unwrap());
let mut next = next.next_sibling();
Readability::next_element(&mut next);
let next = next.unwrap();
assert_eq!(true, next.as_text().is_some());
assert_eq!("This is standalone text", next.text_contents().trim());
let mut next = None;
Readability::next_element(&mut next);
assert_eq!(None, next);
}
#[test]
fn test_is_phrasing_content() {
let html_str = r#"
Some text node
<b>This is a phrasing content node</b>
<p>This is not a phrasing content node</p>
<a href="\#"><i>This is also a phrasing content</i></a>
<a href="\#"><p>This is not a phrasing content</p></a>
"#;
let doc = Readability::new(html_str);
let body = doc.root_node.select_first("body").unwrap();
let body = body.as_node();
let mut body_children = body.children();
let mut node = body_children.next().unwrap();
assert_eq!(true, node.as_text().is_some());
assert_eq!(true, Readability::is_phrasing_content(&node));
node = node.next_sibling().unwrap();
assert_eq!("b", &node.as_element().unwrap().name.local);
assert_eq!(true, Readability::is_phrasing_content(&node));
node = node.next_sibling().unwrap(); // Skips the text node from the new line character
node = node.next_sibling().unwrap();
assert_eq!("p", &node.as_element().unwrap().name.local);
assert_eq!(false, Readability::is_phrasing_content(&node));
node = node.next_sibling().unwrap(); // Skips the text node from the new line character
node = node.next_sibling().unwrap();
assert_eq!("a", &node.as_element().unwrap().name.local);
assert_eq!(true, Readability::is_phrasing_content(&node));
node = node.next_sibling().unwrap(); // Skips the text node from the new line character
node = node.next_sibling().unwrap();
assert_eq!("a", &node.as_element().unwrap().name.local);
assert_eq!(false, Readability::is_phrasing_content(&node));
}
#[test]
fn test_is_whitespace() {
let html_str = r#"
<p>Definitely not whitespace</p>
I am also not whitespace
<p> </p>
<br>
"#;
let doc = Readability::new(html_str);
let body = doc.root_node.select_first("body").unwrap();
let mut node = body.as_node().first_child().unwrap();
assert_eq!("p", &node.as_element().unwrap().name.local);
assert_eq!(false, Readability::is_whitespace(&node));
node = node.next_sibling().unwrap();
assert_eq!(true, node.as_text().is_some());
assert_eq!(false, Readability::is_whitespace(&node));
node = node.next_sibling().unwrap();
assert_eq!("p", &node.as_element().unwrap().name.local);
assert_eq!(
true,
Readability::is_whitespace(&node.first_child().unwrap())
);
// This is testing the new line character in between the <p> and <br> tags
node = node.next_sibling().unwrap();
assert_eq!(true, node.as_text().is_some());
assert_eq!(true, Readability::is_whitespace(&node));
node = node.next_sibling().unwrap();
assert_eq!("br", &node.as_element().unwrap().name.local);
assert_eq!(true, Readability::is_whitespace(&node));
}
#[test]
fn test_set_node_tag() {
let html_str = r#"
<div id="target" class="some random class" tabindex="0"><p>Child 1</p><p>Child 2</p></div>
<div id="not-the-target">The div above is being replaced</div>
"#;
let doc = Readability::new(html_str);
let target = doc.root_node.select_first("#target").unwrap();
let children_count = doc.root_node.children().count();
let target_children_count = target.as_node().children().count();
assert_eq!("div", &target.name.local);
Readability::set_node_tag(&target.as_node(), "section");
assert_eq!(children_count, doc.root_node.children().count());
let target = doc.root_node.select_first("#target").unwrap();
assert_eq!("section", &target.name.local);
assert_eq!(target_children_count, target.as_node().children().count());
let target_attrs = target.as_node().as_element().unwrap().attributes.borrow();
assert_eq!(3, target_attrs.map.len());
let old_div = doc.root_node.select_first("div#target");
assert_eq!(true, old_div.is_err());
}
#[test]
fn test_replace_node_tags() {
let html_str = r#"
<div id="replace-p">
<p>Tag 1</p><p>Tag 2</p><p>Tag 3</p>
</div>
"#;
let doc = Readability::new(html_str);
let target_parent = doc.root_node.select_first("div#replace-p").unwrap();
let target_parent_child_count = target_parent.as_node().children().count();
let nodes = target_parent.as_node().select("p").unwrap();
Readability::replace_node_tags(nodes, "span");
assert_eq!(
target_parent_child_count,
target_parent.as_node().children().count()
);
let nodes = target_parent.as_node().select("p").unwrap();
assert_eq!(0, nodes.count());
let nodes = target_parent.as_node().select("span").unwrap();
assert_eq!(3, nodes.count());
}
#[test]
fn test_replace_brs() {
let html_str = r#"
<div>foo<br>bar<br> <br><br>abc</div>
"#;
let mut doc = Readability::new(html_str);
let div = doc.root_node.select_first("div").unwrap();
let br_count = div.as_node().select("br").unwrap().count();
let p_count = div.as_node().select("p").unwrap().count();
assert_eq!(4, br_count);
assert_eq!(0, p_count);
doc.replace_brs();
let br_count = div.as_node().select("br").unwrap().count();
let p_count = div.as_node().select("p").unwrap().count();
assert_eq!(1, br_count);
assert_eq!(1, p_count);
let p_node = div.as_node().select_first("p").unwrap();
assert_eq!("abc", p_node.as_node().text_contents());
let html_str = r#"
<p>foo<br>bar<br> <br><br>abc</p>
"#;
doc = Readability::new(html_str);
let p = doc.root_node.select_first("p").unwrap();
let div_count = doc.root_node.select("div").unwrap().count();
let br_count = p.as_node().select("br").unwrap().count();
assert_eq!(4, br_count);
assert_eq!(0, div_count);
doc.replace_brs();
let br_count = doc.root_node.select("br").unwrap().count();
let div_count = doc.root_node.select("div").unwrap().count();
let p_count = doc.root_node.select("p").unwrap().count();
assert_eq!(1, br_count);
assert_eq!(1, div_count);
assert_eq!(1, p_count);
let p_node = doc.root_node.select_first("p").unwrap();
assert_eq!("abc", p_node.as_node().text_contents());
}
#[test]
fn test_prep_document() {
let html_str = r#"
<!DOCTYPE html>
<html>
<head>
<style>div {padding: 20px; border-bottom: 2px solid black; }</style>
</head>
<body>
<font face="Times New Roman" size="10">Times New Roman</font>
<div>foo<br>bar<br> <br><br>abc</div>
</body>
</html>
"#;
let mut doc = Readability::new(html_str);
doc.prep_document();
let style_nodes = doc.root_node.select("style").unwrap();
let font_nodes = doc.root_node.select("font").unwrap();
let p_nodes = doc.root_node.select("p").unwrap();
let br_nodes = doc.root_node.select("br").unwrap();
assert_eq!(0, style_nodes.count());
assert_eq!(0, font_nodes.count());
assert_eq!(1, p_nodes.count());
assert_eq!(1, br_nodes.count());
}
}

25
test_html/simple.html Normal file
View file

@ -0,0 +1,25 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Sample Document</title>
</head>
<body>
<h1>Some text in h1</h1>
<img src="inexistent.png">
<div class="invalid-elems">
<!-- This div contains invalid elements -->
<h1>Imagine some lorem ipsum</h1>
<img>
</div>
<!-- Test that the no-script content is copied over -->
<img src="lazy-load.png">
<noscript>
<div class="parent">
<img src="eager-load.png" id="lazy-load">
</div>
</noscript>
</body>
</html>