paperoni/src/moz_readability/mod.rs
Kenneth Gitere 679bf3cb04 Add logic for attempting different rounds for content extraction
with different flags set

Add additional test in `fix_relative_uris`
2020-10-22 11:50:34 +03:00

3902 lines
156 KiB
Rust

use std::collections::{BTreeMap, HashMap, HashSet};
use std::str::FromStr;
use html5ever::{LocalName, Namespace, QualName};
use kuchiki::{
iter::{Descendants, Elements, Select},
traits::*,
NodeData, NodeRef,
};
use url::Url;
const DEFAULT_CHAR_THRESHOLD: usize = 500;
const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
const READABILITY_SCORE: &'static str = "readability-score";
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
// TODO: Change to HashSet
const PHRASING_ELEMS: [&str; 39] = [
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
"embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
"output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
"sub", "sup", "textarea", "time", "var", "wbr",
];
// TODO: Change to HashSet
const DEFAULT_TAGS_TO_SCORE: [&str; 9] =
["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"];
// TODO: Change to HashSet
const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"];
const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
"align",
"background",
"bgcolor",
"border",
"cellpadding",
"cellspacing",
"frame",
"hspace",
"rules",
"style",
"valign",
"vspace",
];
const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"];
// TODO: Change to HashSet
const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"];
mod regexes;
pub struct Readability {
root_node: NodeRef,
byline: Option<String>,
article_title: String,
pub article_node: Option<NodeRef>,
article_dir: Option<String>,
flags: u32,
}
#[derive(Debug, PartialEq)]
struct SizeInfo {
rows: usize,
columns: usize,
}
impl Readability {
pub fn new(html_str: &str) -> Self {
Self {
root_node: kuchiki::parse_html().one(html_str),
byline: None,
article_title: "".into(),
article_node: None,
article_dir: None,
flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
}
}
pub fn parse(&mut self, url: &str) {
self.unwrap_no_script_tags();
self.remove_scripts();
self.prep_document();
let meta_data = self.get_article_metadata();
self.article_title = meta_data.title.clone();
self.grab_article();
self.post_process_content(url);
}
/// Recursively check if node is image, or if node contains exactly only one image
/// whether as a direct child or as its descendants.
fn is_single_image(node_ref: &NodeRef) -> bool {
if let Some(element) = node_ref.as_element() {
if &element.name.local == "img" {
return true;
}
}
if node_ref.children().filter(Self::has_content).count() != 1
|| !node_ref.text_contents().trim().is_empty()
{
return false;
}
return Readability::is_single_image(
&node_ref
.children()
.filter(Self::has_content)
.next()
.expect("Unable to get first child which should exist"),
);
}
fn has_content(node_ref: &NodeRef) -> bool {
match node_ref.data() {
NodeData::Text(text) => !text.borrow().trim().is_empty(),
_ => true,
}
}
/// Find all <noscript> that are located after <img> nodes, and which contain only one <img> element.
/// Replace the first image with the image from inside the <noscript> tag, and remove the <noscript> tag.
/// This improves the quality of the images we use on some sites (e.g. Medium).
fn unwrap_no_script_tags(&mut self) {
if let Ok(imgs) = self.root_node.select("img") {
let mut nodes = imgs.filter(|img_node_ref| {
let img_attrs = img_node_ref.attributes.borrow();
!img_attrs.map.iter().any(|(name, attr)| {
&name.local == "src"
|| &name.local == "srcset"
|| &name.local == "data-src"
|| &name.local == "data-srcset"
|| regexes::is_match_img_ext(&attr.value)
})
});
let mut node_ref = nodes.next();
while let Some(img_ref) = node_ref {
node_ref = nodes.next();
img_ref.as_node().detach();
}
}
if let Ok(noscripts) = self.root_node.select("noscript") {
for noscript in noscripts {
let inner_node_ref = kuchiki::parse_fragment(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
Vec::new(),
)
.one(noscript.text_contents());
if !Self::is_single_image(&inner_node_ref) {
continue;
}
if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
// TODO: Fix this to have a better way of extracting nodes that are elements
while prev_elem.as_element().is_none() {
match prev_elem.previous_sibling() {
Some(new_prev) => prev_elem = new_prev,
None => break,
};
}
if Self::is_single_image(&prev_elem) && prev_elem.as_element().is_some() {
let prev_img = if &prev_elem.as_element().unwrap().name.local != "img" {
prev_elem.select_first("img").unwrap().as_node().clone()
} else {
prev_elem.clone()
};
let new_img = inner_node_ref.select_first("img").unwrap();
let prev_attrs = prev_img.as_element().unwrap().attributes.borrow();
let prev_attrs = prev_attrs.map.iter().filter(|(attr, val)| {
!val.value.trim().is_empty()
&& (&attr.local == "src"
|| &attr.local == "srcset"
|| regexes::is_match_img_ext(&val.value))
});
for (prev_attr, prev_value) in prev_attrs {
match new_img.attributes.borrow().get(&prev_attr.local) {
Some(value) => {
if value == prev_value.value {
continue;
}
}
None => (),
}
let attr_name: &str = &prev_attr.local;
let mut attr_name = attr_name.to_owned();
if new_img.attributes.borrow().contains(attr_name.clone()) {
let new_name = format!("data-old-{}", &attr_name);
attr_name = new_name;
}
new_img
.attributes
.borrow_mut()
.insert(attr_name, prev_value.value.clone());
}
// WARN: This assumes `next_element` returns an element node!!
let inner_node_child =
Self::next_element(inner_node_ref.first_child(), true);
prev_elem.insert_after(inner_node_child.unwrap());
prev_elem.detach();
}
}
}
}
}
/// Removes script tags from the document.
fn remove_scripts(&mut self) {
match self.root_node.select("script") {
Ok(mut script_elems) => {
let mut next_script = script_elems.next();
while let Some(next_script_ref) = next_script {
next_script = script_elems.next();
next_script_ref.as_node().detach();
}
}
Err(_) => (),
}
match self.root_node.select("noscript") {
Ok(mut noscript_elems) => {
let mut next_noscript = noscript_elems.next();
while let Some(noscript_ref) = next_noscript {
next_noscript = noscript_elems.next();
noscript_ref.as_node().detach();
}
}
Err(_) => (),
}
}
/// Prepare the HTML document for readability to scrape it. This includes things like stripping
/// CSS, and handling terrible markup.
fn prep_document(&mut self) {
match self.root_node.select("style") {
Ok(mut style_elems) => {
let mut style_elem = style_elems.next();
while let Some(style_ref) = style_elem {
style_elem = style_elems.next();
style_ref.as_node().detach();
}
}
Err(_) => (),
}
self.replace_brs();
match self.root_node.select("font") {
Ok(nodes_iter) => Self::replace_node_tags(nodes_iter, "span"),
Err(_) => (),
}
}
/// Replaces 2 or more successive <br> elements with a single <p>.
/// Whitespace between <br> elements are ignored. For example:
/// <div>foo<br>bar<br> <br><br>abc</div>
/// will become:
/// <div>foo<br>bar<p>abc</p></div>
fn replace_brs(&mut self) {
if let Ok(mut br_tags) = self.root_node.select("br") {
// The uses of `next_element` here are safe as it explicitly ensures the next element is an element node
while let Some(br_tag) = br_tags.next() {
let mut next = Self::next_element(br_tag.as_node().next_sibling(), false);
let mut replaced = false;
while let Some(next_elem) = next {
if next_elem.as_element().is_some()
&& &next_elem.as_element().as_ref().unwrap().name.local == "br"
{
replaced = true;
let br_sibling = next_elem.next_sibling();
next = Self::next_element(br_sibling, false);
next_elem.detach();
} else {
break;
}
}
if replaced {
let p = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("p")),
BTreeMap::new(),
);
br_tag.as_node().insert_before(p);
let p = br_tag.as_node().previous_sibling().unwrap();
br_tag.as_node().detach();
next = p.next_sibling();
while next.is_some() {
let next_sibling = next.unwrap();
if let Some(next_elem) = next_sibling.as_element() {
if &next_elem.name.local == "br" {
if let Some(second_sibling) = next_sibling.next_sibling() {
if second_sibling.as_element().is_some()
&& "br" == &second_sibling.as_element().unwrap().name.local
{
break;
}
}
}
}
if !Self::is_phrasing_content(&next_sibling) {
break;
}
let sibling = next_sibling.next_sibling();
p.append(next_sibling);
next = sibling;
}
while let Some(first_child) = p.first_child() {
if Self::is_whitespace(&first_child) {
first_child.detach();
} else {
break;
}
}
while let Some(last_child) = p.last_child() {
if Self::is_whitespace(&last_child) {
last_child.detach();
} else {
break;
}
}
if let Some(parent) = p.parent() {
if &parent.as_element().as_ref().unwrap().name.local == "p" {
Self::set_node_tag(&parent, "div");
}
}
}
}
}
}
/// Iterates over a Select, and calls set_node_tag for each node.
fn replace_node_tags(nodes: Select<Elements<Descendants>>, name: &str) {
for node in nodes {
Self::set_node_tag(node.as_node(), name);
}
}
/// Replaces the specified NodeRef by replacing its name. This works by copying over its
/// children and its attributes.
fn set_node_tag(node_ref: &NodeRef, name: &str) -> NodeRef {
match node_ref.as_element() {
Some(elem) => {
let attributes = elem.attributes.borrow().clone().map.into_iter();
let replacement = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from(name)),
attributes,
);
for child in node_ref.children() {
replacement.append(child);
}
node_ref.insert_before(replacement);
let new_node = node_ref.previous_sibling().unwrap();
node_ref.detach();
return new_node;
}
None => (),
}
node_ref.clone()
}
fn is_whitespace(node_ref: &NodeRef) -> bool {
match node_ref.data() {
NodeData::Element(elem_data) => &elem_data.name.local == "br",
NodeData::Text(text_ref) => text_ref.borrow().trim().len() == 0,
_ => false,
}
}
/// Finds the next element, starting from the given node, and ignoring
/// whitespace in between. If the given node is an element, the same node is
/// returned.
/// The must_be_element argument ensure the next element is actually an element node.
/// This is likely to factored out into a new function.
fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
// TODO: Could probably be refactored to use the elements method
let mut node_ref = node_ref;
while node_ref.is_some() {
match node_ref.as_ref().unwrap().data() {
NodeData::Element(_) => break,
_ => {
if node_ref.as_ref().unwrap().text_contents().trim().is_empty() {
node_ref = node_ref.as_ref().unwrap().next_sibling();
} else if must_be_element
&& !node_ref.as_ref().unwrap().text_contents().trim().is_empty()
{
node_ref = node_ref.as_ref().unwrap().next_sibling();
} else {
break;
}
}
}
}
node_ref
}
/// Determine if a node qualifies as phrasing content.
/// https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
fn is_phrasing_content(node_ref: &NodeRef) -> bool {
node_ref.as_text().is_some()
|| match node_ref.as_element() {
Some(elem) => {
let name: &str = &elem.name.local;
PHRASING_ELEMS.contains(&name)
|| ((name == "a" || name == "del" || name == "ins")
&& node_ref
.children()
.all(|child_ref| Self::is_phrasing_content(&child_ref)))
}
None => false,
}
}
///Attempts to get excerpt and byline metadata for the article. @return Object with optional "excerpt" and "byline" properties
fn get_article_metadata(&self) -> MetaData {
let mut values: HashMap<String, String> = HashMap::new();
let mut meta_data = MetaData::new();
if let Ok(meta_elems) = self.root_node.select("meta") {
meta_elems
.filter(|node_ref| {
let node_attr = node_ref.attributes.borrow();
node_attr.get("content").is_some()
})
.for_each(|node_ref| {
let node_attr = node_ref.attributes.borrow();
let content = node_attr.get("content").unwrap();
let name_attr = node_attr.get("name");
let mut matches = None;
if let Some(property) = node_attr.get("property") {
matches = regexes::PROPERTY_REGEX.captures(property);
if matches.is_some() {
let captures = matches.as_ref().unwrap();
for capture in captures.iter() {
let mut name = capture.unwrap().as_str().to_lowercase();
name = regexes::REPLACE_WHITESPACE_REGEX
.replace_all(&name, "")
.to_string();
values.insert(name, content.trim().to_string());
}
}
}
if matches.is_none() && name_attr.is_some() {
let name_val = name_attr.unwrap();
if regexes::is_match_name_pattern(name_val) {
let name = name_val.to_lowercase();
let name = regexes::REPLACE_WHITESPACE_REGEX.replace_all(&name, "");
let name = regexes::REPLACE_DOT_REGEX.replace_all(&name, ":");
values.insert(name.to_string(), content.trim().to_string());
}
}
});
}
let meta_title_keys = [
"dc:title",
"dcterm:title",
"og:title",
"weibo:article:title",
"weibo:webpage:title",
"title",
"twitter:title",
];
meta_data.title = if let Some(key) = meta_title_keys
.iter()
.find(|key| values.contains_key(**key))
{
values.get(*key).map(|title| title.to_owned()).unwrap()
} else {
self.get_article_title()
};
let meta_byline_keys = ["dc:creator", "dcterm:creator", "author"];
meta_data.byline = {
let possible_key = meta_byline_keys
.iter()
.find(|key| values.contains_key(**key));
if let Some(actual_key) = possible_key {
values.get(*actual_key).map(|byline| byline.to_owned())
} else {
None
}
};
let meta_excerpt_keys = [
"dc:description",
"dcterm:description",
"og:description",
"weibo:article:description",
"weibo:webpage:description",
"description",
"twitter:description",
];
meta_data.excerpt = {
let possible_key = meta_excerpt_keys
.iter()
.find(|key| values.contains_key(**key));
if let Some(actual_key) = possible_key {
values.get(*actual_key).map(|excerpt| excerpt.to_owned())
} else {
None
}
};
meta_data.site_name = values
.get("og:site_name")
.map(|site_name| site_name.to_owned());
Self::unescape_html_entities(&mut meta_data.title);
if meta_data.byline.is_some() {
Self::unescape_html_entities(&mut meta_data.byline.as_mut().unwrap());
}
if meta_data.excerpt.is_some() {
Self::unescape_html_entities(&mut meta_data.excerpt.as_mut().unwrap());
}
if meta_data.site_name.is_some() {
Self::unescape_html_entities(&mut meta_data.site_name.as_mut().unwrap());
}
meta_data
}
/// Converts some of the common HTML entities in string to their corresponding characters.
fn unescape_html_entities(value: &mut String) {
if !value.is_empty() {
// TODO: Extract this
let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
html_escape_map.insert("lt", "<");
html_escape_map.insert("gt", ">");
html_escape_map.insert("amp", "&");
html_escape_map.insert("quot", "\"");
html_escape_map.insert("apos", "'");
let mut new_value = regexes::REPLACE_HTML_ESCAPE_REGEX
.replace_all(&value, |captures: &regex::Captures| {
html_escape_map[&captures[1]].to_string()
})
.to_string();
new_value = regexes::REPLACE_HEX_REGEX
.replace_all(&new_value, |captures: &regex::Captures| {
let num = if let Some(hex_capture) = captures.get(1) {
u16::from_str_radix(hex_capture.as_str(), 16)
} else if let Some(dec_capture) = captures.get(2) {
u16::from_str(dec_capture.as_str())
} else {
unreachable!("Unable to match any of the captures");
};
String::from_utf16_lossy(&[num.unwrap()])
})
.to_string();
*value = new_value;
}
}
/// Get the article title as an H1.
fn get_article_title(&self) -> String {
let mut cur_title = self
.root_node
.select_first("title")
.map(|title| title.text_contents().trim().to_string())
.expect("This file has no <title> tag to extract a title from");
let orig_title = cur_title.clone();
let mut title_had_hierarchical_separators = false;
let word_count = |s: &str| -> usize { s.split_whitespace().count() };
if regexes::is_match_title_separator(&cur_title) {
title_had_hierarchical_separators = regexes::is_match_has_title_separator(&cur_title);
cur_title = regexes::REPLACE_START_SEPARATOR_REGEX
.replace_all(&orig_title, "$start")
.to_string();
if word_count(&cur_title) < 3 {
cur_title = regexes::REPLACE_END_SEPARATOR_REGEX
.replace_all(&orig_title, "$end")
.to_string();
}
} else if cur_title.contains(": ") {
let trimmed_title = cur_title.trim();
let is_match_heading = self
.root_node
.select("h1, h2")
.unwrap()
.any(|heading| heading.text_contents().trim() == trimmed_title);
if !is_match_heading {
let mut idx = orig_title.rfind(":").unwrap() + 1;
let mut new_title = &orig_title[idx..];
if word_count(new_title) < 3 {
idx = orig_title.find(":").unwrap() + 1;
new_title = &orig_title[idx..];
} else if word_count(&orig_title[0..orig_title.find(":").unwrap()]) > 5 {
new_title = &orig_title;
}
cur_title = new_title.to_string();
}
} else if cur_title.len() > 150 || cur_title.len() < 15 {
let mut h1_nodes = self.root_node.select("h1").unwrap();
let (_, h1_count) = h1_nodes.size_hint();
if Some(1) == h1_count {
cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None);
}
}
cur_title = regexes::NORMALIZE_REGEX
.replace_all(cur_title.trim(), " ")
.to_string();
let cur_word_count = word_count(&cur_title);
if cur_word_count <= 4
&& (!title_had_hierarchical_separators
|| cur_word_count
!= word_count(
&regexes::REPLACE_MULTI_SEPARATOR_REGEX.replace_all(&orig_title, ""),
) - 1)
{
cur_title = orig_title;
}
cur_title
}
/// Removes the class="" attribute from every element in the given subtree, except those that
/// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
fn clean_classes(&mut self) {
// TODO: This should accessed from Self
let classes_to_preserve: HashSet<&str> = HashSet::new();
if let Some(article_node) = &mut self.article_node {
for elem in article_node.inclusive_descendants().elements() {
let mut elem_attrs = elem.attributes.borrow_mut();
if let Some(class_list) = elem_attrs.get_mut("class") {
let filtered_class: String = class_list
.split_whitespace()
.filter(|class| classes_to_preserve.contains(class))
.fold("".to_string(), |acc, x| acc + " " + x);
if filtered_class.is_empty() {
elem_attrs.remove("class");
} else {
*class_list = filtered_class;
}
}
}
}
}
/// Converts each <a> and <img> uri in the given element to an absolute URI, ignoring #ref URIs.
fn fix_relative_uris(&mut self, document_uri: &str) {
if let Some(article_node) = &mut self.article_node {
let document_uri =
Url::parse(document_uri).expect("Unable to parse the document's URI");
let base_uri = self
.root_node
.select("base")
.unwrap()
.filter(|node_ref| {
let node_attrs = node_ref.attributes.borrow();
node_attrs.contains("href")
})
.map(|node_ref| {
let node_attrs = node_ref.attributes.borrow();
Url::parse(node_attrs.get("href").unwrap()).unwrap()
})
.next()
.unwrap_or(document_uri.clone());
let to_absolute_uri = |uri_str: &str| -> String {
if base_uri == document_uri && uri_str.starts_with("#") {
return uri_str.to_string();
}
if let Ok(new_uri) = Url::parse(uri_str) {
if new_uri.has_host() {
return new_uri.to_string();
}
} else if let Ok(joined_uri) = base_uri.join(uri_str) {
return joined_uri.to_string();
}
uri_str.to_string()
};
let mut links = article_node.select("a").unwrap().filter(|a_ref| {
let link_attrs = a_ref.attributes.borrow();
link_attrs.contains("href")
});
let mut link = links.next();
while let Some(link_ref) = link {
link = links.next();
let mut link_attrs = link_ref.attributes.borrow_mut();
let href = link_attrs.get("href").map(|val| val.to_string()).unwrap();
if href.starts_with("javascript:") {
let link_node = link_ref.as_node();
if link_node.children().count() == 1
&& link_node
.first_child()
.map(|node_ref| node_ref.as_text().is_some())
.unwrap()
{
let text_node = NodeRef::new_text(link_node.text_contents());
link_node.insert_before(text_node);
link_node.detach();
} else {
let container = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("span")),
BTreeMap::new(),
);
let mut children = link_node.children();
let mut child = children.next();
while let Some(child_ref) = child {
child = children.next();
container.append(child_ref);
}
link_node.insert_before(container);
link_node.detach();
}
} else {
link_attrs.insert("href", to_absolute_uri(&href));
}
}
let media_nodes = article_node
.select("img, picture, figure, video, audio, source")
.unwrap();
for media_node in media_nodes {
let mut media_attrs = media_node.attributes.borrow_mut();
if let Some(src) = media_attrs.get_mut("src") {
*src = to_absolute_uri(&src);
}
if let Some(poster) = media_attrs.get_mut("poster") {
*poster = to_absolute_uri(&poster);
}
if let Some(srcset) = media_attrs.get_mut("srcset") {
let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all(
&srcset,
|captures: &regex::Captures| {
to_absolute_uri(&captures[1])
+ &captures.get(2).map(|cap| cap.as_str()).unwrap_or("")
+ &captures[3]
},
);
*srcset = new_srcset.to_string();
}
}
}
}
/// Removes readability attributes from DOM nodes as they are not needed in the final article
fn clean_readability_attrs(&mut self) {
if let Some(article_node) = &mut self.article_node {
for node in article_node.inclusive_descendants().elements() {
let mut node_attrs = node.attributes.borrow_mut();
node_attrs.remove(READABILITY_SCORE);
node_attrs.remove("readability-data-table");
}
}
}
/// Run any post-process modifications to article content as necessary.
fn post_process_content(&mut self, url: &str) {
self.fix_relative_uris(url);
// TODO: Add flag check
self.clean_classes();
self.clean_readability_attrs();
}
/// Converts an inline CSS string to a [HashMap] of property and value(s)
fn inline_css_str_to_map(css_str: &str) -> HashMap<&str, &str> {
css_str
.split(";")
.filter(|split_str| !split_str.trim().is_empty())
.map(|str_pair| {
let mut vals = str_pair.split(":");
(vals.next().unwrap().trim(), vals.next().unwrap().trim())
})
.collect()
}
fn is_probably_visible(node_ref: &NodeRef) -> bool {
if let Some(elem_data) = node_ref.as_element() {
let attributes = elem_data.attributes.borrow();
(if let Some(css_str) = attributes.get("style"){
let style_map = Self::inline_css_str_to_map(css_str);
if let Some(display_val) = style_map.get("display") {
display_val != &"none"
} else {
true
}
} else {
true
})
&& !attributes.contains("hidden")
// check for "fallback-image" so that wikimedia math images are displayed
&&
(!attributes.contains("aria-hidden") ||
attributes.get("aria-hidden").map(|val| val != "true").unwrap_or(true) ||
attributes.get("class").map(|class_list| class_list.split(" ").collect::<Vec<&str>>().contains(&"fallback-image")).unwrap_or(false))
} else {
// Technically, it should not matter what value is returned here
true
}
}
/// Check whether the input string could be a byline, i.e is less than 100 chars
fn is_valid_byline(input: &str) -> bool {
let text = input.trim();
text.len() > 0 && text.len() < 100
}
fn check_byline(&mut self, node_ref: &NodeRef, match_string: &str) -> bool {
if self.byline.is_none() {
if let Some(elem_data) = node_ref.as_element() {
let elem_attrs = elem_data.attributes.borrow();
let rel_attr = elem_attrs.get("rel");
let itemprop_attr = elem_attrs.get("itemprop");
let is_byline = (if rel_attr.is_some() {
rel_attr.unwrap() == "author"
} else if itemprop_attr.is_some() {
itemprop_attr.unwrap().contains("author")
} else {
regexes::is_match_byline(match_string)
}) && Self::is_valid_byline(&node_ref.text_contents());
if is_byline {
self.byline = Some(node_ref.text_contents().trim().to_owned());
}
is_byline
} else {
false
}
} else {
false
}
}
/// Traverse the DOM from node to node, starting at the node passed in.
/// Pass true for the second parameter to indicate this node itself
/// (and its kids) are going away, and we want the next node over.
///
/// Calling this in a loop will traverse the DOM depth-first.
fn get_next_node(node_ref: &NodeRef, ignore_self_and_kids: bool) -> Option<NodeRef> {
// WARN: The uses of `next_element` here assume it returns an element node.
let has_elem_children = node_ref.children().elements().count();
if !ignore_self_and_kids && has_elem_children > 0 {
Self::next_element(node_ref.first_child(), true)
} else if let Some(next_sibling) = Self::next_element(node_ref.next_sibling(), true) {
Some(next_sibling)
} else {
// Keep walking up the node hierarchy until a parent with element siblings is found
let mut node = node_ref.parent();
while let Some(parent) = node {
if let Some(next_sibling) = Self::next_element(parent.next_sibling(), true) {
return Some(next_sibling);
} else {
node = parent.parent();
}
}
None
}
}
/// Removes the node_ref passed in and returns the next possible node by calling [get_next_node]
fn remove_and_get_next(node_ref: NodeRef) -> Option<NodeRef> {
let next_node = Self::get_next_node(&node_ref, true);
node_ref.detach();
next_node
}
/// Check if a given node has one of its ancestor tag name matching the
/// provided one.
fn has_ancestor_tag(
node_ref: &NodeRef,
tag_name: &str,
max_depth: Option<i32>,
filter_fn: Option<fn(&NodeRef) -> bool>,
) -> bool {
let mut depth = 0;
let max_depth = max_depth.or(Some(3)).unwrap();
let mut parent = node_ref.parent();
while parent.is_some() {
let parent_node = parent.as_ref().unwrap();
if parent_node.as_element().is_none() {
// The recursion may go up the DOM tree upto a document node at which point it must stop
return false;
}
let parent_node_elem = parent_node.as_element().unwrap();
if max_depth > 0 && depth > max_depth {
return false;
}
if &parent_node_elem.name.local == tag_name
&& (filter_fn.is_none() || filter_fn.unwrap()(parent_node))
{
return true;
}
parent = parent_node.parent();
depth += 1;
}
false
}
fn is_element_without_content(node_ref: &NodeRef) -> bool {
let child_count = node_ref.children().count();
node_ref.as_element().is_some()
&& node_ref.text_contents().trim().is_empty()
&& (child_count == 0
|| child_count
== node_ref.select("br").unwrap().count()
+ node_ref.select("hr").unwrap().count())
}
/// Check if this node has only whitespace and a single element with given tag
/// Returns false if the <div> node contains non-empty text nodes
/// or if it contains no element with given tag or more than 1 element.
fn has_single_tag_inside_element(node_ref: &NodeRef, tag_name: &str) -> bool {
let first_child = node_ref.children().elements().next();
if node_ref.children().elements().count() != 1
|| (first_child.is_some() && &first_child.unwrap().name.local != tag_name)
{
return false;
}
!node_ref.children().any(|node| {
node.as_text().is_some()
&& regexes::is_match_has_content(&node.text_contents().trim_end())
})
}
fn get_inner_text(node_ref: &NodeRef, normalize_spaces: Option<bool>) -> String {
let will_normalize = normalize_spaces.unwrap_or(true);
let text = node_ref.text_contents();
let text = text.trim();
if will_normalize {
return regexes::NORMALIZE_REGEX.replace_all(&text, " ").to_string();
}
text.to_owned()
}
/// Get the density of links as a percentage of the content
/// This is the amount of text that is inside a link divided by the total text in the node.
fn get_link_density(node_ref: &NodeRef) -> f32 {
let text_length = Self::get_inner_text(node_ref, None).len() as f32;
if text_length == 0_f32 {
return 0_f32;
}
node_ref
.select("a")
.unwrap()
.map(|a_node| Self::get_inner_text(a_node.as_node(), None).len() as f32)
.sum::<f32>()
/ text_length
}
/// Determine whether element has any children block level elements.
fn has_child_block_element(node_ref: &NodeRef) -> bool {
// TODO: Refer to a static HashSet
let block_level_elems: [&str; 32] = [
"address",
"article",
"aside",
"blockquote",
"details",
"dialog",
"dd",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"li",
"main",
"nav",
"ol",
"p",
"pre",
"section",
"table",
"ul",
];
node_ref.children().any(|child_node| {
if child_node.as_element().is_some() {
let child_elem = child_node.as_element().unwrap();
block_level_elems.contains(&&*child_elem.name.local)
|| Self::has_child_block_element(&child_node)
} else {
false
}
})
}
/// Returns a [Vec] of ancestors
fn get_node_ancestors(node_ref: &NodeRef, max_depth: Option<usize>) -> Vec<NodeRef> {
node_ref.ancestors().take(max_depth.unwrap_or(1)).collect()
}
/// Get an element's class/id weight using regular expressions to tell if this
/// element looks good or bad.
fn get_class_weight(&self, node_ref: &NodeRef) -> i32 {
if !self.flag_is_active(FLAG_WEIGHT_CLASSES) {
return 0;
}
let mut weight = 0;
let node_elem = node_ref.as_element().unwrap();
let node_attrs = node_elem.attributes.borrow();
if let Some(id) = node_attrs.get("id") {
if !id.trim().is_empty() {
weight = if regexes::is_match_positive(id) {
weight + 25
} else if regexes::is_match_negative(id) {
weight - 25
} else {
weight
}
}
}
if let Some(class) = node_attrs.get("class") {
if !class.trim().is_empty() {
weight = if regexes::is_match_positive(class) {
weight + 25
} else if regexes::is_match_negative(class) {
weight - 25
} else {
weight
}
}
}
weight
}
/// Initialize a node with the readability attribute. Also checks the
/// className/id for special names to add to its score.
fn initialize_node(&self, node_ref: &mut NodeRef) {
if let Some(element) = node_ref.as_element() {
let mut score = 0.0;
// This must be computed first because it borrows the NodeRef which
// should not also be mutably borrowed
score += self.get_class_weight(node_ref) as f32;
let mut elem_attrs = element.attributes.borrow_mut();
elem_attrs.insert(READABILITY_SCORE, score.to_string());
let readability = elem_attrs.get_mut(READABILITY_SCORE);
match &*element.name.local {
"div" => score += 5.0,
"pre" | "td" | "blockquote" => score += 3.0,
"address" | "ol" | "ul" | "dl" | "dd" | "dt" | "li" | "form" => score -= 3.0,
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "th" => score -= 5.0,
_ => (),
}
if let Some(x) = readability {
*x = score.to_string();
}
}
}
fn get_row_and_column_count(node_ref: &NodeRef) -> SizeInfo {
let mut rows = 0;
let mut columns = 0;
if let Ok(trs) = node_ref.select("tr") {
for tr in trs {
let tr_node = tr.as_node();
let tr_attr = tr.attributes.borrow();
let rowspan = tr_attr
.get("rowspan")
.map(|x| {
x.parse::<usize>()
.expect("Unable to parse rowspan value to usize")
})
.unwrap_or(1);
rows += rowspan;
let mut columns_in_row = 0;
if let Ok(cells) = tr_node.select("td") {
for cell in cells {
let cell_attr = cell.attributes.borrow();
let colspan = cell_attr
.get("colspan")
.map(|x| {
x.parse::<usize>()
.expect("Unable to parse colspan value to usize")
})
.unwrap_or(1);
columns_in_row += colspan;
}
}
columns = columns.max(columns_in_row);
}
}
SizeInfo { rows, columns }
}
/// Look for 'data' (as opposed to 'layout') tables, for which we use similar checks as
/// https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
fn mark_data_tables(&mut self) {
if let Ok(tables) = self.root_node.select("table") {
for table in tables {
let mut table_attr = table.attributes.borrow_mut();
let table_node = table.as_node();
if table_attr.get("role") == Some("presentation") {
table_attr.insert("readability-data-table", "false".to_string());
continue;
}
if table_attr.get("datatable") == Some("0") {
table_attr.insert("readability-data-table", "false".to_string());
continue;
}
if table_attr.contains("summary") {
table_attr.insert("readability-data-table", "true".to_string());
continue;
}
if let Ok(caption) = table_node.select_first("caption") {
if caption.as_node().children().count() > 0 {
table_attr.insert("readability-data-table", "true".to_string());
continue;
}
}
if DATA_TABLE_DESCENDANTS
.iter()
.any(|tag_name| table_node.select_first(tag_name).is_ok())
{
table_attr.insert("readability-data-table", "true".to_string());
continue;
}
if table_node.select("table").unwrap().count() > 1 {
table_attr.insert("readability-data-table", "false".to_string());
continue;
}
let size_info = Self::get_row_and_column_count(table_node);
if size_info.rows >= 10 || size_info.columns > 4 {
table_attr.insert("readability-data-table", "true".to_string());
continue;
}
if (size_info.rows * size_info.columns) > 10 {
table_attr.insert("readability-data-table", "true".to_string());
continue;
} else {
table_attr.insert("readability-data-table", "false".to_string());
continue;
}
}
}
}
/// Convert images and figures that have properties like data-src into images that can be loaded without JS
fn fix_lazy_images(node_ref: &mut NodeRef) {
let nodes = node_ref.select("img, picture, figure").unwrap();
for node in nodes {
let mut node_attr = node.attributes.borrow_mut();
if let Some(src) = node_attr.get("src") {
let src_captures = regexes::B64_DATA_URL_REGEX.captures(src);
if src_captures.is_some() {
let svg_capture = src_captures.unwrap().get(1);
if svg_capture.is_some() && svg_capture.unwrap().as_str() == "image/svg+xml" {
continue;
}
let src_could_be_removed = node_attr
.map
.iter()
.filter(|(name, _)| &name.local != "src")
.filter(|(_, val)| regexes::is_match_img_ext(&val.value))
.count()
> 0;
if src_could_be_removed {
let b64_start = regexes::BASE64_REGEX.find(src).unwrap().start();
let b64_length = src.len() - b64_start;
if b64_length < 133 {
node_attr.remove("src");
}
}
}
}
let src = node_attr.get("src");
let srcset = node_attr.get("srcset");
let class = node_attr.get("class");
if (src.is_some() || srcset.is_some())
&& class.is_some()
&& !class.unwrap().contains("lazy")
{
continue;
}
node_attr
.map
.clone()
.iter()
.filter(|(key, _)| !(&key.local == "src" || &key.local == "srcset"))
.for_each(|(_, val)| {
let mut copy_to = "";
if regexes::is_match_srcset(&val.value) {
copy_to = "srcset";
} else if regexes::is_match_src_regex(&val.value) {
copy_to = "src";
}
if copy_to.len() > 0 {
let new_val = val.value.clone();
let tag_name = &node.name.local;
if tag_name == "img" || tag_name == "picture" {
node_attr.insert(copy_to, new_val);
} else if tag_name == "figure" {
let node_ref = node.as_node();
let img_picture_nodes = node_ref.select("img, picture").unwrap();
if img_picture_nodes.count() > 0 {
let img = NodeRef::new_element(
QualName::new(
None,
Namespace::from(HTML_NS),
LocalName::from("img"),
),
BTreeMap::new(),
);
{
let mut img_attr =
img.as_element().unwrap().attributes.borrow_mut();
img_attr.insert(copy_to, new_val);
}
node_ref.append(img);
}
}
}
});
}
}
/// Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm
/// based on content length, classnames, link density, number of images & embeds, etc.
fn clean_conditionally(&self, node_ref: &mut NodeRef, tag_name: &str) {
if !self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
return;
}
let is_list = tag_name == "ul" || tag_name == "ol";
let is_data_table = |node_ref: &NodeRef| {
let node_elem = node_ref.as_element().unwrap();
let attrs = node_elem.attributes.borrow();
attrs.get("readability-data-table") == Some("true")
};
let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
let mut nodes = node_ref
.descendants()
.select(tag_name)
.unwrap()
// Do not remove data tables
.filter(|node_data_ref| {
!(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node()))
})
// Do not remove if it is a child of a data table
.filter(|node_data_ref| {
!Self::has_ancestor_tag(
node_data_ref.as_node(),
tag_name,
Some(-1),
Some(is_data_table),
)
});
let mut next_node = nodes.next();
while let Some(node_data_ref) = next_node {
next_node = nodes.next();
let node = node_data_ref.as_node();
let weight = self.get_class_weight(node);
// Remove all elements with negative class weights
if weight < 0 {
node.detach();
continue;
}
if get_char_count(node) >= 10 {
continue;
}
let mut embeds = node_data_ref
.as_node()
.select("object, embed, iframe")
.unwrap();
let can_skip_embed = embeds.any(|node_data_ref| {
&node_data_ref.name.local == "object" || {
let attrs = node_data_ref.attributes.borrow();
attrs
.map
.iter()
.any(|(_, val)| regexes::is_match_videos(&val.value))
}
});
if can_skip_embed {
continue;
}
let p_nodes = node_data_ref.as_node().select("p").unwrap().count();
let img_nodes = node_data_ref.as_node().select("img").unwrap().count();
let li_nodes = node_data_ref.as_node().select("li").unwrap().count() as i32 - 100;
let input_nodes = node_data_ref.as_node().select("input").unwrap().count();
let p = p_nodes as f32;
let img = img_nodes as f32;
let embed_count = node.select("object, embed, iframe").unwrap().count();
let link_density = Self::get_link_density(node);
let content_length = Self::get_inner_text(node, None).len();
let has_figure_ancestor = Self::has_ancestor_tag(node, "figure", None, None);
let have_to_remove = (img_nodes > 1 && p / img < 0.5 && !has_figure_ancestor)
|| (!is_list && li_nodes > p_nodes as i32)
|| (input_nodes > (p_nodes / 3))
|| (!is_list
&& content_length < 25
&& (img_nodes == 0 || img_nodes > 2)
&& !has_figure_ancestor)
|| (!is_list && weight < 25 && link_density > 0.2)
|| (weight >= 25 && link_density > 0.5)
|| ((embed_count == 1 && content_length < 75) || embed_count > 1);
if have_to_remove {
node.detach();
}
}
}
/// Clean a node of all elements of type "tag". (Unless it's a YouTube or Vimeo video)
fn clean(node_ref: &mut NodeRef, tag_name: &str) {
// Can be changed to a HashSet
let is_embed = vec!["object", "embed", "iframe"].contains(&tag_name);
let mut nodes = node_ref
.descendants()
.select(tag_name)
.unwrap()
.filter(|node_data_ref| {
!is_embed
|| {
let attrs = node_data_ref.attributes.borrow();
!attrs
.map
.iter()
.any(|(_, val)| regexes::is_match_videos(&val.value))
}
|| &node_data_ref.name.local == "object" // This currently does not check the innerHTML.
});
let mut node = nodes.next();
while let Some(node_data_ref) = node {
node = nodes.next();
node_data_ref.as_node().detach()
}
}
/// Clean out spurious headers from an Element. Checks things like classnames and link density.
fn clean_headers(&self, node_ref: &mut NodeRef) {
let mut nodes = node_ref
.descendants()
.select("h1, h2")
.unwrap()
.filter(|node_data_ref| self.get_class_weight(node_data_ref.as_node()) < 0);
let mut node = nodes.next();
while let Some(node_data_ref) = node {
node = nodes.next();
node_data_ref.as_node().detach();
}
}
/// Remove the style attribute on every element and descendants.
fn clean_styles(node_ref: &mut NodeRef) {
node_ref
.inclusive_descendants()
.elements()
.filter(|node| &node.name.local != "svg")
.for_each(|node_data_ref| {
let mut attrs = node_data_ref.attributes.borrow_mut();
PRESENTATIONAL_ATTRIBUTES.iter().for_each(|pres_attr| {
attrs.remove(*pres_attr);
});
if DEPRECATED_SIZE_ATTRIBUTE_ELEMS.contains(&node_data_ref.name.local.as_ref()) {
attrs.remove("width");
attrs.remove("height");
}
});
}
/// Clean out elements that match the specified conditions
fn clean_matched_nodes(node_ref: &mut NodeRef, filter_fn: impl Fn(&NodeRef, &str) -> bool) {
let end_of_search_marker_node = Self::get_next_node(node_ref, true);
let mut next_node = Self::get_next_node(node_ref, false);
while next_node.is_some() && next_node != end_of_search_marker_node {
let node = next_node.unwrap();
let attrs = node.as_element().unwrap().attributes.borrow();
let class = attrs.get("class").unwrap_or("");
let id = attrs.get("id").unwrap_or("");
if filter_fn(&node, &(class.to_string() + " " + id)) {
next_node = Self::remove_and_get_next(node.clone());
} else {
next_node = Self::get_next_node(&node, false);
}
}
}
/// Prepare the article node for display. Clean out any inline styles, iframes,
/// forms, strip extraneous <p> tags, etc.
fn prep_article(&mut self, node_ref: &mut NodeRef) {
Self::clean_styles(node_ref);
self.mark_data_tables();
Self::fix_lazy_images(node_ref);
self.clean_conditionally(node_ref, "form");
self.clean_conditionally(node_ref, "fieldset");
Self::clean(node_ref, "object");
Self::clean(node_ref, "embed");
Self::clean(node_ref, "h1");
Self::clean(node_ref, "footer");
Self::clean(node_ref, "link");
Self::clean(node_ref, "aside");
node_ref.children().for_each(|mut node| {
Self::clean_matched_nodes(&mut node, |node: &NodeRef, match_string| {
regexes::is_match_share_elems(match_string)
&& node.text_contents().len() < DEFAULT_CHAR_THRESHOLD
});
});
let h2_nodes = node_ref.select("h2").unwrap().take(2).collect::<Vec<_>>();
if h2_nodes.len() == 1 {
let h2_node = h2_nodes[0].as_node();
let length_similar_rate = ((h2_node.text_contents().len() as isize
- self.article_title.len() as isize) as f32)
/ self.article_title.len() as f32;
if length_similar_rate.abs() < 0.5 {
let titles_match = if length_similar_rate > 0.0 {
h2_node.text_contents().contains(&self.article_title)
} else {
self.article_title.contains(&h2_node.text_contents())
};
if titles_match {
Self::clean(node_ref, "h2");
}
}
}
Self::clean(node_ref, "iframe");
Self::clean(node_ref, "input");
Self::clean(node_ref, "textarea");
Self::clean(node_ref, "select");
Self::clean(node_ref, "button");
self.clean_headers(node_ref);
self.clean_conditionally(node_ref, "table");
self.clean_conditionally(node_ref, "ul");
self.clean_conditionally(node_ref, "div");
let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| {
let p_node = node_data_ref.as_node();
let img_count = p_node.select("img").unwrap().count();
let embed_count = p_node.select("embed").unwrap().count();
let object_count = p_node.select("object").unwrap().count();
let iframe_count = p_node.select("iframe").unwrap().count();
let total = img_count + embed_count + object_count + iframe_count;
total == 0 && Self::get_inner_text(node_data_ref.as_node(), Some(false)).is_empty()
});
let mut p_node = p_nodes.next();
while let Some(p_node_ref) = p_node {
p_node = p_nodes.next();
p_node_ref.as_node().detach();
}
let mut br_nodes = node_ref.select("br").unwrap().filter(|node_data_ref| {
let br_node = node_data_ref.as_node();
// WARN: This assumes `next_element` returns an element node.
let next_node = Self::next_element(br_node.next_sibling(), true);
next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
});
let mut br_node = br_nodes.next();
while let Some(br_node_ref) = br_node {
br_node = br_nodes.next();
br_node_ref.as_node().detach();
}
let mut table_nodes = node_ref.select("table").unwrap();
let mut table_node = table_nodes.next();
while let Some(table_node_ref) = table_node {
table_node = table_nodes.next();
let table_node = table_node_ref.as_node();
// WARN: This assumes `next_element` returns an element node.
let table_child = Self::next_element(table_node.first_child(), true);
let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
table_child.as_ref().unwrap()
} else {
table_node
};
// WARN: This block assumes `next_element` returns an element node
if Self::has_single_tag_inside_element(&tbody, "tr") {
let row = Self::next_element(tbody.first_child(), true).unwrap();
if Self::has_single_tag_inside_element(&row, "td") {
let mut cell = Self::next_element(row.first_child(), true).unwrap();
let tag = if cell
.children()
.all(|cell_child| Self::is_phrasing_content(&cell_child))
{
"p"
} else {
"div"
};
cell = Self::set_node_tag(&cell, tag);
if let Some(parent) = table_node.parent() {
parent.append(cell);
table_node.detach();
}
}
}
}
}
fn flag_is_active(&self, flag: u32) -> bool {
self.flags & flag > 0
}
fn remove_flag(&mut self, flag: u32) {
self.flags = self.flags & !flag;
}
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
/// a user wants to read. Then return it wrapped up in a div.
fn grab_article(&mut self) {
println!("Grabbing article");
// var doc = this._doc;
// var isPaging = (page !== null ? true: false);
// page = page ? page : this._doc.body;
let page = self.root_node.select_first("body");
if page.is_err() {
// TODO:Have error logging for this
println!("Document has no <body>");
return;
}
let page = page.unwrap();
let mut attempts: Vec<ExtractAttempt> = Vec::new();
// var pageCacheHtml = page.innerHTML;
//TODO: Add page cache
loop {
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
let strip_unlikely_candidates = self.flag_is_active(FLAG_STRIP_UNLIKELYS);
// // First, node prepping. Trash nodes that look cruddy (like ones with the
// // class name "comment", etc), and turn divs into P tags where they have been
// // used inappropriately (as in, where they contain no other block level elements.)
let mut elements_to_score: Vec<NodeRef> = Vec::new();
let mut node = Some(
self.root_node
.select_first("html")
.unwrap()
.as_node()
.clone(),
);
while let Some(node_ref) = node {
let node_elem = node_ref.as_element().unwrap();
let node_name: &str = node_elem.name.local.as_ref();
let match_string = {
let node_attrs = node_elem.attributes.borrow();
node_attrs.get("class").unwrap_or("").to_string()
+ " "
+ node_attrs.get("id").unwrap_or("")
};
if !Self::is_probably_visible(&node_ref) {
node = Self::remove_and_get_next(node_ref);
continue;
}
if self.check_byline(&node_ref, &match_string) {
node = Self::remove_and_get_next(node_ref);
continue;
}
if strip_unlikely_candidates {
if regexes::is_match_unlikely(&match_string)
&& !regexes::is_match_ok_maybe(&match_string)
&& !Self::has_ancestor_tag(&node_ref, "table", None, None)
&& node_name != "body"
&& node_name != "a"
{
node = Self::remove_and_get_next(node_ref);
continue;
}
let is_complementary = {
let node_attrs = node_elem.attributes.borrow();
node_attrs.get("role") == Some("complementary")
};
if is_complementary {
node = Self::remove_and_get_next(node_ref);
continue;
}
}
match node_name {
"div" | "section" | "header" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
if Self::is_element_without_content(&node_ref) {
node = Self::remove_and_get_next(node_ref);
continue;
}
}
_ => (),
}
if DEFAULT_TAGS_TO_SCORE.contains(&node_name) {
elements_to_score.push(node_ref.clone());
}
if node_name == "div" {
let mut p: Option<NodeRef> = None;
let mut child_node = node_ref.first_child();
while let Some(child_node_ref) = child_node {
let next_sibling = child_node_ref.next_sibling();
if Self::is_phrasing_content(&child_node_ref) {
if let Some(ref p_node) = p {
p_node.append(child_node_ref);
} else if !Self::is_whitespace(&child_node_ref) {
let new_p_node = NodeRef::new_element(
QualName::new(
None,
Namespace::from(HTML_NS),
LocalName::from("p"),
),
BTreeMap::new(),
);
child_node_ref.insert_before(new_p_node);
p = child_node_ref.previous_sibling();
// Append will implicitly detach the child_node_ref
p.as_mut().unwrap().append(child_node_ref);
}
} else if let Some(ref p_node) = p {
while let Some(last_child) = p_node.last_child() {
if Self::is_whitespace(&last_child) {
last_child.detach();
} else {
break;
}
}
p = None;
}
child_node = next_sibling;
}
if Self::has_single_tag_inside_element(&node_ref, "p")
&& Self::get_link_density(&node_ref) < 0.25
{
// WARN: This assumes `next_element` returns an element node.
let new_node = Self::next_element(node_ref.first_child(), true).unwrap();
elements_to_score.push(new_node.clone());
node_ref.insert_before(new_node);
let new_node = node_ref.previous_sibling();
node_ref.detach();
node = new_node;
elements_to_score.push(node.clone().unwrap());
} else if !Self::has_child_block_element(&node_ref) {
node = Some(Self::set_node_tag(&node_ref, "p"));
elements_to_score.push(node.clone().unwrap());
}
}
node = Self::get_next_node(&node_ref, false);
}
let mut candidates: Vec<NodeRef> = Vec::new();
elements_to_score
.iter()
.filter(|node_ref| {
let parent = node_ref.parent();
parent.is_some() && parent.unwrap().as_element().is_some()
})
.map(|node_ref| (node_ref, Self::get_inner_text(&node_ref, None)))
.filter(|(_, inner_text)| inner_text.len() >= 25)
.map(|(node_ref, inner_text)| {
(inner_text, Self::get_node_ancestors(&node_ref, Some(3)))
})
.filter(|(_, ancestors)| ancestors.len() != 0)
.for_each(|(inner_text, ancestors)| {
let mut content_score = 0;
content_score += 1;
content_score += inner_text.split(",").count();
content_score += (3).min(inner_text.len() / 100);
ancestors
.into_iter()
.enumerate()
.filter(|(_, node)| {
node.parent().is_some() && node.parent().unwrap().as_element().is_some()
})
.for_each(|(level, mut ancestor)| {
let has_readability = {
let ancestor_attrs =
ancestor.as_element().unwrap().attributes.borrow();
ancestor_attrs.contains(READABILITY_SCORE)
};
if !has_readability {
self.initialize_node(&mut ancestor);
candidates.push(ancestor.clone());
}
let score_divider = if level == 0 {
1.0
} else if level == 1 {
2.0
} else {
level as f32 * 3.0
};
let mut ancestor_attrs =
ancestor.as_element().unwrap().attributes.borrow_mut();
if let Some(readability_score) =
ancestor_attrs.get_mut(READABILITY_SCORE)
{
*readability_score = (readability_score.parse::<f32>().unwrap()
+ (content_score as f32 / score_divider))
.to_string();
}
});
});
let mut top_candidates: Vec<NodeRef> = Vec::new();
for candidate in candidates {
let mut candidate_score = 0.0;
{
let mut candidate_attr =
candidate.as_element().unwrap().attributes.borrow_mut();
if let Some(readability_score) = candidate_attr.get_mut(READABILITY_SCORE) {
candidate_score = readability_score.parse::<f32>().unwrap()
* (1.0 - Self::get_link_density(&candidate));
*readability_score = candidate_score.to_string();
}
}
let nb_top_candidates = 5;
for i in 0..nb_top_candidates {
let top_candidate = top_candidates.get(i);
let top_candidate_score = top_candidate
.as_ref()
.map(|node_ref| node_ref.as_element().unwrap().attributes.borrow())
.map(|attrs| {
attrs
.get(READABILITY_SCORE)
.unwrap_or("0")
.parse::<f32>()
.unwrap()
});
if top_candidate.is_none() || candidate_score > top_candidate_score.unwrap() {
top_candidates.splice(i..i, vec![candidate].into_iter());
if top_candidates.len() > nb_top_candidates {
top_candidates.pop();
}
break;
}
}
}
let possible_top_candidate = top_candidates.get(0);
let mut top_candidate;
let mut needed_to_create_top_candidate = false;
let mut parent_of_top_candidate: NodeRef;
if possible_top_candidate.is_none()
|| possible_top_candidate
.map(|node| &node.as_element().unwrap().name.local)
.as_ref()
.unwrap()
== &"body"
{
top_candidate = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
BTreeMap::new(),
);
needed_to_create_top_candidate = true;
let mut page_children = page.as_node().children();
let mut page_child = page_children.next();
while let Some(child_node) = page_child {
page_child = page_children.next();
top_candidate.append(child_node);
}
page.as_node().append(top_candidate.clone());
self.initialize_node(&mut top_candidate);
} else {
let alternative_candidate_ancestors: Vec<Vec<NodeRef>>;
top_candidate = top_candidates.get(0).unwrap().clone();
let top_candidate_score = {
let top_candidate_node_attrs =
top_candidate.as_element().unwrap().attributes.borrow();
top_candidate_node_attrs
.get(READABILITY_SCORE)
.unwrap()
.parse::<f32>()
.unwrap()
};
alternative_candidate_ancestors = top_candidates
.iter()
.skip(1)
.filter(|top_candidate_node| {
let candidate_node_score = {
let top_candidate_node_attrs =
top_candidate_node.as_element().unwrap().attributes.borrow();
top_candidate_node_attrs
.get(READABILITY_SCORE)
.unwrap()
.parse::<f32>()
.unwrap()
};
(candidate_node_score / top_candidate_score) >= 0.75
})
.map(|node| Self::get_node_ancestors(&node, None))
.collect();
let minimum_top_candidates = 3;
if alternative_candidate_ancestors.len() >= minimum_top_candidates {
parent_of_top_candidate = top_candidate.parent().unwrap();
while &parent_of_top_candidate.as_element().unwrap().name.local != "body" {
let mut lists_containing_this_ancestor = alternative_candidate_ancestors
.iter()
.filter(|node_vec| node_vec.contains(&parent_of_top_candidate))
.count();
lists_containing_this_ancestor =
lists_containing_this_ancestor.min(minimum_top_candidates);
if lists_containing_this_ancestor >= minimum_top_candidates {
top_candidate = parent_of_top_candidate;
break;
}
parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
}
}
let top_candidate_readability = {
let top_candidate_attrs =
top_candidate.as_element().unwrap().attributes.borrow();
top_candidate_attrs
.get(READABILITY_SCORE)
.map(|x| x.to_owned())
};
if top_candidate_readability.is_none() {
self.initialize_node(&mut top_candidate);
}
parent_of_top_candidate = top_candidate.parent().unwrap();
let mut last_score = {
let top_candidate_node_attrs =
top_candidate.as_element().unwrap().attributes.borrow();
top_candidate_node_attrs
.get(READABILITY_SCORE)
.unwrap()
.parse::<f32>()
.unwrap()
};
let score_threshold = last_score / 3.0;
while parent_of_top_candidate
.as_element()
.map(|elem| elem.name.local.as_ref())
.unwrap()
!= "body"
{
let parent_readability = {
let parent_attrs = parent_of_top_candidate
.as_element()
.unwrap()
.attributes
.borrow();
parent_attrs
.get(READABILITY_SCORE)
.map(|score| score.parse::<f32>().unwrap())
};
if parent_readability.is_none() {
parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
continue;
}
if parent_readability.as_ref().unwrap() < &score_threshold {
break;
}
if parent_readability.as_ref().unwrap() > &last_score {
top_candidate = parent_of_top_candidate;
break;
}
last_score = parent_readability.unwrap();
parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
}
parent_of_top_candidate = top_candidate.parent().unwrap();
while &parent_of_top_candidate.as_element().unwrap().name.local != "body"
&& parent_of_top_candidate.children().count() == 1
{
top_candidate = parent_of_top_candidate;
parent_of_top_candidate = top_candidate.parent().unwrap();
}
let top_candidate_readability = {
let top_candidate_attrs =
top_candidate.as_element().unwrap().attributes.borrow();
top_candidate_attrs
.get(READABILITY_SCORE)
.map(|score| score.to_string())
};
if top_candidate_readability.is_none() {
self.initialize_node(&mut top_candidate);
}
}
let mut article_content = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
BTreeMap::new(),
);
let top_candidate_score = {
let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
top_candidate_attrs
.get(READABILITY_SCORE)
.map(|score| score.parse::<f32>().unwrap())
.unwrap()
};
let sibling_score_threshold = (10.0_f32).max(top_candidate_score * 0.2);
parent_of_top_candidate = top_candidate.parent().unwrap();
let mut siblings = parent_of_top_candidate
.children()
.filter(|node| node.as_element().is_some());
let (top_candidate_class, top_candidate_score) = {
let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
let class = top_candidate_attrs
.get("class")
.map(|class| class.to_string())
.unwrap_or("".to_string());
let score = top_candidate_attrs
.get(READABILITY_SCORE)
.map(|score| score.parse::<f32>().unwrap())
.unwrap();
(class, score)
};
let mut next_sibling = siblings.next();
while let Some(sibling) = next_sibling {
next_sibling = siblings.next();
let mut append = false;
if sibling == top_candidate {
append = true;
} else {
let mut content_bonus = 0.0;
let sibling_attrs = sibling.as_element().unwrap().attributes.borrow();
let sibling_class = sibling_attrs
.get("class")
.map(|class| class.to_string())
.unwrap_or("".to_string());
let sibling_score = sibling_attrs
.get(READABILITY_SCORE)
.map(|score| score.parse::<f32>().unwrap());
if sibling_class == top_candidate_class && !top_candidate_class.is_empty() {
content_bonus += top_candidate_score * 0.2;
}
if sibling_score.is_some()
&& (sibling_score.unwrap() + content_bonus) >= sibling_score_threshold
{
append = true;
} else if sibling.as_element().map(|elem| elem.name.local.as_ref()) == Some("p")
{
let link_density = Self::get_link_density(&sibling);
let node_content = Self::get_inner_text(&sibling, None);
let node_length = node_content.len();
if node_length > 80 && link_density < 0.25 {
append = true;
} else if node_length < 80
&& node_length > 0
&& link_density == 0.0
&& !regexes::is_match_node_content(&node_content)
{
append = true;
}
}
}
if append {
let new_article_child = if !ALTER_TO_DIV_EXCEPTIONS.contains(
&sibling
.as_element()
.map(|elem| elem.name.local.as_ref())
.unwrap(),
) {
Self::set_node_tag(&sibling, "div")
} else {
sibling
};
article_content.append(new_article_child);
}
}
self.prep_article(&mut article_content);
if needed_to_create_top_candidate {
let mut top_candidate_attrs =
top_candidate.as_element().unwrap().attributes.borrow_mut();
top_candidate_attrs.insert("id", "readability-page-1".to_string());
top_candidate_attrs.insert("class", "page".to_string());
} else {
let div = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
BTreeMap::new(),
);
{
let mut div_attrs = div.as_element().unwrap().attributes.borrow_mut();
div_attrs.insert("id", "readability-page-1".to_string());
div_attrs.insert("class", "page".to_string());
}
for child in article_content.children() {
div.append(child);
}
article_content.append(div);
}
let text_length = Self::get_inner_text(&article_content, Some(true)).len();
let mut parse_successful = true;
if text_length < DEFAULT_CHAR_THRESHOLD {
parse_successful = false;
if self.flag_is_active(FLAG_STRIP_UNLIKELYS) {
self.remove_flag(FLAG_STRIP_UNLIKELYS);
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
} else if self.flag_is_active(FLAG_WEIGHT_CLASSES) {
self.remove_flag(FLAG_WEIGHT_CLASSES);
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
} else if self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
self.remove_flag(FLAG_CLEAN_CONDITIONALLY);
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
} else {
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
if attempts.first().as_ref().unwrap().length == 0 {
println!("Unable to extract content");
break;
}
article_content = attempts[0].article.clone();
parse_successful = true;
}
}
if parse_successful {
let parent_ancestors = Self::get_node_ancestors(&parent_of_top_candidate, None);
let ancestors = vec![
vec![parent_of_top_candidate, top_candidate],
parent_ancestors,
]
.concat();
ancestors.iter().any(|node| {
let node_elem = node.as_element();
if node_elem.is_none() {
return false;
}
let node_attrs = node_elem.unwrap().attributes.borrow();
if let Some(dir_attr) = node_attrs.get("dir") {
self.article_dir = Some(dir_attr.to_string());
return true;
}
false
});
self.article_node = Some(article_content);
return;
}
}
}
}
/// This represents the article node extracted after running the grab_article method
#[derive(Debug)]
struct ExtractAttempt {
article: NodeRef,
length: usize,
}
impl ExtractAttempt {
pub fn new(article: NodeRef, length: usize) -> Self {
ExtractAttempt { article, length }
}
}
#[derive(Debug, PartialEq)]
pub struct MetaData {
byline: Option<String>,
excerpt: Option<String>,
site_name: Option<String>,
title: String,
}
impl MetaData {
pub fn new() -> Self {
MetaData {
byline: None,
excerpt: None,
site_name: None,
title: "".into(),
}
}
}
#[cfg(test)]
mod test {
use super::{
MetaData, Readability, SizeInfo, FLAG_CLEAN_CONDITIONALLY, FLAG_STRIP_UNLIKELYS,
FLAG_WEIGHT_CLASSES, HTML_NS, READABILITY_SCORE,
};
use html5ever::{LocalName, Namespace, QualName};
use kuchiki::traits::*;
use kuchiki::NodeRef;
// TODO: Refactor not to use test file possibly
const TEST_HTML: &'static str = include_str!("../../test_html/simple.html");
#[test]
fn test_unwrap_no_script_tags() {
let mut readability = Readability::new(TEST_HTML);
let img_count = readability.root_node.select("img").unwrap().count();
assert_eq!(3, img_count);
readability.unwrap_no_script_tags();
let img_count = readability.root_node.select("img").unwrap().count();
assert_eq!(2, img_count);
// Ensure attributes were copied over
let updated_img = readability.root_node.select_first("img#lazy-load").unwrap();
let updated_img_attrs = updated_img.attributes.borrow();
assert_eq!(true, updated_img_attrs.contains("data-old-src"));
assert_eq!(Some("lazy-load.png"), updated_img_attrs.get("data-old-src"));
assert_eq!(Some("eager-load.png"), updated_img_attrs.get("src"));
}
#[test]
fn test_is_single_image() {
let readability = Readability::new(TEST_HTML);
let img_elem_ref = readability.root_node.select_first("img").unwrap();
assert_eq!(true, Readability::is_single_image(&img_elem_ref.as_node()));
let noscript_elem_ref = readability.root_node.select_first("noscript").unwrap();
assert_eq!(
false,
Readability::is_single_image(&noscript_elem_ref.as_node())
);
let div_elem_ref = readability
.root_node
.select_first("div.invalid-elems")
.unwrap();
assert_eq!(false, Readability::is_single_image(&div_elem_ref.as_node()));
let div_elem_ref = kuchiki::parse_fragment(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
Vec::new(),
)
.one(noscript_elem_ref.as_node().text_contents().trim());
assert_eq!(true, Readability::is_single_image(&div_elem_ref));
}
#[test]
fn test_remove_scripts() {
let mut readability = Readability::new(TEST_HTML);
let noscript_elems = readability.root_node.select("noscript").unwrap();
assert_eq!(1, noscript_elems.count());
readability.remove_scripts();
let noscript_elems = readability.root_node.select("noscript").unwrap();
assert_eq!(0, noscript_elems.count());
}
#[test]
fn test_next_element() {
let html_str = r#"
<p id="a">This is a node</p>
<!-- Commented content -->
<p id="b">This is another node. The next line is just whitespace</p>
This is standalone text
<p> Some <span>more</span> text</p>"#;
let doc = Readability::new(html_str);
let p = doc.root_node.select_first("#a").unwrap();
let p = p.as_node();
let mut p_node_option: Option<NodeRef> = Some(p.clone());
p_node_option = Readability::next_element(p_node_option, false);
assert_eq!(Some(p.clone()), p_node_option);
let p_node_option = p_node_option.unwrap();
let p_node_option = p_node_option.as_element();
let p_node_option_attr = p_node_option.unwrap().attributes.borrow();
assert_eq!("a", p_node_option_attr.get("id").unwrap());
let next = Readability::next_element(p.next_sibling(), false);
let next = next.unwrap();
let next_elem = next.as_element();
let next_attr = next_elem.unwrap().attributes.borrow();
assert_eq!("b", next_attr.get("id").unwrap());
let next = Readability::next_element(next.next_sibling(), false);
let next = next.unwrap();
assert_eq!(true, next.as_text().is_some());
assert_eq!("This is standalone text", next.text_contents().trim());
let next = Readability::next_element(None, false);
assert_eq!(None, next);
}
#[test]
fn test_is_phrasing_content() {
let html_str = r#"
Some text node
<b>This is a phrasing content node</b>
<p>This is not a phrasing content node</p>
<a href="\#"><i>This is also a phrasing content</i></a>
<a href="\#"><p>This is not a phrasing content</p></a>
"#;
let doc = Readability::new(html_str);
let body = doc.root_node.select_first("body").unwrap();
let body = body.as_node();
let mut body_children = body.children();
let mut node = body_children.next().unwrap();
assert_eq!(true, node.as_text().is_some());
assert_eq!(true, Readability::is_phrasing_content(&node));
node = node.next_sibling().unwrap();
assert_eq!("b", &node.as_element().unwrap().name.local);
assert_eq!(true, Readability::is_phrasing_content(&node));
node = node.next_sibling().unwrap(); // Skips the text node from the new line character
node = node.next_sibling().unwrap();
assert_eq!("p", &node.as_element().unwrap().name.local);
assert_eq!(false, Readability::is_phrasing_content(&node));
node = node.next_sibling().unwrap(); // Skips the text node from the new line character
node = node.next_sibling().unwrap();
assert_eq!("a", &node.as_element().unwrap().name.local);
assert_eq!(true, Readability::is_phrasing_content(&node));
node = node.next_sibling().unwrap(); // Skips the text node from the new line character
node = node.next_sibling().unwrap();
assert_eq!("a", &node.as_element().unwrap().name.local);
assert_eq!(false, Readability::is_phrasing_content(&node));
}
#[test]
fn test_is_whitespace() {
let html_str = r#"
<p>Definitely not whitespace</p>
I am also not whitespace
<p> </p>
<br>
"#;
let doc = Readability::new(html_str);
let body = doc.root_node.select_first("body").unwrap();
let mut node = body.as_node().first_child().unwrap();
assert_eq!("p", &node.as_element().unwrap().name.local);
assert_eq!(false, Readability::is_whitespace(&node));
node = node.next_sibling().unwrap();
assert_eq!(true, node.as_text().is_some());
assert_eq!(false, Readability::is_whitespace(&node));
node = node.next_sibling().unwrap();
assert_eq!("p", &node.as_element().unwrap().name.local);
assert_eq!(
true,
Readability::is_whitespace(&node.first_child().unwrap())
);
// This is testing the new line character in between the <p> and <br> tags
node = node.next_sibling().unwrap();
assert_eq!(true, node.as_text().is_some());
assert_eq!(true, Readability::is_whitespace(&node));
node = node.next_sibling().unwrap();
assert_eq!("br", &node.as_element().unwrap().name.local);
assert_eq!(true, Readability::is_whitespace(&node));
}
#[test]
fn test_set_node_tag() {
let html_str = r#"
<div id="target" class="some random class" tabindex="0"><p>Child 1</p><p>Child 2</p></div>
<div id="not-the-target">The div above is being replaced</div>
"#;
let doc = Readability::new(html_str);
let target = doc.root_node.select_first("#target").unwrap();
let children_count = doc.root_node.children().count();
let target_children_count = target.as_node().children().count();
assert_eq!("div", &target.name.local);
let new_node = Readability::set_node_tag(target.as_node(), "section");
assert_eq!(children_count, doc.root_node.children().count());
let target = doc.root_node.select_first("#target").unwrap();
assert_eq!(&new_node, target.as_node());
assert_eq!("section", &target.name.local);
assert_eq!(target_children_count, target.as_node().children().count());
let target_attrs = target.as_node().as_element().unwrap().attributes.borrow();
assert_eq!(3, target_attrs.map.len());
let old_div = doc.root_node.select_first("div#target");
assert_eq!(true, old_div.is_err());
}
#[test]
fn test_replace_node_tags() {
let html_str = r#"
<div id="replace-p">
<p>Tag 1</p><p>Tag 2</p><p>Tag 3</p>
</div>
"#;
let doc = Readability::new(html_str);
let target_parent = doc.root_node.select_first("div#replace-p").unwrap();
let target_parent_child_count = target_parent.as_node().children().count();
let nodes = target_parent.as_node().select("p").unwrap();
Readability::replace_node_tags(nodes, "span");
assert_eq!(
target_parent_child_count,
target_parent.as_node().children().count()
);
let nodes = target_parent.as_node().select("p").unwrap();
assert_eq!(0, nodes.count());
let nodes = target_parent.as_node().select("span").unwrap();
assert_eq!(3, nodes.count());
}
#[test]
fn test_replace_brs() {
let html_str = r#"
<div>foo<br>bar<br> <br><br>abc</div>
"#;
let mut doc = Readability::new(html_str);
let div = doc.root_node.select_first("div").unwrap();
let br_count = div.as_node().select("br").unwrap().count();
let p_count = div.as_node().select("p").unwrap().count();
assert_eq!(4, br_count);
assert_eq!(0, p_count);
doc.replace_brs();
let br_count = div.as_node().select("br").unwrap().count();
let p_count = div.as_node().select("p").unwrap().count();
assert_eq!(1, br_count);
assert_eq!(1, p_count);
let p_node = div.as_node().select_first("p").unwrap();
assert_eq!("abc", p_node.as_node().text_contents());
let html_str = r#"
<p>foo<br>bar<br> <br><br>abc</p>
"#;
doc = Readability::new(html_str);
let p = doc.root_node.select_first("p").unwrap();
let div_count = doc.root_node.select("div").unwrap().count();
let br_count = p.as_node().select("br").unwrap().count();
assert_eq!(4, br_count);
assert_eq!(0, div_count);
doc.replace_brs();
let br_count = doc.root_node.select("br").unwrap().count();
let div_count = doc.root_node.select("div").unwrap().count();
let p_count = doc.root_node.select("p").unwrap().count();
assert_eq!(1, br_count);
assert_eq!(1, div_count);
assert_eq!(1, p_count);
let p_node = doc.root_node.select_first("p").unwrap();
assert_eq!("abc", p_node.as_node().text_contents());
}
#[test]
fn test_prep_document() {
let html_str = r#"
<!DOCTYPE html>
<html>
<head>
<style>div {padding: 20px; border-bottom: 2px solid black; }</style>
</head>
<body>
<font face="Times New Roman" size="10">Times New Roman</font>
<div>foo<br>bar<br> <br><br>abc</div>
</body>
</html>
"#;
let mut doc = Readability::new(html_str);
doc.prep_document();
let style_nodes = doc.root_node.select("style").unwrap();
let font_nodes = doc.root_node.select("font").unwrap();
let p_nodes = doc.root_node.select("p").unwrap();
let br_nodes = doc.root_node.select("br").unwrap();
assert_eq!(0, style_nodes.count());
assert_eq!(0, font_nodes.count());
assert_eq!(1, p_nodes.count());
assert_eq!(1, br_nodes.count());
}
#[test]
fn test_inline_css_str_to_map() {
use std::collections::HashMap;
let css_str = "display: flex; height: 200px; width: 250px; justify-content: center; align-items: center; border: 2px solid black";
let mut css_map = HashMap::new();
css_map.insert("display", "flex");
css_map.insert("height", "200px");
css_map.insert("width", "250px");
css_map.insert("justify-content", "center");
css_map.insert("align-items", "center");
css_map.insert("border", "2px solid black");
let css_str_to_vec = Readability::inline_css_str_to_map(css_str);
assert_eq!(css_map, css_str_to_vec);
let mut css_map = HashMap::new();
css_map.insert("color", "red");
assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;"));
}
#[test]
fn test_is_probably_visible() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p id="visible">Lorem ipsum dolores</p>
<div id="hidden-div" style="display: none">
<p>This is hidden and so is the parent</p>
</div>
<input value="Some good CSRF token" hidden>
<div id="hidden-aria" style="display: flex;" aria-hidden="true">
<p>This is not considered visible</p>
</div>
<div id="visible-aria" style="display: flex;" aria-hidden="false">
<p>This is considered visible</p>
</div>
<img src="./some-img.png" class="fallback-image">
<div id="visible-div" style="display: block" class="visible" aria-hidden="false">
<p>This is fully visible</p>
</div>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let div_node = doc.root_node.select_first("div#hidden-div").unwrap();
let p_node = doc.root_node.select_first("p#visible").unwrap();
let input_node = doc.root_node.select_first("input").unwrap();
let hidden_aria_div_node = doc.root_node.select_first("div#hidden-aria").unwrap();
let visible_aria_div_node = doc.root_node.select_first("div#visible-aria").unwrap();
let img_node = doc.root_node.select_first("img").unwrap();
let visible_div_node = doc.root_node.select_first("div#visible-div").unwrap();
assert_eq!(true, Readability::is_probably_visible(&p_node.as_node()));
assert_eq!(false, Readability::is_probably_visible(&div_node.as_node()));
assert_eq!(
false,
Readability::is_probably_visible(&input_node.as_node())
);
assert_eq!(
false,
Readability::is_probably_visible(&hidden_aria_div_node.as_node())
);
assert_eq!(
true,
Readability::is_probably_visible(&visible_aria_div_node.as_node())
);
assert_eq!(true, Readability::is_probably_visible(&img_node.as_node()));
assert_eq!(
true,
Readability::is_probably_visible(&visible_div_node.as_node())
);
}
#[test]
fn test_check_byline() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p class="byline description" id="author">
This test is used to find out whether a given node is a byline. This works by checking whether
a node has a rel attribute with "author" as its value, or if "author"
is part of its value in the itemprop attribute. If neither is the case then it checks whether the classes and id
of the node match a regex of a potential byline. If any condition is met, then the content must be less than 100
characters. For that reason, this <p> tag could not be a byline because it's too long.
</p>
<p class="author">A Paperoni maintainer</p>
<p class="authors not-byline"></p>
<p rel="author">Maintainer of Paperoni</p>
</body>
</html>
"#;
let mut doc = Readability::new(html_str);
assert_eq!(&None, &doc.byline);
let p1_node = doc.root_node.select_first("p.byline").unwrap();
let p2_node = doc.root_node.select_first("p.author").unwrap();
let p3_node = doc.root_node.select_first("p.not-byline").unwrap();
let p4_node = doc.root_node.select_first(r#"p[rel="author""#).unwrap();
assert_eq!(
false,
doc.check_byline(p1_node.as_node(), "byline description author")
);
assert_eq!(true, doc.check_byline(p2_node.as_node(), "author"));
assert_eq!(
false,
doc.check_byline(p3_node.as_node(), "authors not-byline")
);
assert_eq!(Some("A Paperoni maintainer".into()), doc.byline);
// The test below is false because there is already an existing byline.
assert_eq!(false, doc.check_byline(p4_node.as_node(), ""));
}
#[test]
fn test_get_next_node() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<div id="body-child-1">
<p id="start">Foobar content</p>
<div id="start-sib">
<span>First child</span>
</div>
</div>
<div id="body-child-2"><span>This will not be reached</p></div>
<p id="body-child-last">Last element</p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let node = doc.root_node.select_first("p#start").unwrap();
let next_node = Readability::get_next_node(node.as_node(), false);
assert_eq!(true, next_node.is_some());
let next_node = next_node.unwrap();
let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("start-sib"), next_node_attr.get("id"));
let next_node = Readability::get_next_node(&next_node, false);
assert_eq!(true, next_node.is_some());
let next_node = next_node.unwrap();
assert_eq!("span", &next_node.as_element().unwrap().name.local);
let next_node = Readability::get_next_node(&next_node, false);
assert_eq!(true, next_node.is_some());
let next_node = next_node.unwrap();
let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("body-child-2"), next_node_attr.get("id"));
let next_node = Readability::get_next_node(&next_node, true);
assert_eq!(true, next_node.is_some());
let next_node = next_node.unwrap();
let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("body-child-last"), next_node_attr.get("id"));
let next_node = Readability::get_next_node(&next_node, true);
assert_eq!(None, next_node);
}
#[test]
fn test_remove_and_get_next() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<div id="body-child-1">
<p id="start">Foobar content</p>
<div id="start-sib">
<span>First child</span>
</div>
</div>
<div id="body-child-2"><span>This will not be reached</p></div>
<p id="body-child-last">Last element</p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let node = doc.root_node.select_first("div#body-child-1").unwrap();
let p_node = Readability::get_next_node(node.as_node(), false).unwrap();
let next_node = Readability::remove_and_get_next(p_node);
assert_eq!(true, next_node.is_some());
let next_node = next_node.unwrap();
let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("start-sib"), next_node_attr.get("id"));
// Confirm the p node no longer exists
let p_node = doc.root_node.select_first("p#start");
assert_eq!(true, p_node.is_err());
}
#[test]
fn test_has_ancestor_tag() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<div>
<main>
<p>
<span>Target node</span>
</p>
</main>
</div>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let target = doc.root_node.select_first("span").unwrap();
assert_eq!(
true,
Readability::has_ancestor_tag(target.as_node(), "div", None, None)
);
assert_eq!(
false,
Readability::has_ancestor_tag(target.as_node(), "div", Some(1), None)
);
assert_eq!(
false,
Readability::has_ancestor_tag(
target.as_node(),
"div",
Some(5),
Some(|node_ref| {
let node_attrs = node_ref.as_element().unwrap().attributes.borrow();
node_attrs.contains("class")
})
)
);
}
#[test]
fn test_is_element_without_content() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p>Node with content</p><!-- A comment node which is regarded as not having content -->
<p id="empty"></p>
<div id="contentful">
<p>
<span>Target node</span>
</p>
</div>
<div id="no-content"><br><br><br><br><br><br><hr><hr><br></div>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let target = doc.root_node.select_first("p").unwrap();
assert_eq!(
false,
Readability::is_element_without_content(target.as_node())
);
let target = target.as_node().next_sibling().unwrap();
assert_eq!(true, target.as_comment().is_some());
assert_eq!(false, Readability::is_element_without_content(&target));
let mut target = doc.root_node.select_first("p#empty").unwrap();
assert_eq!(
true,
Readability::is_element_without_content(target.as_node())
);
target = doc.root_node.select_first("div#contentful").unwrap();
assert_eq!(
false,
Readability::is_element_without_content(target.as_node())
);
target = doc.root_node.select_first("div#no-content").unwrap();
assert_eq!(
true,
Readability::is_element_without_content(target.as_node())
);
}
#[test]
fn test_has_single_tag_inside_element() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p id="one">No element tags here</p>
<p id="two"><span>The p tag has only one tag</span></p>
<p id="three">
<span>Target node</span>
<span>
The parent has multiple children
</span>
</p>
<p id="four">
The text here means this div doesn't have a single tag
<span>Target node</span>
</p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let mut target = doc.root_node.select_first("p#one").unwrap();
assert_eq!(
false,
Readability::has_single_tag_inside_element(target.as_node(), "span")
);
target = doc.root_node.select_first("p#two").unwrap();
assert_eq!(
true,
Readability::has_single_tag_inside_element(target.as_node(), "span")
);
target = doc.root_node.select_first("p#three").unwrap();
assert_eq!(
false,
Readability::has_single_tag_inside_element(target.as_node(), "span")
);
target = doc.root_node.select_first("p#four").unwrap();
assert_eq!(
false,
Readability::has_single_tag_inside_element(target.as_node(), "span")
);
}
#[test]
fn test_get_inner_text() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p>The quick brown fox jumps over the lazy dog</p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let target = doc.root_node.select_first("p").unwrap();
assert_eq!(
49,
Readability::get_inner_text(target.as_node(), Some(false)).len()
);
assert_eq!(
43,
Readability::get_inner_text(target.as_node(), None).len()
);
}
#[test]
fn test_get_link_density() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p id="one">Zero link density</p>
<p id="two">Link density > 0 <a href="https://www.rust-lang.org/">The Rust home page</a></p>
<p id="three"><a></a><a></a></p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let mut target = doc.root_node.select_first("p#one").unwrap();
assert_eq!(0_f32, Readability::get_link_density(target.as_node()));
target = doc.root_node.select_first("p#two").unwrap();
assert_eq!(
18_f32 / 35_f32,
Readability::get_link_density(target.as_node())
);
target = doc.root_node.select_first("p#three").unwrap();
assert_eq!(0_f32, Readability::get_link_density(target.as_node()));
}
#[test]
fn test_has_child_block_element() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p id="one">Has no <span>block level</span> elements</p>
<p id="two">Link density > 0 <a href="https://www.rust-lang.org/">The Rust home page</a></p>
<div id="three">
<p>This is a block level element</p>
</div>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let mut target = doc.root_node.select_first("p#one").unwrap();
assert_eq!(
false,
Readability::has_child_block_element(target.as_node())
);
target = doc.root_node.select_first("p#two").unwrap();
assert_eq!(
false,
Readability::has_child_block_element(target.as_node())
);
target = doc.root_node.select_first("div#three").unwrap();
assert_eq!(true, Readability::has_child_block_element(target.as_node()));
}
#[test]
fn test_get_node_ancestors() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<section>
<div>
<p><span></span></p>
</div>
</section>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let mut target = doc.root_node.select_first("span").unwrap();
assert_eq!(
1,
Readability::get_node_ancestors(target.as_node(), None).len()
);
assert_eq!(
3,
Readability::get_node_ancestors(target.as_node(), Some(3)).len()
);
assert_eq!(
5,
Readability::get_node_ancestors(target.as_node(), Some(5)).len()
);
assert_eq!(
6,
Readability::get_node_ancestors(target.as_node(), Some(200)).len()
);
target = doc.root_node.select_first("html").unwrap();
assert_eq!(
1,
Readability::get_node_ancestors(target.as_node(), Some(4)).len()
);
}
#[test]
fn test_get_class_weight() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<div id="blog" class="main">
<h1 class="hidden">Up next...</h1>
<p id="story">A story is told...</p>
</div>
<div id="comments">
Tell us what you think
<p class="comment">Great read...</p>
</div>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let mut target = doc.root_node.select_first("body").unwrap();
assert_eq!(0, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("div#blog").unwrap();
assert_eq!(50, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("h1.hidden").unwrap();
assert_eq!(-25, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("p#story").unwrap();
assert_eq!(25, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("div#comments").unwrap();
assert_eq!(-25, doc.get_class_weight(target.as_node()));
target = doc.root_node.select_first("p.comment").unwrap();
assert_eq!(-25, doc.get_class_weight(target.as_node()));
}
#[test]
fn test_initialize_node() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<div id="blog" class="main">
<h1 class="hidden">Up next...</h1>
<p id="story">A story is told...</p>
</div>
<div id="comments">
Tell us what you think
<pre class="comment">Great read...</pre>
</div>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let mut target = doc.root_node.select_first("div#blog").unwrap();
let mut node = target.as_node().clone();
doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE));
target = doc.root_node.select_first("h1.hidden").unwrap();
let mut node = target.as_node().clone();
doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE));
target = doc.root_node.select_first("p#story").unwrap();
let mut node = target.as_node().clone();
doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE));
target = doc.root_node.select_first("div#comments").unwrap();
let mut node = target.as_node().clone();
doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE));
target = doc.root_node.select_first("pre.comment").unwrap();
let mut node = target.as_node().clone();
doc.initialize_node(&mut node);
let node_attrs = node.as_element().unwrap().attributes.borrow();
assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE));
}
#[test]
fn test_get_row_and_column_count() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<table>
<tbody>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td rowspan="2">&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td colspan="2">&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td colspan="4">&nbsp;</td>
</tr>
</tbody>
</table>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let target = doc.root_node.select_first("table").unwrap();
assert_eq!(
SizeInfo {
rows: 6,
columns: 4
},
Readability::get_row_and_column_count(target.as_node())
);
}
#[test]
fn test_mark_data_tables() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<table id="one"></table>
<table width="100%" border="0" id="two">
<tr valign="top">
<td width="20%">Left</td>
<td height="200" width="60%">Main</td>
<td width="20%">Right</td>
</tr>
</table>
<table id="three">
<caption>Monthly savings</caption>
<tr>
<th>Month</th>
<th>Savings</th>
</tr>
<tr>
<td>January</td>
<td>$100</td>
</tr>
<tr>
<td>February</td>
<td>$50</td>
</tr>
</table>
<table id="four">
<tbody>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td rowspan="2">&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td colspan="2">&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td colspan="4">&nbsp;</td>
</tr>
</tbody>
</table>
<table id="five">
<table>
<tbody>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td rowspan="2">&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td colspan="2">&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>
<tr>
<td colspan="4">&nbsp;</td>
</tr>
</tbody>
</table>
</table>
</body>
</html>
"#;
let mut doc = Readability::new(html_str);
doc.mark_data_tables();
let target = doc.root_node.select_first("table#one").unwrap();
let target_attr = target.attributes.borrow();
assert_eq!(Some("false"), target_attr.get("readability-data-table"));
let target = doc.root_node.select_first("table#two").unwrap();
let target_attr = target.attributes.borrow();
assert_eq!(Some("false"), target_attr.get("readability-data-table"));
let target = doc.root_node.select_first("table#three").unwrap();
let target_attr = target.attributes.borrow();
assert_eq!(Some("true"), target_attr.get("readability-data-table"));
let target = doc.root_node.select_first("table#four").unwrap();
let target_atrr = target.attributes.borrow();
assert_eq!(Some("true"), target_atrr.get("readability-data-table"));
let target = doc.root_node.select_first("table#five").unwrap();
let target_atrr = target.attributes.borrow();
assert_eq!(Some("false"), target_atrr.get("readability-data-table"));
}
#[test]
fn test_fix_lazy_images() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<img id="svg-uri" alt="Basketball" src="" />
<img id="normal-src" src="./foo.jpg">
<img id="gif-uri" src="" alt="star" width="16" height="16">
<img id="gif-uri-remove-src" data-src="./not-real-gif.png" src="" alt="star" width="16" height="16">
<img id="lazy-loaded" class="lazy" src="placeholder.jpg" data-src="./720x640.jpg">
<picture>
<source media="(min-width:650px)" srcset="img_pink_flowers.jpg">
<source media="(min-width:465px)" srcset="img_white_flower.jpg">
<img src="img_orange_flowers.jpg" alt="Flowers" style="width:auto;">
</picture>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let svg_uri = doc.root_node.select_first("#svg-uri").unwrap();
let normal_src = doc.root_node.select_first("#normal-src").unwrap();
let gif_uri = doc.root_node.select_first("#gif-uri").unwrap();
let picture = doc.root_node.select_first("picture").unwrap();
Readability::fix_lazy_images(&mut doc.root_node.clone());
assert_eq!(svg_uri, doc.root_node.select_first("#svg-uri").unwrap());
assert_eq!(
normal_src,
doc.root_node.select_first("#normal-src").unwrap()
);
assert_eq!(gif_uri, doc.root_node.select_first("#gif-uri").unwrap());
assert_eq!(picture, doc.root_node.select_first("picture").unwrap());
let gif_uri_remove_src = doc.root_node.select_first("#gif-uri-remove-src").unwrap();
let gif_uri_remove_src_attrs = gif_uri_remove_src.attributes.borrow();
assert_eq!(
gif_uri_remove_src_attrs.get("data-src"),
gif_uri_remove_src_attrs.get("src")
);
let lazy_loaded = doc.root_node.select_first("#lazy-loaded").unwrap();
let lazy_loaded_attrs = lazy_loaded.attributes.borrow();
assert_eq!(
lazy_loaded_attrs.get("data-src"),
lazy_loaded_attrs.get("src")
);
}
#[test]
fn test_clean_conditionally() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<table id="data-table">
<caption>Monthly savings</caption>
<tr>
<th>Month</th>
<th>Savings</th>
</tr>
<tr>
<td>January</td>
<td>$100</td>
</tr>
<tr>
<td>February</td>
<td>$50</td>
</tr>
</table>
<table width="100%" border="0" id="display-table">
<tr valign="top">
<td width="20%">Left</td>
<td height="200" width="60%">Main</td>
<td width="20%">Right</td>
</tr>
</table>
<table width="100%" border="0" id="display-table-removed" class="comment">
<tr valign="top">
<td width="40%">One</td>
<td width="60%">Two</td>
</tr>
</table>
<div class="comment">
<p>The parent div will be deleted due to negative weight classes</p>
</div>
<div id="some-content">
The days of the week: Mon, Tue, Wed, Thur, Fri, Sat, Sun.
The months of the year: Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Oct, Nov, Dec.
</div>
<div id="embeds">
<iframe width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ"></iframe>
</div>
<div id="footer">
<p>Check out more articles</p>
<ul>
<li><img src="article.jpg"><p>Article 1</p></li>
<li><img src="article.jpg"><p>Article 2</p></li>
<li><img src="article.jpg"><p>Article 3</p></li>
</ul>
</div>
</body>
</html>
"#;
let mut doc = Readability::new(html_str);
let body = doc.root_node.select_first("body").unwrap();
doc.mark_data_tables();
doc.clean_conditionally(&mut body.as_node().clone(), "table");
assert_eq!(true, doc.root_node.select_first("#data-table").is_ok());
assert_eq!(false, doc.root_node.select_first("#display-table").is_ok());
assert_eq!(
false,
doc.root_node.select_first("#display-table-removed").is_ok()
);
doc.clean_conditionally(&mut body.as_node().clone(), "div");
assert_eq!(false, doc.root_node.select_first("div.comment").is_ok());
assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok());
assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok());
assert_eq!(false, doc.root_node.select_first("div#footer").is_ok());
}
#[test]
fn test_clean() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<pre>A Paperoni test</pre>
<iframe width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ">
</iframe>
<iframe src="https://www.rust-lang.org/" name="rust_iframe" height="300px" width="100%" title="Rustlang Homepage">
</iframe>
<iframe src="https://crates.io/" name="crates_iframe" height="300px" width="100%" title="Crates.io Homepage">
</iframe>
<pre></pre>
</body>
</html>
"#;
let doc = Readability::new(html_str);
Readability::clean(&mut doc.root_node.clone(), "pre");
let pre_count = doc.root_node.select("pre").unwrap().count();
assert_eq!(0, pre_count);
Readability::clean(&mut doc.root_node.clone(), "iframe");
let iframe_count = doc.root_node.select("iframe").unwrap().count();
assert_eq!(1, iframe_count);
let iframe = doc.root_node.select_first("iframe").unwrap();
let iframe_attrs = iframe.attributes.borrow();
assert_eq!(
Some("https://www.youtube.com/embed/dQw4w9WgXcQ"),
iframe_attrs.get("src")
);
}
#[test]
fn test_clean_headers() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<h1 class="tags">#blog, #rust</h1>
<h2>A blog in Rust</h2>
<p>Foo bar baz quux</p>
<h1 class="footer">Copyright info</h1>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let body = doc.root_node.select_first("body").unwrap();
let h1_count = doc.root_node.select("h1").unwrap().count();
let h2_count = doc.root_node.select("h2").unwrap().count();
assert_eq!(2, h1_count);
assert_eq!(1, h2_count);
doc.clean_headers(&mut body.as_node().clone());
let h1_count = doc.root_node.select("h1").unwrap().count();
let h2_count = doc.root_node.select("h2").unwrap().count();
assert_eq!(0, h1_count);
assert_eq!(1, h2_count);
}
#[test]
fn test_clean_styles() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<div style="color:red; padding: 10px" id="red">A red box</div>
<div height="100px" style="color:blue; padding: 10px" id="blue">
A blue box
</div>
<svg width="100" height="100">
<circle cx="50" cy="50" r="40" fill="green" />
</svg>
<table width="100%" bgcolor="yellow">
<tr>
<th>Col 1</th>
<th>Col 2</th>
</tr>
</table>
</body>
</html>
"#;
let doc = Readability::new(html_str);
Readability::clean_styles(&mut doc.root_node.clone());
let red_div = doc.root_node.select_first("#red").unwrap();
let blue_div = doc.root_node.select_first("#blue").unwrap();
let svg = doc.root_node.select_first("svg").unwrap();
let table = doc.root_node.select_first("table").unwrap();
let red_div_attrs = red_div.attributes.borrow();
let blue_div_attrs = blue_div.attributes.borrow();
let svg_attrs = svg.attributes.borrow();
let table_attrs = table.attributes.borrow();
assert_eq!(1, red_div_attrs.map.len());
assert_eq!(false, red_div_attrs.contains("style"));
assert_eq!(2, blue_div_attrs.map.len());
assert_eq!(false, blue_div_attrs.contains("style"));
assert_eq!(true, blue_div_attrs.contains("height"));
assert_eq!(2, svg_attrs.map.len());
assert_eq!(0, table_attrs.map.len());
}
#[test]
fn test_clean_matched_nodes() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p class="example">In Rust you can have 3 kinds of variables</p>
<ul>
<li class="example">Immutable</li>
<li class="example">Mutable</li>
<li class="example">Constant</li>
</ul>
<p>Onto more tests</p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
let body = doc.root_node.select_first("body").unwrap();
Readability::clean_matched_nodes(&mut body.as_node().clone(), |node_ref, match_str| {
&node_ref.as_element().unwrap().name.local == "li" && match_str.contains("example")
});
let p_count = doc.root_node.select("p").unwrap().count();
let li_count = doc.root_node.select("li").unwrap().count();
assert_eq!(2, p_count);
assert_eq!(0, li_count);
}
#[test]
fn test_prep_article() {
let html_str = r#"
<!DOCTYPE html>
<html>
<head>
<title>A test HTML file</title>
</head>
<body>
<h2>A test HTML file</h2>
<div class="search">
Search for other posts
<input type="search" placeholder="Type here...">
<button id="search-btn">Search</button>
</div>
<aside>Some content aside</aside>
<h1>A h1 tag</h1>
<h1 class="banner">A h1 tag to be removed</h1>
<table id="tbl-one"></table>
<table width="100%" border="0" id="tbl-two">
<tr valign="top">
<td width="20%">Left</td>
<td height="200" width="60%">Main Content of the system</td>
<td width="20%">Right</td>
</tr>
</table>
<div style="color:red; padding: 10px" id="red">A red box</div>
<div height="100px" style="color:blue; padding: 10px" id="blue">
A blue box
</div>
<svg width="100" height="100">
<circle cx="50" cy="50" r="40" fill="green" />
</svg>
<ul>
<li>one</li>
<li>two</li>
<li>three</li>
</ul>
<object data="obj.html" width="500" height="200"></object>
<table id="tbl-three">
<caption>Monthly savings</caption>
<tr>
<th>Month</th>
<th>Savings</th>
</tr>
<tr>
<td>January</td>
<td>$100</td>
</tr>
<tr>
<td>February</td>
<td>$50</td>
</tr>
</table>
<iframe id="yt" width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ">
</iframe>
<div id="foo">
<form action="">
<fieldset>
<legend>Personal details:</legend>
<label for="fname">First name:</label>
<input type="text" id="fname" name="fname"><br><br>
<label for="lname">Last name:</label>
<input type="text" id="lname" name="lname"><br><br>
</fieldset>
</form>
<br>
<p id="p-link">
omnis nemo qui libero? Eius suscipit veritatis, tenetur impedit et voluptatibus.
<a href="\#">Rerum repellat totam quam nobis harum fuga consequatur</a>
corrupti?
</p>
<br>
<iframe src="https://www.rust-lang.org/" name="rust_iframe" height="300px" width="100%" title="Rustlang Homepage">
</iframe>
</div>
<iframe src="https://crates.io/" name="crates_iframe" height="300px" width="100%" title="Crates.io Homepage">
</iframe>
<table id="tbl-replace-p">
<tr valign="top">
<td width="20%" id="td-to-p"><span>One cell table. This is going to be replaced</span></td>
</tr>
</table>
<embed type="video/webm" src="video.mp4" width="400" height="300">
<br>
<embed type="image/jpg" src="foo.jpg" width="300" height="200">
<div>
<form action="">
<div>
<label>Join our newsletter</label>
<input type="email" placeholder="Your email address">
</div>
<button>Sign up</button>
</form>
</div>
<div id="div-p">
<p class="share">Share this as a <a href="\#">Tweet</a></p>
<br>
<p id="share">
Lorem ipsum dolor, sit amet consectetur adipisicing elit. Minima quia numquam aperiam dolores ipsam, eos perferendis cupiditate adipisci perspiciatis
dolore, sunt, iusto nobis? Nulla molestiae id repellat quibusdam nobis quia. Lorem ipsum dolor sit amet consectetur, adipisicing elit. Voluptas
laudantium omnis nemo qui libero? Eius suscipit veritatis, tenetur impedit et voluptatibus. Rerum repellat totam quam nobis harum fuga consequatur
corrupti? Lorem ipsum dolor sit amet consectetur, adipisicing elit. Iure excepturi accusamus nemo voluptatibus laborum minus dicta blanditiis totam
aperiam velit amet cupiditate hic a molestias odio nam, fugiat facere iusto.
</p>
</div>
<table id="tbl-replace-div">
<tr>
<td id="td-to-div"><pre>One cell table. This is going to be replaced</pre></td>
</tr>
</table>
<footer>A Paperoni test</footer>
<footer>Copyright 2020</footer>
</body>
</html>
"#;
let mut doc = Readability::new(html_str);
doc.article_title = "A test HTML file".into();
let body = doc.root_node.select_first("body").unwrap();
doc.prep_article(&mut body.as_node().clone());
// Ensure tables were assigned their data table scores
let table_node = doc.root_node.select_first("table").unwrap();
let node_attr = table_node.attributes.borrow();
assert_eq!(true, node_attr.get("readability-data-table").is_some());
let forms_and_fieldsets = doc.root_node.select("form, fieldset").unwrap();
assert_eq!(0, forms_and_fieldsets.count());
let nodes = doc
.root_node
.select("h1, object, embed, footer, link, aside")
.unwrap();
assert_eq!(0, nodes.count());
assert_eq!(2, doc.root_node.select("p").unwrap().count());
assert_eq!(true, doc.root_node.select_first("p.share").is_err());
assert_eq!(true, doc.root_node.select_first("p#share").is_ok());
assert_eq!(true, doc.root_node.select_first("p#td-to-p").is_ok());
let node = doc.root_node.select_first("h2");
assert_eq!(true, node.is_err());
let nodes = doc
.root_node
.select("input, textarea, select, button")
.unwrap();
assert_eq!(0, nodes.count());
let nodes = doc.root_node.select("iframe").unwrap();
assert_eq!(1, nodes.count());
let node = doc.root_node.select_first("iframe#yt");
assert_eq!(true, node.is_ok());
let nodes = doc.root_node.select("h1").unwrap();
assert_eq!(0, nodes.count());
let nodes = doc
.root_node
.select("#tbl-one, #tbl-replace-p, #tbl-replace-div")
.unwrap();
assert_eq!(0, nodes.count());
let tables = doc.root_node.select("#tbl-two, #tbl-three").unwrap();
assert_eq!(2, tables.count());
assert_eq!(true, doc.root_node.select_first("ul").is_ok());
assert_eq!(2, doc.root_node.select("div").unwrap().count());
assert_eq!(true, doc.root_node.select_first("div#div-p").is_ok());
assert_eq!(true, doc.root_node.select_first("div#td-to-div").is_ok());
assert_eq!(1, doc.root_node.select("br").unwrap().count());
let node_ref = doc.root_node.select_first("br").unwrap();
assert_eq!(
"div",
&node_ref
.as_node()
.following_siblings()
.elements()
.next()
.unwrap()
.name
.local
);
}
#[test]
fn test_get_article_title() {
let mut html_str = r#"
<!DOCTYPE html>
<html>
<head>
<title>Porting Readability to Rust</title>
</head>
<body>
<p></p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
assert_eq!("Porting Readability to Rust", doc.get_article_title());
html_str = r#"
<!DOCTYPE html>
<html>
<head>
<title>Crates.io: The Rust package repository</title>
</head>
<body>
<p></p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
assert_eq!(
"Crates.io: The Rust package repository",
doc.get_article_title()
);
html_str = r#"
<!DOCTYPE html>
<html>
<head>
<title>Crates.io: The Rust package repository</title>
</head>
<body>
<h1>Crates.io: The Rust package repository</h1>
</body>
</html>
"#;
let doc = Readability::new(html_str);
assert_eq!(
"Crates.io: The Rust package repository",
doc.get_article_title()
);
html_str = r#"
<!DOCTYPE html>
<html>
<head>
<title>Crates.io: A package repository</title>
</head>
<body>
<h1>Crates.io: A Rust package repository</h1>
</body>
</html>
"#;
let doc = Readability::new(html_str);
assert_eq!("Crates.io: A package repository", doc.get_article_title());
html_str = r#"
<!DOCTYPE html>
<html>
<head>
<title>Foo developer \ Blog</title>
</head>
<body>
<p></p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
assert_eq!("Foo developer \\ Blog", doc.get_article_title());
html_str = r#"
<!DOCTYPE html>
<html>
<head>
<title>Foo developer » Blog Post on Foo bar stuff</title>
</head>
<body>
<p></p>
</body>
</html>
"#;
let doc = Readability::new(html_str);
assert_eq!("Blog Post on Foo bar stuff", doc.get_article_title());
html_str = r#"
<!DOCTYPE html>
<html>
<head>
<title>Blog</title>
</head>
<body>
<h1>Getting started with Rust</h1>
</body>
</html>
"#;
let doc = Readability::new(html_str);
assert_eq!("Blog", doc.get_article_title());
}
#[test]
fn test_unescape_html_entities() {
let mut input = "Therefore, 5 &gt; 3".to_string();
Readability::unescape_html_entities(&mut input);
assert_eq!("Therefore, 5 > 3", &input);
input = "Logical AND (&amp;&amp;)".to_string();
Readability::unescape_html_entities(&mut input);
assert_eq!("Logical AND (&&)", &input);
input = "&#117; &#43; &#101; = &#252;".to_string();
Readability::unescape_html_entities(&mut input);
assert_eq!("u + e = ü", input);
input = "&#x0158;&#x016d;&#x0161;&#x0163;".to_string();
Readability::unescape_html_entities(&mut input);
assert_eq!("Řŭšţ", input);
}
#[test]
fn test_get_article_metadata() {
let mut html_str = r#"
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta name="description" content="A post on how hard it is to work with text."/>
<meta name="viewport" content="width=device-width"/>
<title>Foo Coder / Blog on the difficulty of using utf-8</title>
<meta name="author" content="Foo Coder"/>
</head>
<body></body>
</html>
"#;
let doc = Readability::new(html_str);
let mut result = MetaData::new();
result.byline = Some("Foo Coder".to_string());
result.excerpt = Some("A post on how hard it is to work with text.".to_string());
result.title = "Blog on the difficulty of using utf-8".to_string();
assert_eq!(result, doc.get_article_metadata());
html_str = r#"
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" user-scalable="no" />
<meta name="title" content="A Long Title" />
<meta name="description" content="Foo bar baz bo&#223;" />
<meta property="og:site_name" content="Blog Place" />
<meta property="og:title" content="A Longer Title" />
<meta property="og:description" content="Foo bar baz bo&#223;" />
<meta name="author" content="F&#x00f6;o Coder" />
<meta name="dc:creator" content="F&#x00f6;o Coder" />
<meta name="twitter:card" content="summary_large_image" />
<title>The Longest Title</title>
</head>
</html>
"#;
let doc = Readability::new(html_str);
result = MetaData::new();
result.byline = Some("Föo Coder".to_string());
result.excerpt = Some("Foo bar baz boß".to_string());
result.site_name = Some("Blog Place".to_string());
result.title = "A Longer Title".to_string();
assert_eq!(result, doc.get_article_metadata());
}
#[test]
fn test_fix_relative_uris() {
let html_str = r##"
<!DOCTYPE html>
<html>
<body>
<h1><a href="../home.html">Go back</a></h1>
<img id="ex-1" src="https://example.image.com/images/1.jpg" alt="Ex 1">
<img id="ex-2" src="https://example.image.com/images/2.jpg" alt="Ex 2">
<img id="ex-3" src="../images/2.jpg" alt="Ex 3">
<img id="ex-4" src="./images/1.jpg" alt="Ex 4">
<img id="ex-5" src="https://images.com/images/1.jpg" alt="Ex 5">
<img id="ex-6" src="/images/1.jpg" alt="Ex 6">
<p><a href="#ex-1">First image</a></p>
</body>
</html>
"##;
let mut doc = Readability::new(html_str);
doc.article_node = doc
.root_node
.select_first("body")
.ok()
.map(|node_ref| node_ref.as_node().clone());
doc.fix_relative_uris("https://example.image.com/blog/");
let node = doc.root_node.select_first("img#ex-1").unwrap();
let node_attrs = node.attributes.borrow();
assert_eq!(
Some("https://example.image.com/images/1.jpg"),
node_attrs.get("src")
);
let node = doc.root_node.select_first("img#ex-2").unwrap();
let node_attrs = node.attributes.borrow();
assert_eq!(
Some("https://example.image.com/images/2.jpg"),
node_attrs.get("src")
);
let node = doc.root_node.select_first("img#ex-3").unwrap();
let node_attrs = node.attributes.borrow();
assert_eq!(
Some("https://example.image.com/images/2.jpg"),
node_attrs.get("src")
);
let node = doc.root_node.select_first("img#ex-4").unwrap();
let node_attrs = node.attributes.borrow();
assert_eq!(
Some("https://example.image.com/blog/images/1.jpg"),
node_attrs.get("src")
);
let node = doc.root_node.select_first("img#ex-5").unwrap();
let node_attrs = node.attributes.borrow();
assert_eq!(
Some("https://images.com/images/1.jpg"),
node_attrs.get("src")
);
let node = doc.root_node.select_first("img#ex-6").unwrap();
let node_attrs = node.attributes.borrow();
assert_eq!(
Some("https://example.image.com/images/1.jpg"),
node_attrs.get("src")
);
let node = doc.root_node.select_first("p a").unwrap();
let node_attrs = node.attributes.borrow();
assert_eq!(Some("#ex-1"), node_attrs.get("href"));
let node = doc.root_node.select_first("h1 a").unwrap();
let node_attrs = node.attributes.borrow();
assert_eq!(
Some("https://example.image.com/home.html"),
node_attrs.get("href")
);
}
#[test]
fn test_clean_classes() {
// TODO: This test will later be edited to ensure it checks to only remove certain classes
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p class="a b c d">One</p>
<p class="b c d e">Two</p>
<div class="a b c div">Three</div>
<div class="b c d e">Four</div>
<ul class="a b c d">
<li class="a b c d">One</li>
<li class="b c d e">Two</li>
<li class="b c d e">Three</li>
</ul>
</body>
</html>
"#;
let mut doc = Readability::new(html_str);
doc.article_node = doc
.root_node
.select_first("body")
.ok()
.map(|node_ref| node_ref.as_node().clone());
doc.clean_classes();
assert_eq!(
true,
doc.root_node
.inclusive_descendants()
.elements()
.all(|node_elem| {
let node_attrs = node_elem.attributes.borrow();
!node_attrs.contains("class")
})
);
}
#[test]
fn test_clean_readability_attrs() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<div readability-score="0.921487">
<p readability-score="0.8102">Welcome to this awesome blog post. Only good content is here. No spam.</p>
<p readability-score="0.6004">Let's look at some statistics</p>
<table readability-score="0.719275" readability-data-table="true">
<caption>Monthly savings</caption>
<tr>
<th>Month</th>
<th>Savings</th>
</tr>
<tr>
<td>January</td>
<td>$100</td>
</tr>
<tr>
<td>February</td>
<td>$50</td>
</tr>
</table>
</div>
</body>
</html>
"#;
let mut doc = Readability::new(html_str);
doc.article_node = doc
.root_node
.select_first("body")
.ok()
.map(|node_ref| node_ref.as_node().clone());
doc.clean_readability_attrs();
assert_eq!(
true,
doc.root_node
.inclusive_descendants()
.elements()
.all(|node| {
let node_attrs = node.attributes.borrow();
node_attrs.map.len() == 0
})
);
}
#[test]
fn test_post_process_content() {
let html_str = r##"
<!DOCTYPE html>
<html>
<body>
<p class="a b c d">One</p>
<p class="b c d e">Two</p>
<div class="a b c div">Three</div>
<div class="b c d e">
<img src="./img.jpg" class="lazy">
</div>
<ul class="a b c d">
<li class="a b c d"><a href="#home">One</a></li>
<li class="b c d e">Two</li>
<li class="b c d e">Three</li>
</ul>
</body>
</html>
"##;
let mut doc = Readability::new(html_str);
doc.article_node = doc
.root_node
.select_first("body")
.ok()
.map(|node_ref| node_ref.as_node().clone());
doc.post_process_content("https://foo.blog/post/");
let has_class_attr = doc
.root_node
.inclusive_descendants()
.elements()
.any(|node_ref| {
let attrs = node_ref.attributes.borrow();
attrs.contains("class")
});
assert_eq!(false, has_class_attr);
let a_node = doc.root_node.select_first("a").unwrap();
let a_node_attrs = a_node.attributes.borrow();
assert_eq!(Some("#home"), a_node_attrs.get("href"));
let img_node = doc.root_node.select_first("img").unwrap();
let img_attrs = img_node.attributes.borrow();
assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src"));
}
#[test]
fn test_flag_is_active() {
let html_str = r"
<!DOCTYPE html>
<html>
<body>
</body>
</html>
";
let doc = Readability::new(html_str);
assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
}
#[test]
fn test_remove_flag() {
let html_str = r"
<!DOCTYPE html>
<html>
<body>
</body>
</html>
";
let mut doc = Readability::new(html_str);
assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
doc.remove_flag(FLAG_CLEAN_CONDITIONALLY);
assert_eq!(false, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
doc.remove_flag(FLAG_WEIGHT_CLASSES);
assert_eq!(false, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
}
}