Change calls on replacing regexes to replace_all
Add `fix_relative_uris`, `clean_classes`, `clean_readability_attrs` and `post_process_content`
This commit is contained in:
parent
aacb442b7a
commit
350447d1c4
2 changed files with 353 additions and 7 deletions
|
@ -1,4 +1,4 @@
|
||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
use html5ever::{LocalName, Namespace, QualName};
|
use html5ever::{LocalName, Namespace, QualName};
|
||||||
|
@ -7,6 +7,7 @@ use kuchiki::{
|
||||||
traits::*,
|
traits::*,
|
||||||
NodeData, NodeRef,
|
NodeData, NodeRef,
|
||||||
};
|
};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
const SHARE_ELEMENT_THRESHOLD: usize = 500;
|
const SHARE_ELEMENT_THRESHOLD: usize = 500;
|
||||||
const READABILITY_SCORE: &'static str = "readability-score";
|
const READABILITY_SCORE: &'static str = "readability-score";
|
||||||
|
@ -68,13 +69,14 @@ impl Readability {
|
||||||
article_dir: None,
|
article_dir: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn parse(&mut self) {
|
pub fn parse(&mut self, url: &str) {
|
||||||
self.unwrap_no_script_tags();
|
self.unwrap_no_script_tags();
|
||||||
self.remove_scripts();
|
self.remove_scripts();
|
||||||
self.prep_document();
|
self.prep_document();
|
||||||
let meta_data = self.get_article_metadata();
|
let meta_data = self.get_article_metadata();
|
||||||
self.article_title = meta_data.title.clone();
|
self.article_title = meta_data.title.clone();
|
||||||
self.grab_article();
|
self.grab_article();
|
||||||
|
self.post_process_content(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Recursively check if node is image, or if node contains exactly only one image
|
/// Recursively check if node is image, or if node contains exactly only one image
|
||||||
|
@ -435,8 +437,8 @@ impl Readability {
|
||||||
let name_val = name_attr.unwrap();
|
let name_val = name_attr.unwrap();
|
||||||
if regexes::is_match_name_pattern(name_val) {
|
if regexes::is_match_name_pattern(name_val) {
|
||||||
let name = name_val.to_lowercase();
|
let name = name_val.to_lowercase();
|
||||||
let name = regexes::REPLACE_WHITESPACE_REGEX.replace(&name, "");
|
let name = regexes::REPLACE_WHITESPACE_REGEX.replace_all(&name, "");
|
||||||
let name = regexes::REPLACE_DOT_REGEX.replace(&name, ":");
|
let name = regexes::REPLACE_DOT_REGEX.replace_all(&name, ":");
|
||||||
values.insert(name.to_string(), content.trim().to_string());
|
values.insert(name.to_string(), content.trim().to_string());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -590,21 +592,162 @@ impl Readability {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cur_title = regexes::NORMALIZE_REGEX
|
cur_title = regexes::NORMALIZE_REGEX
|
||||||
.replace(cur_title.trim(), " ")
|
.replace_all(cur_title.trim(), " ")
|
||||||
.to_string();
|
.to_string();
|
||||||
let cur_word_count = word_count(&cur_title);
|
let cur_word_count = word_count(&cur_title);
|
||||||
|
|
||||||
if cur_word_count <= 4
|
if cur_word_count <= 4
|
||||||
&& (!title_had_hierarchical_separators
|
&& (!title_had_hierarchical_separators
|
||||||
|| cur_word_count
|
|| cur_word_count
|
||||||
!= word_count(®exes::REPLACE_MULTI_SEPARATOR_REGEX.replace(&orig_title, ""))
|
!= word_count(
|
||||||
- 1)
|
®exes::REPLACE_MULTI_SEPARATOR_REGEX.replace_all(&orig_title, ""),
|
||||||
|
) - 1)
|
||||||
{
|
{
|
||||||
cur_title = orig_title;
|
cur_title = orig_title;
|
||||||
}
|
}
|
||||||
cur_title
|
cur_title
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Removes the class="" attribute from every element in the given subtree, except those that
|
||||||
|
/// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
|
||||||
|
fn clean_classes(&mut self) {
|
||||||
|
// TODO: This should accessed from Self
|
||||||
|
let classes_to_preserve: HashSet<&str> = HashSet::new();
|
||||||
|
if let Some(article_node) = &mut self.article_node {
|
||||||
|
for elem in article_node.inclusive_descendants().elements() {
|
||||||
|
let mut elem_attrs = elem.attributes.borrow_mut();
|
||||||
|
if let Some(class_list) = elem_attrs.get_mut("class") {
|
||||||
|
let filtered_class: String = class_list
|
||||||
|
.split_whitespace()
|
||||||
|
.filter(|class| classes_to_preserve.contains(class))
|
||||||
|
.fold("".to_string(), |acc, x| acc + " " + x);
|
||||||
|
if filtered_class.is_empty() {
|
||||||
|
elem_attrs.remove("class");
|
||||||
|
} else {
|
||||||
|
*class_list = filtered_class;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Converts each <a> and <img> uri in the given element to an absolute URI, ignoring #ref URIs.
|
||||||
|
fn fix_relative_uris(&mut self, document_uri: &str) {
|
||||||
|
if let Some(article_node) = &mut self.article_node {
|
||||||
|
let document_uri =
|
||||||
|
Url::parse(document_uri).expect("Unable to parse the document's URI");
|
||||||
|
let base_uri = self
|
||||||
|
.root_node
|
||||||
|
.select("base")
|
||||||
|
.unwrap()
|
||||||
|
.filter(|node_ref| {
|
||||||
|
let node_attrs = node_ref.attributes.borrow();
|
||||||
|
node_attrs.contains("href")
|
||||||
|
})
|
||||||
|
.map(|node_ref| {
|
||||||
|
let node_attrs = node_ref.attributes.borrow();
|
||||||
|
Url::parse(node_attrs.get("href").unwrap()).unwrap()
|
||||||
|
})
|
||||||
|
.next()
|
||||||
|
.unwrap_or(document_uri.clone());
|
||||||
|
let to_absolute_uri = |uri_str: &str| -> String {
|
||||||
|
if base_uri == document_uri && uri_str.starts_with("#") {
|
||||||
|
return uri_str.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(new_uri) = Url::parse(uri_str) {
|
||||||
|
if new_uri.has_host() {
|
||||||
|
return new_uri.to_string();
|
||||||
|
}
|
||||||
|
} else if let Ok(joined_uri) = base_uri.join(uri_str) {
|
||||||
|
return joined_uri.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
uri_str.to_string()
|
||||||
|
};
|
||||||
|
let mut links = article_node.select("a").unwrap().filter(|a_ref| {
|
||||||
|
let link_attrs = a_ref.attributes.borrow();
|
||||||
|
link_attrs.contains("href")
|
||||||
|
});
|
||||||
|
let mut link = links.next();
|
||||||
|
while let Some(link_ref) = link {
|
||||||
|
link = links.next();
|
||||||
|
let mut link_attrs = link_ref.attributes.borrow_mut();
|
||||||
|
let href = link_attrs.get("href").map(|val| val.to_string()).unwrap();
|
||||||
|
if href.starts_with("javascript:") {
|
||||||
|
let link_node = link_ref.as_node();
|
||||||
|
if link_node.children().count() == 1
|
||||||
|
&& link_node
|
||||||
|
.first_child()
|
||||||
|
.map(|node_ref| node_ref.as_text().is_some())
|
||||||
|
.unwrap()
|
||||||
|
{
|
||||||
|
let text_node = NodeRef::new_text(link_node.text_contents());
|
||||||
|
link_node.insert_before(text_node);
|
||||||
|
link_node.detach();
|
||||||
|
} else {
|
||||||
|
let container = NodeRef::new_element(
|
||||||
|
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("span")),
|
||||||
|
BTreeMap::new(),
|
||||||
|
);
|
||||||
|
let mut children = link_node.children();
|
||||||
|
let mut child = children.next();
|
||||||
|
while let Some(child_ref) = child {
|
||||||
|
child = children.next();
|
||||||
|
container.append(child_ref);
|
||||||
|
}
|
||||||
|
link_node.insert_before(container);
|
||||||
|
link_node.detach();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
link_attrs.insert("href", to_absolute_uri(&href));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let media_nodes = article_node
|
||||||
|
.select("img, picture, figure, video, audio, source")
|
||||||
|
.unwrap();
|
||||||
|
for media_node in media_nodes {
|
||||||
|
let mut media_attrs = media_node.attributes.borrow_mut();
|
||||||
|
if let Some(src) = media_attrs.get_mut("src") {
|
||||||
|
*src = to_absolute_uri(&src);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(poster) = media_attrs.get_mut("poster") {
|
||||||
|
*poster = to_absolute_uri(&poster);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(srcset) = media_attrs.get_mut("srcset") {
|
||||||
|
let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all(
|
||||||
|
&srcset,
|
||||||
|
|captures: ®ex::Captures| {
|
||||||
|
to_absolute_uri(&captures[1]) + &captures[2] + &captures[3]
|
||||||
|
},
|
||||||
|
);
|
||||||
|
*srcset = new_srcset.to_string();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Removes readability attributes from DOM nodes as they are not needed in the final article
|
||||||
|
fn clean_readability_attrs(&mut self) {
|
||||||
|
if let Some(article_node) = &mut self.article_node {
|
||||||
|
for node in article_node.inclusive_descendants().elements() {
|
||||||
|
let mut node_attrs = node.attributes.borrow_mut();
|
||||||
|
node_attrs.remove(READABILITY_SCORE);
|
||||||
|
node_attrs.remove("readability-data-table");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run any post-process modifications to article content as necessary.
|
||||||
|
fn post_process_content(&mut self, url: &str) {
|
||||||
|
self.fix_relative_uris(url);
|
||||||
|
// TODO: Add flag check
|
||||||
|
self.clean_classes();
|
||||||
|
self.clean_readability_attrs();
|
||||||
|
}
|
||||||
|
|
||||||
/// Converts an inline CSS string to a [HashMap] of property and value(s)
|
/// Converts an inline CSS string to a [HashMap] of property and value(s)
|
||||||
fn inline_css_str_to_map(css_str: &str) -> HashMap<&str, &str> {
|
fn inline_css_str_to_map(css_str: &str) -> HashMap<&str, &str> {
|
||||||
css_str
|
css_str
|
||||||
|
@ -3462,4 +3605,205 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
result.title = "A Longer Title".to_string();
|
result.title = "A Longer Title".to_string();
|
||||||
assert_eq!(result, doc.get_article_metadata());
|
assert_eq!(result, doc.get_article_metadata());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_fix_relative_uris() {
|
||||||
|
let html_str = r##"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1><a href="../home.html">Go back</a></h1>
|
||||||
|
<img id="ex-1" src="https://example.image.com/images/1.jpg" alt="Ex 1">
|
||||||
|
<img id="ex-2" src="https://example.image.com/images/2.jpg" alt="Ex 2">
|
||||||
|
<img id="ex-3" src="../images/2.jpg" alt="Ex 3">
|
||||||
|
<img id="ex-4" src="./images/1.jpg" alt="Ex 4">
|
||||||
|
<img id="ex-5" src="https://images.com/images/1.jpg" alt="Ex 5">
|
||||||
|
<p><a href="#ex-1">First image</a></p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"##;
|
||||||
|
let mut doc = Readability::new(html_str);
|
||||||
|
doc.article_node = doc
|
||||||
|
.root_node
|
||||||
|
.select_first("body")
|
||||||
|
.ok()
|
||||||
|
.map(|node_ref| node_ref.as_node().clone());
|
||||||
|
doc.fix_relative_uris("https://example.image.com/blog/");
|
||||||
|
|
||||||
|
let node = doc.root_node.select_first("img#ex-1").unwrap();
|
||||||
|
let node_attrs = node.attributes.borrow();
|
||||||
|
assert_eq!(
|
||||||
|
Some("https://example.image.com/images/1.jpg"),
|
||||||
|
node_attrs.get("src")
|
||||||
|
);
|
||||||
|
|
||||||
|
let node = doc.root_node.select_first("img#ex-2").unwrap();
|
||||||
|
let node_attrs = node.attributes.borrow();
|
||||||
|
assert_eq!(
|
||||||
|
Some("https://example.image.com/images/2.jpg"),
|
||||||
|
node_attrs.get("src")
|
||||||
|
);
|
||||||
|
|
||||||
|
let node = doc.root_node.select_first("img#ex-3").unwrap();
|
||||||
|
let node_attrs = node.attributes.borrow();
|
||||||
|
assert_eq!(
|
||||||
|
Some("https://example.image.com/images/2.jpg"),
|
||||||
|
node_attrs.get("src")
|
||||||
|
);
|
||||||
|
|
||||||
|
let node = doc.root_node.select_first("img#ex-4").unwrap();
|
||||||
|
let node_attrs = node.attributes.borrow();
|
||||||
|
assert_eq!(
|
||||||
|
Some("https://example.image.com/blog/images/1.jpg"),
|
||||||
|
node_attrs.get("src")
|
||||||
|
);
|
||||||
|
|
||||||
|
let node = doc.root_node.select_first("img#ex-5").unwrap();
|
||||||
|
let node_attrs = node.attributes.borrow();
|
||||||
|
assert_eq!(
|
||||||
|
Some("https://images.com/images/1.jpg"),
|
||||||
|
node_attrs.get("src")
|
||||||
|
);
|
||||||
|
|
||||||
|
let node = doc.root_node.select_first("p a").unwrap();
|
||||||
|
let node_attrs = node.attributes.borrow();
|
||||||
|
assert_eq!(Some("#ex-1"), node_attrs.get("href"));
|
||||||
|
|
||||||
|
let node = doc.root_node.select_first("h1 a").unwrap();
|
||||||
|
let node_attrs = node.attributes.borrow();
|
||||||
|
assert_eq!(
|
||||||
|
Some("https://example.image.com/home.html"),
|
||||||
|
node_attrs.get("href")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_clean_classes() {
|
||||||
|
// TODO: This test will later be edited to ensure it checks to only remove certain classes
|
||||||
|
let html_str = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p class="a b c d">One</p>
|
||||||
|
<p class="b c d e">Two</p>
|
||||||
|
<div class="a b c div">Three</div>
|
||||||
|
<div class="b c d e">Four</div>
|
||||||
|
<ul class="a b c d">
|
||||||
|
<li class="a b c d">One</li>
|
||||||
|
<li class="b c d e">Two</li>
|
||||||
|
<li class="b c d e">Three</li>
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"#;
|
||||||
|
let mut doc = Readability::new(html_str);
|
||||||
|
doc.article_node = doc
|
||||||
|
.root_node
|
||||||
|
.select_first("body")
|
||||||
|
.ok()
|
||||||
|
.map(|node_ref| node_ref.as_node().clone());
|
||||||
|
doc.clean_classes();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
true,
|
||||||
|
doc.root_node
|
||||||
|
.inclusive_descendants()
|
||||||
|
.elements()
|
||||||
|
.all(|node_elem| {
|
||||||
|
let node_attrs = node_elem.attributes.borrow();
|
||||||
|
!node_attrs.contains("class")
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_clean_readability_attrs() {
|
||||||
|
let html_str = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div readability-score="0.921487">
|
||||||
|
<p readability-score="0.8102">Welcome to this awesome blog post. Only good content is here. No spam.</p>
|
||||||
|
<p readability-score="0.6004">Let's look at some statistics</p>
|
||||||
|
<table readability-score="0.719275" readability-data-table="true">
|
||||||
|
<caption>Monthly savings</caption>
|
||||||
|
<tr>
|
||||||
|
<th>Month</th>
|
||||||
|
<th>Savings</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>January</td>
|
||||||
|
<td>$100</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>February</td>
|
||||||
|
<td>$50</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"#;
|
||||||
|
let mut doc = Readability::new(html_str);
|
||||||
|
doc.article_node = doc
|
||||||
|
.root_node
|
||||||
|
.select_first("body")
|
||||||
|
.ok()
|
||||||
|
.map(|node_ref| node_ref.as_node().clone());
|
||||||
|
doc.clean_readability_attrs();
|
||||||
|
assert_eq!(
|
||||||
|
true,
|
||||||
|
doc.root_node
|
||||||
|
.inclusive_descendants()
|
||||||
|
.elements()
|
||||||
|
.all(|node| {
|
||||||
|
let node_attrs = node.attributes.borrow();
|
||||||
|
node_attrs.map.len() == 0
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_post_process_content() {
|
||||||
|
let html_str = r##"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p class="a b c d">One</p>
|
||||||
|
<p class="b c d e">Two</p>
|
||||||
|
<div class="a b c div">Three</div>
|
||||||
|
<div class="b c d e">
|
||||||
|
<img src="./img.jpg" class="lazy">
|
||||||
|
</div>
|
||||||
|
<ul class="a b c d">
|
||||||
|
<li class="a b c d"><a href="#home">One</a></li>
|
||||||
|
<li class="b c d e">Two</li>
|
||||||
|
<li class="b c d e">Three</li>
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"##;
|
||||||
|
let mut doc = Readability::new(html_str);
|
||||||
|
doc.article_node = doc
|
||||||
|
.root_node
|
||||||
|
.select_first("body")
|
||||||
|
.ok()
|
||||||
|
.map(|node_ref| node_ref.as_node().clone());
|
||||||
|
doc.post_process_content("https://foo.blog/post/");
|
||||||
|
let has_class_attr = doc
|
||||||
|
.root_node
|
||||||
|
.inclusive_descendants()
|
||||||
|
.elements()
|
||||||
|
.any(|node_ref| {
|
||||||
|
let attrs = node_ref.attributes.borrow();
|
||||||
|
attrs.contains("class")
|
||||||
|
});
|
||||||
|
assert_eq!(false, has_class_attr);
|
||||||
|
let a_node = doc.root_node.select_first("a").unwrap();
|
||||||
|
let a_node_attrs = a_node.attributes.borrow();
|
||||||
|
assert_eq!(Some("#home"), a_node_attrs.get("href"));
|
||||||
|
let img_node = doc.root_node.select_first("img").unwrap();
|
||||||
|
let img_attrs = img_node.attributes.borrow();
|
||||||
|
assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -119,6 +119,8 @@ lazy_static! {
|
||||||
r"(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*"
|
r"(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*"
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
pub static ref SRCSET_CAPTURE_REGEX: Regex =
|
||||||
|
Regex::new(r"(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))").unwrap();
|
||||||
pub static ref REPLACE_WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
|
pub static ref REPLACE_WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
|
||||||
pub static ref REPLACE_DOT_REGEX: Regex = Regex::new(r"\.").unwrap();
|
pub static ref REPLACE_DOT_REGEX: Regex = Regex::new(r"\.").unwrap();
|
||||||
pub static ref REPLACE_HTML_ESCAPE_REGEX: Regex =
|
pub static ref REPLACE_HTML_ESCAPE_REGEX: Regex =
|
||||||
|
|
Reference in a new issue