Add logic for attempting different rounds for content extraction
with different flags set Add additional test in `fix_relative_uris`
This commit is contained in:
parent
a0f69ccf80
commit
679bf3cb04
1 changed files with 34 additions and 8 deletions
|
@ -1527,14 +1527,10 @@ impl Readability {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
let page = page.unwrap();
|
let page = page.unwrap();
|
||||||
|
let mut attempts: Vec<ExtractAttempt> = Vec::new();
|
||||||
// // We can't grab an article if we don't have a page!
|
|
||||||
// if (!page) {
|
|
||||||
// this.log("No body found in document. Abort.");
|
|
||||||
// return null;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// var pageCacheHtml = page.innerHTML;
|
// var pageCacheHtml = page.innerHTML;
|
||||||
|
//TODO: Add page cache
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
|
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
|
||||||
|
@ -2007,11 +2003,21 @@ impl Readability {
|
||||||
parse_successful = false;
|
parse_successful = false;
|
||||||
if self.flag_is_active(FLAG_STRIP_UNLIKELYS) {
|
if self.flag_is_active(FLAG_STRIP_UNLIKELYS) {
|
||||||
self.remove_flag(FLAG_STRIP_UNLIKELYS);
|
self.remove_flag(FLAG_STRIP_UNLIKELYS);
|
||||||
|
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
|
||||||
} else if self.flag_is_active(FLAG_WEIGHT_CLASSES) {
|
} else if self.flag_is_active(FLAG_WEIGHT_CLASSES) {
|
||||||
self.remove_flag(FLAG_WEIGHT_CLASSES);
|
self.remove_flag(FLAG_WEIGHT_CLASSES);
|
||||||
|
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
|
||||||
} else if self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
|
} else if self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
|
||||||
self.remove_flag(FLAG_CLEAN_CONDITIONALLY);
|
self.remove_flag(FLAG_CLEAN_CONDITIONALLY);
|
||||||
|
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
|
||||||
} else {
|
} else {
|
||||||
|
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
|
||||||
|
attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
|
||||||
|
if attempts.first().as_ref().unwrap().length == 0 {
|
||||||
|
println!("Unable to extract content");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
article_content = attempts[0].article.clone();
|
||||||
parse_successful = true;
|
parse_successful = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2037,11 +2043,23 @@ impl Readability {
|
||||||
self.article_node = Some(article_content);
|
self.article_node = Some(article_content);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// TODO: Remove this
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This represents the article node extracted after running the grab_article method
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct ExtractAttempt {
|
||||||
|
article: NodeRef,
|
||||||
|
length: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExtractAttempt {
|
||||||
|
pub fn new(article: NodeRef, length: usize) -> Self {
|
||||||
|
ExtractAttempt { article, length }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub struct MetaData {
|
pub struct MetaData {
|
||||||
byline: Option<String>,
|
byline: Option<String>,
|
||||||
|
@ -3651,6 +3669,7 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
<img id="ex-3" src="../images/2.jpg" alt="Ex 3">
|
<img id="ex-3" src="../images/2.jpg" alt="Ex 3">
|
||||||
<img id="ex-4" src="./images/1.jpg" alt="Ex 4">
|
<img id="ex-4" src="./images/1.jpg" alt="Ex 4">
|
||||||
<img id="ex-5" src="https://images.com/images/1.jpg" alt="Ex 5">
|
<img id="ex-5" src="https://images.com/images/1.jpg" alt="Ex 5">
|
||||||
|
<img id="ex-6" src="/images/1.jpg" alt="Ex 6">
|
||||||
<p><a href="#ex-1">First image</a></p>
|
<p><a href="#ex-1">First image</a></p>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@ -3698,6 +3717,13 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
node_attrs.get("src")
|
node_attrs.get("src")
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let node = doc.root_node.select_first("img#ex-6").unwrap();
|
||||||
|
let node_attrs = node.attributes.borrow();
|
||||||
|
assert_eq!(
|
||||||
|
Some("https://example.image.com/images/1.jpg"),
|
||||||
|
node_attrs.get("src")
|
||||||
|
);
|
||||||
|
|
||||||
let node = doc.root_node.select_first("p a").unwrap();
|
let node = doc.root_node.select_first("p a").unwrap();
|
||||||
let node_attrs = node.attributes.borrow();
|
let node_attrs = node.attributes.borrow();
|
||||||
assert_eq!(Some("#ex-1"), node_attrs.get("href"));
|
assert_eq!(Some("#ex-1"), node_attrs.get("href"));
|
||||||
|
|
Reference in a new issue