Refactor grab_article
to return a Result
- Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic.
This commit is contained in:
parent
ae1ddb9386
commit
dbac7c3b69
6 changed files with 46 additions and 31 deletions
|
@ -61,7 +61,7 @@ pub fn generate_epubs(
|
||||||
.fold(&mut epub, |epub, (idx, article)| {
|
.fold(&mut epub, |epub, (idx, article)| {
|
||||||
let mut article_result = || -> Result<(), PaperoniError> {
|
let mut article_result = || -> Result<(), PaperoniError> {
|
||||||
let mut html_buf = Vec::new();
|
let mut html_buf = Vec::new();
|
||||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?;
|
extractor::serialize_to_xhtml(article.article(), &mut html_buf)?;
|
||||||
let html_str = std::str::from_utf8(&html_buf)?;
|
let html_str = std::str::from_utf8(&html_buf)?;
|
||||||
epub.metadata("title", replace_metadata_value(name))?;
|
epub.metadata("title", replace_metadata_value(name))?;
|
||||||
let section_name = article.metadata().title();
|
let section_name = article.metadata().title();
|
||||||
|
@ -129,7 +129,7 @@ pub fn generate_epubs(
|
||||||
);
|
);
|
||||||
let mut out_file = File::create(&file_name).unwrap();
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
let mut html_buf = Vec::new();
|
let mut html_buf = Vec::new();
|
||||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
extractor::serialize_to_xhtml(article.article(), &mut html_buf)
|
||||||
.expect("Unable to serialize to xhtml");
|
.expect("Unable to serialize to xhtml");
|
||||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||||
if let Some(author) = article.metadata().byline() {
|
if let Some(author) = article.metadata().byline() {
|
||||||
|
|
|
@ -10,6 +10,8 @@ pub enum ErrorKind {
|
||||||
IOError(String),
|
IOError(String),
|
||||||
#[error("[UTF8Error]: {0}")]
|
#[error("[UTF8Error]: {0}")]
|
||||||
UTF8Error(String),
|
UTF8Error(String),
|
||||||
|
#[error("[ReadabilityError]: {0}")]
|
||||||
|
ReadabilityError(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
|
|
|
@ -31,8 +31,8 @@ impl Extractor {
|
||||||
|
|
||||||
/// Locates and extracts the HTML in a document which is determined to be
|
/// Locates and extracts the HTML in a document which is determined to be
|
||||||
/// the source of the content
|
/// the source of the content
|
||||||
pub fn extract_content(&mut self) {
|
pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
|
||||||
self.readability.parse(&self.url);
|
self.readability.parse(&self.url)?;
|
||||||
if let Some(article_node_ref) = &self.readability.article_node {
|
if let Some(article_node_ref) = &self.readability.article_node {
|
||||||
let template = r#"
|
let template = r#"
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
||||||
|
@ -47,6 +47,7 @@ impl Extractor {
|
||||||
body.as_node().append(article_node_ref.clone());
|
body.as_node().append(article_node_ref.clone());
|
||||||
self.article = Some(doc);
|
self.article = Some(doc);
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
||||||
|
@ -64,8 +65,11 @@ impl Extractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn article(&self) -> Option<&NodeRef> {
|
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
|
||||||
self.article.as_ref()
|
pub fn article(&self) -> &NodeRef {
|
||||||
|
self.article.as_ref().expect(
|
||||||
|
"Article node doesn't exist. This may be because the document has not been parsed",
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn metadata(&self) -> &MetaData {
|
pub fn metadata(&self) -> &MetaData {
|
||||||
|
@ -160,7 +164,9 @@ mod test {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_extract_img_urls() {
|
fn test_extract_img_urls() {
|
||||||
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
|
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
|
||||||
extractor.extract_content();
|
extractor
|
||||||
|
.extract_content()
|
||||||
|
.expect("Article extraction failed unexpectedly");
|
||||||
extractor.extract_img_urls();
|
extractor.extract_img_urls();
|
||||||
|
|
||||||
assert!(extractor.img_urls.len() > 0);
|
assert!(extractor.img_urls.len() > 0);
|
||||||
|
|
|
@ -141,8 +141,6 @@ pub async fn download_images(
|
||||||
let (img_url, img_path, img_mime) = img_item;
|
let (img_url, img_path, img_mime) = img_item;
|
||||||
let img_ref = extractor
|
let img_ref = extractor
|
||||||
.article()
|
.article()
|
||||||
.as_mut()
|
|
||||||
.expect("Unable to get mutable ref")
|
|
||||||
.select_first(&format!("img[src='{}']", img_url))
|
.select_first(&format!("img[src='{}']", img_url))
|
||||||
.expect("Image node does not exist");
|
.expect("Image node does not exist");
|
||||||
let mut img_node = img_ref.attributes.borrow_mut();
|
let mut img_node = img_ref.attributes.borrow_mut();
|
||||||
|
|
33
src/main.rs
33
src/main.rs
|
@ -49,21 +49,26 @@ fn download(app_config: AppConfig) {
|
||||||
// println!("Extracting");
|
// println!("Extracting");
|
||||||
let mut extractor = Extractor::from_html(&html, &url);
|
let mut extractor = Extractor::from_html(&html, &url);
|
||||||
bar.set_message("Extracting...");
|
bar.set_message("Extracting...");
|
||||||
extractor.extract_content();
|
match extractor.extract_content() {
|
||||||
|
Ok(_) => {
|
||||||
if extractor.article().is_some() {
|
extractor.extract_img_urls();
|
||||||
extractor.extract_img_urls();
|
if let Err(img_errors) =
|
||||||
if let Err(img_errors) =
|
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
|
||||||
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await
|
.await
|
||||||
{
|
{
|
||||||
eprintln!(
|
eprintln!(
|
||||||
"{} image{} failed to download for {}",
|
"{} image{} failed to download for {}",
|
||||||
img_errors.len(),
|
img_errors.len(),
|
||||||
if img_errors.len() > 1 { "s" } else { "" },
|
if img_errors.len() > 1 { "s" } else { "" },
|
||||||
url
|
url
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
articles.push(extractor);
|
||||||
|
}
|
||||||
|
Err(mut e) => {
|
||||||
|
e.set_article_source(&url);
|
||||||
|
errors.push(e);
|
||||||
}
|
}
|
||||||
articles.push(extractor);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => errors.push(e),
|
Err(e) => errors.push(e),
|
||||||
|
|
|
@ -9,6 +9,8 @@ use kuchiki::{
|
||||||
};
|
};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::errors::{ErrorKind, PaperoniError};
|
||||||
|
|
||||||
const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||||
const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
|
const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
|
||||||
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
||||||
|
@ -76,14 +78,15 @@ impl Readability {
|
||||||
metadata: MetaData::new(),
|
metadata: MetaData::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn parse(&mut self, url: &str) {
|
pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> {
|
||||||
self.unwrap_no_script_tags();
|
self.unwrap_no_script_tags();
|
||||||
self.remove_scripts();
|
self.remove_scripts();
|
||||||
self.prep_document();
|
self.prep_document();
|
||||||
self.metadata = self.get_article_metadata();
|
self.metadata = self.get_article_metadata();
|
||||||
self.article_title = self.metadata.title.clone();
|
self.article_title = self.metadata.title.clone();
|
||||||
self.grab_article();
|
self.grab_article()?;
|
||||||
self.post_process_content(url);
|
self.post_process_content(url);
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Recursively check if node is image, or if node contains exactly only one image
|
/// Recursively check if node is image, or if node contains exactly only one image
|
||||||
|
@ -1584,7 +1587,7 @@ impl Readability {
|
||||||
|
|
||||||
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
|
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
|
||||||
/// a user wants to read. Then return it wrapped up in a div.
|
/// a user wants to read. Then return it wrapped up in a div.
|
||||||
fn grab_article(&mut self) {
|
fn grab_article(&mut self) -> Result<(), PaperoniError> {
|
||||||
// TODO: Add logging for this
|
// TODO: Add logging for this
|
||||||
// println!("Grabbing article");
|
// println!("Grabbing article");
|
||||||
// var doc = this._doc;
|
// var doc = this._doc;
|
||||||
|
@ -1593,8 +1596,7 @@ impl Readability {
|
||||||
let page = self.root_node.select_first("body");
|
let page = self.root_node.select_first("body");
|
||||||
if page.is_err() {
|
if page.is_err() {
|
||||||
// TODO:Have error logging for this
|
// TODO:Have error logging for this
|
||||||
println!("Document has no <body>");
|
return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
let page = page.unwrap();
|
let page = page.unwrap();
|
||||||
let mut attempts: Vec<ExtractAttempt> = Vec::new();
|
let mut attempts: Vec<ExtractAttempt> = Vec::new();
|
||||||
|
@ -2084,8 +2086,10 @@ impl Readability {
|
||||||
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
|
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
|
||||||
attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
|
attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
|
||||||
if attempts.first().as_ref().unwrap().length == 0 {
|
if attempts.first().as_ref().unwrap().length == 0 {
|
||||||
println!("Unable to extract content");
|
return Err(ErrorKind::ReadabilityError(
|
||||||
break;
|
"Unable to extract content".into(),
|
||||||
|
)
|
||||||
|
.into());
|
||||||
}
|
}
|
||||||
article_content = attempts[0].article.clone();
|
article_content = attempts[0].article.clone();
|
||||||
parse_successful = true;
|
parse_successful = true;
|
||||||
|
@ -2111,7 +2115,7 @@ impl Readability {
|
||||||
false
|
false
|
||||||
});
|
});
|
||||||
self.article_node = Some(article_content);
|
self.article_node = Some(article_content);
|
||||||
return;
|
return Ok(());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue