Refactor grab_article to return a Result

- Add ReadabilityError field
- Refactor `article` getter in Extractor to return a &NodeRef. This
  relies on the assumption that the article has already been parsed
  and should otherwise panic.
This commit is contained in:
Kenneth Gitere 2021-04-21 19:07:08 +03:00
parent ae1ddb9386
commit dbac7c3b69
6 changed files with 46 additions and 31 deletions

View file

@ -61,7 +61,7 @@ pub fn generate_epubs(
.fold(&mut epub, |epub, (idx, article)| { .fold(&mut epub, |epub, (idx, article)| {
let mut article_result = || -> Result<(), PaperoniError> { let mut article_result = || -> Result<(), PaperoniError> {
let mut html_buf = Vec::new(); let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?; extractor::serialize_to_xhtml(article.article(), &mut html_buf)?;
let html_str = std::str::from_utf8(&html_buf)?; let html_str = std::str::from_utf8(&html_buf)?;
epub.metadata("title", replace_metadata_value(name))?; epub.metadata("title", replace_metadata_value(name))?;
let section_name = article.metadata().title(); let section_name = article.metadata().title();
@ -129,7 +129,7 @@ pub fn generate_epubs(
); );
let mut out_file = File::create(&file_name).unwrap(); let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new(); let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) extractor::serialize_to_xhtml(article.article(), &mut html_buf)
.expect("Unable to serialize to xhtml"); .expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap(); let html_str = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = article.metadata().byline() { if let Some(author) = article.metadata().byline() {

View file

@ -10,6 +10,8 @@ pub enum ErrorKind {
IOError(String), IOError(String),
#[error("[UTF8Error]: {0}")] #[error("[UTF8Error]: {0}")]
UTF8Error(String), UTF8Error(String),
#[error("[ReadabilityError]: {0}")]
ReadabilityError(String),
} }
#[derive(Error, Debug)] #[derive(Error, Debug)]

View file

@ -31,8 +31,8 @@ impl Extractor {
/// Locates and extracts the HTML in a document which is determined to be /// Locates and extracts the HTML in a document which is determined to be
/// the source of the content /// the source of the content
pub fn extract_content(&mut self) { pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
self.readability.parse(&self.url); self.readability.parse(&self.url)?;
if let Some(article_node_ref) = &self.readability.article_node { if let Some(article_node_ref) = &self.readability.article_node {
let template = r#" let template = r#"
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
@ -47,6 +47,7 @@ impl Extractor {
body.as_node().append(article_node_ref.clone()); body.as_node().append(article_node_ref.clone());
self.article = Some(doc); self.article = Some(doc);
} }
Ok(())
} }
/// Traverses the DOM tree of the content and retrieves the IMG URLs /// Traverses the DOM tree of the content and retrieves the IMG URLs
@ -64,8 +65,11 @@ impl Extractor {
} }
} }
pub fn article(&self) -> Option<&NodeRef> { /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
self.article.as_ref() pub fn article(&self) -> &NodeRef {
self.article.as_ref().expect(
"Article node doesn't exist. This may be because the document has not been parsed",
)
} }
pub fn metadata(&self) -> &MetaData { pub fn metadata(&self) -> &MetaData {
@ -160,7 +164,9 @@ mod test {
#[test] #[test]
fn test_extract_img_urls() { fn test_extract_img_urls() {
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/"); let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
extractor.extract_content(); extractor
.extract_content()
.expect("Article extraction failed unexpectedly");
extractor.extract_img_urls(); extractor.extract_img_urls();
assert!(extractor.img_urls.len() > 0); assert!(extractor.img_urls.len() > 0);

View file

@ -141,8 +141,6 @@ pub async fn download_images(
let (img_url, img_path, img_mime) = img_item; let (img_url, img_path, img_mime) = img_item;
let img_ref = extractor let img_ref = extractor
.article() .article()
.as_mut()
.expect("Unable to get mutable ref")
.select_first(&format!("img[src='{}']", img_url)) .select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist"); .expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut(); let mut img_node = img_ref.attributes.borrow_mut();

View file

@ -49,21 +49,26 @@ fn download(app_config: AppConfig) {
// println!("Extracting"); // println!("Extracting");
let mut extractor = Extractor::from_html(&html, &url); let mut extractor = Extractor::from_html(&html, &url);
bar.set_message("Extracting..."); bar.set_message("Extracting...");
extractor.extract_content(); match extractor.extract_content() {
Ok(_) => {
if extractor.article().is_some() { extractor.extract_img_urls();
extractor.extract_img_urls(); if let Err(img_errors) =
if let Err(img_errors) = download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await .await
{ {
eprintln!( eprintln!(
"{} image{} failed to download for {}", "{} image{} failed to download for {}",
img_errors.len(), img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" }, if img_errors.len() > 1 { "s" } else { "" },
url url
); );
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
} }
articles.push(extractor);
} }
} }
Err(e) => errors.push(e), Err(e) => errors.push(e),

View file

@ -9,6 +9,8 @@ use kuchiki::{
}; };
use url::Url; use url::Url;
use crate::errors::{ErrorKind, PaperoniError};
const DEFAULT_CHAR_THRESHOLD: usize = 500; const DEFAULT_CHAR_THRESHOLD: usize = 500;
const FLAG_STRIP_UNLIKELYS: u32 = 0x1; const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
const FLAG_WEIGHT_CLASSES: u32 = 0x2; const FLAG_WEIGHT_CLASSES: u32 = 0x2;
@ -76,14 +78,15 @@ impl Readability {
metadata: MetaData::new(), metadata: MetaData::new(),
} }
} }
pub fn parse(&mut self, url: &str) { pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> {
self.unwrap_no_script_tags(); self.unwrap_no_script_tags();
self.remove_scripts(); self.remove_scripts();
self.prep_document(); self.prep_document();
self.metadata = self.get_article_metadata(); self.metadata = self.get_article_metadata();
self.article_title = self.metadata.title.clone(); self.article_title = self.metadata.title.clone();
self.grab_article(); self.grab_article()?;
self.post_process_content(url); self.post_process_content(url);
Ok(())
} }
/// Recursively check if node is image, or if node contains exactly only one image /// Recursively check if node is image, or if node contains exactly only one image
@ -1584,7 +1587,7 @@ impl Readability {
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
/// a user wants to read. Then return it wrapped up in a div. /// a user wants to read. Then return it wrapped up in a div.
fn grab_article(&mut self) { fn grab_article(&mut self) -> Result<(), PaperoniError> {
// TODO: Add logging for this // TODO: Add logging for this
// println!("Grabbing article"); // println!("Grabbing article");
// var doc = this._doc; // var doc = this._doc;
@ -1593,8 +1596,7 @@ impl Readability {
let page = self.root_node.select_first("body"); let page = self.root_node.select_first("body");
if page.is_err() { if page.is_err() {
// TODO:Have error logging for this // TODO:Have error logging for this
println!("Document has no <body>"); return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
return;
} }
let page = page.unwrap(); let page = page.unwrap();
let mut attempts: Vec<ExtractAttempt> = Vec::new(); let mut attempts: Vec<ExtractAttempt> = Vec::new();
@ -2084,8 +2086,10 @@ impl Readability {
attempts.push(ExtractAttempt::new(article_content.clone(), text_length)); attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap()); attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
if attempts.first().as_ref().unwrap().length == 0 { if attempts.first().as_ref().unwrap().length == 0 {
println!("Unable to extract content"); return Err(ErrorKind::ReadabilityError(
break; "Unable to extract content".into(),
)
.into());
} }
article_content = attempts[0].article.clone(); article_content = attempts[0].article.clone();
parse_successful = true; parse_successful = true;
@ -2111,7 +2115,7 @@ impl Readability {
false false
}); });
self.article_node = Some(article_content); self.article_node = Some(article_content);
return; return Ok(());
} }
} }
} }