From dbac7c3b69379a31e321b8d622aab3f32366c979 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Wed, 21 Apr 2021 19:07:08 +0300 Subject: [PATCH] Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. --- src/epub.rs | 4 ++-- src/errors.rs | 2 ++ src/extractor.rs | 16 +++++++++++----- src/http.rs | 2 -- src/main.rs | 33 +++++++++++++++++++-------------- src/moz_readability/mod.rs | 20 ++++++++++++-------- 6 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index 83173ab..ac1a934 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -61,7 +61,7 @@ pub fn generate_epubs( .fold(&mut epub, |epub, (idx, article)| { let mut article_result = || -> Result<(), PaperoniError> { let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?; + extractor::serialize_to_xhtml(article.article(), &mut html_buf)?; let html_str = std::str::from_utf8(&html_buf)?; epub.metadata("title", replace_metadata_value(name))?; let section_name = article.metadata().title(); @@ -129,7 +129,7 @@ pub fn generate_epubs( ); let mut out_file = File::create(&file_name).unwrap(); let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) + extractor::serialize_to_xhtml(article.article(), &mut html_buf) .expect("Unable to serialize to xhtml"); let html_str = std::str::from_utf8(&html_buf).unwrap(); if let Some(author) = article.metadata().byline() { diff --git a/src/errors.rs b/src/errors.rs index 70a522a..c37ff8e 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -10,6 +10,8 @@ pub enum ErrorKind { IOError(String), #[error("[UTF8Error]: {0}")] UTF8Error(String), + #[error("[ReadabilityError]: {0}")] + ReadabilityError(String), } #[derive(Error, Debug)] diff --git a/src/extractor.rs b/src/extractor.rs index 507ff6a..1f4140f 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -31,8 +31,8 @@ impl Extractor { /// Locates and extracts the HTML in a document which is determined to be /// the source of the content - pub fn extract_content(&mut self) { - self.readability.parse(&self.url); + pub fn extract_content(&mut self) -> Result<(), PaperoniError> { + self.readability.parse(&self.url)?; if let Some(article_node_ref) = &self.readability.article_node { let template = r#" @@ -47,6 +47,7 @@ impl Extractor { body.as_node().append(article_node_ref.clone()); self.article = Some(doc); } + Ok(()) } /// Traverses the DOM tree of the content and retrieves the IMG URLs @@ -64,8 +65,11 @@ impl Extractor { } } - pub fn article(&self) -> Option<&NodeRef> { - self.article.as_ref() + /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse + pub fn article(&self) -> &NodeRef { + self.article.as_ref().expect( + "Article node doesn't exist. This may be because the document has not been parsed", + ) } pub fn metadata(&self) -> &MetaData { @@ -160,7 +164,9 @@ mod test { #[test] fn test_extract_img_urls() { let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/"); - extractor.extract_content(); + extractor + .extract_content() + .expect("Article extraction failed unexpectedly"); extractor.extract_img_urls(); assert!(extractor.img_urls.len() > 0); diff --git a/src/http.rs b/src/http.rs index 6c7f801..bd87213 100644 --- a/src/http.rs +++ b/src/http.rs @@ -141,8 +141,6 @@ pub async fn download_images( let (img_url, img_path, img_mime) = img_item; let img_ref = extractor .article() - .as_mut() - .expect("Unable to get mutable ref") .select_first(&format!("img[src='{}']", img_url)) .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut(); diff --git a/src/main.rs b/src/main.rs index 7ac578a..3b1ad47 100644 --- a/src/main.rs +++ b/src/main.rs @@ -49,21 +49,26 @@ fn download(app_config: AppConfig) { // println!("Extracting"); let mut extractor = Extractor::from_html(&html, &url); bar.set_message("Extracting..."); - extractor.extract_content(); - - if extractor.article().is_some() { - extractor.extract_img_urls(); - if let Err(img_errors) = - download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await - { - eprintln!( - "{} image{} failed to download for {}", - img_errors.len(), - if img_errors.len() > 1 { "s" } else { "" }, - url - ); + match extractor.extract_content() { + Ok(_) => { + extractor.extract_img_urls(); + if let Err(img_errors) = + download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) + .await + { + eprintln!( + "{} image{} failed to download for {}", + img_errors.len(), + if img_errors.len() > 1 { "s" } else { "" }, + url + ); + } + articles.push(extractor); + } + Err(mut e) => { + e.set_article_source(&url); + errors.push(e); } - articles.push(extractor); } } Err(e) => errors.push(e), diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index fd65620..dc8df9f 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -9,6 +9,8 @@ use kuchiki::{ }; use url::Url; +use crate::errors::{ErrorKind, PaperoniError}; + const DEFAULT_CHAR_THRESHOLD: usize = 500; const FLAG_STRIP_UNLIKELYS: u32 = 0x1; const FLAG_WEIGHT_CLASSES: u32 = 0x2; @@ -76,14 +78,15 @@ impl Readability { metadata: MetaData::new(), } } - pub fn parse(&mut self, url: &str) { + pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> { self.unwrap_no_script_tags(); self.remove_scripts(); self.prep_document(); self.metadata = self.get_article_metadata(); self.article_title = self.metadata.title.clone(); - self.grab_article(); + self.grab_article()?; self.post_process_content(url); + Ok(()) } /// Recursively check if node is image, or if node contains exactly only one image @@ -1584,7 +1587,7 @@ impl Readability { /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// a user wants to read. Then return it wrapped up in a div. - fn grab_article(&mut self) { + fn grab_article(&mut self) -> Result<(), PaperoniError> { // TODO: Add logging for this // println!("Grabbing article"); // var doc = this._doc; @@ -1593,8 +1596,7 @@ impl Readability { let page = self.root_node.select_first("body"); if page.is_err() { // TODO:Have error logging for this - println!("Document has no "); - return; + return Err(ErrorKind::ReadabilityError("Document has no ".into()).into()); } let page = page.unwrap(); let mut attempts: Vec = Vec::new(); @@ -2084,8 +2086,10 @@ impl Readability { attempts.push(ExtractAttempt::new(article_content.clone(), text_length)); attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap()); if attempts.first().as_ref().unwrap().length == 0 { - println!("Unable to extract content"); - break; + return Err(ErrorKind::ReadabilityError( + "Unable to extract content".into(), + ) + .into()); } article_content = attempts[0].article.clone(); parse_successful = true; @@ -2111,7 +2115,7 @@ impl Readability { false }); self.article_node = Some(article_content); - return; + return Ok(()); } } }