Refactor grab_article to return a Result

- Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic.
2021-04-21 19:07:08 +03:00 · 2021-04-21 19:07:08 +03:00 · dbac7c3b69
commit dbac7c3b69
parent ae1ddb9386
6 changed files with 46 additions and 31 deletions
--- a/src/epub.rs
+++ b/src/epub.rs
@ -61,7 +61,7 @@ pub fn generate_epubs(
                .fold(&mut epub, |epub, (idx, article)| {
                    let mut article_result = || -> Result<(), PaperoniError> {
                        let mut html_buf = Vec::new();
-                        extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?;
+                        extractor::serialize_to_xhtml(article.article(), &mut html_buf)?;
                        let html_str = std::str::from_utf8(&html_buf)?;
                        epub.metadata("title", replace_metadata_value(name))?;
                        let section_name = article.metadata().title();
@ -129,7 +129,7 @@ pub fn generate_epubs(
                    );
                    let mut out_file = File::create(&file_name).unwrap();
                    let mut html_buf = Vec::new();
-                    extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
+                    extractor::serialize_to_xhtml(article.article(), &mut html_buf)
                        .expect("Unable to serialize to xhtml");
                    let html_str = std::str::from_utf8(&html_buf).unwrap();
                    if let Some(author) = article.metadata().byline() {
--- a/src/errors.rs
+++ b/src/errors.rs
@ -10,6 +10,8 @@ pub enum ErrorKind {
    IOError(String),
    #[error("[UTF8Error]: {0}")]
    UTF8Error(String),
+    #[error("[ReadabilityError]: {0}")]
+    ReadabilityError(String),
 }

 #[derive(Error, Debug)]
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -31,8 +31,8 @@ impl Extractor {

    /// Locates and extracts the HTML in a document which is determined to be
    /// the source of the content
-    pub fn extract_content(&mut self) {
-        self.readability.parse(&self.url);
+    pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
+        self.readability.parse(&self.url)?;
        if let Some(article_node_ref) = &self.readability.article_node {
            let template = r#"
            <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
@ -47,6 +47,7 @@ impl Extractor {
            body.as_node().append(article_node_ref.clone());
            self.article = Some(doc);
        }
+        Ok(())
    }

    /// Traverses the DOM tree of the content and retrieves the IMG URLs
@ -64,8 +65,11 @@ impl Extractor {
        }
    }

-    pub fn article(&self) -> Option<&NodeRef> {
-        self.article.as_ref()
+    /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
+    pub fn article(&self) -> &NodeRef {
+        self.article.as_ref().expect(
+            "Article node doesn't exist. This may be because the document has not been parsed",
+        )
    }

    pub fn metadata(&self) -> &MetaData {
@ -160,7 +164,9 @@ mod test {
    #[test]
    fn test_extract_img_urls() {
        let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
-        extractor.extract_content();
+        extractor
+            .extract_content()
+            .expect("Article extraction failed unexpectedly");
        extractor.extract_img_urls();

        assert!(extractor.img_urls.len() > 0);
--- a/src/http.rs
+++ b/src/http.rs
@ -141,8 +141,6 @@ pub async fn download_images(
            let (img_url, img_path, img_mime) = img_item;
            let img_ref = extractor
                .article()
-                .as_mut()
-                .expect("Unable to get mutable ref")
                .select_first(&format!("img[src='{}']", img_url))
                .expect("Image node does not exist");
            let mut img_node = img_ref.attributes.borrow_mut();
--- a/src/main.rs
+++ b/src/main.rs
@ -49,12 +49,12 @@ fn download(app_config: AppConfig) {
                    // println!("Extracting");
                    let mut extractor = Extractor::from_html(&html, &url);
                    bar.set_message("Extracting...");
-                    extractor.extract_content();
-
-                    if extractor.article().is_some() {
+                    match extractor.extract_content() {
+                        Ok(_) => {
                            extractor.extract_img_urls();
                            if let Err(img_errors) =
-                            download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await
+                                download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
+                                    .await
                            {
                                eprintln!(
                                    "{} image{} failed to download for {}",
@ -65,6 +65,11 @@ fn download(app_config: AppConfig) {
                            }
                            articles.push(extractor);
                        }
+                        Err(mut e) => {
+                            e.set_article_source(&url);
+                            errors.push(e);
+                        }
+                    }
                }
                Err(e) => errors.push(e),
            }
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@ -9,6 +9,8 @@ use kuchiki::{
 };
 use url::Url;

+use crate::errors::{ErrorKind, PaperoniError};
+
 const DEFAULT_CHAR_THRESHOLD: usize = 500;
 const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
 const FLAG_WEIGHT_CLASSES: u32 = 0x2;
@ -76,14 +78,15 @@ impl Readability {
            metadata: MetaData::new(),
        }
    }
-    pub fn parse(&mut self, url: &str) {
+    pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> {
        self.unwrap_no_script_tags();
        self.remove_scripts();
        self.prep_document();
        self.metadata = self.get_article_metadata();
        self.article_title = self.metadata.title.clone();
-        self.grab_article();
+        self.grab_article()?;
        self.post_process_content(url);
+        Ok(())
    }

    /// Recursively check if node is image, or if node contains exactly only one image
@ -1584,7 +1587,7 @@ impl Readability {

    /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
    /// a user wants to read. Then return it wrapped up in a div.
-    fn grab_article(&mut self) {
+    fn grab_article(&mut self) -> Result<(), PaperoniError> {
        // TODO: Add logging for this
        // println!("Grabbing article");
        // var doc = this._doc;
@ -1593,8 +1596,7 @@ impl Readability {
        let page = self.root_node.select_first("body");
        if page.is_err() {
            // TODO:Have error logging for this
-            println!("Document has no <body>");
-            return;
+            return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
        }
        let page = page.unwrap();
        let mut attempts: Vec<ExtractAttempt> = Vec::new();
@ -2084,8 +2086,10 @@ impl Readability {
                    attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
                    attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
                    if attempts.first().as_ref().unwrap().length == 0 {
-                        println!("Unable to extract content");
-                        break;
+                        return Err(ErrorKind::ReadabilityError(
+                            "Unable to extract content".into(),
+                        )
+                        .into());
                    }
                    article_content = attempts[0].article.clone();
                    parse_successful = true;
@ -2111,7 +2115,7 @@ impl Readability {
                    false
                });
                self.article_node = Some(article_content);
-                return;
+                return Ok(());
            }
        }
    }