paperoni/src/extractor.rs

use itertools::Itertools;
use kuchiki::{traits::*, NodeRef};

use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability};

pub type ResourceInfo = (String, Option<String>);

pub struct Article {
    node_ref_opt: Option<NodeRef>,
    pub img_urls: Vec<ResourceInfo>,
    readability: Readability,
    pub url: String,
}

impl Article {
    /// Create a new instance of an HTML extractor given an HTML string
    pub fn from_html(html_str: &str, url: &str) -> Self {
        Self {
            node_ref_opt: None,
            img_urls: Vec::new(),
            readability: Readability::new(html_str),
            url: url.to_string(),
        }
    }

    /// Locates and extracts the HTML in a document which is determined to be
    /// the source of the content
    pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
        self.readability.parse(&self.url)?;
        if let Some(article_node_ref) = &self.readability.article_node {
            let template = r#"
            <!DOCTYPE html>
            <html>
                <head>
                    <link rel="stylesheet" href="stylesheet.css" type="text/css"></link>
                </head>
                <body>
                </body>
            </html>
            "#;
            let doc = kuchiki::parse_html().one(template);
            let body = doc.select_first("body").unwrap();
            body.as_node().append(article_node_ref.clone());
            self.node_ref_opt = Some(doc);
        }
        Ok(())
    }

    /// Traverses the DOM tree of the content and retrieves the IMG URLs
    pub fn extract_img_urls(&mut self) {
        if let Some(content_ref) = &self.node_ref_opt {
            self.img_urls = content_ref
                .select("img")
                .unwrap()
                .filter_map(|img_ref| {
                    let attrs = img_ref.attributes.borrow();
                    attrs
                        .get("src")
                        .filter(|val| !(val.is_empty() || val.starts_with("data:image")))
                        .map(ToString::to_string)
                })
                .unique()
                .map(|val| (val, None))
                .collect();
        }
    }

    /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
    pub fn node_ref(&self) -> &NodeRef {
        self.node_ref_opt.as_ref().expect(
            "Article node doesn't exist. This may be because the document has not been parsed",
        )
    }

    pub fn metadata(&self) -> &MetaData {
        &self.readability.metadata
    }
}

#[cfg(test)]
mod test {
    use super::*;
    const TEST_HTML: &'static str = r#"
        <!doctype html>
        <html lang="en">
            <head>
                <meta charset="utf-8">
                <meta name="description" content="A sample document">
                <meta name="keywords" content="test,Rust">
                <meta name="author" content="Paperoni">                
                <title>Testing Paperoni</title>
            </head>
            <body>
                <header>
                <!-- Unimportant information -->
                    <h1>Testing Paperoni</h1>
                </header>
                <article>
                    <h1>Starting out</h1>
                    <p>Some Lorem Ipsum text here</p>
                    <p>Observe this picture</p>
                    <img src="./img.jpg" alt="Random image">
                    <img src="data:image/png;base64,lJGWEIUQOIQWIDYVIVEDYFOUYQFWD">
                </article>
                <footer>
                    <p>Made in HTML</p>
                </footer>
            </body>
        </html>
        "#;

    #[test]
    fn test_extract_img_urls() {
        let mut article = Article::from_html(TEST_HTML, "http://example.com/");
        article
            .extract_content()
            .expect("Article extraction failed unexpectedly");
        article.extract_img_urls();

        assert!(article.img_urls.len() > 0);
        assert_eq!(
            vec![("http://example.com/img.jpg".to_string(), None)],
            article.img_urls
        );
    }
}
fix: fix ordering issue with merged articles This commit adds the itertools crate which is used to dedup the Vec when downloading urls fix: fix error message feat: change the serif and mono fonts declarations 2021-06-10 18:16:31 +01:00			`use itertools::Itertools;`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`use kuchiki::{traits::*, NodeRef};`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00
Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. 2021-04-17 10:04:06 +01:00			`use crate::errors::PaperoniError;`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`use crate::moz_readability::{MetaData, Readability};`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00			`pub type ResourceInfo = (String, Option<String>);`

refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`pub struct Article {`
			`node_ref_opt: Option<NodeRef>,`
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00			`pub img_urls: Vec<ResourceInfo>,`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`readability: Readability,`
Add url field in Extractor struct 2021-04-20 19:06:54 +01:00			`pub url: String,`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`

refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`impl Article {`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`/// Create a new instance of an HTML extractor given an HTML string`
Add url field in Extractor struct 2021-04-20 19:06:54 +01:00			`pub fn from_html(html_str: &str, url: &str) -> Self {`
refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`Self {`
			`node_ref_opt: None,`
Add image download functionality 2020-05-02 16:33:45 +01:00			`img_urls: Vec::new(),`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`readability: Readability::new(html_str),`
Add url field in Extractor struct 2021-04-20 19:06:54 +01:00			`url: url.to_string(),`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`
			`}`

Change signature of `extract_content` to copy the reference to article DOM node instead of writing to file 2020-05-02 12:51:53 +01:00			`/// Locates and extracts the HTML in a document which is determined to be`
			`/// the source of the content`
Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. 2021-04-21 17:07:08 +01:00			`pub fn extract_content(&mut self) -> Result<(), PaperoniError> {`
			`self.readability.parse(&self.url)?;`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`if let Some(article_node_ref) = &self.readability.article_node {`
			`let template = r#"`
feat: add HTML exports with inlining of images fix: typo fix refactor: refactor `add_stylesheets` function 2021-07-24 10:03:36 +01:00			`<!DOCTYPE html>`
			`<html>`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`<head>`
feat: add css library for EPUB exports 2021-06-09 06:04:50 +01:00			`<link rel="stylesheet" href="stylesheet.css" type="text/css"></link>`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`</head>`
			`<body>`
			`</body>`
			`</html>`
			`"#;`
			`let doc = kuchiki::parse_html().one(template);`
			`let body = doc.select_first("body").unwrap();`
			`body.as_node().append(article_node_ref.clone());`
refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`self.node_ref_opt = Some(doc);`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`}`
Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. 2021-04-21 17:07:08 +01:00			`Ok(())`
Change signature of `extract_content` to copy the reference to article DOM node instead of writing to file 2020-05-02 12:51:53 +01:00			`}`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00
Add image download functionality 2020-05-02 16:33:45 +01:00			`/// Traverses the DOM tree of the content and retrieves the IMG URLs`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`pub fn extract_img_urls(&mut self) {`
refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`if let Some(content_ref) = &self.node_ref_opt {`
fix: fix ordering issue with merged articles This commit adds the itertools crate which is used to dedup the Vec when downloading urls fix: fix error message feat: change the serif and mono fonts declarations 2021-06-10 18:16:31 +01:00			`self.img_urls = content_ref`
			`.select("img")`
			`.unwrap()`
			`.filter_map(\|img_ref\| {`
			`let attrs = img_ref.attributes.borrow();`
			`attrs`
			`.get("src")`
			`.filter(\|val\| !(val.is_empty() \|\| val.starts_with("data:image")))`
			`.map(ToString::to_string)`
			`})`
			`.unique()`
			`.map(\|val\| (val, None))`
			`.collect();`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`
			`}`
Add image download functionality 2020-05-02 16:33:45 +01:00
Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. 2021-04-21 17:07:08 +01:00			`/// Returns the extracted article [NodeRef]. It should only be called AFTER calling parse`
refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`pub fn node_ref(&self) -> &NodeRef {`
			`self.node_ref_opt.as_ref().expect(`
Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. 2021-04-21 17:07:08 +01:00			`"Article node doesn't exist. This may be because the document has not been parsed",`
			`)`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`}`

			`pub fn metadata(&self) -> &MetaData {`
			`&self.readability.metadata`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`}`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`

			`#[cfg(test)]`
			`mod test {`
			`use super::*;`
			`const TEST_HTML: &'static str = r#"`
			`<!doctype html>`
			`<html lang="en">`
			`<head>`
			`<meta charset="utf-8">`
			`<meta name="description" content="A sample document">`
			`<meta name="keywords" content="test,Rust">`
			`<meta name="author" content="Paperoni">`
			`<title>Testing Paperoni</title>`
			`</head>`
			`<body>`
			`<header>`
			`<!-- Unimportant information -->`
			`<h1>Testing Paperoni</h1>`
			`</header>`
			`<article>`
			`<h1>Starting out</h1>`
			`<p>Some Lorem Ipsum text here</p>`
			`<p>Observe this picture</p>`
Fix test data 2020-05-05 10:29:08 +01:00			`<img src="./img.jpg" alt="Random image">`
Bug fixes - Prevent downloading images with base64 strings as the source - Add escaping of quotation characters in the serializer - Disable redirects when downloading images which fails on multiple sites - Remove invalid characters for making the epub export file name - Fix version number in release 2020-12-24 09:16:30 +00:00			`<img src="data:image/png;base64,lJGWEIUQOIQWIDYVIVEDYFOUYQFWD">`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`</article>`
			`<footer>`
			`<p>Made in HTML</p>`
			`</footer>`
			`</body>`
			`</html>`
			`"#;`

Add image download functionality 2020-05-02 16:33:45 +01:00			`#[test]`
			`fn test_extract_img_urls() {`
refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`let mut article = Article::from_html(TEST_HTML, "http://example.com/");`
			`article`
Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. 2021-04-21 17:07:08 +01:00			`.extract_content()`
			`.expect("Article extraction failed unexpectedly");`
refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`article.extract_img_urls();`
Add image download functionality 2020-05-02 16:33:45 +01:00
refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`assert!(article.img_urls.len() > 0);`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`assert_eq!(`
			`vec![("http://example.com/img.jpg".to_string(), None)],`
refactor: rename `Extractor` to `Article` 2021-07-24 10:43:40 +01:00			`article.img_urls`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`);`
Add image download functionality 2020-05-02 16:33:45 +01:00			`}`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`