paperoni/src/extractor.rs

use std::collections::HashMap;

use kuchiki::{traits::*, NodeRef};

use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability};

pub type ResourceInfo = (String, Option<String>);

lazy_static! {
    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
}

pub struct Extractor {
    article: Option<NodeRef>,
    pub img_urls: Vec<ResourceInfo>,
    readability: Readability,
    pub url: String,
}

impl Extractor {
    /// Create a new instance of an HTML extractor given an HTML string
    pub fn from_html(html_str: &str, url: &str) -> Self {
        Extractor {
            article: None,
            img_urls: Vec::new(),
            readability: Readability::new(html_str),
            url: url.to_string(),
        }
    }

    /// Locates and extracts the HTML in a document which is determined to be
    /// the source of the content
    pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
        self.readability.parse(&self.url)?;
        if let Some(article_node_ref) = &self.readability.article_node {
            let template = r#"
            <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
                <head>
                </head>
                <body>
                </body>
            </html>
            "#;
            let doc = kuchiki::parse_html().one(template);
            let body = doc.select_first("body").unwrap();
            body.as_node().append(article_node_ref.clone());
            self.article = Some(doc);
        }
        Ok(())
    }

    /// Traverses the DOM tree of the content and retrieves the IMG URLs
    pub fn extract_img_urls(&mut self) {
        if let Some(content_ref) = &self.article {
            for img_ref in content_ref.select("img").unwrap() {
                img_ref.as_node().as_element().map(|img_elem| {
                    img_elem.attributes.borrow().get("src").map(|img_url| {
                        if !(img_url.is_empty() || img_url.starts_with("data:image")) {
                            self.img_urls.push((img_url.to_string(), None))
                        }
                    })
                });
            }
        }
    }

    /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
    pub fn article(&self) -> &NodeRef {
        self.article.as_ref().expect(
            "Article node doesn't exist. This may be because the document has not been parsed",
        )
    }

    pub fn metadata(&self) -> &MetaData {
        &self.readability.metadata
    }
}

/// Serializes a NodeRef to a string that is XHTML compatible
/// The only DOM nodes serialized are Text and Element nodes
pub fn serialize_to_xhtml<W: std::io::Write>(
    node_ref: &NodeRef,
    mut w: &mut W,
) -> Result<(), PaperoniError> {
    let mut escape_map = HashMap::new();
    escape_map.insert("<", "&lt;");
    escape_map.insert(">", "&gt;");
    escape_map.insert("&", "&amp;");
    escape_map.insert("\"", "&quot;");
    escape_map.insert("'", "&apos;");
    for edge in node_ref.traverse_inclusive() {
        match edge {
            kuchiki::iter::NodeEdge::Start(n) => match n.data() {
                kuchiki::NodeData::Text(rc_text) => {
                    let text = rc_text.borrow();
                    let esc_text = ESC_SEQ_REGEX
                        .replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
                    write!(&mut w, "{}", esc_text)?;
                }
                kuchiki::NodeData::Element(elem_data) => {
                    let attrs = elem_data.attributes.borrow();
                    let attrs_str = attrs
                        .map
                        .iter()
                        .filter(|(k, _)| !k.local.contains("\""))
                        .map(|(k, v)| {
                            format!(
                                "{}=\"{}\"",
                                k.local,
                                ESC_SEQ_REGEX
                                    .replace_all(&v.value, |captures: &regex::Captures| {
                                        escape_map[&captures[1]]
                                    })
                            )
                        })
                        .fold("".to_string(), |acc, val| acc + " " + &val);
                    write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
                }
                _ => (),
            },
            kuchiki::iter::NodeEdge::End(n) => match n.data() {
                kuchiki::NodeData::Element(elem_data) => {
                    write!(&mut w, "</{}>", &elem_data.name.local)?;
                }
                _ => (),
            },
        }
    }
    Ok(())
}

#[cfg(test)]
mod test {
    use super::*;
    const TEST_HTML: &'static str = r#"
        <!doctype html>
        <html lang="en">
            <head>
                <meta charset="utf-8">
                <meta name="description" content="A sample document">
                <meta name="keywords" content="test,Rust">
                <meta name="author" content="Paperoni">                
                <title>Testing Paperoni</title>
            </head>
            <body>
                <header>
                <!-- Unimportant information -->
                    <h1>Testing Paperoni</h1>
                </header>
                <article>
                    <h1>Starting out</h1>
                    <p>Some Lorem Ipsum text here</p>
                    <p>Observe this picture</p>
                    <img src="./img.jpg" alt="Random image">
                    <img src="data:image/png;base64,lJGWEIUQOIQWIDYVIVEDYFOUYQFWD">
                </article>
                <footer>
                    <p>Made in HTML</p>
                </footer>
            </body>
        </html>
        "#;

    #[test]
    fn test_extract_img_urls() {
        let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
        extractor
            .extract_content()
            .expect("Article extraction failed unexpectedly");
        extractor.extract_img_urls();

        assert!(extractor.img_urls.len() > 0);
        assert_eq!(
            vec![("http://example.com/img.jpg".to_string(), None)],
            extractor.img_urls
        );
    }
}
Add custom serializer for XHTML 2020-11-24 11:54:23 +00:00			`use std::collections::HashMap;`

Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`use kuchiki::{traits::*, NodeRef};`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00
Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. 2021-04-17 10:04:06 +01:00			`use crate::errors::PaperoniError;`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`use crate::moz_readability::{MetaData, Readability};`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00			`pub type ResourceInfo = (String, Option<String>);`

Add custom serializer for XHTML 2020-11-24 11:54:23 +00:00			`lazy_static! {`
Bug fixes - Prevent downloading images with base64 strings as the source - Add escaping of quotation characters in the serializer - Disable redirects when downloading images which fails on multiple sites - Remove invalid characters for making the epub export file name - Fix version number in release 2020-12-24 09:16:30 +00:00			`static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&\|<\|>\|'\|")"#).unwrap();`
Add custom serializer for XHTML 2020-11-24 11:54:23 +00:00			`}`

Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`pub struct Extractor {`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`article: Option<NodeRef>,`
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00			`pub img_urls: Vec<ResourceInfo>,`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`readability: Readability,`
Add url field in Extractor struct 2021-04-20 19:06:54 +01:00			`pub url: String,`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`

			`impl Extractor {`
			`/// Create a new instance of an HTML extractor given an HTML string`
Add url field in Extractor struct 2021-04-20 19:06:54 +01:00			`pub fn from_html(html_str: &str, url: &str) -> Self {`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`Extractor {`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`article: None,`
Add image download functionality 2020-05-02 16:33:45 +01:00			`img_urls: Vec::new(),`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`readability: Readability::new(html_str),`
Add url field in Extractor struct 2021-04-20 19:06:54 +01:00			`url: url.to_string(),`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`
			`}`

Change signature of `extract_content` to copy the reference to article DOM node instead of writing to file 2020-05-02 12:51:53 +01:00			`/// Locates and extracts the HTML in a document which is determined to be`
			`/// the source of the content`
Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. 2021-04-21 17:07:08 +01:00			`pub fn extract_content(&mut self) -> Result<(), PaperoniError> {`
			`self.readability.parse(&self.url)?;`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`if let Some(article_node_ref) = &self.readability.article_node {`
			`let template = r#"`
			`<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">`
			`<head>`
			`</head>`
			`<body>`
			`</body>`
			`</html>`
			`"#;`
			`let doc = kuchiki::parse_html().one(template);`
			`let body = doc.select_first("body").unwrap();`
			`body.as_node().append(article_node_ref.clone());`
			`self.article = Some(doc);`
			`}`
Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. 2021-04-21 17:07:08 +01:00			`Ok(())`
Change signature of `extract_content` to copy the reference to article DOM node instead of writing to file 2020-05-02 12:51:53 +01:00			`}`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00
Add image download functionality 2020-05-02 16:33:45 +01:00			`/// Traverses the DOM tree of the content and retrieves the IMG URLs`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`pub fn extract_img_urls(&mut self) {`
			`if let Some(content_ref) = &self.article {`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`for img_ref in content_ref.select("img").unwrap() {`
Add image download functionality 2020-05-02 16:33:45 +01:00			`img_ref.as_node().as_element().map(\|img_elem\| {`
			`img_elem.attributes.borrow().get("src").map(\|img_url\| {`
Bug fixes - Prevent downloading images with base64 strings as the source - Add escaping of quotation characters in the serializer - Disable redirects when downloading images which fails on multiple sites - Remove invalid characters for making the epub export file name - Fix version number in release 2020-12-24 09:16:30 +00:00			`if !(img_url.is_empty() \|\| img_url.starts_with("data:image")) {`
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00			`self.img_urls.push((img_url.to_string(), None))`
Add image download functionality 2020-05-02 16:33:45 +01:00			`}`
			`})`
			`});`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`
			`}`
			`}`
Add image download functionality 2020-05-02 16:33:45 +01:00
Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. 2021-04-21 17:07:08 +01:00			`/// Returns the extracted article [NodeRef]. It should only be called AFTER calling parse`
			`pub fn article(&self) -> &NodeRef {`
			`self.article.as_ref().expect(`
			`"Article node doesn't exist. This may be because the document has not been parsed",`
			`)`
Add template for epub output Change output format to name file with the title name Add getters in MetaData 2020-10-22 11:55:02 +01:00			`}`

			`pub fn metadata(&self) -> &MetaData {`
			`&self.readability.metadata`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`}`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`

Add custom serializer for XHTML 2020-11-24 11:54:23 +00:00			`/// Serializes a NodeRef to a string that is XHTML compatible`
			`/// The only DOM nodes serialized are Text and Element nodes`
			`pub fn serialize_to_xhtml<W: std::io::Write>(`
			`node_ref: &NodeRef,`
			`mut w: &mut W,`
Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. 2021-04-17 10:04:06 +01:00			`) -> Result<(), PaperoniError> {`
Add custom serializer for XHTML 2020-11-24 11:54:23 +00:00			`let mut escape_map = HashMap::new();`
			`escape_map.insert("<", "<");`
			`escape_map.insert(">", ">");`
			`escape_map.insert("&", "&");`
Bug fixes - Prevent downloading images with base64 strings as the source - Add escaping of quotation characters in the serializer - Disable redirects when downloading images which fails on multiple sites - Remove invalid characters for making the epub export file name - Fix version number in release 2020-12-24 09:16:30 +00:00			`escape_map.insert("\"", """);`
			`escape_map.insert("'", "'");`
Add custom serializer for XHTML 2020-11-24 11:54:23 +00:00			`for edge in node_ref.traverse_inclusive() {`
			`match edge {`
			`kuchiki::iter::NodeEdge::Start(n) => match n.data() {`
			`kuchiki::NodeData::Text(rc_text) => {`
			`let text = rc_text.borrow();`
			`let esc_text = ESC_SEQ_REGEX`
			`.replace_all(&text, \|captures: &regex::Captures\| escape_map[&captures[1]]);`
			`write!(&mut w, "{}", esc_text)?;`
			`}`
			`kuchiki::NodeData::Element(elem_data) => {`
			`let attrs = elem_data.attributes.borrow();`
			`let attrs_str = attrs`
			`.map`
			`.iter()`
fix: minor fixes - restore default debug level when logging to file - return early from generating epubs if there are no articles - fix serialization bug in creating attributes 2021-06-09 05:26:52 +01:00			`.filter(\|(k, _)\| !k.local.contains("\""))`
Add custom serializer for XHTML 2020-11-24 11:54:23 +00:00			`.map(\|(k, v)\| {`
			`format!(`
			`"{}=\"{}\"",`
			`k.local,`
			`ESC_SEQ_REGEX`
			`.replace_all(&v.value, \|captures: &regex::Captures\| {`
			`escape_map[&captures[1]]`
			`})`
			`)`
			`})`
			`.fold("".to_string(), \|acc, val\| acc + " " + &val);`
			`write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;`
			`}`
			`_ => (),`
			`},`
			`kuchiki::iter::NodeEdge::End(n) => match n.data() {`
			`kuchiki::NodeData::Element(elem_data) => {`
			`write!(&mut w, "</{}>", &elem_data.name.local)?;`
			`}`
			`_ => (),`
			`},`
			`}`
			`}`
			`Ok(())`
			`}`

Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`#[cfg(test)]`
			`mod test {`
			`use super::*;`
			`const TEST_HTML: &'static str = r#"`
			`<!doctype html>`
			`<html lang="en">`
			`<head>`
			`<meta charset="utf-8">`
			`<meta name="description" content="A sample document">`
			`<meta name="keywords" content="test,Rust">`
			`<meta name="author" content="Paperoni">`
			`<title>Testing Paperoni</title>`
			`</head>`
			`<body>`
			`<header>`
			`<!-- Unimportant information -->`
			`<h1>Testing Paperoni</h1>`
			`</header>`
			`<article>`
			`<h1>Starting out</h1>`
			`<p>Some Lorem Ipsum text here</p>`
			`<p>Observe this picture</p>`
Fix test data 2020-05-05 10:29:08 +01:00			`<img src="./img.jpg" alt="Random image">`
Bug fixes - Prevent downloading images with base64 strings as the source - Add escaping of quotation characters in the serializer - Disable redirects when downloading images which fails on multiple sites - Remove invalid characters for making the epub export file name - Fix version number in release 2020-12-24 09:16:30 +00:00			`<img src="data:image/png;base64,lJGWEIUQOIQWIDYVIVEDYFOUYQFWD">`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`</article>`
			`<footer>`
			`<p>Made in HTML</p>`
			`</footer>`
			`</body>`
			`</html>`
			`"#;`

Add image download functionality 2020-05-02 16:33:45 +01:00			`#[test]`
			`fn test_extract_img_urls() {`
Add url field in Extractor struct 2021-04-20 19:06:54 +01:00			`let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");`
Refactor `grab_article` to return a Result - Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic. 2021-04-21 17:07:08 +01:00			`extractor`
			`.extract_content()`
			`.expect("Article extraction failed unexpectedly");`
Add image download functionality 2020-05-02 16:33:45 +01:00			`extractor.extract_img_urls();`

			`assert!(extractor.img_urls.len() > 0);`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`assert_eq!(`
			`vec![("http://example.com/img.jpg".to_string(), None)],`
			`extractor.img_urls`
			`);`
Add image download functionality 2020-05-02 16:33:45 +01:00			`}`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`}`