paperoni/src/epub.rs

use std::collections::HashMap;
use std::fs::File;

use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table};
use epub_builder::{EpubBuilder, EpubContent, TocElement, ZipLibrary};
use html5ever::tendril::fmt::Slice;
use indicatif::{ProgressBar, ProgressStyle};
use kuchiki::NodeRef;
use log::{debug, error, info};

use crate::{cli::AppConfig, errors::PaperoniError, extractor::Article};

lazy_static! {
    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
    static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_:]"#).unwrap();
}

pub fn generate_epubs(
    articles: Vec<Article>,
    app_config: &AppConfig,
    successful_articles_table: &mut Table,
) -> Result<(), Vec<PaperoniError>> {
    if articles.is_empty() {
        return Ok(());
    }

    let bar = if app_config.can_disable_progress_bar {
        ProgressBar::hidden()
    } else {
        let enabled_bar = ProgressBar::new(articles.len() as u64);
        let style = ProgressStyle::default_bar().template(
            "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}",
        );
        enabled_bar.set_style(style);
        if !articles.is_empty() {
            enabled_bar.set_message("Generating epubs");
        }
        enabled_bar
    };

    let mut errors: Vec<PaperoniError> = Vec::new();

    match app_config.merged {
        Some(ref name) => {
            successful_articles_table.set_header(vec![Cell::new("Table of Contents")
                .add_attribute(Attribute::Bold)
                .set_alignment(CellAlignment::Center)
                .fg(Color::Green)]);

            let mut epub = match EpubBuilder::new(match ZipLibrary::new() {
                Ok(zip_library) => zip_library,
                Err(err) => {
                    let mut paperoni_err: PaperoniError = err.into();
                    paperoni_err.set_article_source(name);
                    errors.push(paperoni_err);
                    return Err(errors);
                }
            }) {
                Ok(epub) => epub,
                Err(err) => {
                    let mut paperoni_err: PaperoniError = err.into();
                    paperoni_err.set_article_source(name);
                    errors.push(paperoni_err);
                    return Err(errors);
                }
            };
            debug!("Creating {:?}", name);

            if app_config.inline_toc {
                epub.inline_toc();
            }

            match add_stylesheets(&mut epub, app_config) {
                Ok(_) => (),
                Err(e) => {
                    error!("Unable to add stylesheets to epub file");
                    let mut paperoni_err: PaperoniError = e.into();
                    paperoni_err.set_article_source(name);
                    errors.push(paperoni_err);
                    return Err(errors);
                }
            }
            articles
                .iter()
                .enumerate()
                .fold(&mut epub, |epub, (idx, article)| {
                    let mut article_result = || -> Result<(), PaperoniError> {
                        let content_url = format!("article_{}.xhtml", idx);
                        let mut xhtml_buf = Vec::new();
                        let header_level_tocs =
                            get_header_level_toc_vec(&content_url, article.node_ref());

                        serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)?;
                        let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
                        let section_name = article.metadata().title();
                        let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
                            .title(replace_escaped_characters(section_name));

                        for toc_element in header_level_tocs {
                            content = content.child(toc_element);
                        }

                        epub.metadata("title", replace_escaped_characters(name))?;
                        epub.add_content(content)?;
                        info!("Adding images for {:?}", name);
                        article.img_urls.iter().for_each(|img| {
                            // TODO: Add error handling and return errors as a vec
                            let mut file_path = std::env::temp_dir();
                            file_path.push(&img.0);

                            let img_buf = File::open(&file_path).expect("Can't read file");
                            epub.add_resource(
                                file_path.file_name().unwrap(),
                                img_buf,
                                img.1.as_ref().unwrap(),
                            )
                            .unwrap();
                        });
                        info!("Added images for {:?}", name);
                        Ok(())
                    };
                    if let Err(mut error) = article_result() {
                        error.set_article_source(&article.url);
                        errors.push(error);
                    }
                    bar.inc(1);
                    successful_articles_table.add_row(vec![article.metadata().title()]);
                    epub
                });
            let appendix = generate_appendix(articles.iter().collect());
            if let Err(err) = epub.add_content(
                EpubContent::new("appendix.xhtml", appendix.as_bytes())
                    .title(replace_escaped_characters("Article Sources")),
            ) {
                let mut paperoni_err: PaperoniError = err.into();
                paperoni_err.set_article_source(&name);
                errors.push(paperoni_err);
                return Err(errors);
            }

            let mut out_file = File::create(&name).unwrap();
            match epub.generate(&mut out_file) {
                Ok(_) => (),
                Err(err) => {
                    let mut paperoni_err: PaperoniError = err.into();
                    paperoni_err.set_article_source(&name);
                    errors.push(paperoni_err);
                    error!("Failed to generate epub: {}", name);
                    bar.finish_with_message("epub generation failed\n");
                    return Err(errors);
                }
            }

            bar.finish_with_message("Generated epub\n");
            debug!("Created {:?}", name);
            println!("Created {:?}", name);
        }
        None => {
            successful_articles_table
                .set_header(vec![Cell::new("Downloaded articles")
                    .add_attribute(Attribute::Bold)
                    .set_alignment(CellAlignment::Center)
                    .fg(Color::Green)])
                .set_content_arrangement(ContentArrangement::Dynamic);

            for article in &articles {
                let mut result = || -> Result<(), PaperoniError> {
                    let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
                    let file_name = format!(
                        "{}/{}.epub",
                        app_config.output_directory.as_deref().unwrap_or("."),
                        article
                            .metadata()
                            .title()
                            .replace("/", " ")
                            .replace("\\", " ")
                    );
                    debug!("Creating {:?}", file_name);
                    let mut out_file = File::create(&file_name).unwrap();
                    let mut xhtml_buf = Vec::new();
                    let header_level_tocs =
                        get_header_level_toc_vec("index.xhtml", article.node_ref());
                    serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)
                        .expect("Unable to serialize to xhtml");
                    let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();

                    if let Some(author) = article.metadata().byline() {
                        epub.metadata("author", replace_escaped_characters(author))?;
                    }

                    add_stylesheets(&mut epub, app_config)?;
                    let title = replace_escaped_characters(article.metadata().title());
                    epub.metadata("title", &title)?;

                    let mut content =
                        EpubContent::new("index.xhtml", xhtml_str.as_bytes()).title(title);

                    for toc_element in header_level_tocs {
                        content = content.child(toc_element);
                    }

                    epub.add_content(content)?;

                    for img in &article.img_urls {
                        let mut file_path = std::env::temp_dir();
                        file_path.push(&img.0);

                        let img_buf = File::open(&file_path).expect("Can't read image file");
                        epub.add_resource(
                            file_path.file_name().unwrap(),
                            img_buf,
                            img.1.as_ref().unwrap(),
                        )?;
                    }
                    let appendix = generate_appendix(vec![&article]);
                    epub.add_content(
                        EpubContent::new("appendix.xhtml", appendix.as_bytes())
                            .title(replace_escaped_characters("Article Source")),
                    )?;
                    epub.generate(&mut out_file)?;
                    bar.inc(1);

                    successful_articles_table.add_row(vec![article.metadata().title()]);

                    debug!("Created {:?}", file_name);
                    Ok(())
                };
                if let Err(mut error) = result() {
                    error.set_article_source(&article.url);
                    errors.push(error);
                }
            }
            bar.finish_with_message("Generated epubs\n");
        }
    }

    if errors.is_empty() {
        Ok(())
    } else {
        Err(errors)
    }
}

/// Replaces characters that have to be escaped before adding to the epub's metadata
fn replace_escaped_characters(value: &str) -> String {
    value
        .replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
}

fn add_stylesheets<T: epub_builder::Zip>(
    epub: &mut EpubBuilder<T>,
    app_config: &AppConfig,
) -> Result<(), epub_builder::Error> {
    let body_stylesheet: &[u8] = include_bytes!("./assets/body.min.css");
    let header_stylesheet: &[u8] = include_bytes!("./assets/headers.min.css");
    match app_config.css_config {
        crate::cli::CSSConfig::All => {
            epub.stylesheet([header_stylesheet, body_stylesheet].concat().as_bytes())?;
            Ok(())
        }
        crate::cli::CSSConfig::NoHeaders => {
            epub.stylesheet(body_stylesheet.as_bytes())?;
            Ok(())
        }
        _ => Ok(()),
    }
}

//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
fn generate_appendix(articles: Vec<&Article>) -> String {
    let link_tags: String = articles
        .iter()
        .map(|article| {
            let article_name = if !article.metadata().title().is_empty() {
                article.metadata().title()
            } else {
                &article.url
            };
            format!(
                "<a href=\"{}\">{}</a><br></br>",
                replace_escaped_characters(&article.url),
                replace_escaped_characters(article_name)
            )
        })
        .collect();
    let template = format!(
        r#"<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
    <head>
        <link rel="stylesheet" href="stylesheet.css" type="text/css"></link>
    </head>
    <body>
        <h2>Appendix</h2><h3>Article sources</h3>
        {}
    </body>
</html>"#,
        link_tags
    );
    template
}

/// Adds an id attribute to header elements and assigns a value based on
/// the hash of the text content. Headers with id attributes are not modified.
/// The headers here are known to have text because the grabbed article from
/// readability removes headers with no text.
fn generate_header_ids(root_node: &NodeRef) {
    let headers = root_node
        .select("h1, h2, h3, h4")
        .expect("Unable to create selector for headings");
    let headers_no_id = headers.filter(|node_data_ref| {
        let attrs = node_data_ref.attributes.borrow();
        !attrs.contains("id")
            || attrs
                .get("id")
                .map(|val| !VALID_ATTR_CHARS_REGEX.is_match(&val))
                .unwrap()
    });
    for header in headers_no_id {
        let mut attrs = header.attributes.borrow_mut();
        let text = header.text_contents();
        // The value of the id begins with an underscore because the hexadecimal
        // digest might start with a number which would make it an invalid id
        // when querying with selectors
        let value = format!("_{:x}", md5::compute(text));
        attrs.insert("id", value);
    }
}

/// Returns a vector of `TocElement` from a NodeRef used for adding to the Table of Contents for navigation
fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec<TocElement> {
    // Depth starts from 1
    const HEADER_LEVEL_MAX_DEPTH: usize = 4;
    let mut headers_vec: Vec<TocElement> = Vec::new();

    let mut header_levels = HashMap::with_capacity(HEADER_LEVEL_MAX_DEPTH);
    header_levels.insert("h1", 1);
    header_levels.insert("h2", 2);
    header_levels.insert("h3", 3);
    header_levels.insert("h4", 4);

    generate_header_ids(article);

    let headings = article
        .select("h1, h2, h3, h4")
        .expect("Unable to create selector for headings");

    // The header list will be generated using some sort of backtracking algorithm
    // There will be a stack of maximum size 4 (since it only goes to h4 now)
    let mut stack: Vec<Option<TocElement>> = std::iter::repeat(None)
        .take(HEADER_LEVEL_MAX_DEPTH)
        .collect::<_>();

    for heading in headings {
        let elem_name: &str = &heading.name.local;
        let attrs = heading.attributes.borrow();
        let id = attrs
            .get("id")
            .map(ToOwned::to_owned)
            .expect("Unable to get id value in get_header_level_toc_vec");
        let url = format!("{}#{}", content_url, id);

        let level = header_levels[elem_name];
        let index = level - 1;

        if let Some(mut existing_toc) = stack.get_mut(index).take().cloned().flatten() {
            // If a toc element already exists at that header level, consume all the toc elements
            // of a lower hierarchy e.g if the existing toc is a h2, then the h3 and h4 in the stack
            // will be consumed.
            // We collapse the children by folding from the right to the left of the stack.
            let descendants_levels = HEADER_LEVEL_MAX_DEPTH - level;
            let folded_descendants = stack
                .iter_mut()
                .rev()
                .take(descendants_levels)
                .map(|toc_elem| toc_elem.take())
                .filter(|toc_elem| toc_elem.is_some())
                .map(|toc_elem| toc_elem.unwrap())
                .reduce(|child, parent| parent.child(child));

            if let Some(child) = folded_descendants {
                existing_toc = existing_toc.child(child);
            };

            // Find the nearest ancestor to embed into.
            // If this toc_elem was a h1, then just add it to the headers_vec
            if index == 0 {
                headers_vec.push(existing_toc);
            } else {
                // Otherwise, find the nearest ancestor to add it to. If none exists, add it to the headers_vec
                let first_ancestor = stack
                    .iter_mut()
                    .take(level - 1)
                    .map(|toc_elem| toc_elem.as_mut())
                    .rfind(|toc_elem| toc_elem.is_some())
                    .flatten();

                match first_ancestor {
                    Some(ancestor_toc_elem) => {
                        *ancestor_toc_elem = ancestor_toc_elem.clone().child(existing_toc);
                    }
                    None => {
                        headers_vec.push(existing_toc);
                    }
                }
            }
        }

        if let Some(toc_elem) = stack.get_mut(index) {
            *toc_elem = Some(TocElement::new(
                url,
                replace_escaped_characters(&heading.text_contents()),
            ));
        }
    }

    let folded_stack = stack
        .into_iter()
        .rev()
        .filter(|toc_elem| toc_elem.is_some())
        .map(|opt_toc_elem| opt_toc_elem.unwrap())
        .reduce(|child, parent| parent.child(child));
    if let Some(toc_elem) = folded_stack {
        headers_vec.push(toc_elem)
    }

    headers_vec
}

/// Serializes a NodeRef to a string that is XHTML compatible
/// The only DOM nodes serialized are Text and Element nodes
fn serialize_to_xhtml<W: std::io::Write>(
    node_ref: &NodeRef,
    mut w: &mut W,
) -> Result<(), PaperoniError> {
    {
        // Add XHTML attributes
        let html_elem = node_ref
            .select_first("html")
            .expect("Unable to get <html> element in article");
        let mut html_attrs = html_elem.attributes.borrow_mut();
        html_attrs.insert("xmlns", "http://www.w3.org/1999/xhtml".into());
        html_attrs.insert("xmlns:epub", "http://www.idpf.org/2007/ops".into());
    }
    let mut escape_map = HashMap::new();
    escape_map.insert("<", "&lt;");
    escape_map.insert(">", "&gt;");
    escape_map.insert("&", "&amp;");
    escape_map.insert("\"", "&quot;");
    escape_map.insert("'", "&apos;");
    for edge in node_ref.traverse_inclusive() {
        match edge {
            kuchiki::iter::NodeEdge::Start(n) => match n.data() {
                kuchiki::NodeData::Text(rc_text) => {
                    let text = rc_text.borrow();
                    let esc_text = ESC_SEQ_REGEX
                        .replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
                    write!(&mut w, "{}", esc_text)?;
                }
                kuchiki::NodeData::Element(elem_data) => {
                    let attrs = elem_data.attributes.borrow();
                    let attrs_str = attrs
                        .map
                        .iter()
                        .filter(|(k, _)| {
                            let attr_key: &str = &k.local;
                            attr_key.is_ascii() && VALID_ATTR_CHARS_REGEX.is_match(attr_key)
                        })
                        .map(|(k, v)| {
                            format!(
                                "{}=\"{}\"",
                                k.local,
                                ESC_SEQ_REGEX
                                    .replace_all(&v.value, |captures: &regex::Captures| {
                                        escape_map[&captures[1]]
                                    })
                            )
                        })
                        .fold("".to_string(), |acc, val| acc + " " + &val);
                    write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
                }
                _ => (),
            },
            kuchiki::iter::NodeEdge::End(n) => match n.data() {
                kuchiki::NodeData::Element(elem_data) => {
                    write!(&mut w, "</{}>", &elem_data.name.local)?;
                }
                _ => (),
            },
        }
    }
    Ok(())
}

#[cfg(test)]
mod test {
    use kuchiki::traits::*;

    use super::{generate_header_ids, get_header_level_toc_vec, replace_escaped_characters};

    #[test]
    fn test_replace_escaped_characters() {
        let mut value = "Lorem ipsum";
        assert_eq!(replace_escaped_characters(value), "Lorem ipsum");
        value = "Memory safe > memory unsafe";
        assert_eq!(
            replace_escaped_characters(value),
            "Memory safe &gt; memory unsafe"
        );
        value = "Author Name <author@mail.example>";
        assert_eq!(
            replace_escaped_characters(value),
            "Author Name &lt;author@mail.example&gt;"
        );
    }

    #[test]
    fn test_generate_header_ids() {
        let html_str = r#"
<!DOCTYPE html>
<html>
    <body>
        <h1>Heading 1</h1>
        <h2 id="heading-2">Heading 2</h2>
        <h2 id="heading-2-again">Heading 2 again</h2>
        <h4>Heading 4</h4>
        <h1>Heading 1 again</h1>
        <h3 class="heading">Heading 3</h3>
    </body>
</html>
        "#;
        let doc = kuchiki::parse_html().one(html_str);
        generate_header_ids(&doc);

        let mut headers = doc.select("h1, h2, h3, h4").unwrap();
        let all_headers_have_ids = headers.all(|node_data_ref| {
            let attrs = node_data_ref.attributes.borrow();
            if let Some(id) = attrs.get("id") {
                !id.trim().is_empty()
            } else {
                false
            }
        });
        assert_eq!(true, all_headers_have_ids);

        let selector = format!("h1#_{:x}", md5::compute("Heading 1"));
        assert_eq!(true, doc.select_first(&selector).is_ok());

        let selector = format!("h1#_{:x}", md5::compute("Heading 1 again"));
        assert_eq!(true, doc.select_first(&selector).is_ok());

        let selector = "h2#heading-2-again";
        assert_eq!(true, doc.select_first(selector).is_ok());
    }

    #[test]
    fn test_get_header_level_toc_vec() {
        // NOTE: Due to `TocElement` not implementing PartialEq, the tests here
        // will need to be manually written to cover for this
        let html_str = r#"
        <!DOCTYPE html>
        <html>
            <body>
                <p>Lorem ipsum</p>
            </body>
        </html>
        "#;
        let doc = kuchiki::parse_html().one(html_str);

        let toc_vec = get_header_level_toc_vec("index.xhtml", &doc);
        assert_eq!(0, toc_vec.len());

        let html_str = r#"
        <!DOCTYPE html>
        <html>
            <body>
                <h1 id="heading-1">Heading 1</h1>
                <p>Lorem ipsum</p>
                <div>
                    <h2 id="heading-2">Heading 2</h2>
                    <p>Lorem ipsum</p>
                    <p>Lorem ipsum</p>
                </div>
                <h3 id="subheading-3">Subheading 3</h2>
                <p>Lorem ipsum</p>
                <h1 id="heading-2">Second Heading 1</h2>
                <p>Lorem ipsum</p>
            </body>
        </html>
        "#;
        let doc = kuchiki::parse_html().one(html_str);

        let toc_vec = get_header_level_toc_vec("index.xhtml", &doc);
        assert_eq!(2, toc_vec.len());

        let first_h1_toc = toc_vec.first().unwrap();
        assert_eq!("Heading 1", first_h1_toc.title);
        assert_eq!(1, first_h1_toc.children.len());

        let h2_toc = first_h1_toc.children.first().unwrap();
        assert_eq!("Heading 2", h2_toc.title);
        assert_eq!(1, h2_toc.children.len());

        let h3_toc = h2_toc.children.first().unwrap();
        assert_eq!("Subheading 3", h3_toc.title);
        assert_eq!(0, h3_toc.children.len());

        let last_h1_toc = toc_vec.last().unwrap();
        assert_eq!("Second Heading 1", last_h1_toc.title);
        assert_eq!(0, last_h1_toc.children.len());

        let html_str = r#"
        <!DOCTYPE html>
        <html>
            <body>
                <h1 id="heading-1">Heading 1</h1>
                <p>Lorem ipsum</p>
                <div>
                    <h2 id="heading-2">Heading 2</h2>
                    <p>Lorem ipsum</p>
                    <p>Lorem ipsum</p>
                    <h3 id="subheading-3">Subheading 3</h2>
                    <p>Lorem ipsum</p>
                </div>
                <h2 id="heading-2">Heading 2</h2>
                <p>Lorem ipsum</p>
                <h4 id="subheading-4">Subheading 4</h4>
                <h2 id="conclusion">Conclusion</h2>
            </body>
        </html>
        "#;
        let doc = kuchiki::parse_html().one(html_str);

        let toc_vec = get_header_level_toc_vec("index.xhtml", &doc);
        assert_eq!(1, toc_vec.len());

        let h1_toc = toc_vec.first().unwrap();
        assert_eq!("Heading 1", h1_toc.title);
        assert_eq!(3, h1_toc.children.len());

        let first_h2_toc = h1_toc.children.first().unwrap();
        assert_eq!("Heading 2", first_h2_toc.title);
        assert_eq!(1, first_h2_toc.children.len());

        let h3_toc = first_h2_toc.children.first().unwrap();
        assert_eq!("Subheading 3", h3_toc.title);
        assert_eq!(0, h3_toc.children.len());
    }
}