Merge pull request #3 from hipstermojo/dev

0.2.0 update
2020-11-24 18:42:29 +03:00 · 2020-11-24 18:42:29 +03:00 · 3c7dc9a416
commit 3c7dc9a416
parent fbf2f0b3d8 3bfa82ba60
8 changed files with 1360 additions and 950 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,21 +3,22 @@ description = "A web article downloader"
 homepage = "https://github.com/hipstermojo/paperoni"
 repository = "https://github.com/hipstermojo/paperoni"
 name = "paperoni"
-version = "0.1.0-alpha1"
+version = "0.2.0-alpha1"
 authors = ["Kenneth Gitere <gitere81@gmail.com>"]
 edition = "2018"
 license = "MIT"
+readme = "README.md"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-async-std = "1.5.0"
-epub-builder = "0.4.5"
+async-std = "1.7.0"
+clap = "2.33.3"
+epub-builder = "0.4.8"
 html5ever = "0.25.1"
 kuchiki = "0.8.1"
-lazy_static = "1.3.9"
+lazy_static = "1.4.0"
 md5 = "0.7.0"
-regex = "1.3.9"
-surf = "1.0.3"
-structopt = { version = "0.3" }
-url = "2.1.1"
+regex = "1.4.2"
+surf = "2.1.0"
+url = "2.2.0"
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-<p align="center"><img src="./paperoni-dark.png" width="400"></p>
+<p align="center"><img src="./paperoni-dark.png"></p>

 <p align="center"><i>Salami not included</i></p>

@ -29,7 +29,7 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi

 This program is still in alpha so a number of things currently break:

- Links with redirects will crash the program as it has no redirect logic.
+- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium.
 - Websites that only run with JavaScript cannot be extracted.
 - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.

--- a/src/cli.rs
+++ b/src/cli.rs
@ -1,13 +1,21 @@
-use structopt::StructOpt;
+use clap::{App, AppSettings, Arg};

-#[derive(Debug, StructOpt)]
-#[structopt(name = "paperoni")]
-/// Paperoni is an article downloader.
-///
-/// It takes a url and downloads the article content from it and
-/// saves it to an epub.
-pub struct Opts {
-    // #[structopt(conflicts_with("links"))]
-    /// Url of a web article
-    pub urls: Vec<String>,
+pub fn cli_init() -> App<'static, 'static> {
+    App::new("paperoni")
+        .settings(&[
+            AppSettings::ArgRequiredElseHelp,
+            AppSettings::UnifiedHelpMessage,
+        ])
+        .version("0.1.0-alpha1")
+        .about(
+            "
+Paperoni is an article downloader.
+It takes a url and downloads the article content from it and saves it to an epub.
+        ",
+        )
+        .arg(
+            Arg::with_name("urls")
+                .help("Urls of web articles")
+                .multiple(true),
+        )
 }
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -1,3 +1,5 @@
+use std::collections::HashMap;
+
 use async_std::fs::File;
 use async_std::io::prelude::*;
 use async_std::task;
@ -8,6 +10,10 @@ use crate::moz_readability::{MetaData, Readability};

 pub type ResourceInfo = (String, Option<String>);

+lazy_static! {
+    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
+}
+
 pub struct Extractor {
    article: Option<NodeRef>,
    pub img_urls: Vec<ResourceInfo>,
@ -62,22 +68,27 @@ impl Extractor {
    pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
        let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
        self.extract_img_urls();
-        println!("Downloading images to res/");
+        println!("Downloading images...");
        for img_url in &self.img_urls {
            let img_url = img_url.0.clone();
            let abs_url = get_absolute_url(&img_url, article_origin);
+
            async_download_tasks.push(task::spawn(async move {
-                let mut img_response = surf::get(&abs_url).await.expect("Unable to retrieve file");
+                let mut img_response = surf::Client::new()
+                    .with(surf::middleware::Redirect::default())
+                    .get(&abs_url)
+                    .await
+                    .expect("Unable to retrieve file");
                let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
                let img_mime = img_response
-                    .header("Content-Type")
-                    .map(|header| header.to_string());
+                    .content_type()
+                    .map(|mime| mime.essence().to_string());
                let img_ext = img_response
-                    .header("Content-Type")
-                    .and_then(map_mime_type_to_ext)
+                    .content_type()
+                    .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
                    .unwrap();
-
-                let img_path = format!("res/{}{}", hash_url(&abs_url), &img_ext);
+                let mut img_path = std::env::temp_dir();
+                img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
                let mut img_file = File::create(&img_path)
                    .await
                    .expect("Unable to create file");
@ -86,7 +97,19 @@ impl Extractor {
                    .await
                    .expect("Unable to save to file");

-                (img_url, img_path, img_mime)
+                (
+                    img_url,
+                    img_path
+                        .file_name()
+                        .map(|os_str_name| {
+                            os_str_name
+                                .to_str()
+                                .expect("Unable to get image file name")
+                                .to_string()
+                        })
+                        .unwrap(),
+                    img_mime,
+                )
            }));
        }

@ -123,21 +146,15 @@ fn hash_url(url: &str) -> String {
    format!("{:x}", md5::compute(url.as_bytes()))
 }

-/// Handles getting the extension from a given MIME type. The extension starts with a dot
-fn map_mime_type_to_ext(mime_type: &str) -> Option<String> {
-    mime_type
-        .split("/")
-        .last()
-        .map(|format| {
-            if format == ("svg+xml") {
-                return "svg";
-            } else if format == "x-icon" {
-                "ico"
-            } else {
-                format
-            }
-        })
-        .map(|format| String::from(".") + format)
+/// Handles getting the extension from a given MIME subtype.
+fn map_mime_subtype_to_ext(subtype: &str) -> &str {
+    if subtype == ("svg+xml") {
+        return "svg";
+    } else if subtype == "x-icon" {
+        "ico"
+    } else {
+        subtype
+    }
 }

 fn get_absolute_url(url: &str, request_url: &Url) -> String {
@ -158,6 +175,56 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String {
    }
 }

+/// Serializes a NodeRef to a string that is XHTML compatible
+/// The only DOM nodes serialized are Text and Element nodes
+pub fn serialize_to_xhtml<W: std::io::Write>(
+    node_ref: &NodeRef,
+    mut w: &mut W,
+) -> Result<(), Box<dyn std::error::Error>> {
+    let mut escape_map = HashMap::new();
+    escape_map.insert("<", "&lt;");
+    escape_map.insert(">", "&gt;");
+    escape_map.insert("&", "&amp;");
+    for edge in node_ref.traverse_inclusive() {
+        match edge {
+            kuchiki::iter::NodeEdge::Start(n) => match n.data() {
+                kuchiki::NodeData::Text(rc_text) => {
+                    let text = rc_text.borrow();
+                    let esc_text = ESC_SEQ_REGEX
+                        .replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
+                    write!(&mut w, "{}", esc_text)?;
+                }
+                kuchiki::NodeData::Element(elem_data) => {
+                    let attrs = elem_data.attributes.borrow();
+                    let attrs_str = attrs
+                        .map
+                        .iter()
+                        .map(|(k, v)| {
+                            format!(
+                                "{}=\"{}\"",
+                                k.local,
+                                ESC_SEQ_REGEX
+                                    .replace_all(&v.value, |captures: &regex::Captures| {
+                                        escape_map[&captures[1]]
+                                    })
+                            )
+                        })
+                        .fold("".to_string(), |acc, val| acc + " " + &val);
+                    write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
+                }
+                _ => (),
+            },
+            kuchiki::iter::NodeEdge::End(n) => match n.data() {
+                kuchiki::NodeData::Element(elem_data) => {
+                    write!(&mut w, "</{}>", &elem_data.name.local)?;
+                }
+                _ => (),
+            },
+        }
+    }
+    Ok(())
+}
+
 #[cfg(test)]
 mod test {
    use super::*;
@ -204,23 +271,15 @@ mod test {

    #[test]
    fn test_map_mime_type_to_ext() {
-        let mime_types = vec![
-            "image/apng",
-            "image/bmp",
-            "image/gif",
-            "image/x-icon",
-            "image/jpeg",
-            "image/png",
-            "image/svg+xml",
-            "image/tiff",
-            "image/webp",
+        let mime_subtypes = vec![
+            "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
        ];
-        let exts = mime_types
+        let exts = mime_subtypes
            .into_iter()
-            .map(|mime_type| map_mime_type_to_ext(mime_type).unwrap())
+            .map(|mime_type| map_mime_subtype_to_ext(mime_type))
            .collect::<Vec<_>>();
        assert_eq!(
-            vec![".apng", ".bmp", ".gif", ".ico", ".jpeg", ".png", ".svg", ".tiff", ".webp"],
+            vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
            exts
        );
    }
--- a/src/main.rs
+++ b/src/main.rs
@ -3,9 +3,8 @@ extern crate lazy_static;

 use std::fs::File;

-use async_std::{fs::create_dir, fs::remove_dir_all, task};
+use async_std::task;
 use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
-use structopt::StructOpt;
 use url::Url;

 mod cli;
@ -14,33 +13,36 @@ mod moz_readability;

 use extractor::Extractor;
 fn main() {
-    let opt = cli::Opts::from_args();
-    if !opt.urls.is_empty() {
-        println!("Downloading single article");
-        download(opt.urls);
+    let app = cli::cli_init();
+    let arg_matches = app.get_matches();
+    if let Some(vals) = arg_matches.values_of("urls") {
+        let urls = vals.map(|val| val.to_string()).collect::<Vec<_>>();
+        download(urls);
    }
 }

 type HTMLResource = (String, String);

-async fn fetch_url(url: &str) -> HTMLResource {
+async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error>> {
    let client = surf::Client::new();
    println!("Fetching...");
-    // TODO: Add middleware for following redirects
-    (
-        url.to_string(),
-        client
-            .get(url)
-            .recv_string()
-            .await
-            .expect("Unable to fetch URL"),
-    )
+    let mut res = client
+        .with(surf::middleware::Redirect::default())
+        .get(url)
+        .send()
+        .await
+        .expect(&format!("Unable to fetch {}", url));
+    if res.status() == 200 {
+        Ok((url.to_string(), res.body_string().await?))
+    } else {
+        Err("Request failed to return HTTP 200".into())
+    }
 }

 fn download(urls: Vec<String>) {
    let mut async_url_tasks = Vec::with_capacity(urls.len());
    for url in urls {
-        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
+        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() }));
    }
    task::block_on(async {
        for url_task in async_url_tasks {
@ -49,41 +51,35 @@ fn download(urls: Vec<String>) {
            let mut extractor = Extractor::from_html(&html);
            extractor.extract_content(&url);
            if extractor.article().is_some() {
-                create_dir("res/")
-                    .await
-                    .expect("Unable to create res/ output folder");
                extractor
                    .download_images(&Url::parse(&url).unwrap())
                    .await
                    .expect("Unable to download images");
-                let mut out_file =
-                    File::create(format!("{}.epub", extractor.metadata().title())).unwrap();
+                let file_name = format!("{}.epub", extractor.metadata().title());
+                let mut out_file = File::create(&file_name).unwrap();
                let mut html_buf = Vec::new();
-                extractor
-                    .article()
-                    .unwrap()
-                    .serialize(&mut html_buf)
-                    .expect("Unable to serialize");
+                extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
+                    .expect("Unable to serialize to xhtml");
                let html_buf = std::str::from_utf8(&html_buf).unwrap();
-                let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX
-                    .replace_all(html_buf, "$tag/>");
                let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
                if let Some(author) = extractor.metadata().byline() {
-                    epub.metadata("author", author).unwrap();
+                    epub.metadata("author", author.replace("&", "&amp;"))
+                        .unwrap();
                }
-                epub.metadata("title", extractor.metadata().title())
+                epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
                    .unwrap();
                epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
                    .unwrap();
                for img in extractor.img_urls {
-                    let file_path = format!("{}", &img.0);
+                    let mut file_path = std::env::temp_dir();
+                    file_path.push(&img.0);

-                    let img_buf = File::open(file_path).expect("Can't read file");
-                    epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
+                    let img_buf = File::open(&file_path).expect("Can't read file");
+                    epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
+                        .unwrap();
                }
                epub.generate(&mut out_file).unwrap();
-                println!("Cleaning up");
-                remove_dir_all("res/").await.unwrap();
+                println!("Created {:?}", file_name);
            }
        }
    })
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@ -653,7 +653,12 @@ impl Readability {
                })
                .map(|node_ref| {
                    let node_attrs = node_ref.attributes.borrow();
-                    Url::parse(node_attrs.get("href").unwrap()).unwrap()
+                    let href = node_attrs.get("href").unwrap();
+                    if href.trim() == "/" {
+                        document_uri.join("/").unwrap()
+                    } else {
+                        Url::parse(href).unwrap()
+                    }
                })
                .next()
                .unwrap_or(document_uri.clone());
@ -758,14 +763,66 @@ impl Readability {
    }

    /// Converts an inline CSS string to a [HashMap] of property and value(s)
-    fn inline_css_str_to_map(css_str: &str) -> HashMap<&str, &str> {
-        css_str
-            .split(";")
-            .filter(|split_str| !split_str.trim().is_empty())
-            .map(|str_pair| {
-                let mut vals = str_pair.split(":");
-                (vals.next().unwrap().trim(), vals.next().unwrap().trim())
-            })
+    fn inline_css_str_to_map(css_str: &str) -> HashMap<String, String> {
+        enum State {
+            ReadProp,
+            ReadVal,
+            ReadQuot,
+            ReadDquot,
+        }
+        let mut decl: (Option<String>, Option<String>) = (None, None);
+        let mut chars = css_str.chars();
+        let mut state = State::ReadProp;
+        let mut token = String::new();
+        let mut tokens = vec![];
+        while let Some(c) = chars.next() {
+            match state {
+                State::ReadProp => {
+                    if c != ':' {
+                        token.push(c);
+                    } else {
+                        state = State::ReadVal;
+                        decl.0 = Some(token.trim().to_string());
+                        token.clear();
+                    }
+                }
+                State::ReadVal => {
+                    if c == '\'' {
+                        state = State::ReadQuot;
+                        token.push(c);
+                    } else if c == '"' {
+                        state = State::ReadDquot;
+                        token.push(c);
+                    } else if c == ';' {
+                        state = State::ReadProp;
+                        decl.1 = Some(token.trim().to_string());
+                        tokens.push(decl.clone());
+                        token.clear();
+                    } else {
+                        token.push(c);
+                    }
+                }
+                State::ReadQuot => {
+                    token.push(c);
+                    if c == '\'' {
+                        state = State::ReadVal;
+                    }
+                }
+                State::ReadDquot => {
+                    token.push(c);
+                    if c == '"' {
+                        state = State::ReadVal;
+                    }
+                }
+            }
+        }
+        if !token.is_empty() {
+            decl.1 = Some(token.trim().to_string());
+            tokens.push(decl);
+        }
+        tokens
+            .into_iter()
+            .map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap()))
            .collect()
    }

@ -2394,18 +2451,19 @@ mod test {
        use std::collections::HashMap;
        let css_str = "display: flex; height: 200px; width: 250px; justify-content: center; align-items: center; border: 2px solid black";
        let mut css_map = HashMap::new();
-        css_map.insert("display", "flex");
-        css_map.insert("height", "200px");
-        css_map.insert("width", "250px");
-        css_map.insert("justify-content", "center");
-        css_map.insert("align-items", "center");
-        css_map.insert("border", "2px solid black");
+        css_map.insert("display".to_string(), "flex".to_string());
+        css_map.insert("height".to_string(), "200px".to_string());
+        css_map.insert("width".to_string(), "250px".to_string());
+        css_map.insert("justify-content".to_string(), "center".to_string());
+        css_map.insert("align-items".to_string(), "center".to_string());
+        css_map.insert("border".to_string(), "2px solid black".to_string());

        let css_str_to_vec = Readability::inline_css_str_to_map(css_str);
        assert_eq!(css_map, css_str_to_vec);
        let mut css_map = HashMap::new();
-        css_map.insert("color", "red");
-        assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;"));
+        css_map.insert("color".to_string(), "red".to_string());
+        css_map.insert("background-image".to_string(), "url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')".to_string());
+        assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')"));
    }

    #[test]
--- a/src/moz_readability/regexes.rs
+++ b/src/moz_readability/regexes.rs
@ -132,8 +132,4 @@ lazy_static! {
    pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
        Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
    pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
-    pub static ref REPLACE_SELF_CLOSING_REGEX: Regex = Regex::new(
-        r#"(?P<tag><(?:area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)(?: [a-z\-]+=["'][\sa-zA-Z0-9\./\-_#]+["']|[a-z\-]+)*)>"#
-    )
-    .unwrap();
 }