Merge pull request #3 from hipstermojo/dev

0.2.0 update
2020-11-24 18:42:29 +03:00 · 2020-11-24 18:42:29 +03:00 · 3c7dc9a416
commit 3c7dc9a416
parent fbf2f0b3d8 3bfa82ba60
8 changed files with 1360 additions and 950 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,21 +3,22 @@ description = "A web article downloader"
 homepage = "https://github.com/hipstermojo/paperoni"
 repository = "https://github.com/hipstermojo/paperoni"
 name = "paperoni"
-version = "0.1.0-alpha1"
+version = "0.2.0-alpha1"
 authors = ["Kenneth Gitere <gitere81@gmail.com>"]
 edition = "2018"
 license = "MIT"
 readme = "README.md"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
-async-std = "1.5.0"
+async-std = "1.7.0"
-epub-builder = "0.4.5"
+clap = "2.33.3"
 epub-builder = "0.4.8"
 html5ever = "0.25.1"
 kuchiki = "0.8.1"
-lazy_static = "1.3.9"
+lazy_static = "1.4.0"
 md5 = "0.7.0"
-regex = "1.3.9"
+regex = "1.4.2"
-surf = "1.0.3"
+surf = "2.1.0"
-structopt = { version = "0.3" }
+url = "2.2.0"
 url = "2.1.1"
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-<p align="center"><img src="./paperoni-dark.png" width="400"></p>
+<p align="center"><img src="./paperoni-dark.png"></p>
 <p align="center"><i>Salami not included</i></p>
@ -29,7 +29,7 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi
 This program is still in alpha so a number of things currently break:
- Links with redirects will crash the program as it has no redirect logic.
+- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium.
 - Websites that only run with JavaScript cannot be extracted.
 - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
--- a/src/cli.rs
+++ b/src/cli.rs
@ -1,13 +1,21 @@
-use structopt::StructOpt;
+use clap::{App, AppSettings, Arg};
-#[derive(Debug, StructOpt)]
+pub fn cli_init() -> App<'static, 'static> {
-#[structopt(name = "paperoni")]
+    App::new("paperoni")
-/// Paperoni is an article downloader.
+        .settings(&[
-///
+            AppSettings::ArgRequiredElseHelp,
-/// It takes a url and downloads the article content from it and
+            AppSettings::UnifiedHelpMessage,
-/// saves it to an epub.
+        ])
-pub struct Opts {
+        .version("0.1.0-alpha1")
-    // #[structopt(conflicts_with("links"))]
+        .about(
-    /// Url of a web article
+            "
-    pub urls: Vec<String>,
+Paperoni is an article downloader.
 It takes a url and downloads the article content from it and saves it to an epub.
        ",
        )
        .arg(
            Arg::with_name("urls")
                .help("Urls of web articles")
                .multiple(true),
        )
 }
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -1,3 +1,5 @@
 use std::collections::HashMap;
 use async_std::fs::File;
 use async_std::io::prelude::*;
 use async_std::task;
@ -8,6 +10,10 @@ use crate::moz_readability::{MetaData, Readability};
 pub type ResourceInfo = (String, Option<String>);
 lazy_static! {
    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
 }
 pub struct Extractor {
    article: Option<NodeRef>,
    pub img_urls: Vec<ResourceInfo>,
@ -62,22 +68,27 @@ impl Extractor {
    pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
        let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
        self.extract_img_urls();
-        println!("Downloading images to res/");
+        println!("Downloading images...");
        for img_url in &self.img_urls {
            let img_url = img_url.0.clone();
            let abs_url = get_absolute_url(&img_url, article_origin);
            async_download_tasks.push(task::spawn(async move {
-                let mut img_response = surf::get(&abs_url).await.expect("Unable to retrieve file");
+                let mut img_response = surf::Client::new()
                    .with(surf::middleware::Redirect::default())
                    .get(&abs_url)
                    .await
                    .expect("Unable to retrieve file");
                let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
                let img_mime = img_response
-                    .header("Content-Type")
+                    .content_type()
-                    .map(|header| header.to_string());
+                    .map(|mime| mime.essence().to_string());
                let img_ext = img_response
-                    .header("Content-Type")
+                    .content_type()
-                    .and_then(map_mime_type_to_ext)
+                    .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
                    .unwrap();
-
+                let mut img_path = std::env::temp_dir();
-                let img_path = format!("res/{}{}", hash_url(&abs_url), &img_ext);
+                img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
                let mut img_file = File::create(&img_path)
                    .await
                    .expect("Unable to create file");
@ -86,7 +97,19 @@ impl Extractor {
                    .await
                    .expect("Unable to save to file");
-                (img_url, img_path, img_mime)
+                (
                    img_url,
                    img_path
                        .file_name()
                        .map(|os_str_name| {
                            os_str_name
                                .to_str()
                                .expect("Unable to get image file name")
                                .to_string()
                        })
                        .unwrap(),
                    img_mime,
                )
            }));
        }
@ -123,21 +146,15 @@ fn hash_url(url: &str) -> String {
    format!("{:x}", md5::compute(url.as_bytes()))
 }
-/// Handles getting the extension from a given MIME type. The extension starts with a dot
+/// Handles getting the extension from a given MIME subtype.
-fn map_mime_type_to_ext(mime_type: &str) -> Option<String> {
+fn map_mime_subtype_to_ext(subtype: &str) -> &str {
-    mime_type
+    if subtype == ("svg+xml") {
        .split("/")
        .last()
        .map(|format| {
            if format == ("svg+xml") {
        return "svg";
-            } else if format == "x-icon" {
+    } else if subtype == "x-icon" {
        "ico"
    } else {
-                format
+        subtype
    }
        })
        .map(|format| String::from(".") + format)
 }
 fn get_absolute_url(url: &str, request_url: &Url) -> String {
@ -158,6 +175,56 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String {
    }
 }
 /// Serializes a NodeRef to a string that is XHTML compatible
 /// The only DOM nodes serialized are Text and Element nodes
 pub fn serialize_to_xhtml<W: std::io::Write>(
    node_ref: &NodeRef,
    mut w: &mut W,
 ) -> Result<(), Box<dyn std::error::Error>> {
    let mut escape_map = HashMap::new();
    escape_map.insert("<", "&lt;");
    escape_map.insert(">", "&gt;");
    escape_map.insert("&", "&amp;");
    for edge in node_ref.traverse_inclusive() {
        match edge {
            kuchiki::iter::NodeEdge::Start(n) => match n.data() {
                kuchiki::NodeData::Text(rc_text) => {
                    let text = rc_text.borrow();
                    let esc_text = ESC_SEQ_REGEX
                        .replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
                    write!(&mut w, "{}", esc_text)?;
                }
                kuchiki::NodeData::Element(elem_data) => {
                    let attrs = elem_data.attributes.borrow();
                    let attrs_str = attrs
                        .map
                        .iter()
                        .map(|(k, v)| {
                            format!(
                                "{}=\"{}\"",
                                k.local,
                                ESC_SEQ_REGEX
                                    .replace_all(&v.value, |captures: &regex::Captures| {
                                        escape_map[&captures[1]]
                                    })
                            )
                        })
                        .fold("".to_string(), |acc, val| acc + " " + &val);
                    write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
                }
                _ => (),
            },
            kuchiki::iter::NodeEdge::End(n) => match n.data() {
                kuchiki::NodeData::Element(elem_data) => {
                    write!(&mut w, "</{}>", &elem_data.name.local)?;
                }
                _ => (),
            },
        }
    }
    Ok(())
 }
 #[cfg(test)]
 mod test {
    use super::*;
@ -204,23 +271,15 @@ mod test {
    #[test]
    fn test_map_mime_type_to_ext() {
-        let mime_types = vec![
+        let mime_subtypes = vec![
-            "image/apng",
+            "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
            "image/bmp",
            "image/gif",
            "image/x-icon",
            "image/jpeg",
            "image/png",
            "image/svg+xml",
            "image/tiff",
            "image/webp",
        ];
-        let exts = mime_types
+        let exts = mime_subtypes
            .into_iter()
-            .map(|mime_type| map_mime_type_to_ext(mime_type).unwrap())
+            .map(|mime_type| map_mime_subtype_to_ext(mime_type))
            .collect::<Vec<_>>();
        assert_eq!(
-            vec![".apng", ".bmp", ".gif", ".ico", ".jpeg", ".png", ".svg", ".tiff", ".webp"],
+            vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
            exts
        );
    }
--- a/src/main.rs
+++ b/src/main.rs
@ -3,9 +3,8 @@ extern crate lazy_static;
 use std::fs::File;
-use async_std::{fs::create_dir, fs::remove_dir_all, task};
+use async_std::task;
 use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
 use structopt::StructOpt;
 use url::Url;
 mod cli;
@ -14,33 +13,36 @@ mod moz_readability;
 use extractor::Extractor;
 fn main() {
-    let opt = cli::Opts::from_args();
+    let app = cli::cli_init();
-    if !opt.urls.is_empty() {
+    let arg_matches = app.get_matches();
-        println!("Downloading single article");
+    if let Some(vals) = arg_matches.values_of("urls") {
-        download(opt.urls);
+        let urls = vals.map(|val| val.to_string()).collect::<Vec<_>>();
        download(urls);
    }
 }
 type HTMLResource = (String, String);
-async fn fetch_url(url: &str) -> HTMLResource {
+async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error>> {
    let client = surf::Client::new();
    println!("Fetching...");
-    // TODO: Add middleware for following redirects
+    let mut res = client
-    (
+        .with(surf::middleware::Redirect::default())
        url.to_string(),
        client
        .get(url)
-            .recv_string()
+        .send()
        .await
-            .expect("Unable to fetch URL"),
+        .expect(&format!("Unable to fetch {}", url));
-    )
+    if res.status() == 200 {
        Ok((url.to_string(), res.body_string().await?))
    } else {
        Err("Request failed to return HTTP 200".into())
    }
 }
 fn download(urls: Vec<String>) {
    let mut async_url_tasks = Vec::with_capacity(urls.len());
    for url in urls {
-        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
+        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() }));
    }
    task::block_on(async {
        for url_task in async_url_tasks {
@ -49,41 +51,35 @@ fn download(urls: Vec<String>) {
            let mut extractor = Extractor::from_html(&html);
            extractor.extract_content(&url);
            if extractor.article().is_some() {
                create_dir("res/")
                    .await
                    .expect("Unable to create res/ output folder");
                extractor
                    .download_images(&Url::parse(&url).unwrap())
                    .await
                    .expect("Unable to download images");
-                let mut out_file =
+                let file_name = format!("{}.epub", extractor.metadata().title());
-                    File::create(format!("{}.epub", extractor.metadata().title())).unwrap();
+                let mut out_file = File::create(&file_name).unwrap();
                let mut html_buf = Vec::new();
-                extractor
+                extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
-                    .article()
+                    .expect("Unable to serialize to xhtml");
                    .unwrap()
                    .serialize(&mut html_buf)
                    .expect("Unable to serialize");
                let html_buf = std::str::from_utf8(&html_buf).unwrap();
                let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX
                    .replace_all(html_buf, "$tag/>");
                let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
                if let Some(author) = extractor.metadata().byline() {
-                    epub.metadata("author", author).unwrap();
+                    epub.metadata("author", author.replace("&", "&amp;"))
                        .unwrap();
                }
-                epub.metadata("title", extractor.metadata().title())
+                epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
                    .unwrap();
                epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
                    .unwrap();
                for img in extractor.img_urls {
-                    let file_path = format!("{}", &img.0);
+                    let mut file_path = std::env::temp_dir();
                    file_path.push(&img.0);
-                    let img_buf = File::open(file_path).expect("Can't read file");
+                    let img_buf = File::open(&file_path).expect("Can't read file");
-                    epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
+                    epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
                        .unwrap();
                }
                epub.generate(&mut out_file).unwrap();
-                println!("Cleaning up");
+                println!("Created {:?}", file_name);
                remove_dir_all("res/").await.unwrap();
            }
        }
    })
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@ -653,7 +653,12 @@ impl Readability {
                })
                .map(|node_ref| {
                    let node_attrs = node_ref.attributes.borrow();
-                    Url::parse(node_attrs.get("href").unwrap()).unwrap()
+                    let href = node_attrs.get("href").unwrap();
                    if href.trim() == "/" {
                        document_uri.join("/").unwrap()
                    } else {
                        Url::parse(href).unwrap()
                    }
                })
                .next()
                .unwrap_or(document_uri.clone());
@ -758,14 +763,66 @@ impl Readability {
    }
    /// Converts an inline CSS string to a [HashMap] of property and value(s)
-    fn inline_css_str_to_map(css_str: &str) -> HashMap<&str, &str> {
+    fn inline_css_str_to_map(css_str: &str) -> HashMap<String, String> {
-        css_str
+        enum State {
-            .split(";")
+            ReadProp,
-            .filter(|split_str| !split_str.trim().is_empty())
+            ReadVal,
-            .map(|str_pair| {
+            ReadQuot,
-                let mut vals = str_pair.split(":");
+            ReadDquot,
-                (vals.next().unwrap().trim(), vals.next().unwrap().trim())
+        }
-            })
+        let mut decl: (Option<String>, Option<String>) = (None, None);
        let mut chars = css_str.chars();
        let mut state = State::ReadProp;
        let mut token = String::new();
        let mut tokens = vec![];
        while let Some(c) = chars.next() {
            match state {
                State::ReadProp => {
                    if c != ':' {
                        token.push(c);
                    } else {
                        state = State::ReadVal;
                        decl.0 = Some(token.trim().to_string());
                        token.clear();
                    }
                }
                State::ReadVal => {
                    if c == '\'' {
                        state = State::ReadQuot;
                        token.push(c);
                    } else if c == '"' {
                        state = State::ReadDquot;
                        token.push(c);
                    } else if c == ';' {
                        state = State::ReadProp;
                        decl.1 = Some(token.trim().to_string());
                        tokens.push(decl.clone());
                        token.clear();
                    } else {
                        token.push(c);
                    }
                }
                State::ReadQuot => {
                    token.push(c);
                    if c == '\'' {
                        state = State::ReadVal;
                    }
                }
                State::ReadDquot => {
                    token.push(c);
                    if c == '"' {
                        state = State::ReadVal;
                    }
                }
            }
        }
        if !token.is_empty() {
            decl.1 = Some(token.trim().to_string());
            tokens.push(decl);
        }
        tokens
            .into_iter()
            .map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap()))
            .collect()
    }
@ -2394,18 +2451,19 @@ mod test {
        use std::collections::HashMap;
        let css_str = "display: flex; height: 200px; width: 250px; justify-content: center; align-items: center; border: 2px solid black";
        let mut css_map = HashMap::new();
-        css_map.insert("display", "flex");
+        css_map.insert("display".to_string(), "flex".to_string());
-        css_map.insert("height", "200px");
+        css_map.insert("height".to_string(), "200px".to_string());
-        css_map.insert("width", "250px");
+        css_map.insert("width".to_string(), "250px".to_string());
-        css_map.insert("justify-content", "center");
+        css_map.insert("justify-content".to_string(), "center".to_string());
-        css_map.insert("align-items", "center");
+        css_map.insert("align-items".to_string(), "center".to_string());
-        css_map.insert("border", "2px solid black");
+        css_map.insert("border".to_string(), "2px solid black".to_string());
        let css_str_to_vec = Readability::inline_css_str_to_map(css_str);
        assert_eq!(css_map, css_str_to_vec);
        let mut css_map = HashMap::new();
-        css_map.insert("color", "red");
+        css_map.insert("color".to_string(), "red".to_string());
-        assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;"));
+        css_map.insert("background-image".to_string(), "url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')".to_string());
        assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')"));
    }
    #[test]
--- a/src/moz_readability/regexes.rs
+++ b/src/moz_readability/regexes.rs
@ -132,8 +132,4 @@ lazy_static! {
    pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
        Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
    pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
    pub static ref REPLACE_SELF_CLOSING_REGEX: Regex = Regex::new(
        r#"(?P<tag><(?:area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)(?: [a-z\-]+=["'][\sa-zA-Z0-9\./\-_#]+["']|[a-z\-]+)*)>"#
    )
    .unwrap();
 }