Merge pull request #6 from hipstermojo/dev

Update to 0.3.0
2021-02-24 13:13:36 +03:00 · 2021-02-24 13:13:36 +03:00 · e9f96d2970
commit e9f96d2970
parent c82071a871 165b2187be
9 changed files with 520 additions and 283 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -187,7 +187,7 @@ dependencies = [
 "memchr",
 "num_cpus",
 "once_cell",
- "pin-project-lite",
+ "pin-project-lite 0.1.11",
 "pin-utils",
 "slab",
 "wasm-bindgen-futures",
@ -684,25 +684,52 @@ dependencies = [
 ]

 [[package]]
-name = "futures-channel"
-version = "0.3.8"
+name = "futures"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b7109687aa4e177ef6fe84553af6280ef2778bdb7783ba44c9dc3399110fe64"
+checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
 dependencies = [
 "futures-core",
+ "futures-sink",
 ]

 [[package]]
 name = "futures-core"
-version = "0.3.8"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "847ce131b72ffb13b6109a221da9ad97a64cbe48feb1028356b836b47b8f1748"
+checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]

 [[package]]
 name = "futures-io"
-version = "0.3.8"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "611834ce18aaa1bd13c4b374f5d653e1027cf99b6b502584ff8c9a64413b30bb"
+checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"

 [[package]]
 name = "futures-lite"
@ -715,15 +742,15 @@ dependencies = [
 "futures-io",
 "memchr",
 "parking",
- "pin-project-lite",
+ "pin-project-lite 0.1.11",
 "waker-fn",
 ]

 [[package]]
 name = "futures-macro"
-version = "0.3.8"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77408a692f1f97bcc61dc001d752e00643408fbc922e4d634c655df50d595556"
+checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
 dependencies = [
 "proc-macro-hack",
 "proc-macro2",
@ -733,31 +760,33 @@ dependencies = [

 [[package]]
 name = "futures-sink"
-version = "0.3.8"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f878195a49cee50e006b02b93cf7e0a95a38ac7b776b4c4d9cc1207cd20fcb3d"
+checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"

 [[package]]
 name = "futures-task"
-version = "0.3.8"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c554eb5bf48b2426c4771ab68c6b14468b6e76cc90996f528c3338d761a4d0d"
+checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
 dependencies = [
 "once_cell",
 ]

 [[package]]
 name = "futures-util"
-version = "0.3.8"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d304cff4a7b99cfb7986f7d43fbe93d175e72e704a8860787cc95e9ffd85cbd2"
+checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
 dependencies = [
+ "futures-channel",
 "futures-core",
 "futures-io",
 "futures-macro",
+ "futures-sink",
 "futures-task",
 "memchr",
- "pin-project 1.0.2",
+ "pin-project-lite 0.2.4",
 "pin-utils",
 "proc-macro-hack",
 "proc-macro-nested",
@ -911,7 +940,7 @@ dependencies = [
 "cookie",
 "futures-lite",
 "infer",
- "pin-project-lite",
+ "pin-project-lite 0.1.11",
 "rand 0.7.3",
 "serde",
 "serde_json",
@ -1242,11 +1271,12 @@ dependencies = [

 [[package]]
 name = "paperoni"
-version = "0.2.2-alpha1"
+version = "0.3.0-alpha1"
 dependencies = [
 "async-std",
 "clap",
 "epub-builder",
+ "futures",
 "html5ever",
 "kuchiki",
 "lazy_static",
@ -1328,16 +1358,7 @@ version = "0.4.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15"
 dependencies = [
- "pin-project-internal 0.4.27",
-]
-
-[[package]]
-name = "pin-project"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ccc2237c2c489783abd8c4c80e5450fc0e98644555b1364da68cc29aa151ca7"
-dependencies = [
- "pin-project-internal 1.0.2",
+ "pin-project-internal",
 ]

 [[package]]
@ -1351,23 +1372,18 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "pin-project-internal"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8e8d2bf0b23038a4424865103a4df472855692821aab4e4f5c3312d461d9e5f"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "pin-project-lite"
 version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b"

+[[package]]
+name = "pin-project-lite"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827"
+
 [[package]]
 name = "pin-utils"
 version = "0.1.0"
@ -1889,7 +1905,7 @@ dependencies = [
 "log 0.4.11",
 "mime_guess",
 "once_cell",
- "pin-project-lite",
+ "pin-project-lite 0.1.11",
 "serde",
 "serde_json",
 "web-sys",
@ -2043,7 +2059,7 @@ checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27"
 dependencies = [
 "cfg-if 0.1.10",
 "log 0.4.11",
- "pin-project-lite",
+ "pin-project-lite 0.1.11",
 "tracing-attributes",
 "tracing-core",
 ]
@ -2074,7 +2090,7 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c"
 dependencies = [
- "pin-project 0.4.27",
+ "pin-project",
 "tracing",
 ]

--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ description = "A web article downloader"
 homepage = "https://github.com/hipstermojo/paperoni"
 repository = "https://github.com/hipstermojo/paperoni"
 name = "paperoni"
-version = "0.2.2-alpha1"
+version = "0.3.0-alpha1"
 authors = ["Kenneth Gitere <gitere81@gmail.com>"]
 edition = "2018"
 license = "MIT"
@ -15,6 +15,7 @@ readme = "README.md"
 async-std = "1.7.0"
 clap = "2.33.3"
 epub-builder = "0.4.8"
+futures = "0.3.12"
 html5ever = "0.25.1"
 kuchiki = "0.8.1"
 lazy_static = "1.4.0"
--- a/README.md
+++ b/README.md
@ -12,12 +12,24 @@ Paperoni is a web article downloader written in Rust. The downloaded articles ar
 paperoni https://en.wikipedia.org/wiki/Pepperoni
 ```

-Paperoni also supports passing multiple links as arguments. If you are on a Unix-like OS, you can simply do something like this:
+Paperoni also supports passing multiple links as arguments.
+
+```sh
+paperoni https://en.wikipedia.org/wiki/Pepperoni https://en.wikipedia.org/wiki/Salami
+```
+
+Alternatively, if you are on a Unix-like OS, you can simply do something like this:

 ```sh
 cat links.txt | xargs paperoni
 ```

+These can also be read from a file using the `-f` flag.
+
+```sh
+paperoni -f links.txt
+```
+
 ## How it works

 The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor.
@ -27,11 +39,11 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi

 ## How it (currently) doesn't work

-This program is still in alpha so a number of things currently break:
+This program is still in alpha so a number of things won't work:

- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium.
 - Websites that only run with JavaScript cannot be extracted.
 - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
+- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB.

 ## Running locally

--- a/src/cli.rs
+++ b/src/cli.rs
@ -1,12 +1,14 @@
+use std::{fs::File, io::Read};
+
 use clap::{App, AppSettings, Arg};

-pub fn cli_init() -> App<'static, 'static> {
-    App::new("paperoni")
+pub fn cli_init() -> AppConfig {
+    let app = App::new("paperoni")
        .settings(&[
            AppSettings::ArgRequiredElseHelp,
            AppSettings::UnifiedHelpMessage,
        ])
-        .version("0.2.2-alpha1")
+        .version("0.3.0-alpha1")
        .about(
            "
 Paperoni is an article downloader.
@ -18,4 +20,104 @@ It takes a url and downloads the article content from it and saves it to an epub
                .help("Urls of web articles")
                .multiple(true),
        )
+        .arg(
+            Arg::with_name("file")
+                .short("f")
+                .long("file")
+                .help("Input file containing links")
+                .takes_value(true),
+        )
+        .arg(
+            Arg::with_name("output_name")
+                .long("merge")
+                .help("Merge multiple articles into a single epub")
+                .long_help("Merge multiple articles into a single epub that will be given the name provided")
+                .takes_value(true),
+        ).arg(
+            Arg::with_name("max_conn")
+                .long("max_conn")
+                .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
+                .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
+                .takes_value(true));
+    let arg_matches = app.get_matches();
+    let mut urls: Vec<String> = match arg_matches.value_of("file") {
+        Some(file_name) => {
+            if let Ok(mut file) = File::open(file_name) {
+                let mut content = String::new();
+                match file.read_to_string(&mut content) {
+                    Ok(_) => content
+                        .lines()
+                        .filter(|line| !line.is_empty())
+                        .map(|line| line.to_owned())
+                        .collect(),
+                    Err(_) => vec![],
+                }
+            } else {
+                println!("Unable to open file: {}", file_name);
+                vec![]
+            }
+        }
+        None => vec![],
+    };
+
+    if let Some(vals) = arg_matches.values_of("urls") {
+        urls.extend(
+            vals.filter(|val| !val.is_empty())
+                .map(|val| val.to_string()),
+        );
+    }
+
+    let max_conn = arg_matches
+        .value_of("max_conn")
+        .map(|conn_str| conn_str.parse::<usize>().ok())
+        .flatten()
+        .map(|max| if max > 0 { max } else { 1 })
+        .unwrap_or(8);
+
+    let mut app_config = AppConfig::new(max_conn);
+    app_config.set_urls(urls);
+    if let Some(name) = arg_matches.value_of("output_name") {
+        let file_name = if name.ends_with(".epub") && name.len() > 5 {
+            name.to_owned()
+        } else {
+            name.to_owned() + ".epub"
+        };
+        app_config.set_merged(file_name);
+    }
+    app_config
+}
+
+pub struct AppConfig {
+    urls: Vec<String>,
+    max_conn: usize,
+    merged: Option<String>,
+}
+
+impl AppConfig {
+    fn new(max_conn: usize) -> Self {
+        Self {
+            urls: vec![],
+            max_conn,
+            merged: None,
+        }
+    }
+
+    fn set_urls(&mut self, urls: Vec<String>) {
+        self.urls.extend(urls);
+    }
+
+    fn set_merged(&mut self, name: String) {
+        self.merged = Some(name);
+    }
+
+    pub fn urls(&self) -> &Vec<String> {
+        &self.urls
+    }
+    pub fn max_conn(&self) -> usize {
+        self.max_conn
+    }
+
+    pub fn merged(&self) -> Option<&String> {
+        self.merged.as_ref()
+    }
 }
--- a/src/epub.rs
+++ b/src/epub.rs
@ -0,0 +1,113 @@
+use std::fs::File;
+
+use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
+
+use crate::extractor::{self, Extractor};
+
+pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
+    match merged {
+        Some(name) => {
+            let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
+            epub.inline_toc();
+            epub = articles
+                .iter()
+                .enumerate()
+                .fold(epub, |mut epub, (idx, article)| {
+                    let mut html_buf = Vec::new();
+                    extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
+                        .expect("Unable to serialize to xhtml");
+                    let html_str = std::str::from_utf8(&html_buf).unwrap();
+                    epub.metadata("title", replace_metadata_value(name))
+                        .unwrap();
+                    let section_name = article.metadata().title();
+                    epub.add_content(
+                        EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
+                            .title(replace_metadata_value(section_name)),
+                    )
+                    .unwrap();
+
+                    article.img_urls.iter().for_each(|img| {
+                        let mut file_path = std::env::temp_dir();
+                        file_path.push(&img.0);
+
+                        let img_buf = File::open(&file_path).expect("Can't read file");
+                        epub.add_resource(
+                            file_path.file_name().unwrap(),
+                            img_buf,
+                            img.1.as_ref().unwrap(),
+                        )
+                        .unwrap();
+                    });
+                    epub
+                });
+            let mut out_file = File::create(&name).unwrap();
+            epub.generate(&mut out_file).unwrap();
+            println!("Created {:?}", name);
+        }
+        None => {
+            for article in articles {
+                let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
+                let file_name = format!(
+                    "{}.epub",
+                    article
+                        .metadata()
+                        .title()
+                        .replace("/", " ")
+                        .replace("\\", " ")
+                );
+                let mut out_file = File::create(&file_name).unwrap();
+                let mut html_buf = Vec::new();
+                extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
+                    .expect("Unable to serialize to xhtml");
+                let html_str = std::str::from_utf8(&html_buf).unwrap();
+                if let Some(author) = article.metadata().byline() {
+                    epub.metadata("author", replace_metadata_value(author))
+                        .unwrap();
+                }
+                epub.metadata("title", replace_metadata_value(article.metadata().title()))
+                    .unwrap();
+                epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
+                    .unwrap();
+                for img in article.img_urls {
+                    let mut file_path = std::env::temp_dir();
+                    file_path.push(&img.0);
+
+                    let img_buf = File::open(&file_path).expect("Can't read file");
+                    epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
+                        .unwrap();
+                }
+                epub.generate(&mut out_file).unwrap();
+                println!("Created {:?}", file_name);
+            }
+        }
+    }
+}
+
+/// Replaces characters that have to be escaped before adding to the epub's metadata
+fn replace_metadata_value(value: &str) -> String {
+    value
+        .replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+}
+
+#[cfg(test)]
+mod test {
+    use super::replace_metadata_value;
+
+    #[test]
+    fn test_replace_metadata_value() {
+        let mut value = "Lorem ipsum";
+        assert_eq!(replace_metadata_value(value), "Lorem ipsum");
+        value = "Memory safe > memory unsafe";
+        assert_eq!(
+            replace_metadata_value(value),
+            "Memory safe &gt; memory unsafe"
+        );
+        value = "Author Name <author@mail.example>";
+        assert_eq!(
+            replace_metadata_value(value),
+            "Author Name &lt;author@mail.example&gt;"
+        );
+    }
+}
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -1,10 +1,6 @@
 use std::collections::HashMap;

-use async_std::fs::File;
-use async_std::io::prelude::*;
-use async_std::task;
 use kuchiki::{traits::*, NodeRef};
-use url::Url;

 use crate::moz_readability::{MetaData, Readability};

@ -51,8 +47,8 @@ impl Extractor {
    }

    /// Traverses the DOM tree of the content and retrieves the IMG URLs
-    fn extract_img_urls(&mut self) {
-        if let Some(content_ref) = &self.readability.article_node {
+    pub fn extract_img_urls(&mut self) {
+        if let Some(content_ref) = &self.article {
            for img_ref in content_ref.select("img").unwrap() {
                img_ref.as_node().as_element().map(|img_elem| {
                    img_elem.attributes.borrow().get("src").map(|img_url| {
@ -65,80 +61,6 @@ impl Extractor {
        }
    }

-    pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
-        let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
-        self.extract_img_urls();
-        if self.img_urls.len() > 0 {
-            println!("Downloading images...");
-        }
-        for img_url in &self.img_urls {
-            let img_url = img_url.0.clone();
-            let abs_url = get_absolute_url(&img_url, article_origin);
-
-            async_download_tasks.push(task::spawn(async move {
-                let mut img_response = surf::Client::new()
-                    // The middleware has been temporarily commented out because it happens
-                    // to affect downloading images when there is no redirecting
-                    // .with(surf::middleware::Redirect::default())
-                    .get(&abs_url)
-                    .await
-                    .expect("Unable to retrieve file");
-                let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
-                let img_mime = img_response
-                    .content_type()
-                    .map(|mime| mime.essence().to_string());
-                let img_ext = img_response
-                    .content_type()
-                    .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
-                    .unwrap();
-                let mut img_path = std::env::temp_dir();
-                img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
-                let mut img_file = File::create(&img_path)
-                    .await
-                    .expect("Unable to create file");
-                img_file
-                    .write_all(&img_content)
-                    .await
-                    .expect("Unable to save to file");
-
-                (
-                    img_url,
-                    img_path
-                        .file_name()
-                        .map(|os_str_name| {
-                            os_str_name
-                                .to_str()
-                                .expect("Unable to get image file name")
-                                .to_string()
-                        })
-                        .unwrap(),
-                    img_mime,
-                )
-            }));
-        }
-
-        self.img_urls.clear();
-
-        for async_task in async_download_tasks {
-            let (img_url, img_path, img_mime) = async_task.await;
-            // Update the image sources
-            let img_ref = self
-                .readability
-                .article_node
-                .as_mut()
-                .expect("Unable to get mutable ref")
-                .select_first(&format!("img[src='{}']", img_url))
-                .expect("Image node does not exist");
-            let mut img_node = img_ref.attributes.borrow_mut();
-            *img_node.get_mut("src").unwrap() = img_path.clone();
-            // srcset is removed because readers such as Foliate then fail to display
-            // the image already downloaded and stored in src
-            img_node.remove("srcset");
-            self.img_urls.push((img_path, img_mime));
-        }
-        Ok(())
-    }
-
    pub fn article(&self) -> Option<&NodeRef> {
        self.article.as_ref()
    }
@ -148,40 +70,6 @@ impl Extractor {
    }
 }

-/// Utility for hashing URLs. This is used to help store files locally with unique values
-fn hash_url(url: &str) -> String {
-    format!("{:x}", md5::compute(url.as_bytes()))
-}
-
-/// Handles getting the extension from a given MIME subtype.
-fn map_mime_subtype_to_ext(subtype: &str) -> &str {
-    if subtype == ("svg+xml") {
-        return "svg";
-    } else if subtype == "x-icon" {
-        "ico"
-    } else {
-        subtype
-    }
-}
-
-fn get_absolute_url(url: &str, request_url: &Url) -> String {
-    if Url::parse(url).is_ok() {
-        url.to_owned()
-    } else if url.starts_with("/") {
-        Url::parse(&format!(
-            "{}://{}",
-            request_url.scheme(),
-            request_url.host_str().unwrap()
-        ))
-        .unwrap()
-        .join(url)
-        .unwrap()
-        .into_string()
-    } else {
-        request_url.join(url).unwrap().into_string()
-    }
-}
-
 /// Serializes a NodeRef to a string that is XHTML compatible
 /// The only DOM nodes serialized are Text and Element nodes
 pub fn serialize_to_xhtml<W: std::io::Write>(
@ -278,19 +166,4 @@ mod test {
            extractor.img_urls
        );
    }
-
-    #[test]
-    fn test_map_mime_type_to_ext() {
-        let mime_subtypes = vec![
-            "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
-        ];
-        let exts = mime_subtypes
-            .into_iter()
-            .map(|mime_type| map_mime_subtype_to_ext(mime_type))
-            .collect::<Vec<_>>();
-        assert_eq!(
-            vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
-            exts
-        );
-    }
 }
--- a/src/http.rs
+++ b/src/http.rs
@ -0,0 +1,188 @@
+use async_std::io::prelude::*;
+use async_std::{fs::File, stream};
+use futures::StreamExt;
+use url::Url;
+
+use crate::extractor::Extractor;
+
+type HTMLResource = (String, String);
+
+pub async fn fetch_url(
+    url: &str,
+) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
+    let client = surf::Client::new();
+    println!("Fetching...");
+
+    let mut redirect_count: u8 = 0;
+    let base_url = Url::parse(&url)?;
+    let mut url = base_url.clone();
+    while redirect_count < 5 {
+        redirect_count += 1;
+        let req = surf::get(&url);
+        let mut res = client.send(req).await?;
+        if res.status().is_redirection() {
+            if let Some(location) = res.header(surf::http::headers::LOCATION) {
+                match Url::parse(location.last().as_str()) {
+                    Ok(valid_url) => url = valid_url,
+                    Err(e) => match e {
+                        url::ParseError::RelativeUrlWithoutBase => {
+                            url = base_url.join(location.last().as_str())?
+                        }
+                        e => return Err(e.into()),
+                    },
+                };
+            }
+        } else if res.status().is_success() {
+            if let Some(mime) = res.content_type() {
+                if mime.essence() == "text/html" {
+                    return Ok((url.to_string(), res.body_string().await?));
+                } else {
+                    return Err(format!(
+                        "Invalid HTTP response. Received {} instead of text/html",
+                        mime.essence()
+                    )
+                    .into());
+                }
+            } else {
+                return Err("Unknown HTTP response".into());
+            }
+        } else {
+            return Err(format!("Request failed: HTTP {}", res.status()).into());
+        }
+    }
+    Err("Unable to fetch HTML".into())
+}
+
+pub async fn download_images(
+    extractor: &mut Extractor,
+    article_origin: &Url,
+) -> async_std::io::Result<()> {
+    if extractor.img_urls.len() > 0 {
+        println!("Downloading images...");
+    }
+
+    let imgs_req_iter = extractor
+        .img_urls
+        .iter()
+        .map(|(url, _)| {
+            (
+                url,
+                surf::Client::new().get(get_absolute_url(&url, article_origin)),
+            )
+        })
+        .map(|(url, req)| async move {
+            let mut img_response = req.await.expect("Unable to retrieve image");
+            let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
+            let img_mime = img_response
+                .content_type()
+                .map(|mime| mime.essence().to_string());
+            let img_ext = img_response
+                .content_type()
+                .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
+                .unwrap();
+
+            let mut img_path = std::env::temp_dir();
+            img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
+            let mut img_file = File::create(&img_path)
+                .await
+                .expect("Unable to create file");
+            img_file
+                .write_all(&img_content)
+                .await
+                .expect("Unable to save to file");
+
+            (
+                url,
+                img_path
+                    .file_name()
+                    .map(|os_str_name| {
+                        os_str_name
+                            .to_str()
+                            .expect("Unable to get image file name")
+                            .to_string()
+                    })
+                    .unwrap(),
+                img_mime,
+            )
+        });
+
+    // A utility closure used when update the value of an image source after downloading is successful
+    let replace_existing_img_src =
+        |img_item: (&String, String, Option<String>)| -> (String, Option<String>) {
+            let (img_url, img_path, img_mime) = img_item;
+            let img_ref = extractor
+                .article()
+                .as_mut()
+                .expect("Unable to get mutable ref")
+                .select_first(&format!("img[src='{}']", img_url))
+                .expect("Image node does not exist");
+            let mut img_node = img_ref.attributes.borrow_mut();
+            *img_node.get_mut("src").unwrap() = img_path.clone();
+            // srcset is removed because readers such as Foliate then fail to display
+            // the image already downloaded and stored in src
+            img_node.remove("srcset");
+            (img_path, img_mime)
+        };
+
+    extractor.img_urls = stream::from_iter(imgs_req_iter)
+        .buffered(10)
+        .collect::<Vec<_>>()
+        .await
+        .into_iter()
+        .map(replace_existing_img_src)
+        .collect();
+    Ok(())
+}
+
+/// Handles getting the extension from a given MIME subtype.
+fn map_mime_subtype_to_ext(subtype: &str) -> &str {
+    if subtype == ("svg+xml") {
+        return "svg";
+    } else if subtype == "x-icon" {
+        "ico"
+    } else {
+        subtype
+    }
+}
+
+/// Utility for hashing URLs. This is used to help store files locally with unique values
+fn hash_url(url: &str) -> String {
+    format!("{:x}", md5::compute(url.as_bytes()))
+}
+
+fn get_absolute_url(url: &str, request_url: &Url) -> String {
+    if Url::parse(url).is_ok() {
+        url.to_owned()
+    } else if url.starts_with("/") {
+        Url::parse(&format!(
+            "{}://{}",
+            request_url.scheme(),
+            request_url.host_str().unwrap()
+        ))
+        .unwrap()
+        .join(url)
+        .unwrap()
+        .into_string()
+    } else {
+        request_url.join(url).unwrap().into_string()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    #[test]
+    fn test_map_mime_type_to_ext() {
+        let mime_subtypes = vec![
+            "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
+        ];
+        let exts = mime_subtypes
+            .into_iter()
+            .map(|mime_type| map_mime_subtype_to_ext(mime_type))
+            .collect::<Vec<_>>();
+        assert_eq!(
+            vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
+            exts
+        );
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,129 +1,56 @@
 #[macro_use]
 extern crate lazy_static;

-use std::fs::File;
-
+use async_std::stream;
 use async_std::task;
-use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
+use futures::stream::StreamExt;
 use url::Url;

 mod cli;
+mod epub;
 mod extractor;
+/// This module is responsible for async HTTP calls for downloading
+/// the HTML content and images
+mod http;
 mod moz_readability;

+use cli::AppConfig;
+use epub::generate_epubs;
 use extractor::Extractor;
+use http::{download_images, fetch_url};
+
 fn main() {
-    let app = cli::cli_init();
-    let arg_matches = app.get_matches();
-    if let Some(vals) = arg_matches.values_of("urls") {
-        let urls = vals.map(|val| val.to_string()).collect::<Vec<_>>();
-        download(urls);
+    let app_config = cli::cli_init();
+
+    if !app_config.urls().is_empty() {
+        download(app_config);
    }
 }

-type HTMLResource = (String, String);
-
-async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
-    let client = surf::Client::new();
-    println!("Fetching...");
-
-    let mut redirect_count: u8 = 0;
-    let base_url = Url::parse(&url)?;
-    let mut url = base_url.clone();
-    while redirect_count < 5 {
-        redirect_count += 1;
-        let req = surf::get(&url);
-        let mut res = client.send(req).await?;
-        if res.status().is_redirection() {
-            if let Some(location) = res.header(surf::http::headers::LOCATION) {
-                match Url::parse(location.last().as_str()) {
-                    Ok(valid_url) => url = valid_url,
-                    Err(e) => match e {
-                        url::ParseError::RelativeUrlWithoutBase => {
-                            url = base_url.join(location.last().as_str())?
-                        }
-                        e => return Err(e.into()),
-                    },
-                };
-            }
-        } else if res.status().is_success() {
-            if let Some(mime) = res.content_type() {
-                if mime.essence() == "text/html" {
-                    return Ok((url.to_string(), res.body_string().await?));
-                } else {
-                    return Err(format!(
-                        "Invalid HTTP response. Received {} instead of text/html",
-                        mime.essence()
-                    )
-                    .into());
-                }
-            } else {
-                return Err("Unknown HTTP response".into());
-            }
-        } else {
-            return Err(format!("Request failed: HTTP {}", res.status()).into());
-        }
-    }
-    Err("Unable to fetch HTML".into())
-}
-
-fn download(urls: Vec<String>) {
-    let mut async_url_tasks = Vec::with_capacity(urls.len());
-    for url in urls {
-        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
-    }
-    task::block_on(async {
-        for url_task in async_url_tasks {
-            match url_task.await {
+fn download(app_config: AppConfig) {
+    let articles = task::block_on(async {
+        let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
+        let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
+        let mut articles = Vec::new();
+        while let Some(fetch_result) = responses.next().await {
+            match fetch_result {
                Ok((url, html)) => {
                    println!("Extracting");
                    let mut extractor = Extractor::from_html(&html);
                    extractor.extract_content(&url);
+
                    if extractor.article().is_some() {
-                        extractor
-                            .download_images(&Url::parse(&url).unwrap())
+                        extractor.extract_img_urls();
+                        download_images(&mut extractor, &Url::parse(&url).unwrap())
                            .await
                            .expect("Unable to download images");
-                        let file_name = format!(
-                            "{}.epub",
-                            extractor
-                                .metadata()
-                                .title()
-                                .replace("/", " ")
-                                .replace("\\", " ")
-                        );
-                        let mut out_file = File::create(&file_name).unwrap();
-                        let mut html_buf = Vec::new();
-                        extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
-                            .expect("Unable to serialize to xhtml");
-                        let html_buf = std::str::from_utf8(&html_buf).unwrap();
-                        let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
-                        if let Some(author) = extractor.metadata().byline() {
-                            epub.metadata("author", author.replace("&", "&amp;"))
-                                .unwrap();
-                        }
-                        epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
-                            .unwrap();
-                        epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
-                            .unwrap();
-                        for img in extractor.img_urls {
-                            let mut file_path = std::env::temp_dir();
-                            file_path.push(&img.0);
-
-                            let img_buf = File::open(&file_path).expect("Can't read file");
-                            epub.add_resource(
-                                file_path.file_name().unwrap(),
-                                img_buf,
-                                img.1.unwrap(),
-                            )
-                            .unwrap();
-                        }
-                        epub.generate(&mut out_file).unwrap();
-                        println!("Created {:?}", file_name);
+                        articles.push(extractor);
                    }
                }
-                Err(e) => println!("{}", e),
+                Err(e) => eprintln!("{}", e),
            }
        }
-    })
+        articles
+    });
+    generate_epubs(articles, app_config.merged());
 }
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@ -462,7 +462,12 @@ impl Readability {
            .iter()
            .find(|key| values.contains_key(**key))
        {
-            values.get(*key).map(|title| title.to_owned()).unwrap()
+            let title = values.get(*key).map(|title| title.to_owned()).unwrap();
+            if title.is_empty() {
+                self.get_article_title()
+            } else {
+                title
+            }
        } else {
            self.get_article_title()
        };