Add http and epub modules

2021-02-06 12:59:03 +03:00 · 2021-02-06 12:59:03 +03:00 · b402472ba6
commit b402472ba6
parent 08f847531f
5 changed files with 293 additions and 249 deletions
--- a/src/cli.rs
+++ b/src/cli.rs
@ -1,7 +1,9 @@
 use std::{fs::File, io::Read};
 use clap::{App, AppSettings, Arg};
-pub fn cli_init() -> App<'static, 'static> {
+pub fn cli_init() -> AppConfig {
-    App::new("paperoni")
+    let app = App::new("paperoni")
        .settings(&[
            AppSettings::ArgRequiredElseHelp,
            AppSettings::UnifiedHelpMessage,
@ -24,5 +26,54 @@ It takes a url and downloads the article content from it and saves it to an epub
                .long("file")
                .help("Input file containing links")
                .takes_value(true),
-        )
+        );
    let arg_matches = app.get_matches();
    let mut urls: Vec<String> = match arg_matches.value_of("file") {
        Some(file_name) => {
            if let Ok(mut file) = File::open(file_name) {
                let mut content = String::new();
                match file.read_to_string(&mut content) {
                    Ok(_) => content
                        .lines()
                        .filter(|line| !line.is_empty())
                        .map(|line| line.to_owned())
                        .collect(),
                    Err(_) => vec![],
                }
            } else {
                println!("Unable to open file: {}", file_name);
                vec![]
            }
        }
        None => vec![],
    };
    if let Some(vals) = arg_matches.values_of("urls") {
        urls.extend(
            vals.filter(|val| !val.is_empty())
                .map(|val| val.to_string()),
        );
    }
    let mut app_config = AppConfig::new();
    app_config.set_urls(urls);
    app_config
 }
 pub struct AppConfig {
    urls: Vec<String>,
 }
 impl AppConfig {
    fn new() -> Self {
        Self { urls: vec![] }
    }
    fn set_urls(&mut self, urls: Vec<String>) {
        self.urls.extend(urls);
    }
    pub fn urls(&self) -> &Vec<String> {
        &self.urls
    }
 }
--- a/src/epub.rs
+++ b/src/epub.rs
@ -0,0 +1,40 @@
 use std::fs::File;
 use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
 use crate::extractor::{self, Extractor};
 pub fn generate_epub(extractor: Extractor) {
    let file_name = format!(
        "{}.epub",
        extractor
            .metadata()
            .title()
            .replace("/", " ")
            .replace("\\", " ")
    );
    let mut out_file = File::create(&file_name).unwrap();
    let mut html_buf = Vec::new();
    extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
        .expect("Unable to serialize to xhtml");
    let html_buf = std::str::from_utf8(&html_buf).unwrap();
    let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
    if let Some(author) = extractor.metadata().byline() {
        epub.metadata("author", author.replace("&", "&amp;"))
            .unwrap();
    }
    epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
        .unwrap();
    epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
        .unwrap();
    for img in extractor.img_urls {
        let mut file_path = std::env::temp_dir();
        file_path.push(&img.0);
        let img_buf = File::open(&file_path).expect("Can't read file");
        epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
            .unwrap();
    }
    epub.generate(&mut out_file).unwrap();
    println!("Created {:?}", file_name);
 }
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -1,10 +1,6 @@
 use std::collections::HashMap;
 use async_std::fs::File;
 use async_std::io::prelude::*;
 use async_std::task;
 use kuchiki::{traits::*, NodeRef};
 use url::Url;
 use crate::moz_readability::{MetaData, Readability};
@ -51,8 +47,8 @@ impl Extractor {
    }
    /// Traverses the DOM tree of the content and retrieves the IMG URLs
-    fn extract_img_urls(&mut self) {
+    pub fn extract_img_urls(&mut self) {
-        if let Some(content_ref) = &self.readability.article_node {
+        if let Some(content_ref) = &self.article {
            for img_ref in content_ref.select("img").unwrap() {
                img_ref.as_node().as_element().map(|img_elem| {
                    img_elem.attributes.borrow().get("src").map(|img_url| {
@ -65,80 +61,6 @@ impl Extractor {
        }
    }
    pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
        let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
        self.extract_img_urls();
        if self.img_urls.len() > 0 {
            println!("Downloading images...");
        }
        for img_url in &self.img_urls {
            let img_url = img_url.0.clone();
            let abs_url = get_absolute_url(&img_url, article_origin);
            async_download_tasks.push(task::spawn(async move {
                let mut img_response = surf::Client::new()
                    // The middleware has been temporarily commented out because it happens
                    // to affect downloading images when there is no redirecting
                    // .with(surf::middleware::Redirect::default())
                    .get(&abs_url)
                    .await
                    .expect("Unable to retrieve file");
                let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
                let img_mime = img_response
                    .content_type()
                    .map(|mime| mime.essence().to_string());
                let img_ext = img_response
                    .content_type()
                    .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
                    .unwrap();
                let mut img_path = std::env::temp_dir();
                img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
                let mut img_file = File::create(&img_path)
                    .await
                    .expect("Unable to create file");
                img_file
                    .write_all(&img_content)
                    .await
                    .expect("Unable to save to file");
                (
                    img_url,
                    img_path
                        .file_name()
                        .map(|os_str_name| {
                            os_str_name
                                .to_str()
                                .expect("Unable to get image file name")
                                .to_string()
                        })
                        .unwrap(),
                    img_mime,
                )
            }));
        }
        self.img_urls.clear();
        for async_task in async_download_tasks {
            let (img_url, img_path, img_mime) = async_task.await;
            // Update the image sources
            let img_ref = self
                .readability
                .article_node
                .as_mut()
                .expect("Unable to get mutable ref")
                .select_first(&format!("img[src='{}']", img_url))
                .expect("Image node does not exist");
            let mut img_node = img_ref.attributes.borrow_mut();
            *img_node.get_mut("src").unwrap() = img_path.clone();
            // srcset is removed because readers such as Foliate then fail to display
            // the image already downloaded and stored in src
            img_node.remove("srcset");
            self.img_urls.push((img_path, img_mime));
        }
        Ok(())
    }
    pub fn article(&self) -> Option<&NodeRef> {
        self.article.as_ref()
    }
@ -148,40 +70,6 @@ impl Extractor {
    }
 }
 /// Utility for hashing URLs. This is used to help store files locally with unique values
 fn hash_url(url: &str) -> String {
    format!("{:x}", md5::compute(url.as_bytes()))
 }
 /// Handles getting the extension from a given MIME subtype.
 fn map_mime_subtype_to_ext(subtype: &str) -> &str {
    if subtype == ("svg+xml") {
        return "svg";
    } else if subtype == "x-icon" {
        "ico"
    } else {
        subtype
    }
 }
 fn get_absolute_url(url: &str, request_url: &Url) -> String {
    if Url::parse(url).is_ok() {
        url.to_owned()
    } else if url.starts_with("/") {
        Url::parse(&format!(
            "{}://{}",
            request_url.scheme(),
            request_url.host_str().unwrap()
        ))
        .unwrap()
        .join(url)
        .unwrap()
        .into_string()
    } else {
        request_url.join(url).unwrap().into_string()
    }
 }
 /// Serializes a NodeRef to a string that is XHTML compatible
 /// The only DOM nodes serialized are Text and Element nodes
 pub fn serialize_to_xhtml<W: std::io::Write>(
@ -278,19 +166,4 @@ mod test {
            extractor.img_urls
        );
    }
    #[test]
    fn test_map_mime_type_to_ext() {
        let mime_subtypes = vec![
            "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
        ];
        let exts = mime_subtypes
            .into_iter()
            .map(|mime_type| map_mime_subtype_to_ext(mime_type))
            .collect::<Vec<_>>();
        assert_eq!(
            vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
            exts
        );
    }
 }
--- a/src/http.rs
+++ b/src/http.rs
@ -0,0 +1,183 @@
 use async_std::fs::File;
 use async_std::io::prelude::*;
 use async_std::task;
 use url::Url;
 use crate::extractor::Extractor;
 type HTMLResource = (String, String);
 pub async fn fetch_url(
    url: &str,
 ) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
    let client = surf::Client::new();
    println!("Fetching...");
    let mut redirect_count: u8 = 0;
    let base_url = Url::parse(&url)?;
    let mut url = base_url.clone();
    while redirect_count < 5 {
        redirect_count += 1;
        let req = surf::get(&url);
        let mut res = client.send(req).await?;
        if res.status().is_redirection() {
            if let Some(location) = res.header(surf::http::headers::LOCATION) {
                match Url::parse(location.last().as_str()) {
                    Ok(valid_url) => url = valid_url,
                    Err(e) => match e {
                        url::ParseError::RelativeUrlWithoutBase => {
                            url = base_url.join(location.last().as_str())?
                        }
                        e => return Err(e.into()),
                    },
                };
            }
        } else if res.status().is_success() {
            if let Some(mime) = res.content_type() {
                if mime.essence() == "text/html" {
                    return Ok((url.to_string(), res.body_string().await?));
                } else {
                    return Err(format!(
                        "Invalid HTTP response. Received {} instead of text/html",
                        mime.essence()
                    )
                    .into());
                }
            } else {
                return Err("Unknown HTTP response".into());
            }
        } else {
            return Err(format!("Request failed: HTTP {}", res.status()).into());
        }
    }
    Err("Unable to fetch HTML".into())
 }
 pub async fn download_images(
    extractor: &mut Extractor,
    article_origin: &Url,
 ) -> async_std::io::Result<()> {
    extractor.extract_img_urls();
    let mut async_download_tasks = Vec::with_capacity(extractor.img_urls.len());
    if extractor.img_urls.len() > 0 {
        println!("Downloading images...");
    }
    for img_url in &extractor.img_urls {
        let img_url = img_url.0.clone();
        let abs_url = get_absolute_url(&img_url, article_origin);
        async_download_tasks.push(task::spawn(async move {
            let mut img_response = surf::Client::new()
                // The middleware has been temporarily commented out because it happens
                // to affect downloading images when there is no redirecting
                // .with(surf::middleware::Redirect::default())
                .get(&abs_url)
                .await
                .expect("Unable to retrieve file");
            let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
            let img_mime = img_response
                .content_type()
                .map(|mime| mime.essence().to_string());
            let img_ext = img_response
                .content_type()
                .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
                .unwrap();
            let mut img_path = std::env::temp_dir();
            img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
            let mut img_file = File::create(&img_path)
                .await
                .expect("Unable to create file");
            img_file
                .write_all(&img_content)
                .await
                .expect("Unable to save to file");
            (
                img_url,
                img_path
                    .file_name()
                    .map(|os_str_name| {
                        os_str_name
                            .to_str()
                            .expect("Unable to get image file name")
                            .to_string()
                    })
                    .unwrap(),
                img_mime,
            )
        }));
    }
    extractor.img_urls.clear();
    for async_task in async_download_tasks {
        let (img_url, img_path, img_mime) = async_task.await;
        // Update the image sources
        let img_ref = extractor
            .article()
            .as_mut()
            .expect("Unable to get mutable ref")
            .select_first(&format!("img[src='{}']", img_url))
            .expect("Image node does not exist");
        let mut img_node = img_ref.attributes.borrow_mut();
        *img_node.get_mut("src").unwrap() = img_path.clone();
        // srcset is removed because readers such as Foliate then fail to display
        // the image already downloaded and stored in src
        img_node.remove("srcset");
        extractor.img_urls.push((img_path, img_mime));
    }
    Ok(())
 }
 /// Handles getting the extension from a given MIME subtype.
 fn map_mime_subtype_to_ext(subtype: &str) -> &str {
    if subtype == ("svg+xml") {
        return "svg";
    } else if subtype == "x-icon" {
        "ico"
    } else {
        subtype
    }
 }
 /// Utility for hashing URLs. This is used to help store files locally with unique values
 fn hash_url(url: &str) -> String {
    format!("{:x}", md5::compute(url.as_bytes()))
 }
 fn get_absolute_url(url: &str, request_url: &Url) -> String {
    if Url::parse(url).is_ok() {
        url.to_owned()
    } else if url.starts_with("/") {
        Url::parse(&format!(
            "{}://{}",
            request_url.scheme(),
            request_url.host_str().unwrap()
        ))
        .unwrap()
        .join(url)
        .unwrap()
        .into_string()
    } else {
        request_url.join(url).unwrap().into_string()
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
    #[test]
    fn test_map_mime_type_to_ext() {
        let mime_subtypes = vec![
            "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
        ];
        let exts = mime_subtypes
            .into_iter()
            .map(|mime_type| map_mime_subtype_to_ext(mime_type))
            .collect::<Vec<_>>();
        assert_eq!(
            vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
            exts
        );
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -1,96 +1,27 @@
 #[macro_use]
 extern crate lazy_static;
 use std::{fs::File, io::Read};
 use async_std::task;
 use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
 use url::Url;
 mod cli;
 mod epub;
 mod extractor;
 /// This module is responsible for async HTTP calls for downloading
 /// the HTML content and images
 mod http;
 mod moz_readability;
 use epub::generate_epub;
 use http::{download_images, fetch_url};
 use extractor::Extractor;
 fn main() {
-    let app = cli::cli_init();
+    let app_config = cli::cli_init();
    let arg_matches = app.get_matches();
    let mut urls: Vec<String> = match arg_matches.value_of("file") {
        Some(file_name) => {
            if let Ok(mut file) = File::open(file_name) {
                let mut content = String::new();
                match file.read_to_string(&mut content) {
                    Ok(_) => content
                        .lines()
                        .filter(|line| !line.is_empty())
                        .map(|line| line.to_owned())
                        .collect(),
                    Err(_) => vec![],
                }
            } else {
                println!("Unable to open file: {}", file_name);
                vec![]
            }
        }
        None => vec![],
    };
-    if let Some(vals) = arg_matches.values_of("urls") {
+    if !app_config.urls().is_empty() {
-        urls.extend(
+        download(app_config.urls().clone());
            vals.filter(|val| !val.is_empty())
                .map(|val| val.to_string()),
        );
    }
    if !urls.is_empty() {
        download(urls);
    }
 }
 type HTMLResource = (String, String);
 async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
    let client = surf::Client::new();
    println!("Fetching...");
    let mut redirect_count: u8 = 0;
    let base_url = Url::parse(&url)?;
    let mut url = base_url.clone();
    while redirect_count < 5 {
        redirect_count += 1;
        let req = surf::get(&url);
        let mut res = client.send(req).await?;
        if res.status().is_redirection() {
            if let Some(location) = res.header(surf::http::headers::LOCATION) {
                match Url::parse(location.last().as_str()) {
                    Ok(valid_url) => url = valid_url,
                    Err(e) => match e {
                        url::ParseError::RelativeUrlWithoutBase => {
                            url = base_url.join(location.last().as_str())?
                        }
                        e => return Err(e.into()),
                    },
                };
            }
        } else if res.status().is_success() {
            if let Some(mime) = res.content_type() {
                if mime.essence() == "text/html" {
                    return Ok((url.to_string(), res.body_string().await?));
                } else {
                    return Err(format!(
                        "Invalid HTTP response. Received {} instead of text/html",
                        mime.essence()
                    )
                    .into());
                }
            } else {
                return Err("Unknown HTTP response".into());
            }
        } else {
            return Err(format!("Request failed: HTTP {}", res.status()).into());
        }
    }
    Err("Unable to fetch HTML".into())
 }
 fn download(urls: Vec<String>) {
@ -98,6 +29,7 @@ fn download(urls: Vec<String>) {
    for url in urls {
        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
    }
    task::block_on(async {
        for url_task in async_url_tasks {
            match url_task.await {
@ -105,47 +37,12 @@ fn download(urls: Vec<String>) {
                    println!("Extracting");
                    let mut extractor = Extractor::from_html(&html);
                    extractor.extract_content(&url);
                    if extractor.article().is_some() {
-                        extractor
+                        download_images(&mut extractor, &Url::parse(&url).unwrap())
                            .download_images(&Url::parse(&url).unwrap())
                            .await
                            .expect("Unable to download images");
-                        let file_name = format!(
+                        generate_epub(extractor);
                            "{}.epub",
                            extractor
                                .metadata()
                                .title()
                                .replace("/", " ")
                                .replace("\\", " ")
                        );
                        let mut out_file = File::create(&file_name).unwrap();
                        let mut html_buf = Vec::new();
                        extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
                            .expect("Unable to serialize to xhtml");
                        let html_buf = std::str::from_utf8(&html_buf).unwrap();
                        let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
                        if let Some(author) = extractor.metadata().byline() {
                            epub.metadata("author", author.replace("&", "&amp;"))
                                .unwrap();
                        }
                        epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
                            .unwrap();
                        epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
                            .unwrap();
                        for img in extractor.img_urls {
                            let mut file_path = std::env::temp_dir();
                            file_path.push(&img.0);
                            let img_buf = File::open(&file_path).expect("Can't read file");
                            epub.add_resource(
                                file_path.file_name().unwrap(),
                                img_buf,
                                img.1.unwrap(),
                            )
                            .unwrap();
                        }
                        epub.generate(&mut out_file).unwrap();
                        println!("Created {:?}", file_name);
                    }
                }
                Err(e) => println!("{}", e),