diff --git a/Cargo.lock b/Cargo.lock index 3b7d384..8466dbf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -187,7 +187,7 @@ dependencies = [ "memchr", "num_cpus", "once_cell", - "pin-project-lite", + "pin-project-lite 0.1.11", "pin-utils", "slab", "wasm-bindgen-futures", @@ -684,25 +684,52 @@ dependencies = [ ] [[package]] -name = "futures-channel" -version = "0.3.8" +name = "futures" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b7109687aa4e177ef6fe84553af6280ef2778bdb7783ba44c9dc3399110fe64" +checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" dependencies = [ "futures-core", + "futures-sink", ] [[package]] name = "futures-core" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "847ce131b72ffb13b6109a221da9ad97a64cbe48feb1028356b836b47b8f1748" +checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" + +[[package]] +name = "futures-executor" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] [[package]] name = "futures-io" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "611834ce18aaa1bd13c4b374f5d653e1027cf99b6b502584ff8c9a64413b30bb" +checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" [[package]] name = "futures-lite" @@ -715,15 +742,15 @@ dependencies = [ "futures-io", "memchr", "parking", - "pin-project-lite", + "pin-project-lite 0.1.11", "waker-fn", ] [[package]] name = "futures-macro" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77408a692f1f97bcc61dc001d752e00643408fbc922e4d634c655df50d595556" +checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -733,31 +760,33 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f878195a49cee50e006b02b93cf7e0a95a38ac7b776b4c4d9cc1207cd20fcb3d" +checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" [[package]] name = "futures-task" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c554eb5bf48b2426c4771ab68c6b14468b6e76cc90996f528c3338d761a4d0d" +checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" dependencies = [ "once_cell", ] [[package]] name = "futures-util" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d304cff4a7b99cfb7986f7d43fbe93d175e72e704a8860787cc95e9ffd85cbd2" +checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" dependencies = [ + "futures-channel", "futures-core", "futures-io", "futures-macro", + "futures-sink", "futures-task", "memchr", - "pin-project 1.0.2", + "pin-project-lite 0.2.4", "pin-utils", "proc-macro-hack", "proc-macro-nested", @@ -911,7 +940,7 @@ dependencies = [ "cookie", "futures-lite", "infer", - "pin-project-lite", + "pin-project-lite 0.1.11", "rand 0.7.3", "serde", "serde_json", @@ -1242,11 +1271,12 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.2.2-alpha1" +version = "0.3.0-alpha1" dependencies = [ "async-std", "clap", "epub-builder", + "futures", "html5ever", "kuchiki", "lazy_static", @@ -1328,16 +1358,7 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15" dependencies = [ - "pin-project-internal 0.4.27", -] - -[[package]] -name = "pin-project" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ccc2237c2c489783abd8c4c80e5450fc0e98644555b1364da68cc29aa151ca7" -dependencies = [ - "pin-project-internal 1.0.2", + "pin-project-internal", ] [[package]] @@ -1351,23 +1372,18 @@ dependencies = [ "syn", ] -[[package]] -name = "pin-project-internal" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8e8d2bf0b23038a4424865103a4df472855692821aab4e4f5c3312d461d9e5f" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "pin-project-lite" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b" +[[package]] +name = "pin-project-lite" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827" + [[package]] name = "pin-utils" version = "0.1.0" @@ -1889,7 +1905,7 @@ dependencies = [ "log 0.4.11", "mime_guess", "once_cell", - "pin-project-lite", + "pin-project-lite 0.1.11", "serde", "serde_json", "web-sys", @@ -2043,7 +2059,7 @@ checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27" dependencies = [ "cfg-if 0.1.10", "log 0.4.11", - "pin-project-lite", + "pin-project-lite 0.1.11", "tracing-attributes", "tracing-core", ] @@ -2074,7 +2090,7 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c" dependencies = [ - "pin-project 0.4.27", + "pin-project", "tracing", ] diff --git a/Cargo.toml b/Cargo.toml index 01bbe6b..05660ed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ description = "A web article downloader" homepage = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.2.2-alpha1" +version = "0.3.0-alpha1" authors = ["Kenneth Gitere "] edition = "2018" license = "MIT" @@ -15,6 +15,7 @@ readme = "README.md" async-std = "1.7.0" clap = "2.33.3" epub-builder = "0.4.8" +futures = "0.3.12" html5ever = "0.25.1" kuchiki = "0.8.1" lazy_static = "1.4.0" diff --git a/README.md b/README.md index 8a4d989..a6b4eb5 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,24 @@ Paperoni is a web article downloader written in Rust. The downloaded articles ar paperoni https://en.wikipedia.org/wiki/Pepperoni ``` -Paperoni also supports passing multiple links as arguments. If you are on a Unix-like OS, you can simply do something like this: +Paperoni also supports passing multiple links as arguments. + +```sh +paperoni https://en.wikipedia.org/wiki/Pepperoni https://en.wikipedia.org/wiki/Salami +``` + +Alternatively, if you are on a Unix-like OS, you can simply do something like this: ```sh cat links.txt | xargs paperoni ``` +These can also be read from a file using the `-f` flag. + +```sh +paperoni -f links.txt +``` + ## How it works The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor. @@ -27,11 +39,11 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi ## How it (currently) doesn't work -This program is still in alpha so a number of things currently break: +This program is still in alpha so a number of things won't work: -- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium. - Websites that only run with JavaScript cannot be extracted. - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either. +- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB. ## Running locally diff --git a/src/cli.rs b/src/cli.rs index 474223b..9815e08 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,12 +1,14 @@ +use std::{fs::File, io::Read}; + use clap::{App, AppSettings, Arg}; -pub fn cli_init() -> App<'static, 'static> { - App::new("paperoni") +pub fn cli_init() -> AppConfig { + let app = App::new("paperoni") .settings(&[ AppSettings::ArgRequiredElseHelp, AppSettings::UnifiedHelpMessage, ]) - .version("0.2.2-alpha1") + .version("0.3.0-alpha1") .about( " Paperoni is an article downloader. @@ -18,4 +20,104 @@ It takes a url and downloads the article content from it and saves it to an epub .help("Urls of web articles") .multiple(true), ) + .arg( + Arg::with_name("file") + .short("f") + .long("file") + .help("Input file containing links") + .takes_value(true), + ) + .arg( + Arg::with_name("output_name") + .long("merge") + .help("Merge multiple articles into a single epub") + .long_help("Merge multiple articles into a single epub that will be given the name provided") + .takes_value(true), + ).arg( + Arg::with_name("max_conn") + .long("max_conn") + .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") + .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") + .takes_value(true)); + let arg_matches = app.get_matches(); + let mut urls: Vec = match arg_matches.value_of("file") { + Some(file_name) => { + if let Ok(mut file) = File::open(file_name) { + let mut content = String::new(); + match file.read_to_string(&mut content) { + Ok(_) => content + .lines() + .filter(|line| !line.is_empty()) + .map(|line| line.to_owned()) + .collect(), + Err(_) => vec![], + } + } else { + println!("Unable to open file: {}", file_name); + vec![] + } + } + None => vec![], + }; + + if let Some(vals) = arg_matches.values_of("urls") { + urls.extend( + vals.filter(|val| !val.is_empty()) + .map(|val| val.to_string()), + ); + } + + let max_conn = arg_matches + .value_of("max_conn") + .map(|conn_str| conn_str.parse::().ok()) + .flatten() + .map(|max| if max > 0 { max } else { 1 }) + .unwrap_or(8); + + let mut app_config = AppConfig::new(max_conn); + app_config.set_urls(urls); + if let Some(name) = arg_matches.value_of("output_name") { + let file_name = if name.ends_with(".epub") && name.len() > 5 { + name.to_owned() + } else { + name.to_owned() + ".epub" + }; + app_config.set_merged(file_name); + } + app_config +} + +pub struct AppConfig { + urls: Vec, + max_conn: usize, + merged: Option, +} + +impl AppConfig { + fn new(max_conn: usize) -> Self { + Self { + urls: vec![], + max_conn, + merged: None, + } + } + + fn set_urls(&mut self, urls: Vec) { + self.urls.extend(urls); + } + + fn set_merged(&mut self, name: String) { + self.merged = Some(name); + } + + pub fn urls(&self) -> &Vec { + &self.urls + } + pub fn max_conn(&self) -> usize { + self.max_conn + } + + pub fn merged(&self) -> Option<&String> { + self.merged.as_ref() + } } diff --git a/src/epub.rs b/src/epub.rs new file mode 100644 index 0000000..e6e0376 --- /dev/null +++ b/src/epub.rs @@ -0,0 +1,113 @@ +use std::fs::File; + +use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; + +use crate::extractor::{self, Extractor}; + +pub fn generate_epubs(articles: Vec, merged: Option<&String>) { + match merged { + Some(name) => { + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + epub.inline_toc(); + epub = articles + .iter() + .enumerate() + .fold(epub, |mut epub, (idx, article)| { + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); + let html_str = std::str::from_utf8(&html_buf).unwrap(); + epub.metadata("title", replace_metadata_value(name)) + .unwrap(); + let section_name = article.metadata().title(); + epub.add_content( + EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) + .title(replace_metadata_value(section_name)), + ) + .unwrap(); + + article.img_urls.iter().for_each(|img| { + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); + + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.as_ref().unwrap(), + ) + .unwrap(); + }); + epub + }); + let mut out_file = File::create(&name).unwrap(); + epub.generate(&mut out_file).unwrap(); + println!("Created {:?}", name); + } + None => { + for article in articles { + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + let file_name = format!( + "{}.epub", + article + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + let mut out_file = File::create(&file_name).unwrap(); + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); + let html_str = std::str::from_utf8(&html_buf).unwrap(); + if let Some(author) = article.metadata().byline() { + epub.metadata("author", replace_metadata_value(author)) + .unwrap(); + } + epub.metadata("title", replace_metadata_value(article.metadata().title())) + .unwrap(); + epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes())) + .unwrap(); + for img in article.img_urls { + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); + + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) + .unwrap(); + } + epub.generate(&mut out_file).unwrap(); + println!("Created {:?}", file_name); + } + } + } +} + +/// Replaces characters that have to be escaped before adding to the epub's metadata +fn replace_metadata_value(value: &str) -> String { + value + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") +} + +#[cfg(test)] +mod test { + use super::replace_metadata_value; + + #[test] + fn test_replace_metadata_value() { + let mut value = "Lorem ipsum"; + assert_eq!(replace_metadata_value(value), "Lorem ipsum"); + value = "Memory safe > memory unsafe"; + assert_eq!( + replace_metadata_value(value), + "Memory safe > memory unsafe" + ); + value = "Author Name "; + assert_eq!( + replace_metadata_value(value), + "Author Name <author@mail.example>" + ); + } +} diff --git a/src/extractor.rs b/src/extractor.rs index 2b90e3b..0fcc5e8 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,10 +1,6 @@ use std::collections::HashMap; -use async_std::fs::File; -use async_std::io::prelude::*; -use async_std::task; use kuchiki::{traits::*, NodeRef}; -use url::Url; use crate::moz_readability::{MetaData, Readability}; @@ -51,8 +47,8 @@ impl Extractor { } /// Traverses the DOM tree of the content and retrieves the IMG URLs - fn extract_img_urls(&mut self) { - if let Some(content_ref) = &self.readability.article_node { + pub fn extract_img_urls(&mut self) { + if let Some(content_ref) = &self.article { for img_ref in content_ref.select("img").unwrap() { img_ref.as_node().as_element().map(|img_elem| { img_elem.attributes.borrow().get("src").map(|img_url| { @@ -65,80 +61,6 @@ impl Extractor { } } - pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> { - let mut async_download_tasks = Vec::with_capacity(self.img_urls.len()); - self.extract_img_urls(); - if self.img_urls.len() > 0 { - println!("Downloading images..."); - } - for img_url in &self.img_urls { - let img_url = img_url.0.clone(); - let abs_url = get_absolute_url(&img_url, article_origin); - - async_download_tasks.push(task::spawn(async move { - let mut img_response = surf::Client::new() - // The middleware has been temporarily commented out because it happens - // to affect downloading images when there is no redirecting - // .with(surf::middleware::Redirect::default()) - .get(&abs_url) - .await - .expect("Unable to retrieve file"); - let img_content: Vec = img_response.body_bytes().await.unwrap(); - let img_mime = img_response - .content_type() - .map(|mime| mime.essence().to_string()); - let img_ext = img_response - .content_type() - .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) - .unwrap(); - let mut img_path = std::env::temp_dir(); - img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext)); - let mut img_file = File::create(&img_path) - .await - .expect("Unable to create file"); - img_file - .write_all(&img_content) - .await - .expect("Unable to save to file"); - - ( - img_url, - img_path - .file_name() - .map(|os_str_name| { - os_str_name - .to_str() - .expect("Unable to get image file name") - .to_string() - }) - .unwrap(), - img_mime, - ) - })); - } - - self.img_urls.clear(); - - for async_task in async_download_tasks { - let (img_url, img_path, img_mime) = async_task.await; - // Update the image sources - let img_ref = self - .readability - .article_node - .as_mut() - .expect("Unable to get mutable ref") - .select_first(&format!("img[src='{}']", img_url)) - .expect("Image node does not exist"); - let mut img_node = img_ref.attributes.borrow_mut(); - *img_node.get_mut("src").unwrap() = img_path.clone(); - // srcset is removed because readers such as Foliate then fail to display - // the image already downloaded and stored in src - img_node.remove("srcset"); - self.img_urls.push((img_path, img_mime)); - } - Ok(()) - } - pub fn article(&self) -> Option<&NodeRef> { self.article.as_ref() } @@ -148,40 +70,6 @@ impl Extractor { } } -/// Utility for hashing URLs. This is used to help store files locally with unique values -fn hash_url(url: &str) -> String { - format!("{:x}", md5::compute(url.as_bytes())) -} - -/// Handles getting the extension from a given MIME subtype. -fn map_mime_subtype_to_ext(subtype: &str) -> &str { - if subtype == ("svg+xml") { - return "svg"; - } else if subtype == "x-icon" { - "ico" - } else { - subtype - } -} - -fn get_absolute_url(url: &str, request_url: &Url) -> String { - if Url::parse(url).is_ok() { - url.to_owned() - } else if url.starts_with("/") { - Url::parse(&format!( - "{}://{}", - request_url.scheme(), - request_url.host_str().unwrap() - )) - .unwrap() - .join(url) - .unwrap() - .into_string() - } else { - request_url.join(url).unwrap().into_string() - } -} - /// Serializes a NodeRef to a string that is XHTML compatible /// The only DOM nodes serialized are Text and Element nodes pub fn serialize_to_xhtml( @@ -278,19 +166,4 @@ mod test { extractor.img_urls ); } - - #[test] - fn test_map_mime_type_to_ext() { - let mime_subtypes = vec![ - "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp", - ]; - let exts = mime_subtypes - .into_iter() - .map(|mime_type| map_mime_subtype_to_ext(mime_type)) - .collect::>(); - assert_eq!( - vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"], - exts - ); - } } diff --git a/src/http.rs b/src/http.rs new file mode 100644 index 0000000..faf9428 --- /dev/null +++ b/src/http.rs @@ -0,0 +1,188 @@ +use async_std::io::prelude::*; +use async_std::{fs::File, stream}; +use futures::StreamExt; +use url::Url; + +use crate::extractor::Extractor; + +type HTMLResource = (String, String); + +pub async fn fetch_url( + url: &str, +) -> Result> { + let client = surf::Client::new(); + println!("Fetching..."); + + let mut redirect_count: u8 = 0; + let base_url = Url::parse(&url)?; + let mut url = base_url.clone(); + while redirect_count < 5 { + redirect_count += 1; + let req = surf::get(&url); + let mut res = client.send(req).await?; + if res.status().is_redirection() { + if let Some(location) = res.header(surf::http::headers::LOCATION) { + match Url::parse(location.last().as_str()) { + Ok(valid_url) => url = valid_url, + Err(e) => match e { + url::ParseError::RelativeUrlWithoutBase => { + url = base_url.join(location.last().as_str())? + } + e => return Err(e.into()), + }, + }; + } + } else if res.status().is_success() { + if let Some(mime) = res.content_type() { + if mime.essence() == "text/html" { + return Ok((url.to_string(), res.body_string().await?)); + } else { + return Err(format!( + "Invalid HTTP response. Received {} instead of text/html", + mime.essence() + ) + .into()); + } + } else { + return Err("Unknown HTTP response".into()); + } + } else { + return Err(format!("Request failed: HTTP {}", res.status()).into()); + } + } + Err("Unable to fetch HTML".into()) +} + +pub async fn download_images( + extractor: &mut Extractor, + article_origin: &Url, +) -> async_std::io::Result<()> { + if extractor.img_urls.len() > 0 { + println!("Downloading images..."); + } + + let imgs_req_iter = extractor + .img_urls + .iter() + .map(|(url, _)| { + ( + url, + surf::Client::new().get(get_absolute_url(&url, article_origin)), + ) + }) + .map(|(url, req)| async move { + let mut img_response = req.await.expect("Unable to retrieve image"); + let img_content: Vec = img_response.body_bytes().await.unwrap(); + let img_mime = img_response + .content_type() + .map(|mime| mime.essence().to_string()); + let img_ext = img_response + .content_type() + .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) + .unwrap(); + + let mut img_path = std::env::temp_dir(); + img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); + let mut img_file = File::create(&img_path) + .await + .expect("Unable to create file"); + img_file + .write_all(&img_content) + .await + .expect("Unable to save to file"); + + ( + url, + img_path + .file_name() + .map(|os_str_name| { + os_str_name + .to_str() + .expect("Unable to get image file name") + .to_string() + }) + .unwrap(), + img_mime, + ) + }); + + // A utility closure used when update the value of an image source after downloading is successful + let replace_existing_img_src = + |img_item: (&String, String, Option)| -> (String, Option) { + let (img_url, img_path, img_mime) = img_item; + let img_ref = extractor + .article() + .as_mut() + .expect("Unable to get mutable ref") + .select_first(&format!("img[src='{}']", img_url)) + .expect("Image node does not exist"); + let mut img_node = img_ref.attributes.borrow_mut(); + *img_node.get_mut("src").unwrap() = img_path.clone(); + // srcset is removed because readers such as Foliate then fail to display + // the image already downloaded and stored in src + img_node.remove("srcset"); + (img_path, img_mime) + }; + + extractor.img_urls = stream::from_iter(imgs_req_iter) + .buffered(10) + .collect::>() + .await + .into_iter() + .map(replace_existing_img_src) + .collect(); + Ok(()) +} + +/// Handles getting the extension from a given MIME subtype. +fn map_mime_subtype_to_ext(subtype: &str) -> &str { + if subtype == ("svg+xml") { + return "svg"; + } else if subtype == "x-icon" { + "ico" + } else { + subtype + } +} + +/// Utility for hashing URLs. This is used to help store files locally with unique values +fn hash_url(url: &str) -> String { + format!("{:x}", md5::compute(url.as_bytes())) +} + +fn get_absolute_url(url: &str, request_url: &Url) -> String { + if Url::parse(url).is_ok() { + url.to_owned() + } else if url.starts_with("/") { + Url::parse(&format!( + "{}://{}", + request_url.scheme(), + request_url.host_str().unwrap() + )) + .unwrap() + .join(url) + .unwrap() + .into_string() + } else { + request_url.join(url).unwrap().into_string() + } +} + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn test_map_mime_type_to_ext() { + let mime_subtypes = vec![ + "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp", + ]; + let exts = mime_subtypes + .into_iter() + .map(|mime_type| map_mime_subtype_to_ext(mime_type)) + .collect::>(); + assert_eq!( + vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"], + exts + ); + } +} diff --git a/src/main.rs b/src/main.rs index 4e403b6..0467712 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,129 +1,56 @@ #[macro_use] extern crate lazy_static; -use std::fs::File; - +use async_std::stream; use async_std::task; -use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; +use futures::stream::StreamExt; use url::Url; mod cli; +mod epub; mod extractor; +/// This module is responsible for async HTTP calls for downloading +/// the HTML content and images +mod http; mod moz_readability; +use cli::AppConfig; +use epub::generate_epubs; use extractor::Extractor; +use http::{download_images, fetch_url}; + fn main() { - let app = cli::cli_init(); - let arg_matches = app.get_matches(); - if let Some(vals) = arg_matches.values_of("urls") { - let urls = vals.map(|val| val.to_string()).collect::>(); - download(urls); + let app_config = cli::cli_init(); + + if !app_config.urls().is_empty() { + download(app_config); } } -type HTMLResource = (String, String); - -async fn fetch_url(url: &str) -> Result> { - let client = surf::Client::new(); - println!("Fetching..."); - - let mut redirect_count: u8 = 0; - let base_url = Url::parse(&url)?; - let mut url = base_url.clone(); - while redirect_count < 5 { - redirect_count += 1; - let req = surf::get(&url); - let mut res = client.send(req).await?; - if res.status().is_redirection() { - if let Some(location) = res.header(surf::http::headers::LOCATION) { - match Url::parse(location.last().as_str()) { - Ok(valid_url) => url = valid_url, - Err(e) => match e { - url::ParseError::RelativeUrlWithoutBase => { - url = base_url.join(location.last().as_str())? - } - e => return Err(e.into()), - }, - }; - } - } else if res.status().is_success() { - if let Some(mime) = res.content_type() { - if mime.essence() == "text/html" { - return Ok((url.to_string(), res.body_string().await?)); - } else { - return Err(format!( - "Invalid HTTP response. Received {} instead of text/html", - mime.essence() - ) - .into()); - } - } else { - return Err("Unknown HTTP response".into()); - } - } else { - return Err(format!("Request failed: HTTP {}", res.status()).into()); - } - } - Err("Unable to fetch HTML".into()) -} - -fn download(urls: Vec) { - let mut async_url_tasks = Vec::with_capacity(urls.len()); - for url in urls { - async_url_tasks.push(task::spawn(async move { fetch_url(&url).await })); - } - task::block_on(async { - for url_task in async_url_tasks { - match url_task.await { +fn download(app_config: AppConfig) { + let articles = task::block_on(async { + let urls_iter = app_config.urls().iter().map(|url| fetch_url(url)); + let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); + let mut articles = Vec::new(); + while let Some(fetch_result) = responses.next().await { + match fetch_result { Ok((url, html)) => { println!("Extracting"); let mut extractor = Extractor::from_html(&html); extractor.extract_content(&url); + if extractor.article().is_some() { - extractor - .download_images(&Url::parse(&url).unwrap()) + extractor.extract_img_urls(); + download_images(&mut extractor, &Url::parse(&url).unwrap()) .await .expect("Unable to download images"); - let file_name = format!( - "{}.epub", - extractor - .metadata() - .title() - .replace("/", " ") - .replace("\\", " ") - ); - let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_buf = std::str::from_utf8(&html_buf).unwrap(); - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); - if let Some(author) = extractor.metadata().byline() { - epub.metadata("author", author.replace("&", "&")) - .unwrap(); - } - epub.metadata("title", extractor.metadata().title().replace("&", "&")) - .unwrap(); - epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) - .unwrap(); - for img in extractor.img_urls { - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); - - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource( - file_path.file_name().unwrap(), - img_buf, - img.1.unwrap(), - ) - .unwrap(); - } - epub.generate(&mut out_file).unwrap(); - println!("Created {:?}", file_name); + articles.push(extractor); } } - Err(e) => println!("{}", e), + Err(e) => eprintln!("{}", e), } } - }) + articles + }); + generate_epubs(articles, app_config.merged()); } diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 7986c2b..9b25b79 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -462,7 +462,12 @@ impl Readability { .iter() .find(|key| values.contains_key(**key)) { - values.get(*key).map(|title| title.to_owned()).unwrap() + let title = values.get(*key).map(|title| title.to_owned()).unwrap(); + if title.is_empty() { + self.get_article_title() + } else { + title + } } else { self.get_article_title() };