diff --git a/src/cli.rs b/src/cli.rs index a456f29..1284a15 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,7 +1,9 @@ +use std::{fs::File, io::Read}; + use clap::{App, AppSettings, Arg}; -pub fn cli_init() -> App<'static, 'static> { - App::new("paperoni") +pub fn cli_init() -> AppConfig { + let app = App::new("paperoni") .settings(&[ AppSettings::ArgRequiredElseHelp, AppSettings::UnifiedHelpMessage, @@ -24,5 +26,54 @@ It takes a url and downloads the article content from it and saves it to an epub .long("file") .help("Input file containing links") .takes_value(true), - ) + ); + let arg_matches = app.get_matches(); + let mut urls: Vec = match arg_matches.value_of("file") { + Some(file_name) => { + if let Ok(mut file) = File::open(file_name) { + let mut content = String::new(); + match file.read_to_string(&mut content) { + Ok(_) => content + .lines() + .filter(|line| !line.is_empty()) + .map(|line| line.to_owned()) + .collect(), + Err(_) => vec![], + } + } else { + println!("Unable to open file: {}", file_name); + vec![] + } + } + None => vec![], + }; + + if let Some(vals) = arg_matches.values_of("urls") { + urls.extend( + vals.filter(|val| !val.is_empty()) + .map(|val| val.to_string()), + ); + } + + let mut app_config = AppConfig::new(); + app_config.set_urls(urls); + app_config +} + +pub struct AppConfig { + urls: Vec, +} + +impl AppConfig { + fn new() -> Self { + Self { urls: vec![] } + } + + fn set_urls(&mut self, urls: Vec) { + self.urls.extend(urls); + } + + pub fn urls(&self) -> &Vec { + &self.urls + } } diff --git a/src/epub.rs b/src/epub.rs new file mode 100644 index 0000000..7ffcc9c --- /dev/null +++ b/src/epub.rs @@ -0,0 +1,40 @@ +use std::fs::File; + +use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; + +use crate::extractor::{self, Extractor}; + +pub fn generate_epub(extractor: Extractor) { + let file_name = format!( + "{}.epub", + extractor + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + let mut out_file = File::create(&file_name).unwrap(); + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); + let html_buf = std::str::from_utf8(&html_buf).unwrap(); + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + if let Some(author) = extractor.metadata().byline() { + epub.metadata("author", author.replace("&", "&")) + .unwrap(); + } + epub.metadata("title", extractor.metadata().title().replace("&", "&")) + .unwrap(); + epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) + .unwrap(); + for img in extractor.img_urls { + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); + + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) + .unwrap(); + } + epub.generate(&mut out_file).unwrap(); + println!("Created {:?}", file_name); +} diff --git a/src/extractor.rs b/src/extractor.rs index 2b90e3b..0fcc5e8 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,10 +1,6 @@ use std::collections::HashMap; -use async_std::fs::File; -use async_std::io::prelude::*; -use async_std::task; use kuchiki::{traits::*, NodeRef}; -use url::Url; use crate::moz_readability::{MetaData, Readability}; @@ -51,8 +47,8 @@ impl Extractor { } /// Traverses the DOM tree of the content and retrieves the IMG URLs - fn extract_img_urls(&mut self) { - if let Some(content_ref) = &self.readability.article_node { + pub fn extract_img_urls(&mut self) { + if let Some(content_ref) = &self.article { for img_ref in content_ref.select("img").unwrap() { img_ref.as_node().as_element().map(|img_elem| { img_elem.attributes.borrow().get("src").map(|img_url| { @@ -65,80 +61,6 @@ impl Extractor { } } - pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> { - let mut async_download_tasks = Vec::with_capacity(self.img_urls.len()); - self.extract_img_urls(); - if self.img_urls.len() > 0 { - println!("Downloading images..."); - } - for img_url in &self.img_urls { - let img_url = img_url.0.clone(); - let abs_url = get_absolute_url(&img_url, article_origin); - - async_download_tasks.push(task::spawn(async move { - let mut img_response = surf::Client::new() - // The middleware has been temporarily commented out because it happens - // to affect downloading images when there is no redirecting - // .with(surf::middleware::Redirect::default()) - .get(&abs_url) - .await - .expect("Unable to retrieve file"); - let img_content: Vec = img_response.body_bytes().await.unwrap(); - let img_mime = img_response - .content_type() - .map(|mime| mime.essence().to_string()); - let img_ext = img_response - .content_type() - .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) - .unwrap(); - let mut img_path = std::env::temp_dir(); - img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext)); - let mut img_file = File::create(&img_path) - .await - .expect("Unable to create file"); - img_file - .write_all(&img_content) - .await - .expect("Unable to save to file"); - - ( - img_url, - img_path - .file_name() - .map(|os_str_name| { - os_str_name - .to_str() - .expect("Unable to get image file name") - .to_string() - }) - .unwrap(), - img_mime, - ) - })); - } - - self.img_urls.clear(); - - for async_task in async_download_tasks { - let (img_url, img_path, img_mime) = async_task.await; - // Update the image sources - let img_ref = self - .readability - .article_node - .as_mut() - .expect("Unable to get mutable ref") - .select_first(&format!("img[src='{}']", img_url)) - .expect("Image node does not exist"); - let mut img_node = img_ref.attributes.borrow_mut(); - *img_node.get_mut("src").unwrap() = img_path.clone(); - // srcset is removed because readers such as Foliate then fail to display - // the image already downloaded and stored in src - img_node.remove("srcset"); - self.img_urls.push((img_path, img_mime)); - } - Ok(()) - } - pub fn article(&self) -> Option<&NodeRef> { self.article.as_ref() } @@ -148,40 +70,6 @@ impl Extractor { } } -/// Utility for hashing URLs. This is used to help store files locally with unique values -fn hash_url(url: &str) -> String { - format!("{:x}", md5::compute(url.as_bytes())) -} - -/// Handles getting the extension from a given MIME subtype. -fn map_mime_subtype_to_ext(subtype: &str) -> &str { - if subtype == ("svg+xml") { - return "svg"; - } else if subtype == "x-icon" { - "ico" - } else { - subtype - } -} - -fn get_absolute_url(url: &str, request_url: &Url) -> String { - if Url::parse(url).is_ok() { - url.to_owned() - } else if url.starts_with("/") { - Url::parse(&format!( - "{}://{}", - request_url.scheme(), - request_url.host_str().unwrap() - )) - .unwrap() - .join(url) - .unwrap() - .into_string() - } else { - request_url.join(url).unwrap().into_string() - } -} - /// Serializes a NodeRef to a string that is XHTML compatible /// The only DOM nodes serialized are Text and Element nodes pub fn serialize_to_xhtml( @@ -278,19 +166,4 @@ mod test { extractor.img_urls ); } - - #[test] - fn test_map_mime_type_to_ext() { - let mime_subtypes = vec![ - "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp", - ]; - let exts = mime_subtypes - .into_iter() - .map(|mime_type| map_mime_subtype_to_ext(mime_type)) - .collect::>(); - assert_eq!( - vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"], - exts - ); - } } diff --git a/src/http.rs b/src/http.rs new file mode 100644 index 0000000..b3d01f7 --- /dev/null +++ b/src/http.rs @@ -0,0 +1,183 @@ +use async_std::fs::File; +use async_std::io::prelude::*; +use async_std::task; +use url::Url; + +use crate::extractor::Extractor; + +type HTMLResource = (String, String); + +pub async fn fetch_url( + url: &str, +) -> Result> { + let client = surf::Client::new(); + println!("Fetching..."); + + let mut redirect_count: u8 = 0; + let base_url = Url::parse(&url)?; + let mut url = base_url.clone(); + while redirect_count < 5 { + redirect_count += 1; + let req = surf::get(&url); + let mut res = client.send(req).await?; + if res.status().is_redirection() { + if let Some(location) = res.header(surf::http::headers::LOCATION) { + match Url::parse(location.last().as_str()) { + Ok(valid_url) => url = valid_url, + Err(e) => match e { + url::ParseError::RelativeUrlWithoutBase => { + url = base_url.join(location.last().as_str())? + } + e => return Err(e.into()), + }, + }; + } + } else if res.status().is_success() { + if let Some(mime) = res.content_type() { + if mime.essence() == "text/html" { + return Ok((url.to_string(), res.body_string().await?)); + } else { + return Err(format!( + "Invalid HTTP response. Received {} instead of text/html", + mime.essence() + ) + .into()); + } + } else { + return Err("Unknown HTTP response".into()); + } + } else { + return Err(format!("Request failed: HTTP {}", res.status()).into()); + } + } + Err("Unable to fetch HTML".into()) +} + +pub async fn download_images( + extractor: &mut Extractor, + article_origin: &Url, +) -> async_std::io::Result<()> { + extractor.extract_img_urls(); + let mut async_download_tasks = Vec::with_capacity(extractor.img_urls.len()); + if extractor.img_urls.len() > 0 { + println!("Downloading images..."); + } + for img_url in &extractor.img_urls { + let img_url = img_url.0.clone(); + let abs_url = get_absolute_url(&img_url, article_origin); + + async_download_tasks.push(task::spawn(async move { + let mut img_response = surf::Client::new() + // The middleware has been temporarily commented out because it happens + // to affect downloading images when there is no redirecting + // .with(surf::middleware::Redirect::default()) + .get(&abs_url) + .await + .expect("Unable to retrieve file"); + let img_content: Vec = img_response.body_bytes().await.unwrap(); + let img_mime = img_response + .content_type() + .map(|mime| mime.essence().to_string()); + let img_ext = img_response + .content_type() + .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) + .unwrap(); + let mut img_path = std::env::temp_dir(); + img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext)); + let mut img_file = File::create(&img_path) + .await + .expect("Unable to create file"); + img_file + .write_all(&img_content) + .await + .expect("Unable to save to file"); + + ( + img_url, + img_path + .file_name() + .map(|os_str_name| { + os_str_name + .to_str() + .expect("Unable to get image file name") + .to_string() + }) + .unwrap(), + img_mime, + ) + })); + } + + extractor.img_urls.clear(); + + for async_task in async_download_tasks { + let (img_url, img_path, img_mime) = async_task.await; + // Update the image sources + let img_ref = extractor + .article() + .as_mut() + .expect("Unable to get mutable ref") + .select_first(&format!("img[src='{}']", img_url)) + .expect("Image node does not exist"); + let mut img_node = img_ref.attributes.borrow_mut(); + *img_node.get_mut("src").unwrap() = img_path.clone(); + // srcset is removed because readers such as Foliate then fail to display + // the image already downloaded and stored in src + img_node.remove("srcset"); + extractor.img_urls.push((img_path, img_mime)); + } + Ok(()) +} + +/// Handles getting the extension from a given MIME subtype. +fn map_mime_subtype_to_ext(subtype: &str) -> &str { + if subtype == ("svg+xml") { + return "svg"; + } else if subtype == "x-icon" { + "ico" + } else { + subtype + } +} + +/// Utility for hashing URLs. This is used to help store files locally with unique values +fn hash_url(url: &str) -> String { + format!("{:x}", md5::compute(url.as_bytes())) +} + +fn get_absolute_url(url: &str, request_url: &Url) -> String { + if Url::parse(url).is_ok() { + url.to_owned() + } else if url.starts_with("/") { + Url::parse(&format!( + "{}://{}", + request_url.scheme(), + request_url.host_str().unwrap() + )) + .unwrap() + .join(url) + .unwrap() + .into_string() + } else { + request_url.join(url).unwrap().into_string() + } +} + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn test_map_mime_type_to_ext() { + let mime_subtypes = vec![ + "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp", + ]; + let exts = mime_subtypes + .into_iter() + .map(|mime_type| map_mime_subtype_to_ext(mime_type)) + .collect::>(); + assert_eq!( + vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"], + exts + ); + } +} diff --git a/src/main.rs b/src/main.rs index e843aea..b74b217 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,96 +1,27 @@ #[macro_use] extern crate lazy_static; -use std::{fs::File, io::Read}; - use async_std::task; -use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use url::Url; mod cli; +mod epub; mod extractor; +/// This module is responsible for async HTTP calls for downloading +/// the HTML content and images +mod http; mod moz_readability; +use epub::generate_epub; +use http::{download_images, fetch_url}; + use extractor::Extractor; fn main() { - let app = cli::cli_init(); - let arg_matches = app.get_matches(); - let mut urls: Vec = match arg_matches.value_of("file") { - Some(file_name) => { - if let Ok(mut file) = File::open(file_name) { - let mut content = String::new(); - match file.read_to_string(&mut content) { - Ok(_) => content - .lines() - .filter(|line| !line.is_empty()) - .map(|line| line.to_owned()) - .collect(), - Err(_) => vec![], - } - } else { - println!("Unable to open file: {}", file_name); - vec![] - } - } - None => vec![], - }; + let app_config = cli::cli_init(); - if let Some(vals) = arg_matches.values_of("urls") { - urls.extend( - vals.filter(|val| !val.is_empty()) - .map(|val| val.to_string()), - ); + if !app_config.urls().is_empty() { + download(app_config.urls().clone()); } - - if !urls.is_empty() { - download(urls); - } -} - -type HTMLResource = (String, String); - -async fn fetch_url(url: &str) -> Result> { - let client = surf::Client::new(); - println!("Fetching..."); - - let mut redirect_count: u8 = 0; - let base_url = Url::parse(&url)?; - let mut url = base_url.clone(); - while redirect_count < 5 { - redirect_count += 1; - let req = surf::get(&url); - let mut res = client.send(req).await?; - if res.status().is_redirection() { - if let Some(location) = res.header(surf::http::headers::LOCATION) { - match Url::parse(location.last().as_str()) { - Ok(valid_url) => url = valid_url, - Err(e) => match e { - url::ParseError::RelativeUrlWithoutBase => { - url = base_url.join(location.last().as_str())? - } - e => return Err(e.into()), - }, - }; - } - } else if res.status().is_success() { - if let Some(mime) = res.content_type() { - if mime.essence() == "text/html" { - return Ok((url.to_string(), res.body_string().await?)); - } else { - return Err(format!( - "Invalid HTTP response. Received {} instead of text/html", - mime.essence() - ) - .into()); - } - } else { - return Err("Unknown HTTP response".into()); - } - } else { - return Err(format!("Request failed: HTTP {}", res.status()).into()); - } - } - Err("Unable to fetch HTML".into()) } fn download(urls: Vec) { @@ -98,6 +29,7 @@ fn download(urls: Vec) { for url in urls { async_url_tasks.push(task::spawn(async move { fetch_url(&url).await })); } + task::block_on(async { for url_task in async_url_tasks { match url_task.await { @@ -105,47 +37,12 @@ fn download(urls: Vec) { println!("Extracting"); let mut extractor = Extractor::from_html(&html); extractor.extract_content(&url); + if extractor.article().is_some() { - extractor - .download_images(&Url::parse(&url).unwrap()) + download_images(&mut extractor, &Url::parse(&url).unwrap()) .await .expect("Unable to download images"); - let file_name = format!( - "{}.epub", - extractor - .metadata() - .title() - .replace("/", " ") - .replace("\\", " ") - ); - let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_buf = std::str::from_utf8(&html_buf).unwrap(); - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); - if let Some(author) = extractor.metadata().byline() { - epub.metadata("author", author.replace("&", "&")) - .unwrap(); - } - epub.metadata("title", extractor.metadata().title().replace("&", "&")) - .unwrap(); - epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) - .unwrap(); - for img in extractor.img_urls { - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); - - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource( - file_path.file_name().unwrap(), - img_buf, - img.1.unwrap(), - ) - .unwrap(); - } - epub.generate(&mut out_file).unwrap(); - println!("Created {:?}", file_name); + generate_epub(extractor); } } Err(e) => println!("{}", e),