From b0c4c47413f3b75ef477ad9f5df3667cdcc7227c Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 11 Feb 2021 13:51:21 +0300 Subject: [PATCH] Add support for merging articles into a single epub This is still experimental as it lacks validation of the target file name --- src/cli.rs | 25 ++++++++++++ src/epub.rs | 109 ++++++++++++++++++++++++++++++++++++---------------- src/main.rs | 11 ++++-- 3 files changed, 107 insertions(+), 38 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 1971fac..33b5072 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -26,6 +26,13 @@ It takes a url and downloads the article content from it and saves it to an epub .long("file") .help("Input file containing links") .takes_value(true), + ) + .arg( + Arg::with_name("output_name") + .long("merge") + .help("Merge multiple articles into a single epub") + .long_help("Merge multiple articles into a single epub that will be given the name provided") + .takes_value(true), ); let arg_matches = app.get_matches(); let mut urls: Vec = match arg_matches.value_of("file") { @@ -57,12 +64,21 @@ It takes a url and downloads the article content from it and saves it to an epub let mut app_config = AppConfig::new(); app_config.set_urls(urls); + if let Some(name) = arg_matches.value_of("output_name") { + let file_name = if name.ends_with(".epub") && name.len() > 5 { + name.to_owned() + } else { + name.to_owned() + ".epub" + }; + app_config.set_merged(file_name); + } app_config } pub struct AppConfig { urls: Vec, max_conn: usize, + merged: Option, } impl AppConfig { @@ -70,6 +86,7 @@ impl AppConfig { Self { urls: vec![], max_conn: 8, + merged: None, } } @@ -77,10 +94,18 @@ impl AppConfig { self.urls.extend(urls); } + fn set_merged(&mut self, name: String) { + self.merged = Some(name); + } + pub fn urls(&self) -> &Vec { &self.urls } pub fn max_conn(&self) -> usize { self.max_conn } + + pub fn merged(&self) -> Option<&String> { + self.merged.as_ref() + } } diff --git a/src/epub.rs b/src/epub.rs index 714bd73..e6e0376 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -4,42 +4,83 @@ use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use crate::extractor::{self, Extractor}; -pub fn generate_epub(extractor: Extractor) { - let file_name = format!( - "{}.epub", - extractor - .metadata() - .title() - .replace("/", " ") - .replace("\\", " ") - ); - let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_buf = std::str::from_utf8(&html_buf).unwrap(); - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); - if let Some(author) = extractor.metadata().byline() { - epub.metadata("author", replace_metadata_value(author)) - .unwrap(); - } - epub.metadata( - "title", - replace_metadata_value(extractor.metadata().title()), - ) - .unwrap(); - epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) - .unwrap(); - for img in extractor.img_urls { - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); +pub fn generate_epubs(articles: Vec, merged: Option<&String>) { + match merged { + Some(name) => { + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + epub.inline_toc(); + epub = articles + .iter() + .enumerate() + .fold(epub, |mut epub, (idx, article)| { + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); + let html_str = std::str::from_utf8(&html_buf).unwrap(); + epub.metadata("title", replace_metadata_value(name)) + .unwrap(); + let section_name = article.metadata().title(); + epub.add_content( + EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) + .title(replace_metadata_value(section_name)), + ) + .unwrap(); - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) - .unwrap(); + article.img_urls.iter().for_each(|img| { + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); + + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.as_ref().unwrap(), + ) + .unwrap(); + }); + epub + }); + let mut out_file = File::create(&name).unwrap(); + epub.generate(&mut out_file).unwrap(); + println!("Created {:?}", name); + } + None => { + for article in articles { + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + let file_name = format!( + "{}.epub", + article + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + let mut out_file = File::create(&file_name).unwrap(); + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); + let html_str = std::str::from_utf8(&html_buf).unwrap(); + if let Some(author) = article.metadata().byline() { + epub.metadata("author", replace_metadata_value(author)) + .unwrap(); + } + epub.metadata("title", replace_metadata_value(article.metadata().title())) + .unwrap(); + epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes())) + .unwrap(); + for img in article.img_urls { + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); + + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) + .unwrap(); + } + epub.generate(&mut out_file).unwrap(); + println!("Created {:?}", file_name); + } + } } - epub.generate(&mut out_file).unwrap(); - println!("Created {:?}", file_name); } /// Replaces characters that have to be escaped before adding to the epub's metadata diff --git a/src/main.rs b/src/main.rs index ec983ab..5ba4ad9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ mod http; mod moz_readability; use cli::AppConfig; -use epub::generate_epub; +use epub::generate_epubs; use extractor::Extractor; use http::{download_images, fetch_url}; @@ -28,9 +28,10 @@ fn main() { } fn download(app_config: AppConfig) { - task::block_on(async { + let articles = task::block_on(async { let urls_iter = app_config.urls().iter().map(|url| fetch_url(url)); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); + let mut articles = Vec::new(); while let Some(fetch_result) = responses.next().await { match fetch_result { Ok((url, html)) => { @@ -43,11 +44,13 @@ fn download(app_config: AppConfig) { download_images(&mut extractor, &Url::parse(&url).unwrap()) .await .expect("Unable to download images"); - generate_epub(extractor); + articles.push(extractor); } } Err(e) => println!("{}", e), } } - }) + articles + }); + generate_epubs(articles, app_config.merged()); }