Add support for merging articles into a single epub

This is still experimental as it lacks validation of the target file name
This commit is contained in:
Kenneth Gitere 2021-02-11 13:51:21 +03:00
parent f0a610c2ac
commit b0c4c47413
3 changed files with 107 additions and 38 deletions

View file

@ -26,6 +26,13 @@ It takes a url and downloads the article content from it and saves it to an epub
.long("file") .long("file")
.help("Input file containing links") .help("Input file containing links")
.takes_value(true), .takes_value(true),
)
.arg(
Arg::with_name("output_name")
.long("merge")
.help("Merge multiple articles into a single epub")
.long_help("Merge multiple articles into a single epub that will be given the name provided")
.takes_value(true),
); );
let arg_matches = app.get_matches(); let arg_matches = app.get_matches();
let mut urls: Vec<String> = match arg_matches.value_of("file") { let mut urls: Vec<String> = match arg_matches.value_of("file") {
@ -57,12 +64,21 @@ It takes a url and downloads the article content from it and saves it to an epub
let mut app_config = AppConfig::new(); let mut app_config = AppConfig::new();
app_config.set_urls(urls); app_config.set_urls(urls);
if let Some(name) = arg_matches.value_of("output_name") {
let file_name = if name.ends_with(".epub") && name.len() > 5 {
name.to_owned()
} else {
name.to_owned() + ".epub"
};
app_config.set_merged(file_name);
}
app_config app_config
} }
pub struct AppConfig { pub struct AppConfig {
urls: Vec<String>, urls: Vec<String>,
max_conn: usize, max_conn: usize,
merged: Option<String>,
} }
impl AppConfig { impl AppConfig {
@ -70,6 +86,7 @@ impl AppConfig {
Self { Self {
urls: vec![], urls: vec![],
max_conn: 8, max_conn: 8,
merged: None,
} }
} }
@ -77,10 +94,18 @@ impl AppConfig {
self.urls.extend(urls); self.urls.extend(urls);
} }
fn set_merged(&mut self, name: String) {
self.merged = Some(name);
}
pub fn urls(&self) -> &Vec<String> { pub fn urls(&self) -> &Vec<String> {
&self.urls &self.urls
} }
pub fn max_conn(&self) -> usize { pub fn max_conn(&self) -> usize {
self.max_conn self.max_conn
} }
pub fn merged(&self) -> Option<&String> {
self.merged.as_ref()
}
} }

View file

@ -4,10 +4,52 @@ use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use crate::extractor::{self, Extractor}; use crate::extractor::{self, Extractor};
pub fn generate_epub(extractor: Extractor) { pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
match merged {
Some(name) => {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
epub.inline_toc();
epub = articles
.iter()
.enumerate()
.fold(epub, |mut epub, (idx, article)| {
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
epub.metadata("title", replace_metadata_value(name))
.unwrap();
let section_name = article.metadata().title();
epub.add_content(
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
.title(replace_metadata_value(section_name)),
)
.unwrap();
article.img_urls.iter().for_each(|img| {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.as_ref().unwrap(),
)
.unwrap();
});
epub
});
let mut out_file = File::create(&name).unwrap();
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", name);
}
None => {
for article in articles {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
let file_name = format!( let file_name = format!(
"{}.epub", "{}.epub",
extractor article
.metadata() .metadata()
.title() .title()
.replace("/", " ") .replace("/", " ")
@ -15,22 +57,18 @@ pub fn generate_epub(extractor: Extractor) {
); );
let mut out_file = File::create(&file_name).unwrap(); let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new(); let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml"); .expect("Unable to serialize to xhtml");
let html_buf = std::str::from_utf8(&html_buf).unwrap(); let html_str = std::str::from_utf8(&html_buf).unwrap();
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); if let Some(author) = article.metadata().byline() {
if let Some(author) = extractor.metadata().byline() {
epub.metadata("author", replace_metadata_value(author)) epub.metadata("author", replace_metadata_value(author))
.unwrap(); .unwrap();
} }
epub.metadata( epub.metadata("title", replace_metadata_value(article.metadata().title()))
"title",
replace_metadata_value(extractor.metadata().title()),
)
.unwrap(); .unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
.unwrap(); .unwrap();
for img in extractor.img_urls { for img in article.img_urls {
let mut file_path = std::env::temp_dir(); let mut file_path = std::env::temp_dir();
file_path.push(&img.0); file_path.push(&img.0);
@ -41,6 +79,9 @@ pub fn generate_epub(extractor: Extractor) {
epub.generate(&mut out_file).unwrap(); epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name); println!("Created {:?}", file_name);
} }
}
}
}
/// Replaces characters that have to be escaped before adding to the epub's metadata /// Replaces characters that have to be escaped before adding to the epub's metadata
fn replace_metadata_value(value: &str) -> String { fn replace_metadata_value(value: &str) -> String {

View file

@ -15,7 +15,7 @@ mod http;
mod moz_readability; mod moz_readability;
use cli::AppConfig; use cli::AppConfig;
use epub::generate_epub; use epub::generate_epubs;
use extractor::Extractor; use extractor::Extractor;
use http::{download_images, fetch_url}; use http::{download_images, fetch_url};
@ -28,9 +28,10 @@ fn main() {
} }
fn download(app_config: AppConfig) { fn download(app_config: AppConfig) {
task::block_on(async { let articles = task::block_on(async {
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url)); let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await { while let Some(fetch_result) = responses.next().await {
match fetch_result { match fetch_result {
Ok((url, html)) => { Ok((url, html)) => {
@ -43,11 +44,13 @@ fn download(app_config: AppConfig) {
download_images(&mut extractor, &Url::parse(&url).unwrap()) download_images(&mut extractor, &Url::parse(&url).unwrap())
.await .await
.expect("Unable to download images"); .expect("Unable to download images");
generate_epub(extractor); articles.push(extractor);
} }
} }
Err(e) => println!("{}", e), Err(e) => println!("{}", e),
} }
} }
}) articles
});
generate_epubs(articles, app_config.merged());
} }