Add support for merging articles into a single epub

This is still experimental as it lacks validation of the target file name
This commit is contained in:
Kenneth Gitere 2021-02-11 13:51:21 +03:00
parent f0a610c2ac
commit b0c4c47413
3 changed files with 107 additions and 38 deletions

View file

@ -26,6 +26,13 @@ It takes a url and downloads the article content from it and saves it to an epub
.long("file") .long("file")
.help("Input file containing links") .help("Input file containing links")
.takes_value(true), .takes_value(true),
)
.arg(
Arg::with_name("output_name")
.long("merge")
.help("Merge multiple articles into a single epub")
.long_help("Merge multiple articles into a single epub that will be given the name provided")
.takes_value(true),
); );
let arg_matches = app.get_matches(); let arg_matches = app.get_matches();
let mut urls: Vec<String> = match arg_matches.value_of("file") { let mut urls: Vec<String> = match arg_matches.value_of("file") {
@ -57,12 +64,21 @@ It takes a url and downloads the article content from it and saves it to an epub
let mut app_config = AppConfig::new(); let mut app_config = AppConfig::new();
app_config.set_urls(urls); app_config.set_urls(urls);
if let Some(name) = arg_matches.value_of("output_name") {
let file_name = if name.ends_with(".epub") && name.len() > 5 {
name.to_owned()
} else {
name.to_owned() + ".epub"
};
app_config.set_merged(file_name);
}
app_config app_config
} }
pub struct AppConfig { pub struct AppConfig {
urls: Vec<String>, urls: Vec<String>,
max_conn: usize, max_conn: usize,
merged: Option<String>,
} }
impl AppConfig { impl AppConfig {
@ -70,6 +86,7 @@ impl AppConfig {
Self { Self {
urls: vec![], urls: vec![],
max_conn: 8, max_conn: 8,
merged: None,
} }
} }
@ -77,10 +94,18 @@ impl AppConfig {
self.urls.extend(urls); self.urls.extend(urls);
} }
fn set_merged(&mut self, name: String) {
self.merged = Some(name);
}
pub fn urls(&self) -> &Vec<String> { pub fn urls(&self) -> &Vec<String> {
&self.urls &self.urls
} }
pub fn max_conn(&self) -> usize { pub fn max_conn(&self) -> usize {
self.max_conn self.max_conn
} }
pub fn merged(&self) -> Option<&String> {
self.merged.as_ref()
}
} }

View file

@ -4,42 +4,83 @@ use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use crate::extractor::{self, Extractor}; use crate::extractor::{self, Extractor};
pub fn generate_epub(extractor: Extractor) { pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
let file_name = format!( match merged {
"{}.epub", Some(name) => {
extractor let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
.metadata() epub.inline_toc();
.title() epub = articles
.replace("/", " ") .iter()
.replace("\\", " ") .enumerate()
); .fold(epub, |mut epub, (idx, article)| {
let mut out_file = File::create(&file_name).unwrap(); let mut html_buf = Vec::new();
let mut html_buf = Vec::new(); extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) .expect("Unable to serialize to xhtml");
.expect("Unable to serialize to xhtml"); let html_str = std::str::from_utf8(&html_buf).unwrap();
let html_buf = std::str::from_utf8(&html_buf).unwrap(); epub.metadata("title", replace_metadata_value(name))
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); .unwrap();
if let Some(author) = extractor.metadata().byline() { let section_name = article.metadata().title();
epub.metadata("author", replace_metadata_value(author)) epub.add_content(
.unwrap(); EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
} .title(replace_metadata_value(section_name)),
epub.metadata( )
"title", .unwrap();
replace_metadata_value(extractor.metadata().title()),
)
.unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
.unwrap();
for img in extractor.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file"); article.img_urls.iter().for_each(|img| {
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) let mut file_path = std::env::temp_dir();
.unwrap(); file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.as_ref().unwrap(),
)
.unwrap();
});
epub
});
let mut out_file = File::create(&name).unwrap();
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", name);
}
None => {
for article in articles {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
let file_name = format!(
"{}.epub",
article
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))
.unwrap();
}
epub.metadata("title", replace_metadata_value(article.metadata().title()))
.unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
.unwrap();
for img in article.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
.unwrap();
}
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name);
}
}
} }
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name);
} }
/// Replaces characters that have to be escaped before adding to the epub's metadata /// Replaces characters that have to be escaped before adding to the epub's metadata

View file

@ -15,7 +15,7 @@ mod http;
mod moz_readability; mod moz_readability;
use cli::AppConfig; use cli::AppConfig;
use epub::generate_epub; use epub::generate_epubs;
use extractor::Extractor; use extractor::Extractor;
use http::{download_images, fetch_url}; use http::{download_images, fetch_url};
@ -28,9 +28,10 @@ fn main() {
} }
fn download(app_config: AppConfig) { fn download(app_config: AppConfig) {
task::block_on(async { let articles = task::block_on(async {
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url)); let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await { while let Some(fetch_result) = responses.next().await {
match fetch_result { match fetch_result {
Ok((url, html)) => { Ok((url, html)) => {
@ -43,11 +44,13 @@ fn download(app_config: AppConfig) {
download_images(&mut extractor, &Url::parse(&url).unwrap()) download_images(&mut extractor, &Url::parse(&url).unwrap())
.await .await
.expect("Unable to download images"); .expect("Unable to download images");
generate_epub(extractor); articles.push(extractor);
} }
} }
Err(e) => println!("{}", e), Err(e) => println!("{}", e),
} }
} }
}) articles
});
generate_epubs(articles, app_config.merged());
} }