Add support for merging articles into a single epub

This is still experimental as it lacks validation of the target file name
This commit is contained in:
Kenneth Gitere 2021-02-11 13:51:21 +03:00
parent f0a610c2ac
commit b0c4c47413
3 changed files with 107 additions and 38 deletions

View file

@ -26,6 +26,13 @@ It takes a url and downloads the article content from it and saves it to an epub
.long("file")
.help("Input file containing links")
.takes_value(true),
)
.arg(
Arg::with_name("output_name")
.long("merge")
.help("Merge multiple articles into a single epub")
.long_help("Merge multiple articles into a single epub that will be given the name provided")
.takes_value(true),
);
let arg_matches = app.get_matches();
let mut urls: Vec<String> = match arg_matches.value_of("file") {
@ -57,12 +64,21 @@ It takes a url and downloads the article content from it and saves it to an epub
let mut app_config = AppConfig::new();
app_config.set_urls(urls);
if let Some(name) = arg_matches.value_of("output_name") {
let file_name = if name.ends_with(".epub") && name.len() > 5 {
name.to_owned()
} else {
name.to_owned() + ".epub"
};
app_config.set_merged(file_name);
}
app_config
}
pub struct AppConfig {
urls: Vec<String>,
max_conn: usize,
merged: Option<String>,
}
impl AppConfig {
@ -70,6 +86,7 @@ impl AppConfig {
Self {
urls: vec![],
max_conn: 8,
merged: None,
}
}
@ -77,10 +94,18 @@ impl AppConfig {
self.urls.extend(urls);
}
fn set_merged(&mut self, name: String) {
self.merged = Some(name);
}
pub fn urls(&self) -> &Vec<String> {
&self.urls
}
pub fn max_conn(&self) -> usize {
self.max_conn
}
pub fn merged(&self) -> Option<&String> {
self.merged.as_ref()
}
}

View file

@ -4,42 +4,83 @@ use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use crate::extractor::{self, Extractor};
pub fn generate_epub(extractor: Extractor) {
let file_name = format!(
"{}.epub",
extractor
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_buf = std::str::from_utf8(&html_buf).unwrap();
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
if let Some(author) = extractor.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))
.unwrap();
}
epub.metadata(
"title",
replace_metadata_value(extractor.metadata().title()),
)
.unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
.unwrap();
for img in extractor.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
match merged {
Some(name) => {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
epub.inline_toc();
epub = articles
.iter()
.enumerate()
.fold(epub, |mut epub, (idx, article)| {
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
epub.metadata("title", replace_metadata_value(name))
.unwrap();
let section_name = article.metadata().title();
epub.add_content(
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
.title(replace_metadata_value(section_name)),
)
.unwrap();
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
.unwrap();
article.img_urls.iter().for_each(|img| {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.as_ref().unwrap(),
)
.unwrap();
});
epub
});
let mut out_file = File::create(&name).unwrap();
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", name);
}
None => {
for article in articles {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
let file_name = format!(
"{}.epub",
article
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))
.unwrap();
}
epub.metadata("title", replace_metadata_value(article.metadata().title()))
.unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
.unwrap();
for img in article.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
.unwrap();
}
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name);
}
}
}
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name);
}
/// Replaces characters that have to be escaped before adding to the epub's metadata

View file

@ -15,7 +15,7 @@ mod http;
mod moz_readability;
use cli::AppConfig;
use epub::generate_epub;
use epub::generate_epubs;
use extractor::Extractor;
use http::{download_images, fetch_url};
@ -28,9 +28,10 @@ fn main() {
}
fn download(app_config: AppConfig) {
task::block_on(async {
let articles = task::block_on(async {
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
match fetch_result {
Ok((url, html)) => {
@ -43,11 +44,13 @@ fn download(app_config: AppConfig) {
download_images(&mut extractor, &Url::parse(&url).unwrap())
.await
.expect("Unable to download images");
generate_epub(extractor);
articles.push(extractor);
}
}
Err(e) => println!("{}", e),
}
}
})
articles
});
generate_epubs(articles, app_config.merged());
}