Add support for merging articles into a single epub
This is still experimental as it lacks validation of the target file name
This commit is contained in:
parent
f0a610c2ac
commit
b0c4c47413
3 changed files with 107 additions and 38 deletions
25
src/cli.rs
25
src/cli.rs
|
@ -26,6 +26,13 @@ It takes a url and downloads the article content from it and saves it to an epub
|
|||
.long("file")
|
||||
.help("Input file containing links")
|
||||
.takes_value(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("output_name")
|
||||
.long("merge")
|
||||
.help("Merge multiple articles into a single epub")
|
||||
.long_help("Merge multiple articles into a single epub that will be given the name provided")
|
||||
.takes_value(true),
|
||||
);
|
||||
let arg_matches = app.get_matches();
|
||||
let mut urls: Vec<String> = match arg_matches.value_of("file") {
|
||||
|
@ -57,12 +64,21 @@ It takes a url and downloads the article content from it and saves it to an epub
|
|||
|
||||
let mut app_config = AppConfig::new();
|
||||
app_config.set_urls(urls);
|
||||
if let Some(name) = arg_matches.value_of("output_name") {
|
||||
let file_name = if name.ends_with(".epub") && name.len() > 5 {
|
||||
name.to_owned()
|
||||
} else {
|
||||
name.to_owned() + ".epub"
|
||||
};
|
||||
app_config.set_merged(file_name);
|
||||
}
|
||||
app_config
|
||||
}
|
||||
|
||||
pub struct AppConfig {
|
||||
urls: Vec<String>,
|
||||
max_conn: usize,
|
||||
merged: Option<String>,
|
||||
}
|
||||
|
||||
impl AppConfig {
|
||||
|
@ -70,6 +86,7 @@ impl AppConfig {
|
|||
Self {
|
||||
urls: vec![],
|
||||
max_conn: 8,
|
||||
merged: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -77,10 +94,18 @@ impl AppConfig {
|
|||
self.urls.extend(urls);
|
||||
}
|
||||
|
||||
fn set_merged(&mut self, name: String) {
|
||||
self.merged = Some(name);
|
||||
}
|
||||
|
||||
pub fn urls(&self) -> &Vec<String> {
|
||||
&self.urls
|
||||
}
|
||||
pub fn max_conn(&self) -> usize {
|
||||
self.max_conn
|
||||
}
|
||||
|
||||
pub fn merged(&self) -> Option<&String> {
|
||||
self.merged.as_ref()
|
||||
}
|
||||
}
|
||||
|
|
109
src/epub.rs
109
src/epub.rs
|
@ -4,42 +4,83 @@ use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
|||
|
||||
use crate::extractor::{self, Extractor};
|
||||
|
||||
pub fn generate_epub(extractor: Extractor) {
|
||||
let file_name = format!(
|
||||
"{}.epub",
|
||||
extractor
|
||||
.metadata()
|
||||
.title()
|
||||
.replace("/", " ")
|
||||
.replace("\\", " ")
|
||||
);
|
||||
let mut out_file = File::create(&file_name).unwrap();
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
if let Some(author) = extractor.metadata().byline() {
|
||||
epub.metadata("author", replace_metadata_value(author))
|
||||
.unwrap();
|
||||
}
|
||||
epub.metadata(
|
||||
"title",
|
||||
replace_metadata_value(extractor.metadata().title()),
|
||||
)
|
||||
.unwrap();
|
||||
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
||||
.unwrap();
|
||||
for img in extractor.img_urls {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
|
||||
match merged {
|
||||
Some(name) => {
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
epub.inline_toc();
|
||||
epub = articles
|
||||
.iter()
|
||||
.enumerate()
|
||||
.fold(epub, |mut epub, (idx, article)| {
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||
epub.metadata("title", replace_metadata_value(name))
|
||||
.unwrap();
|
||||
let section_name = article.metadata().title();
|
||||
epub.add_content(
|
||||
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
|
||||
.title(replace_metadata_value(section_name)),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
|
||||
.unwrap();
|
||||
article.img_urls.iter().for_each(|img| {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(
|
||||
file_path.file_name().unwrap(),
|
||||
img_buf,
|
||||
img.1.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
});
|
||||
epub
|
||||
});
|
||||
let mut out_file = File::create(&name).unwrap();
|
||||
epub.generate(&mut out_file).unwrap();
|
||||
println!("Created {:?}", name);
|
||||
}
|
||||
None => {
|
||||
for article in articles {
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
let file_name = format!(
|
||||
"{}.epub",
|
||||
article
|
||||
.metadata()
|
||||
.title()
|
||||
.replace("/", " ")
|
||||
.replace("\\", " ")
|
||||
);
|
||||
let mut out_file = File::create(&file_name).unwrap();
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||
if let Some(author) = article.metadata().byline() {
|
||||
epub.metadata("author", replace_metadata_value(author))
|
||||
.unwrap();
|
||||
}
|
||||
epub.metadata("title", replace_metadata_value(article.metadata().title()))
|
||||
.unwrap();
|
||||
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
|
||||
.unwrap();
|
||||
for img in article.img_urls {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
|
||||
.unwrap();
|
||||
}
|
||||
epub.generate(&mut out_file).unwrap();
|
||||
println!("Created {:?}", file_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
epub.generate(&mut out_file).unwrap();
|
||||
println!("Created {:?}", file_name);
|
||||
}
|
||||
|
||||
/// Replaces characters that have to be escaped before adding to the epub's metadata
|
||||
|
|
11
src/main.rs
11
src/main.rs
|
@ -15,7 +15,7 @@ mod http;
|
|||
mod moz_readability;
|
||||
|
||||
use cli::AppConfig;
|
||||
use epub::generate_epub;
|
||||
use epub::generate_epubs;
|
||||
use extractor::Extractor;
|
||||
use http::{download_images, fetch_url};
|
||||
|
||||
|
@ -28,9 +28,10 @@ fn main() {
|
|||
}
|
||||
|
||||
fn download(app_config: AppConfig) {
|
||||
task::block_on(async {
|
||||
let articles = task::block_on(async {
|
||||
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
|
||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
|
||||
let mut articles = Vec::new();
|
||||
while let Some(fetch_result) = responses.next().await {
|
||||
match fetch_result {
|
||||
Ok((url, html)) => {
|
||||
|
@ -43,11 +44,13 @@ fn download(app_config: AppConfig) {
|
|||
download_images(&mut extractor, &Url::parse(&url).unwrap())
|
||||
.await
|
||||
.expect("Unable to download images");
|
||||
generate_epub(extractor);
|
||||
articles.push(extractor);
|
||||
}
|
||||
}
|
||||
Err(e) => println!("{}", e),
|
||||
}
|
||||
}
|
||||
})
|
||||
articles
|
||||
});
|
||||
generate_epubs(articles, app_config.merged());
|
||||
}
|
||||
|
|
Reference in a new issue