Add support for merging articles into a single epub
This is still experimental as it lacks validation of the target file name
This commit is contained in:
parent
f0a610c2ac
commit
b0c4c47413
3 changed files with 107 additions and 38 deletions
25
src/cli.rs
25
src/cli.rs
|
@ -26,6 +26,13 @@ It takes a url and downloads the article content from it and saves it to an epub
|
||||||
.long("file")
|
.long("file")
|
||||||
.help("Input file containing links")
|
.help("Input file containing links")
|
||||||
.takes_value(true),
|
.takes_value(true),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("output_name")
|
||||||
|
.long("merge")
|
||||||
|
.help("Merge multiple articles into a single epub")
|
||||||
|
.long_help("Merge multiple articles into a single epub that will be given the name provided")
|
||||||
|
.takes_value(true),
|
||||||
);
|
);
|
||||||
let arg_matches = app.get_matches();
|
let arg_matches = app.get_matches();
|
||||||
let mut urls: Vec<String> = match arg_matches.value_of("file") {
|
let mut urls: Vec<String> = match arg_matches.value_of("file") {
|
||||||
|
@ -57,12 +64,21 @@ It takes a url and downloads the article content from it and saves it to an epub
|
||||||
|
|
||||||
let mut app_config = AppConfig::new();
|
let mut app_config = AppConfig::new();
|
||||||
app_config.set_urls(urls);
|
app_config.set_urls(urls);
|
||||||
|
if let Some(name) = arg_matches.value_of("output_name") {
|
||||||
|
let file_name = if name.ends_with(".epub") && name.len() > 5 {
|
||||||
|
name.to_owned()
|
||||||
|
} else {
|
||||||
|
name.to_owned() + ".epub"
|
||||||
|
};
|
||||||
|
app_config.set_merged(file_name);
|
||||||
|
}
|
||||||
app_config
|
app_config
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct AppConfig {
|
pub struct AppConfig {
|
||||||
urls: Vec<String>,
|
urls: Vec<String>,
|
||||||
max_conn: usize,
|
max_conn: usize,
|
||||||
|
merged: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AppConfig {
|
impl AppConfig {
|
||||||
|
@ -70,6 +86,7 @@ impl AppConfig {
|
||||||
Self {
|
Self {
|
||||||
urls: vec![],
|
urls: vec![],
|
||||||
max_conn: 8,
|
max_conn: 8,
|
||||||
|
merged: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,10 +94,18 @@ impl AppConfig {
|
||||||
self.urls.extend(urls);
|
self.urls.extend(urls);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn set_merged(&mut self, name: String) {
|
||||||
|
self.merged = Some(name);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn urls(&self) -> &Vec<String> {
|
pub fn urls(&self) -> &Vec<String> {
|
||||||
&self.urls
|
&self.urls
|
||||||
}
|
}
|
||||||
pub fn max_conn(&self) -> usize {
|
pub fn max_conn(&self) -> usize {
|
||||||
self.max_conn
|
self.max_conn
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn merged(&self) -> Option<&String> {
|
||||||
|
self.merged.as_ref()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
65
src/epub.rs
65
src/epub.rs
|
@ -4,10 +4,52 @@ use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
||||||
|
|
||||||
use crate::extractor::{self, Extractor};
|
use crate::extractor::{self, Extractor};
|
||||||
|
|
||||||
pub fn generate_epub(extractor: Extractor) {
|
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
|
||||||
|
match merged {
|
||||||
|
Some(name) => {
|
||||||
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||||
|
epub.inline_toc();
|
||||||
|
epub = articles
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.fold(epub, |mut epub, (idx, article)| {
|
||||||
|
let mut html_buf = Vec::new();
|
||||||
|
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||||
|
.expect("Unable to serialize to xhtml");
|
||||||
|
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||||
|
epub.metadata("title", replace_metadata_value(name))
|
||||||
|
.unwrap();
|
||||||
|
let section_name = article.metadata().title();
|
||||||
|
epub.add_content(
|
||||||
|
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
|
||||||
|
.title(replace_metadata_value(section_name)),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
article.img_urls.iter().for_each(|img| {
|
||||||
|
let mut file_path = std::env::temp_dir();
|
||||||
|
file_path.push(&img.0);
|
||||||
|
|
||||||
|
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||||
|
epub.add_resource(
|
||||||
|
file_path.file_name().unwrap(),
|
||||||
|
img_buf,
|
||||||
|
img.1.as_ref().unwrap(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
});
|
||||||
|
epub
|
||||||
|
});
|
||||||
|
let mut out_file = File::create(&name).unwrap();
|
||||||
|
epub.generate(&mut out_file).unwrap();
|
||||||
|
println!("Created {:?}", name);
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
for article in articles {
|
||||||
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||||
let file_name = format!(
|
let file_name = format!(
|
||||||
"{}.epub",
|
"{}.epub",
|
||||||
extractor
|
article
|
||||||
.metadata()
|
.metadata()
|
||||||
.title()
|
.title()
|
||||||
.replace("/", " ")
|
.replace("/", " ")
|
||||||
|
@ -15,22 +57,18 @@ pub fn generate_epub(extractor: Extractor) {
|
||||||
);
|
);
|
||||||
let mut out_file = File::create(&file_name).unwrap();
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
let mut html_buf = Vec::new();
|
let mut html_buf = Vec::new();
|
||||||
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||||
.expect("Unable to serialize to xhtml");
|
.expect("Unable to serialize to xhtml");
|
||||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
if let Some(author) = article.metadata().byline() {
|
||||||
if let Some(author) = extractor.metadata().byline() {
|
|
||||||
epub.metadata("author", replace_metadata_value(author))
|
epub.metadata("author", replace_metadata_value(author))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
epub.metadata(
|
epub.metadata("title", replace_metadata_value(article.metadata().title()))
|
||||||
"title",
|
|
||||||
replace_metadata_value(extractor.metadata().title()),
|
|
||||||
)
|
|
||||||
.unwrap();
|
.unwrap();
|
||||||
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
for img in extractor.img_urls {
|
for img in article.img_urls {
|
||||||
let mut file_path = std::env::temp_dir();
|
let mut file_path = std::env::temp_dir();
|
||||||
file_path.push(&img.0);
|
file_path.push(&img.0);
|
||||||
|
|
||||||
|
@ -40,6 +78,9 @@ pub fn generate_epub(extractor: Extractor) {
|
||||||
}
|
}
|
||||||
epub.generate(&mut out_file).unwrap();
|
epub.generate(&mut out_file).unwrap();
|
||||||
println!("Created {:?}", file_name);
|
println!("Created {:?}", file_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Replaces characters that have to be escaped before adding to the epub's metadata
|
/// Replaces characters that have to be escaped before adding to the epub's metadata
|
||||||
|
|
11
src/main.rs
11
src/main.rs
|
@ -15,7 +15,7 @@ mod http;
|
||||||
mod moz_readability;
|
mod moz_readability;
|
||||||
|
|
||||||
use cli::AppConfig;
|
use cli::AppConfig;
|
||||||
use epub::generate_epub;
|
use epub::generate_epubs;
|
||||||
use extractor::Extractor;
|
use extractor::Extractor;
|
||||||
use http::{download_images, fetch_url};
|
use http::{download_images, fetch_url};
|
||||||
|
|
||||||
|
@ -28,9 +28,10 @@ fn main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn download(app_config: AppConfig) {
|
fn download(app_config: AppConfig) {
|
||||||
task::block_on(async {
|
let articles = task::block_on(async {
|
||||||
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
|
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
|
||||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
|
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
|
||||||
|
let mut articles = Vec::new();
|
||||||
while let Some(fetch_result) = responses.next().await {
|
while let Some(fetch_result) = responses.next().await {
|
||||||
match fetch_result {
|
match fetch_result {
|
||||||
Ok((url, html)) => {
|
Ok((url, html)) => {
|
||||||
|
@ -43,11 +44,13 @@ fn download(app_config: AppConfig) {
|
||||||
download_images(&mut extractor, &Url::parse(&url).unwrap())
|
download_images(&mut extractor, &Url::parse(&url).unwrap())
|
||||||
.await
|
.await
|
||||||
.expect("Unable to download images");
|
.expect("Unable to download images");
|
||||||
generate_epub(extractor);
|
articles.push(extractor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => println!("{}", e),
|
Err(e) => println!("{}", e),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
articles
|
||||||
|
});
|
||||||
|
generate_epubs(articles, app_config.merged());
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue