2020-10-12 19:33:01 +01:00
|
|
|
#[macro_use]
|
|
|
|
extern crate lazy_static;
|
|
|
|
|
2020-05-01 18:42:41 +01:00
|
|
|
use std::fs::File;
|
2020-11-23 06:01:05 +00:00
|
|
|
use std::path::Path;
|
2020-05-01 18:42:41 +01:00
|
|
|
|
2020-05-05 10:24:11 +01:00
|
|
|
use async_std::{fs::create_dir, fs::remove_dir_all, task};
|
2020-05-02 17:25:31 +01:00
|
|
|
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
2020-05-16 08:09:44 +01:00
|
|
|
use structopt::StructOpt;
|
2020-05-02 16:33:45 +01:00
|
|
|
use url::Url;
|
2020-04-30 09:05:53 +01:00
|
|
|
|
2020-05-16 08:09:44 +01:00
|
|
|
mod cli;
|
2020-05-01 14:17:59 +01:00
|
|
|
mod extractor;
|
2020-08-31 17:30:09 +01:00
|
|
|
mod moz_readability;
|
2020-05-01 14:17:59 +01:00
|
|
|
|
|
|
|
use extractor::Extractor;
|
2020-04-30 09:05:53 +01:00
|
|
|
fn main() {
|
2020-05-16 08:09:44 +01:00
|
|
|
let opt = cli::Opts::from_args();
|
2020-10-22 13:22:56 +01:00
|
|
|
if !opt.urls.is_empty() {
|
2020-05-16 08:09:44 +01:00
|
|
|
println!("Downloading single article");
|
2020-10-22 13:22:56 +01:00
|
|
|
download(opt.urls);
|
2020-05-16 08:09:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-22 13:22:56 +01:00
|
|
|
type HTMLResource = (String, String);
|
|
|
|
|
|
|
|
async fn fetch_url(url: &str) -> HTMLResource {
|
2020-05-16 08:09:44 +01:00
|
|
|
let client = surf::Client::new();
|
|
|
|
println!("Fetching...");
|
|
|
|
// TODO: Add middleware for following redirects
|
2020-10-22 13:22:56 +01:00
|
|
|
(
|
|
|
|
url.to_string(),
|
|
|
|
client
|
|
|
|
.get(url)
|
|
|
|
.recv_string()
|
|
|
|
.await
|
|
|
|
.expect("Unable to fetch URL"),
|
|
|
|
)
|
2020-05-16 08:09:44 +01:00
|
|
|
}
|
|
|
|
|
2020-10-22 13:22:56 +01:00
|
|
|
fn download(urls: Vec<String>) {
|
|
|
|
let mut async_url_tasks = Vec::with_capacity(urls.len());
|
|
|
|
for url in urls {
|
|
|
|
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
|
|
|
|
}
|
2020-04-30 09:05:53 +01:00
|
|
|
task::block_on(async {
|
2020-10-22 13:22:56 +01:00
|
|
|
for url_task in async_url_tasks {
|
|
|
|
let (url, html) = url_task.await;
|
|
|
|
println!("Extracting");
|
|
|
|
let mut extractor = Extractor::from_html(&html);
|
|
|
|
extractor.extract_content(&url);
|
|
|
|
if extractor.article().is_some() {
|
2020-11-23 06:01:05 +00:00
|
|
|
if !Path::new("res/").exists() {
|
|
|
|
create_dir("res/")
|
|
|
|
.await
|
|
|
|
.expect("Unable to create res/ output folder");
|
|
|
|
}
|
2020-10-22 13:22:56 +01:00
|
|
|
extractor
|
|
|
|
.download_images(&Url::parse(&url).unwrap())
|
|
|
|
.await
|
|
|
|
.expect("Unable to download images");
|
2020-11-23 06:01:05 +00:00
|
|
|
let file_name = format!("{}.epub", extractor.metadata().title());
|
|
|
|
let mut out_file = File::create(&file_name).unwrap();
|
2020-10-22 13:22:56 +01:00
|
|
|
let mut html_buf = Vec::new();
|
|
|
|
extractor
|
|
|
|
.article()
|
|
|
|
.unwrap()
|
|
|
|
.serialize(&mut html_buf)
|
|
|
|
.expect("Unable to serialize");
|
|
|
|
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
2020-10-22 17:10:11 +01:00
|
|
|
let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX
|
|
|
|
.replace_all(html_buf, "$tag/>");
|
2020-10-22 13:22:56 +01:00
|
|
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
|
|
|
if let Some(author) = extractor.metadata().byline() {
|
|
|
|
epub.metadata("author", author).unwrap();
|
|
|
|
}
|
|
|
|
epub.metadata("title", extractor.metadata().title())
|
|
|
|
.unwrap();
|
|
|
|
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
|
|
|
.unwrap();
|
|
|
|
for img in extractor.img_urls {
|
|
|
|
let file_path = format!("{}", &img.0);
|
2020-05-05 10:24:11 +01:00
|
|
|
|
2020-10-22 13:22:56 +01:00
|
|
|
let img_buf = File::open(file_path).expect("Can't read file");
|
|
|
|
epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
|
|
|
|
}
|
|
|
|
epub.generate(&mut out_file).unwrap();
|
|
|
|
println!("Cleaning up");
|
|
|
|
remove_dir_all("res/").await.unwrap();
|
2020-11-23 06:01:05 +00:00
|
|
|
println!("Created {:?}", file_name);
|
2020-10-22 10:12:30 +01:00
|
|
|
}
|
2020-05-05 10:24:11 +01:00
|
|
|
}
|
2020-05-02 17:25:31 +01:00
|
|
|
})
|
2020-04-30 09:05:53 +01:00
|
|
|
}
|