paperoni/src/main.rs

68 lines
2 KiB
Rust
Raw Normal View History

2020-05-01 18:42:41 +01:00
use std::fs::File;
use async_std::{fs::create_dir, fs::remove_dir_all, task};
2020-05-02 17:25:31 +01:00
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
2020-05-16 08:09:44 +01:00
use structopt::StructOpt;
2020-05-02 16:33:45 +01:00
use url::Url;
2020-05-16 08:09:44 +01:00
mod cli;
mod extractor;
mod moz_readability;
use extractor::Extractor;
fn main() {
2020-05-16 08:09:44 +01:00
let opt = cli::Opts::from_args();
if let Some(url) = opt.url {
println!("Downloading single article");
download(url)
}
}
async fn fetch_url(url: &str) -> String {
let client = surf::Client::new();
println!("Fetching...");
// TODO: Add middleware for following redirects
client
.get(url)
.recv_string()
.await
.expect("Unable to fetch URL")
}
fn download(url: String) {
task::block_on(async {
2020-05-16 08:09:44 +01:00
let html = fetch_url(&url).await;
let mut extractor = Extractor::from_html(&html);
2020-05-01 18:42:41 +01:00
println!("Extracting");
extractor.extract_content();
create_dir("res/")
.await
.expect("Unable to create res/ output folder");
2020-05-02 16:33:45 +01:00
extractor
2020-05-16 08:09:44 +01:00
.download_images(&Url::parse(&url).unwrap())
2020-05-02 16:33:45 +01:00
.await
.expect("Unable to download images");
2020-05-02 17:25:31 +01:00
let mut out_file = File::create("out.epub").unwrap();
let mut html_buf = Vec::new();
extractor
.content
.unwrap()
.as_node()
2020-05-02 17:25:31 +01:00
.serialize(&mut html_buf)
.expect("Unable to serialize");
2020-05-02 17:25:31 +01:00
let html_buf = std::str::from_utf8(&html_buf).unwrap();
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
2020-05-02 17:25:31 +01:00
.unwrap();
for img in extractor.img_urls {
let file_path = format!("{}", &img.0);
let img_buf = File::open(file_path).expect("Can't read file");
epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
}
epub.generate(&mut out_file).unwrap();
println!("Cleaning up");
remove_dir_all("res/").await.unwrap();
2020-05-02 17:25:31 +01:00
})
}