paperoni/src/main.rs

#[macro_use]
extern crate lazy_static;

use std::fs::File;
use std::path::Path;

use async_std::{fs::create_dir, fs::remove_dir_all, task};
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use structopt::StructOpt;
use url::Url;

mod cli;
mod extractor;
mod moz_readability;

use extractor::Extractor;
fn main() {
    let opt = cli::Opts::from_args();
    if !opt.urls.is_empty() {
        println!("Downloading single article");
        download(opt.urls);
    }
}

type HTMLResource = (String, String);

async fn fetch_url(url: &str) -> HTMLResource {
    let client = surf::Client::new();
    println!("Fetching...");
    // TODO: Add middleware for following redirects
    (
        url.to_string(),
        client
            .get(url)
            .recv_string()
            .await
            .expect("Unable to fetch URL"),
    )
}

fn download(urls: Vec<String>) {
    let mut async_url_tasks = Vec::with_capacity(urls.len());
    for url in urls {
        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
    }
    task::block_on(async {
        for url_task in async_url_tasks {
            let (url, html) = url_task.await;
            println!("Extracting");
            let mut extractor = Extractor::from_html(&html);
            extractor.extract_content(&url);
            if extractor.article().is_some() {
                if !Path::new("res/").exists() {
                    create_dir("res/")
                        .await
                        .expect("Unable to create res/ output folder");
                }
                extractor
                    .download_images(&Url::parse(&url).unwrap())
                    .await
                    .expect("Unable to download images");
                let file_name = format!("{}.epub", extractor.metadata().title());
                let mut out_file = File::create(&file_name).unwrap();
                let mut html_buf = Vec::new();
                extractor
                    .article()
                    .unwrap()
                    .serialize(&mut html_buf)
                    .expect("Unable to serialize");
                let html_buf = std::str::from_utf8(&html_buf).unwrap();
                let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX
                    .replace_all(html_buf, "$tag/>");
                let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
                if let Some(author) = extractor.metadata().byline() {
                    epub.metadata("author", author).unwrap();
                }
                epub.metadata("title", extractor.metadata().title())
                    .unwrap();
                epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
                    .unwrap();
                for img in extractor.img_urls {
                    let file_path = format!("{}", &img.0);

                    let img_buf = File::open(file_path).expect("Can't read file");
                    epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
                }
                epub.generate(&mut out_file).unwrap();
                println!("Cleaning up");
                remove_dir_all("res/").await.unwrap();
                println!("Created {:?}", file_name);
            }
        }
    })
}
Add regexes module in moz_readability that contains the regular expressions used. For optimal performance, the regular expresions are compiled to static values to prevent recompiling in loops 2020-10-12 19:33:01 +01:00			`#[macro_use]`
			`extern crate lazy_static;`

Add test for extract content 2020-05-01 18:42:41 +01:00			`use std::fs::File;`
Bug fix and add printing of the name of the extracted EPUB The fix prevents creating the res directory if it already exists 2020-11-23 06:01:05 +00:00			`use std::path::Path;`
Add test for extract content 2020-05-01 18:42:41 +01:00
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00			`use async_std::{fs::create_dir, fs::remove_dir_all, task};`
Add first attempt to save an epub file 2020-05-02 17:25:31 +01:00			`use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`use structopt::StructOpt;`
Add image download functionality 2020-05-02 16:33:45 +01:00			`use url::Url;`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`mod cli;`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`mod extractor;`
Add moz_readability initial code and accompanying unit tests This currently contains the preprocessing code of the Readability. It is a port of Readability.js by Mozilla. 2020-08-31 17:30:09 +01:00			`mod moz_readability;`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00
			`use extractor::Extractor;`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00			`fn main() {`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`let opt = cli::Opts::from_args();`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`if !opt.urls.is_empty() {`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`println!("Downloading single article");`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`download(opt.urls);`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`}`
			`}`

Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`type HTMLResource = (String, String);`

			`async fn fetch_url(url: &str) -> HTMLResource {`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`let client = surf::Client::new();`
			`println!("Fetching...");`
			`// TODO: Add middleware for following redirects`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`(`
			`url.to_string(),`
			`client`
			`.get(url)`
			`.recv_string()`
			`.await`
			`.expect("Unable to fetch URL"),`
			`)`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`}`

Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`fn download(urls: Vec<String>) {`
			`let mut async_url_tasks = Vec::with_capacity(urls.len());`
			`for url in urls {`
			`async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));`
			`}`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00			`task::block_on(async {`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`for url_task in async_url_tasks {`
			`let (url, html) = url_task.await;`
			`println!("Extracting");`
			`let mut extractor = Extractor::from_html(&html);`
			`extractor.extract_content(&url);`
			`if extractor.article().is_some() {`
Bug fix and add printing of the name of the extracted EPUB The fix prevents creating the res directory if it already exists 2020-11-23 06:01:05 +00:00			`if !Path::new("res/").exists() {`
			`create_dir("res/")`
			`.await`
			`.expect("Unable to create res/ output folder");`
			`}`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`extractor`
			`.download_images(&Url::parse(&url).unwrap())`
			`.await`
			`.expect("Unable to download images");`
Bug fix and add printing of the name of the extracted EPUB The fix prevents creating the res directory if it already exists 2020-11-23 06:01:05 +00:00			`let file_name = format!("{}.epub", extractor.metadata().title());`
			`let mut out_file = File::create(&file_name).unwrap();`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`let mut html_buf = Vec::new();`
			`extractor`
			`.article()`
			`.unwrap()`
			`.serialize(&mut html_buf)`
			`.expect("Unable to serialize");`
			`let html_buf = std::str::from_utf8(&html_buf).unwrap();`
Fix alignment in README Update manifest file Add fix in serialized file to have self closing tags which is invalid xhtml 2020-10-22 17:10:11 +01:00			`let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX`
			`.replace_all(html_buf, "$tag/>");`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();`
			`if let Some(author) = extractor.metadata().byline() {`
			`epub.metadata("author", author).unwrap();`
			`}`
			`epub.metadata("title", extractor.metadata().title())`
			`.unwrap();`
			`epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))`
			`.unwrap();`
			`for img in extractor.img_urls {`
			`let file_path = format!("{}", &img.0);`
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`let img_buf = File::open(file_path).expect("Can't read file");`
			`epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();`
			`}`
			`epub.generate(&mut out_file).unwrap();`
			`println!("Cleaning up");`
			`remove_dir_all("res/").await.unwrap();`
Bug fix and add printing of the name of the extracted EPUB The fix prevents creating the res directory if it already exists 2020-11-23 06:01:05 +00:00			`println!("Created {:?}", file_name);`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`}`
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00			`}`
Add first attempt to save an epub file 2020-05-02 17:25:31 +01:00			`})`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00			`}`