paperoni/src/main.rs

#[macro_use]
extern crate lazy_static;

use std::fs::File;

use async_std::task;
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use url::Url;

mod cli;
mod extractor;
mod moz_readability;

use extractor::Extractor;
fn main() {
    let app = cli::cli_init();
    let arg_matches = app.get_matches();
    if let Some(vals) = arg_matches.values_of("urls") {
        let urls = vals.map(|val| val.to_string()).collect::<Vec<_>>();
        download(urls);
    }
}

type HTMLResource = (String, String);

async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
    let client = surf::Client::new();
    println!("Fetching...");

    let mut redirect_count: u8 = 0;
    let base_url = Url::parse(&url)?;
    let mut url = base_url.clone();
    while redirect_count < 5 {
        redirect_count += 1;
        let req = surf::get(&url);
        let mut res = client.send(req).await?;
        if res.status().is_redirection() {
            if let Some(location) = res.header(surf::http::headers::LOCATION) {
                match Url::parse(location.last().as_str()) {
                    Ok(valid_url) => url = valid_url,
                    Err(e) => match e {
                        url::ParseError::RelativeUrlWithoutBase => {
                            url = base_url.join(location.last().as_str())?
                        }
                        e => return Err(e.into()),
                    },
                };
            }
        } else if res.status().is_success() {
            if let Some(mime) = res.content_type() {
                if mime.essence() == "text/html" {
                    return Ok((url.to_string(), res.body_string().await?));
                } else {
                    return Err(format!(
                        "Invalid HTTP response. Received {} instead of text/html",
                        mime.essence()
                    )
                    .into());
                }
            } else {
                return Err("Unknown HTTP response".into());
            }
        } else {
            return Err(format!("Request failed: HTTP {}", res.status()).into());
        }
    }
    Err("Unable to fetch HTML".into())
}

fn download(urls: Vec<String>) {
    let mut async_url_tasks = Vec::with_capacity(urls.len());
    for url in urls {
        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
    }
    task::block_on(async {
        for url_task in async_url_tasks {
            match url_task.await {
                Ok((url, html)) => {
                    println!("Extracting");
                    let mut extractor = Extractor::from_html(&html);
                    extractor.extract_content(&url);
                    if extractor.article().is_some() {
                        extractor
                            .download_images(&Url::parse(&url).unwrap())
                            .await
                            .expect("Unable to download images");
                        let file_name = format!(
                            "{}.epub",
                            extractor
                                .metadata()
                                .title()
                                .replace("/", " ")
                                .replace("\\", " ")
                        );
                        let mut out_file = File::create(&file_name).unwrap();
                        let mut html_buf = Vec::new();
                        extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
                            .expect("Unable to serialize to xhtml");
                        let html_buf = std::str::from_utf8(&html_buf).unwrap();
                        let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
                        if let Some(author) = extractor.metadata().byline() {
                            epub.metadata("author", author.replace("&", "&amp;"))
                                .unwrap();
                        }
                        epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
                            .unwrap();
                        epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
                            .unwrap();
                        for img in extractor.img_urls {
                            let mut file_path = std::env::temp_dir();
                            file_path.push(&img.0);

                            let img_buf = File::open(&file_path).expect("Can't read file");
                            epub.add_resource(
                                file_path.file_name().unwrap(),
                                img_buf,
                                img.1.unwrap(),
                            )
                            .unwrap();
                        }
                        epub.generate(&mut out_file).unwrap();
                        println!("Created {:?}", file_name);
                    }
                }
                Err(e) => println!("{}", e),
            }
        }
    })
}
Add regexes module in moz_readability that contains the regular expressions used. For optimal performance, the regular expresions are compiled to static values to prevent recompiling in loops 2020-10-12 19:33:01 +01:00			`#[macro_use]`
			`extern crate lazy_static;`

Add test for extract content 2020-05-01 18:42:41 +01:00			`use std::fs::File;`

Refactor to use temp directory and update surf Change from using res directory for image downloads to using temp directories. Update surf to v2 which required changing the way Content-Type headers are read from. 2020-11-23 06:39:56 +00:00			`use async_std::task;`
Add first attempt to save an epub file 2020-05-02 17:25:31 +01:00			`use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};`
Add image download functionality 2020-05-02 16:33:45 +01:00			`use url::Url;`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`mod cli;`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`mod extractor;`
Add moz_readability initial code and accompanying unit tests This currently contains the preprocessing code of the Readability. It is a port of Readability.js by Mozilla. 2020-08-31 17:30:09 +01:00			`mod moz_readability;`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00
			`use extractor::Extractor;`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00			`fn main() {`
Change from structopt to clap This allows printing the help message if no args are passed 2020-11-24 06:58:50 +00:00			`let app = cli::cli_init();`
			`let arg_matches = app.get_matches();`
			`if let Some(vals) = arg_matches.values_of("urls") {`
			`let urls = vals.map(\|val\| val.to_string()).collect::<Vec<_>>();`
			`download(urls);`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`}`
			`}`

Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`type HTMLResource = (String, String);`

Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`let client = surf::Client::new();`
			`println!("Fetching...");`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00
			`let mut redirect_count: u8 = 0;`
			`let base_url = Url::parse(&url)?;`
			`let mut url = base_url.clone();`
			`while redirect_count < 5 {`
			`redirect_count += 1;`
			`let req = surf::get(&url);`
			`let mut res = client.send(req).await?;`
			`if res.status().is_redirection() {`
			`if let Some(location) = res.header(surf::http::headers::LOCATION) {`
			`match Url::parse(location.last().as_str()) {`
			`Ok(valid_url) => url = valid_url,`
			`Err(e) => match e {`
			`url::ParseError::RelativeUrlWithoutBase => {`
			`url = base_url.join(location.last().as_str())?`
			`}`
			`e => return Err(e.into()),`
			`},`
			`};`
			`}`
			`} else if res.status().is_success() {`
			`if let Some(mime) = res.content_type() {`
			`if mime.essence() == "text/html" {`
			`return Ok((url.to_string(), res.body_string().await?));`
			`} else {`
			`return Err(format!(`
			`"Invalid HTTP response. Received {} instead of text/html",`
			`mime.essence()`
			`)`
			`.into());`
			`}`
			`} else {`
			`return Err("Unknown HTTP response".into());`
			`}`
			`} else {`
			`return Err(format!("Request failed: HTTP {}", res.status()).into());`
			`}`
Add basic redirect provided by surf and early exit of the program if the response is not a 200 2020-11-24 14:44:31 +00:00			`}`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`Err("Unable to fetch HTML".into())`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`}`

Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`fn download(urls: Vec<String>) {`
			`let mut async_url_tasks = Vec::with_capacity(urls.len());`
			`for url in urls {`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`}`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00			`task::block_on(async {`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`for url_task in async_url_tasks {`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`match url_task.await {`
			`Ok((url, html)) => {`
			`println!("Extracting");`
			`let mut extractor = Extractor::from_html(&html);`
			`extractor.extract_content(&url);`
			`if extractor.article().is_some() {`
			`extractor`
			`.download_images(&Url::parse(&url).unwrap())`
			`.await`
			`.expect("Unable to download images");`
			`let file_name = format!(`
			`"{}.epub",`
			`extractor`
			`.metadata()`
			`.title()`
			`.replace("/", " ")`
			`.replace("\\", " ")`
			`);`
			`let mut out_file = File::create(&file_name).unwrap();`
			`let mut html_buf = Vec::new();`
			`extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)`
			`.expect("Unable to serialize to xhtml");`
			`let html_buf = std::str::from_utf8(&html_buf).unwrap();`
			`let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();`
			`if let Some(author) = extractor.metadata().byline() {`
			`epub.metadata("author", author.replace("&", "&"))`
			`.unwrap();`
			`}`
			`epub.metadata("title", extractor.metadata().title().replace("&", "&"))`
			`.unwrap();`
			`epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))`
			`.unwrap();`
			`for img in extractor.img_urls {`
			`let mut file_path = std::env::temp_dir();`
			`file_path.push(&img.0);`
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`let img_buf = File::open(&file_path).expect("Can't read file");`
			`epub.add_resource(`
			`file_path.file_name().unwrap(),`
			`img_buf,`
			`img.1.unwrap(),`
			`)`
			`.unwrap();`
			`}`
			`epub.generate(&mut out_file).unwrap();`
			`println!("Created {:?}", file_name);`
			`}`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`}`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`Err(e) => println!("{}", e),`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`}`
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00			`}`
Add first attempt to save an epub file 2020-05-02 17:25:31 +01:00			`})`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00			`}`