paperoni/src/main.rs

#[macro_use]
extern crate lazy_static;

use async_std::task;
use url::Url;

mod cli;
mod epub;
mod extractor;
/// This module is responsible for async HTTP calls for downloading
/// the HTML content and images
mod http;
mod moz_readability;

use epub::generate_epub;
use http::{download_images, fetch_url};

use extractor::Extractor;
fn main() {
    let app_config = cli::cli_init();

    if !app_config.urls().is_empty() {
        download(app_config.urls().clone());
    }
}

fn download(urls: Vec<String>) {
    let mut async_url_tasks = Vec::with_capacity(urls.len());
    for url in urls {
        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
    }

    task::block_on(async {
        for url_task in async_url_tasks {
            match url_task.await {
                Ok((url, html)) => {
                    println!("Extracting");
                    let mut extractor = Extractor::from_html(&html);
                    extractor.extract_content(&url);

                    if extractor.article().is_some() {
                        download_images(&mut extractor, &Url::parse(&url).unwrap())
                            .await
                            .expect("Unable to download images");
                        generate_epub(extractor);
                    }
                }
                Err(e) => println!("{}", e),
            }
        }
    })
}
Add regexes module in moz_readability that contains the regular expressions used. For optimal performance, the regular expresions are compiled to static values to prevent recompiling in loops 2020-10-12 19:33:01 +01:00			`#[macro_use]`
			`extern crate lazy_static;`

Refactor to use temp directory and update surf Change from using res directory for image downloads to using temp directories. Update surf to v2 which required changing the way Content-Type headers are read from. 2020-11-23 06:39:56 +00:00			`use async_std::task;`
Add image download functionality 2020-05-02 16:33:45 +01:00			`use url::Url;`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`mod cli;`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`mod epub;`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`mod extractor;`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`/// This module is responsible for async HTTP calls for downloading`
			`/// the HTML content and images`
			`mod http;`
Add moz_readability initial code and accompanying unit tests This currently contains the preprocessing code of the Readability. It is a port of Readability.js by Mozilla. 2020-08-31 17:30:09 +01:00			`mod moz_readability;`
Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00
Add http and epub modules 2021-02-06 09:59:03 +00:00			`use epub::generate_epub;`
			`use http::{download_images, fetch_url};`

Factor out text extraction into extractor module 2020-05-01 14:17:59 +01:00			`use extractor::Extractor;`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00			`fn main() {`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`let app_config = cli::cli_init();`
Add -f flag for adding links from a file instead of needing to use cat 2021-02-01 08:28:07 +00:00
Add http and epub modules 2021-02-06 09:59:03 +00:00			`if !app_config.urls().is_empty() {`
			`download(app_config.urls().clone());`
Add simple CLI wrapper 2020-05-16 08:09:44 +01:00			`}`
			`}`

Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`fn download(urls: Vec<String>) {`
			`let mut async_url_tasks = Vec::with_capacity(urls.len());`
			`for url in urls {`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`}`
Add http and epub modules 2021-02-06 09:59:03 +00:00
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00			`task::block_on(async {`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`for url_task in async_url_tasks {`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`match url_task.await {`
			`Ok((url, html)) => {`
			`println!("Extracting");`
			`let mut extractor = Extractor::from_html(&html);`
			`extractor.extract_content(&url);`
Add http and epub modules 2021-02-06 09:59:03 +00:00
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`if extractor.article().is_some() {`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`download_images(&mut extractor, &Url::parse(&url).unwrap())`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`.await`
			`.expect("Unable to download images");`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`generate_epub(extractor);`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`}`
Change CLI option to allow for multiple arguments Add basic looping in async runtime 2020-10-22 13:22:56 +01:00			`}`
Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. 2021-01-24 14:49:42 +00:00			`Err(e) => println!("{}", e),`
Merge the readability module with the rest of the extractor 2020-10-22 10:12:30 +01:00			`}`
Change download code to save images to a folder Add downloaded images to the output epub file 2020-05-05 10:24:11 +01:00			`}`
Add first attempt to save an epub file 2020-05-02 17:25:31 +01:00			`})`
Initial extraction code to get meta information on a blog 2020-04-30 09:05:53 +01:00			`}`