From 003953332f0387dfba7ff233279db280beb764b5 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 6 Feb 2021 17:03:02 +0300 Subject: [PATCH] Refactor downloading of HTML pages This change allows for parallel downloads of HTML pages upto a maximum number of concurrent HTTP requests which is more efficient than before where all HTTP requests are likely to begin at the same time. --- Cargo.lock | 102 ++++++++++++++++++++++++++++++---------------------- Cargo.toml | 1 + src/cli.rs | 9 ++++- src/http.rs | 1 - src/main.rs | 21 +++++------ 5 files changed, 79 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b7d384..4707b91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -187,7 +187,7 @@ dependencies = [ "memchr", "num_cpus", "once_cell", - "pin-project-lite", + "pin-project-lite 0.1.11", "pin-utils", "slab", "wasm-bindgen-futures", @@ -684,25 +684,52 @@ dependencies = [ ] [[package]] -name = "futures-channel" -version = "0.3.8" +name = "futures" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b7109687aa4e177ef6fe84553af6280ef2778bdb7783ba44c9dc3399110fe64" +checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" dependencies = [ "futures-core", + "futures-sink", ] [[package]] name = "futures-core" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "847ce131b72ffb13b6109a221da9ad97a64cbe48feb1028356b836b47b8f1748" +checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" + +[[package]] +name = "futures-executor" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] [[package]] name = "futures-io" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "611834ce18aaa1bd13c4b374f5d653e1027cf99b6b502584ff8c9a64413b30bb" +checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" [[package]] name = "futures-lite" @@ -715,15 +742,15 @@ dependencies = [ "futures-io", "memchr", "parking", - "pin-project-lite", + "pin-project-lite 0.1.11", "waker-fn", ] [[package]] name = "futures-macro" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77408a692f1f97bcc61dc001d752e00643408fbc922e4d634c655df50d595556" +checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -733,31 +760,33 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f878195a49cee50e006b02b93cf7e0a95a38ac7b776b4c4d9cc1207cd20fcb3d" +checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" [[package]] name = "futures-task" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c554eb5bf48b2426c4771ab68c6b14468b6e76cc90996f528c3338d761a4d0d" +checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" dependencies = [ "once_cell", ] [[package]] name = "futures-util" -version = "0.3.8" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d304cff4a7b99cfb7986f7d43fbe93d175e72e704a8860787cc95e9ffd85cbd2" +checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" dependencies = [ + "futures-channel", "futures-core", "futures-io", "futures-macro", + "futures-sink", "futures-task", "memchr", - "pin-project 1.0.2", + "pin-project-lite 0.2.4", "pin-utils", "proc-macro-hack", "proc-macro-nested", @@ -911,7 +940,7 @@ dependencies = [ "cookie", "futures-lite", "infer", - "pin-project-lite", + "pin-project-lite 0.1.11", "rand 0.7.3", "serde", "serde_json", @@ -1247,6 +1276,7 @@ dependencies = [ "async-std", "clap", "epub-builder", + "futures", "html5ever", "kuchiki", "lazy_static", @@ -1328,16 +1358,7 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15" dependencies = [ - "pin-project-internal 0.4.27", -] - -[[package]] -name = "pin-project" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ccc2237c2c489783abd8c4c80e5450fc0e98644555b1364da68cc29aa151ca7" -dependencies = [ - "pin-project-internal 1.0.2", + "pin-project-internal", ] [[package]] @@ -1351,23 +1372,18 @@ dependencies = [ "syn", ] -[[package]] -name = "pin-project-internal" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8e8d2bf0b23038a4424865103a4df472855692821aab4e4f5c3312d461d9e5f" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "pin-project-lite" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b" +[[package]] +name = "pin-project-lite" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827" + [[package]] name = "pin-utils" version = "0.1.0" @@ -1889,7 +1905,7 @@ dependencies = [ "log 0.4.11", "mime_guess", "once_cell", - "pin-project-lite", + "pin-project-lite 0.1.11", "serde", "serde_json", "web-sys", @@ -2043,7 +2059,7 @@ checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27" dependencies = [ "cfg-if 0.1.10", "log 0.4.11", - "pin-project-lite", + "pin-project-lite 0.1.11", "tracing-attributes", "tracing-core", ] @@ -2074,7 +2090,7 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c" dependencies = [ - "pin-project 0.4.27", + "pin-project", "tracing", ] diff --git a/Cargo.toml b/Cargo.toml index 01bbe6b..26f2e99 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ readme = "README.md" async-std = "1.7.0" clap = "2.33.3" epub-builder = "0.4.8" +futures = "0.3.12" html5ever = "0.25.1" kuchiki = "0.8.1" lazy_static = "1.4.0" diff --git a/src/cli.rs b/src/cli.rs index 1284a15..1971fac 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -62,11 +62,15 @@ It takes a url and downloads the article content from it and saves it to an epub pub struct AppConfig { urls: Vec, + max_conn: usize, } impl AppConfig { fn new() -> Self { - Self { urls: vec![] } + Self { + urls: vec![], + max_conn: 8, + } } fn set_urls(&mut self, urls: Vec) { @@ -76,4 +80,7 @@ impl AppConfig { pub fn urls(&self) -> &Vec { &self.urls } + pub fn max_conn(&self) -> usize { + self.max_conn + } } diff --git a/src/http.rs b/src/http.rs index b3d01f7..b5fe140 100644 --- a/src/http.rs +++ b/src/http.rs @@ -57,7 +57,6 @@ pub async fn download_images( extractor: &mut Extractor, article_origin: &Url, ) -> async_std::io::Result<()> { - extractor.extract_img_urls(); let mut async_download_tasks = Vec::with_capacity(extractor.img_urls.len()); if extractor.img_urls.len() > 0 { println!("Downloading images..."); diff --git a/src/main.rs b/src/main.rs index b74b217..ec983ab 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,9 @@ #[macro_use] extern crate lazy_static; +use async_std::stream; use async_std::task; +use futures::stream::StreamExt; use url::Url; mod cli; @@ -12,33 +14,32 @@ mod extractor; mod http; mod moz_readability; +use cli::AppConfig; use epub::generate_epub; +use extractor::Extractor; use http::{download_images, fetch_url}; -use extractor::Extractor; fn main() { let app_config = cli::cli_init(); if !app_config.urls().is_empty() { - download(app_config.urls().clone()); + download(app_config); } } -fn download(urls: Vec) { - let mut async_url_tasks = Vec::with_capacity(urls.len()); - for url in urls { - async_url_tasks.push(task::spawn(async move { fetch_url(&url).await })); - } - +fn download(app_config: AppConfig) { task::block_on(async { - for url_task in async_url_tasks { - match url_task.await { + let urls_iter = app_config.urls().iter().map(|url| fetch_url(url)); + let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); + while let Some(fetch_result) = responses.next().await { + match fetch_result { Ok((url, html)) => { println!("Extracting"); let mut extractor = Extractor::from_html(&html); extractor.extract_content(&url); if extractor.article().is_some() { + extractor.extract_img_urls(); download_images(&mut extractor, &Url::parse(&url).unwrap()) .await .expect("Unable to download images");