Refactor downloading of HTML pages
This change allows for parallel downloads of HTML pages upto a maximum number of concurrent HTTP requests which is more efficient than before where all HTTP requests are likely to begin at the same time.
This commit is contained in:
parent
6b62051942
commit
003953332f
5 changed files with 79 additions and 55 deletions
102
Cargo.lock
generated
102
Cargo.lock
generated
|
@ -187,7 +187,7 @@ dependencies = [
|
|||
"memchr",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"pin-utils",
|
||||
"slab",
|
||||
"wasm-bindgen-futures",
|
||||
|
@ -684,25 +684,52 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.8"
|
||||
name = "futures"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b7109687aa4e177ef6fe84553af6280ef2778bdb7783ba44c9dc3399110fe64"
|
||||
checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "847ce131b72ffb13b6109a221da9ad97a64cbe48feb1028356b836b47b8f1748"
|
||||
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "611834ce18aaa1bd13c4b374f5d653e1027cf99b6b502584ff8c9a64413b30bb"
|
||||
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"
|
||||
|
||||
[[package]]
|
||||
name = "futures-lite"
|
||||
|
@ -715,15 +742,15 @@ dependencies = [
|
|||
"futures-io",
|
||||
"memchr",
|
||||
"parking",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"waker-fn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77408a692f1f97bcc61dc001d752e00643408fbc922e4d634c655df50d595556"
|
||||
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
|
||||
dependencies = [
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
|
@ -733,31 +760,33 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f878195a49cee50e006b02b93cf7e0a95a38ac7b776b4c4d9cc1207cd20fcb3d"
|
||||
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c554eb5bf48b2426c4771ab68c6b14468b6e76cc90996f528c3338d761a4d0d"
|
||||
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d304cff4a7b99cfb7986f7d43fbe93d175e72e704a8860787cc95e9ffd85cbd2"
|
||||
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-macro",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
"pin-project 1.0.2",
|
||||
"pin-project-lite 0.2.4",
|
||||
"pin-utils",
|
||||
"proc-macro-hack",
|
||||
"proc-macro-nested",
|
||||
|
@ -911,7 +940,7 @@ dependencies = [
|
|||
"cookie",
|
||||
"futures-lite",
|
||||
"infer",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"rand 0.7.3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
@ -1247,6 +1276,7 @@ dependencies = [
|
|||
"async-std",
|
||||
"clap",
|
||||
"epub-builder",
|
||||
"futures",
|
||||
"html5ever",
|
||||
"kuchiki",
|
||||
"lazy_static",
|
||||
|
@ -1328,16 +1358,7 @@ version = "0.4.27"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15"
|
||||
dependencies = [
|
||||
"pin-project-internal 0.4.27",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ccc2237c2c489783abd8c4c80e5450fc0e98644555b1364da68cc29aa151ca7"
|
||||
dependencies = [
|
||||
"pin-project-internal 1.0.2",
|
||||
"pin-project-internal",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1351,23 +1372,18 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-internal"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8e8d2bf0b23038a4424865103a4df472855692821aab4e4f5c3312d461d9e5f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b"
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827"
|
||||
|
||||
[[package]]
|
||||
name = "pin-utils"
|
||||
version = "0.1.0"
|
||||
|
@ -1889,7 +1905,7 @@ dependencies = [
|
|||
"log 0.4.11",
|
||||
"mime_guess",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"web-sys",
|
||||
|
@ -2043,7 +2059,7 @@ checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27"
|
|||
dependencies = [
|
||||
"cfg-if 0.1.10",
|
||||
"log 0.4.11",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
@ -2074,7 +2090,7 @@ version = "0.2.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c"
|
||||
dependencies = [
|
||||
"pin-project 0.4.27",
|
||||
"pin-project",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@ readme = "README.md"
|
|||
async-std = "1.7.0"
|
||||
clap = "2.33.3"
|
||||
epub-builder = "0.4.8"
|
||||
futures = "0.3.12"
|
||||
html5ever = "0.25.1"
|
||||
kuchiki = "0.8.1"
|
||||
lazy_static = "1.4.0"
|
||||
|
|
|
@ -62,11 +62,15 @@ It takes a url and downloads the article content from it and saves it to an epub
|
|||
|
||||
pub struct AppConfig {
|
||||
urls: Vec<String>,
|
||||
max_conn: usize,
|
||||
}
|
||||
|
||||
impl AppConfig {
|
||||
fn new() -> Self {
|
||||
Self { urls: vec![] }
|
||||
Self {
|
||||
urls: vec![],
|
||||
max_conn: 8,
|
||||
}
|
||||
}
|
||||
|
||||
fn set_urls(&mut self, urls: Vec<String>) {
|
||||
|
@ -76,4 +80,7 @@ impl AppConfig {
|
|||
pub fn urls(&self) -> &Vec<String> {
|
||||
&self.urls
|
||||
}
|
||||
pub fn max_conn(&self) -> usize {
|
||||
self.max_conn
|
||||
}
|
||||
}
|
||||
|
|
|
@ -57,7 +57,6 @@ pub async fn download_images(
|
|||
extractor: &mut Extractor,
|
||||
article_origin: &Url,
|
||||
) -> async_std::io::Result<()> {
|
||||
extractor.extract_img_urls();
|
||||
let mut async_download_tasks = Vec::with_capacity(extractor.img_urls.len());
|
||||
if extractor.img_urls.len() > 0 {
|
||||
println!("Downloading images...");
|
||||
|
|
21
src/main.rs
21
src/main.rs
|
@ -1,7 +1,9 @@
|
|||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
use async_std::stream;
|
||||
use async_std::task;
|
||||
use futures::stream::StreamExt;
|
||||
use url::Url;
|
||||
|
||||
mod cli;
|
||||
|
@ -12,33 +14,32 @@ mod extractor;
|
|||
mod http;
|
||||
mod moz_readability;
|
||||
|
||||
use cli::AppConfig;
|
||||
use epub::generate_epub;
|
||||
use extractor::Extractor;
|
||||
use http::{download_images, fetch_url};
|
||||
|
||||
use extractor::Extractor;
|
||||
fn main() {
|
||||
let app_config = cli::cli_init();
|
||||
|
||||
if !app_config.urls().is_empty() {
|
||||
download(app_config.urls().clone());
|
||||
download(app_config);
|
||||
}
|
||||
}
|
||||
|
||||
fn download(urls: Vec<String>) {
|
||||
let mut async_url_tasks = Vec::with_capacity(urls.len());
|
||||
for url in urls {
|
||||
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
|
||||
}
|
||||
|
||||
fn download(app_config: AppConfig) {
|
||||
task::block_on(async {
|
||||
for url_task in async_url_tasks {
|
||||
match url_task.await {
|
||||
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
|
||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
|
||||
while let Some(fetch_result) = responses.next().await {
|
||||
match fetch_result {
|
||||
Ok((url, html)) => {
|
||||
println!("Extracting");
|
||||
let mut extractor = Extractor::from_html(&html);
|
||||
extractor.extract_content(&url);
|
||||
|
||||
if extractor.article().is_some() {
|
||||
extractor.extract_img_urls();
|
||||
download_images(&mut extractor, &Url::parse(&url).unwrap())
|
||||
.await
|
||||
.expect("Unable to download images");
|
||||
|
|
Reference in a new issue