From 65fdd967c1222199aefd23b0c50bb62e3bd5c4b8 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 9 Feb 2021 10:33:02 +0300 Subject: [PATCH] Refactor image downloading and update README Image downloads uses streams instead of spawned tasks to ensure that it does not start an unbounded number of spawned tasks --- README.md | 14 +++++++--- src/http.rs | 76 +++++++++++++++++++++++++++++------------------------ 2 files changed, 51 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index e3995fc..a6b4eb5 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,10 @@ Paperoni is a web article downloader written in Rust. The downloaded articles ar paperoni https://en.wikipedia.org/wiki/Pepperoni ``` -Paperoni also supports passing multiple links as arguments. These can be read from a file using the `-f` flag. +Paperoni also supports passing multiple links as arguments. ```sh -paperoni -f links.txt +paperoni https://en.wikipedia.org/wiki/Pepperoni https://en.wikipedia.org/wiki/Salami ``` Alternatively, if you are on a Unix-like OS, you can simply do something like this: @@ -24,6 +24,12 @@ Alternatively, if you are on a Unix-like OS, you can simply do something like th cat links.txt | xargs paperoni ``` +These can also be read from a file using the `-f` flag. + +```sh +paperoni -f links.txt +``` + ## How it works The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor. @@ -33,11 +39,11 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi ## How it (currently) doesn't work -This program is still in alpha so a number of things currently break: +This program is still in alpha so a number of things won't work: -- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium. - Websites that only run with JavaScript cannot be extracted. - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either. +- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB. ## Running locally diff --git a/src/http.rs b/src/http.rs index b5fe140..faf9428 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,6 +1,6 @@ -use async_std::fs::File; use async_std::io::prelude::*; -use async_std::task; +use async_std::{fs::File, stream}; +use futures::StreamExt; use url::Url; use crate::extractor::Extractor; @@ -57,22 +57,21 @@ pub async fn download_images( extractor: &mut Extractor, article_origin: &Url, ) -> async_std::io::Result<()> { - let mut async_download_tasks = Vec::with_capacity(extractor.img_urls.len()); if extractor.img_urls.len() > 0 { println!("Downloading images..."); } - for img_url in &extractor.img_urls { - let img_url = img_url.0.clone(); - let abs_url = get_absolute_url(&img_url, article_origin); - async_download_tasks.push(task::spawn(async move { - let mut img_response = surf::Client::new() - // The middleware has been temporarily commented out because it happens - // to affect downloading images when there is no redirecting - // .with(surf::middleware::Redirect::default()) - .get(&abs_url) - .await - .expect("Unable to retrieve file"); + let imgs_req_iter = extractor + .img_urls + .iter() + .map(|(url, _)| { + ( + url, + surf::Client::new().get(get_absolute_url(&url, article_origin)), + ) + }) + .map(|(url, req)| async move { + let mut img_response = req.await.expect("Unable to retrieve image"); let img_content: Vec = img_response.body_bytes().await.unwrap(); let img_mime = img_response .content_type() @@ -81,8 +80,9 @@ pub async fn download_images( .content_type() .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) .unwrap(); + let mut img_path = std::env::temp_dir(); - img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext)); + img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); let mut img_file = File::create(&img_path) .await .expect("Unable to create file"); @@ -92,7 +92,7 @@ pub async fn download_images( .expect("Unable to save to file"); ( - img_url, + url, img_path .file_name() .map(|os_str_name| { @@ -104,27 +104,33 @@ pub async fn download_images( .unwrap(), img_mime, ) - })); - } + }); - extractor.img_urls.clear(); + // A utility closure used when update the value of an image source after downloading is successful + let replace_existing_img_src = + |img_item: (&String, String, Option)| -> (String, Option) { + let (img_url, img_path, img_mime) = img_item; + let img_ref = extractor + .article() + .as_mut() + .expect("Unable to get mutable ref") + .select_first(&format!("img[src='{}']", img_url)) + .expect("Image node does not exist"); + let mut img_node = img_ref.attributes.borrow_mut(); + *img_node.get_mut("src").unwrap() = img_path.clone(); + // srcset is removed because readers such as Foliate then fail to display + // the image already downloaded and stored in src + img_node.remove("srcset"); + (img_path, img_mime) + }; - for async_task in async_download_tasks { - let (img_url, img_path, img_mime) = async_task.await; - // Update the image sources - let img_ref = extractor - .article() - .as_mut() - .expect("Unable to get mutable ref") - .select_first(&format!("img[src='{}']", img_url)) - .expect("Image node does not exist"); - let mut img_node = img_ref.attributes.borrow_mut(); - *img_node.get_mut("src").unwrap() = img_path.clone(); - // srcset is removed because readers such as Foliate then fail to display - // the image already downloaded and stored in src - img_node.remove("srcset"); - extractor.img_urls.push((img_path, img_mime)); - } + extractor.img_urls = stream::from_iter(imgs_req_iter) + .buffered(10) + .collect::>() + .await + .into_iter() + .map(replace_existing_img_src) + .collect(); Ok(()) }