Refactor image downloading and update README

Image downloads uses streams instead of spawned tasks to ensure that it does not start an unbounded number of spawned tasks
2021-02-09 10:33:02 +03:00 · 2021-02-09 10:33:02 +03:00 · 65fdd967c1
commit 65fdd967c1
parent 003953332f
2 changed files with 51 additions and 39 deletions
--- a/README.md
+++ b/README.md
@ -12,10 +12,10 @@ Paperoni is a web article downloader written in Rust. The downloaded articles ar
 paperoni https://en.wikipedia.org/wiki/Pepperoni
 ```
-Paperoni also supports passing multiple links as arguments. These can be read from a file using the `-f` flag.
+Paperoni also supports passing multiple links as arguments.
 ```sh
-paperoni -f links.txt
+paperoni https://en.wikipedia.org/wiki/Pepperoni https://en.wikipedia.org/wiki/Salami
 ```
 Alternatively, if you are on a Unix-like OS, you can simply do something like this:
@ -24,6 +24,12 @@ Alternatively, if you are on a Unix-like OS, you can simply do something like th
 cat links.txt | xargs paperoni
 ```
 These can also be read from a file using the `-f` flag.
 ```sh
 paperoni -f links.txt
 ```
 ## How it works
 The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor.
@ -33,11 +39,11 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi
 ## How it (currently) doesn't work
-This program is still in alpha so a number of things currently break:
+This program is still in alpha so a number of things won't work:
 - Certain links with redirects can't be extracted. Such links include urls that are proxying Medium.
 - Websites that only run with JavaScript cannot be extracted.
 - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
 - Code snippets on Medium articles that are lazy loaded will not appear in the EPUB.
 ## Running locally
--- a/src/http.rs
+++ b/src/http.rs
@ -1,6 +1,6 @@
 use async_std::fs::File;
 use async_std::io::prelude::*;
-use async_std::task;
+use async_std::{fs::File, stream};
 use futures::StreamExt;
 use url::Url;
 use crate::extractor::Extractor;
@ -57,22 +57,21 @@ pub async fn download_images(
    extractor: &mut Extractor,
    article_origin: &Url,
 ) -> async_std::io::Result<()> {
    let mut async_download_tasks = Vec::with_capacity(extractor.img_urls.len());
    if extractor.img_urls.len() > 0 {
        println!("Downloading images...");
    }
    for img_url in &extractor.img_urls {
        let img_url = img_url.0.clone();
        let abs_url = get_absolute_url(&img_url, article_origin);
-        async_download_tasks.push(task::spawn(async move {
+    let imgs_req_iter = extractor
-            let mut img_response = surf::Client::new()
+        .img_urls
-                // The middleware has been temporarily commented out because it happens
+        .iter()
-                // to affect downloading images when there is no redirecting
+        .map(|(url, _)| {
-                // .with(surf::middleware::Redirect::default())
+            (
-                .get(&abs_url)
+                url,
-                .await
+                surf::Client::new().get(get_absolute_url(&url, article_origin)),
-                .expect("Unable to retrieve file");
+            )
        })
        .map(|(url, req)| async move {
            let mut img_response = req.await.expect("Unable to retrieve image");
            let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
            let img_mime = img_response
                .content_type()
@ -81,8 +80,9 @@ pub async fn download_images(
                .content_type()
                .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
                .unwrap();
            let mut img_path = std::env::temp_dir();
-            img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
+            img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
            let mut img_file = File::create(&img_path)
                .await
                .expect("Unable to create file");
@ -92,7 +92,7 @@ pub async fn download_images(
                .expect("Unable to save to file");
            (
-                img_url,
+                url,
                img_path
                    .file_name()
                    .map(|os_str_name| {
@ -104,27 +104,33 @@ pub async fn download_images(
                    .unwrap(),
                img_mime,
            )
-        }));
+        });
    }
-    extractor.img_urls.clear();
+    // A utility closure used when update the value of an image source after downloading is successful
    let replace_existing_img_src =
        |img_item: (&String, String, Option<String>)| -> (String, Option<String>) {
            let (img_url, img_path, img_mime) = img_item;
            let img_ref = extractor
                .article()
                .as_mut()
                .expect("Unable to get mutable ref")
                .select_first(&format!("img[src='{}']", img_url))
                .expect("Image node does not exist");
            let mut img_node = img_ref.attributes.borrow_mut();
            *img_node.get_mut("src").unwrap() = img_path.clone();
            // srcset is removed because readers such as Foliate then fail to display
            // the image already downloaded and stored in src
            img_node.remove("srcset");
            (img_path, img_mime)
        };
-    for async_task in async_download_tasks {
+    extractor.img_urls = stream::from_iter(imgs_req_iter)
-        let (img_url, img_path, img_mime) = async_task.await;
+        .buffered(10)
-        // Update the image sources
+        .collect::<Vec<_>>()
-        let img_ref = extractor
+        .await
-            .article()
+        .into_iter()
-            .as_mut()
+        .map(replace_existing_img_src)
-            .expect("Unable to get mutable ref")
+        .collect();
            .select_first(&format!("img[src='{}']", img_url))
            .expect("Image node does not exist");
        let mut img_node = img_ref.attributes.borrow_mut();
        *img_node.get_mut("src").unwrap() = img_path.clone();
        // srcset is removed because readers such as Foliate then fail to display
        // the image already downloaded and stored in src
        img_node.remove("srcset");
        extractor.img_urls.push((img_path, img_mime));
    }
    Ok(())
 }