From 65fdd967c1222199aefd23b0c50bb62e3bd5c4b8 Mon Sep 17 00:00:00 2001
From: Kenneth Gitere <gitere81@gmail.com>
Date: Tue, 9 Feb 2021 10:33:02 +0300
Subject: [PATCH] Refactor image downloading and update README

Image downloads uses streams instead of spawned tasks to ensure that
it does not start an unbounded number of spawned tasks
---
 README.md   | 14 +++++++---
 src/http.rs | 76 +++++++++++++++++++++++++++++------------------------
 2 files changed, 51 insertions(+), 39 deletions(-)
diff --git a/README.md b/README.md
index e3995fc..a6b4eb5 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,10 @@ Paperoni is a web article downloader written in Rust. The downloaded articles ar
 paperoni https://en.wikipedia.org/wiki/Pepperoni
 ```
 
-Paperoni also supports passing multiple links as arguments. These can be read from a file using the `-f` flag.
+Paperoni also supports passing multiple links as arguments.
 
 ```sh
-paperoni -f links.txt
+paperoni https://en.wikipedia.org/wiki/Pepperoni https://en.wikipedia.org/wiki/Salami
 ```
 
 Alternatively, if you are on a Unix-like OS, you can simply do something like this:
@@ -24,6 +24,12 @@ Alternatively, if you are on a Unix-like OS, you can simply do something like th
 cat links.txt | xargs paperoni
 ```
 
+These can also be read from a file using the `-f` flag.
+
+```sh
+paperoni -f links.txt
+```
+
 ## How it works
 
 The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor.
@@ -33,11 +39,11 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi
 
 ## How it (currently) doesn't work
 
-This program is still in alpha so a number of things currently break:
+This program is still in alpha so a number of things won't work:
 
-- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium.
 - Websites that only run with JavaScript cannot be extracted.
 - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
+- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB.
 
 ## Running locally
 
diff --git a/src/http.rs b/src/http.rs
index b5fe140..faf9428 100644
--- a/src/http.rs
+++ b/src/http.rs
@@ -1,6 +1,6 @@
-use async_std::fs::File;
 use async_std::io::prelude::*;
-use async_std::task;
+use async_std::{fs::File, stream};
+use futures::StreamExt;
 use url::Url;
 
 use crate::extractor::Extractor;
@@ -57,22 +57,21 @@ pub async fn download_images(
     extractor: &mut Extractor,
     article_origin: &Url,
 ) -> async_std::io::Result<()> {
-    let mut async_download_tasks = Vec::with_capacity(extractor.img_urls.len());
     if extractor.img_urls.len() > 0 {
         println!("Downloading images...");
     }
-    for img_url in &extractor.img_urls {
-        let img_url = img_url.0.clone();
-        let abs_url = get_absolute_url(&img_url, article_origin);
 
-        async_download_tasks.push(task::spawn(async move {
-            let mut img_response = surf::Client::new()
-                // The middleware has been temporarily commented out because it happens
-                // to affect downloading images when there is no redirecting
-                // .with(surf::middleware::Redirect::default())
-                .get(&abs_url)
-                .await
-                .expect("Unable to retrieve file");
+    let imgs_req_iter = extractor
+        .img_urls
+        .iter()
+        .map(|(url, _)| {
+            (
+                url,
+                surf::Client::new().get(get_absolute_url(&url, article_origin)),
+            )
+        })
+        .map(|(url, req)| async move {
+            let mut img_response = req.await.expect("Unable to retrieve image");
             let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
             let img_mime = img_response
                 .content_type()
@@ -81,8 +80,9 @@ pub async fn download_images(
                 .content_type()
                 .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
                 .unwrap();
+
             let mut img_path = std::env::temp_dir();
-            img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
+            img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
             let mut img_file = File::create(&img_path)
                 .await
                 .expect("Unable to create file");
@@ -92,7 +92,7 @@ pub async fn download_images(
                 .expect("Unable to save to file");
 
             (
-                img_url,
+                url,
                 img_path
                     .file_name()
                     .map(|os_str_name| {
@@ -104,27 +104,33 @@ pub async fn download_images(
                     .unwrap(),
                 img_mime,
             )
-        }));
-    }
+        });
 
-    extractor.img_urls.clear();
+    // A utility closure used when update the value of an image source after downloading is successful
+    let replace_existing_img_src =
+        |img_item: (&String, String, Option<String>)| -> (String, Option<String>) {
+            let (img_url, img_path, img_mime) = img_item;
+            let img_ref = extractor
+                .article()
+                .as_mut()
+                .expect("Unable to get mutable ref")
+                .select_first(&format!("img[src='{}']", img_url))
+                .expect("Image node does not exist");
+            let mut img_node = img_ref.attributes.borrow_mut();
+            *img_node.get_mut("src").unwrap() = img_path.clone();
+            // srcset is removed because readers such as Foliate then fail to display
+            // the image already downloaded and stored in src
+            img_node.remove("srcset");
+            (img_path, img_mime)
+        };
 
-    for async_task in async_download_tasks {
-        let (img_url, img_path, img_mime) = async_task.await;
-        // Update the image sources
-        let img_ref = extractor
-            .article()
-            .as_mut()
-            .expect("Unable to get mutable ref")
-            .select_first(&format!("img[src='{}']", img_url))
-            .expect("Image node does not exist");
-        let mut img_node = img_ref.attributes.borrow_mut();
-        *img_node.get_mut("src").unwrap() = img_path.clone();
-        // srcset is removed because readers such as Foliate then fail to display
-        // the image already downloaded and stored in src
-        img_node.remove("srcset");
-        extractor.img_urls.push((img_path, img_mime));
-    }
+    extractor.img_urls = stream::from_iter(imgs_req_iter)
+        .buffered(10)
+        .collect::<Vec<_>>()
+        .await
+        .into_iter()
+        .map(replace_existing_img_src)
+        .collect();
     Ok(())
 }