Refactor image downloading and update README

Image downloads uses streams instead of spawned tasks to ensure that
it does not start an unbounded number of spawned tasks
This commit is contained in:
Kenneth Gitere 2021-02-09 10:33:02 +03:00
parent 003953332f
commit 65fdd967c1
2 changed files with 51 additions and 39 deletions

View file

@ -12,10 +12,10 @@ Paperoni is a web article downloader written in Rust. The downloaded articles ar
paperoni https://en.wikipedia.org/wiki/Pepperoni paperoni https://en.wikipedia.org/wiki/Pepperoni
``` ```
Paperoni also supports passing multiple links as arguments. These can be read from a file using the `-f` flag. Paperoni also supports passing multiple links as arguments.
```sh ```sh
paperoni -f links.txt paperoni https://en.wikipedia.org/wiki/Pepperoni https://en.wikipedia.org/wiki/Salami
``` ```
Alternatively, if you are on a Unix-like OS, you can simply do something like this: Alternatively, if you are on a Unix-like OS, you can simply do something like this:
@ -24,6 +24,12 @@ Alternatively, if you are on a Unix-like OS, you can simply do something like th
cat links.txt | xargs paperoni cat links.txt | xargs paperoni
``` ```
These can also be read from a file using the `-f` flag.
```sh
paperoni -f links.txt
```
## How it works ## How it works
The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor. The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor.
@ -33,11 +39,11 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi
## How it (currently) doesn't work ## How it (currently) doesn't work
This program is still in alpha so a number of things currently break: This program is still in alpha so a number of things won't work:
- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium.
- Websites that only run with JavaScript cannot be extracted. - Websites that only run with JavaScript cannot be extracted.
- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either. - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB.
## Running locally ## Running locally

View file

@ -1,6 +1,6 @@
use async_std::fs::File;
use async_std::io::prelude::*; use async_std::io::prelude::*;
use async_std::task; use async_std::{fs::File, stream};
use futures::StreamExt;
use url::Url; use url::Url;
use crate::extractor::Extractor; use crate::extractor::Extractor;
@ -57,22 +57,21 @@ pub async fn download_images(
extractor: &mut Extractor, extractor: &mut Extractor,
article_origin: &Url, article_origin: &Url,
) -> async_std::io::Result<()> { ) -> async_std::io::Result<()> {
let mut async_download_tasks = Vec::with_capacity(extractor.img_urls.len());
if extractor.img_urls.len() > 0 { if extractor.img_urls.len() > 0 {
println!("Downloading images..."); println!("Downloading images...");
} }
for img_url in &extractor.img_urls {
let img_url = img_url.0.clone();
let abs_url = get_absolute_url(&img_url, article_origin);
async_download_tasks.push(task::spawn(async move { let imgs_req_iter = extractor
let mut img_response = surf::Client::new() .img_urls
// The middleware has been temporarily commented out because it happens .iter()
// to affect downloading images when there is no redirecting .map(|(url, _)| {
// .with(surf::middleware::Redirect::default()) (
.get(&abs_url) url,
.await surf::Client::new().get(get_absolute_url(&url, article_origin)),
.expect("Unable to retrieve file"); )
})
.map(|(url, req)| async move {
let mut img_response = req.await.expect("Unable to retrieve image");
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap(); let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
let img_mime = img_response let img_mime = img_response
.content_type() .content_type()
@ -81,8 +80,9 @@ pub async fn download_images(
.content_type() .content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
.unwrap(); .unwrap();
let mut img_path = std::env::temp_dir(); let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext)); img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
let mut img_file = File::create(&img_path) let mut img_file = File::create(&img_path)
.await .await
.expect("Unable to create file"); .expect("Unable to create file");
@ -92,7 +92,7 @@ pub async fn download_images(
.expect("Unable to save to file"); .expect("Unable to save to file");
( (
img_url, url,
img_path img_path
.file_name() .file_name()
.map(|os_str_name| { .map(|os_str_name| {
@ -104,27 +104,33 @@ pub async fn download_images(
.unwrap(), .unwrap(),
img_mime, img_mime,
) )
})); });
}
extractor.img_urls.clear(); // A utility closure used when update the value of an image source after downloading is successful
let replace_existing_img_src =
|img_item: (&String, String, Option<String>)| -> (String, Option<String>) {
let (img_url, img_path, img_mime) = img_item;
let img_ref = extractor
.article()
.as_mut()
.expect("Unable to get mutable ref")
.select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut();
*img_node.get_mut("src").unwrap() = img_path.clone();
// srcset is removed because readers such as Foliate then fail to display
// the image already downloaded and stored in src
img_node.remove("srcset");
(img_path, img_mime)
};
for async_task in async_download_tasks { extractor.img_urls = stream::from_iter(imgs_req_iter)
let (img_url, img_path, img_mime) = async_task.await; .buffered(10)
// Update the image sources .collect::<Vec<_>>()
let img_ref = extractor .await
.article() .into_iter()
.as_mut() .map(replace_existing_img_src)
.expect("Unable to get mutable ref") .collect();
.select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut();
*img_node.get_mut("src").unwrap() = img_path.clone();
// srcset is removed because readers such as Foliate then fail to display
// the image already downloaded and stored in src
img_node.remove("srcset");
extractor.img_urls.push((img_path, img_mime));
}
Ok(()) Ok(())
} }