Refactor downloading code to download images in parallel

This commit is contained in:
Kenneth Gitere 2020-05-05 09:40:44 +03:00
parent 4e8812c1ee
commit f02973157d

View file

@ -1,5 +1,6 @@
use async_std::fs::File; use async_std::fs::File;
use async_std::io::prelude::*; use async_std::io::prelude::*;
use async_std::task;
use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef}; use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef};
use url::Url; use url::Url;
@ -93,26 +94,41 @@ impl Extractor {
} }
} }
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> { pub async fn download_images<'a>(
&'a mut self,
article_origin: &'a Url,
) -> async_std::io::Result<()> {
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
self.extract_img_urls(); self.extract_img_urls();
for img_url in &self.img_urls { for img_url in &self.img_urls {
let mut img_url = img_url.clone(); let mut img_url = img_url.clone();
get_absolute_url(&mut img_url, article_origin); get_absolute_url(&mut img_url, article_origin);
async_download_tasks.push(task::spawn(async {
println!("Fetching {}", img_url);
let mut img_response = surf::get(&img_url).await.expect("Unable to retrieve file");
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
let img_ext = img_response
.header("Content-Type")
.and_then(map_mime_type_to_ext)
.unwrap();
println!("Fetching {}", img_url); let img_path = format!("{}{}", hash_url(&img_url), &img_ext);
let mut img_response = surf::get(&img_url).await.expect("Unable to retrieve file"); let mut img_file = File::create(&img_path)
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap(); .await
let img_ext = img_response .expect("Unable to create file");
.header("Content-Type") img_file
.and_then(map_mime_type_to_ext) .write_all(&img_content)
.unwrap(); .await
let img_path = format!("{}{}", hash_url(&img_url), &img_ext); .expect("Unable to save to file");
println!("Image file downloaded successfully");
let mut img_file = File::create(&img_path).await?; (img_url, img_path)
img_file.write_all(&img_content).await?; }));
println!("Image file downloaded successfully"); }
for async_task in async_download_tasks {
let (img_url, img_path) = async_task.await;
// Update the image sources
let img_ref = self let img_ref = self
.content .content
.as_mut() .as_mut()
@ -126,6 +142,7 @@ impl Extractor {
Ok(()) Ok(())
} }
} }
fn extract_text_from_node(node: &NodeRef) -> Option<String> { fn extract_text_from_node(node: &NodeRef) -> Option<String> {
node.first_child() node.first_child()
.map(|child_ref| child_ref.text_contents()) .map(|child_ref| child_ref.text_contents())