From 271d3c89513448e6a1f4e7ff023d14312b53fda0 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Tue, 5 May 2020 12:24:11 +0300 Subject: [PATCH] Change download code to save images to a folder Add downloaded images to the output epub file --- src/extractor.rs | 29 ++++++++++++++++++----------- src/main.rs | 23 ++++++++++++++++------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/src/extractor.rs b/src/extractor.rs index 898f6c1..93548be 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -4,10 +4,12 @@ use async_std::task; use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef}; use url::Url; +pub type ResourceInfo = (String, Option); + pub struct Extractor { pub root_node: NodeRef, pub content: Option>, - img_urls: Vec, + pub img_urls: Vec, } impl Extractor { @@ -86,7 +88,7 @@ impl Extractor { img_ref.as_node().as_element().map(|img_elem| { img_elem.attributes.borrow().get("src").map(|img_url| { if !img_url.is_empty() { - self.img_urls.push(img_url.to_string()) + self.img_urls.push((img_url.to_string(), None)) } }) }); @@ -100,20 +102,22 @@ impl Extractor { ) -> async_std::io::Result<()> { let mut async_download_tasks = Vec::with_capacity(self.img_urls.len()); self.extract_img_urls(); - + println!("Downloading images to res/"); for img_url in &self.img_urls { - let mut img_url = img_url.clone(); + let mut img_url = img_url.0.clone(); get_absolute_url(&mut img_url, article_origin); async_download_tasks.push(task::spawn(async { - println!("Fetching {}", img_url); let mut img_response = surf::get(&img_url).await.expect("Unable to retrieve file"); let img_content: Vec = img_response.body_bytes().await.unwrap(); + let img_mime = img_response + .header("Content-Type") + .map(|header| header.to_string()); let img_ext = img_response .header("Content-Type") .and_then(map_mime_type_to_ext) .unwrap(); - let img_path = format!("{}{}", hash_url(&img_url), &img_ext); + let img_path = format!("res/{}{}", hash_url(&img_url), &img_ext); let mut img_file = File::create(&img_path) .await .expect("Unable to create file"); @@ -121,13 +125,15 @@ impl Extractor { .write_all(&img_content) .await .expect("Unable to save to file"); - println!("Image file downloaded successfully"); - (img_url, img_path) + + (img_url, img_path, img_mime) })); } + self.img_urls.clear(); + for async_task in async_download_tasks { - let (img_url, img_path) = async_task.await; + let (img_url, img_path, img_mime) = async_task.await; // Update the image sources let img_ref = self .content @@ -137,7 +143,8 @@ impl Extractor { .select_first(&format!("img[src='{}']", img_url)) .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut(); - *img_node.get_mut("src").unwrap() = img_path; + *img_node.get_mut("src").unwrap() = img_path.clone(); + self.img_urls.push((img_path, img_mime)); } Ok(()) } @@ -324,7 +331,7 @@ mod test { extractor.extract_img_urls(); assert!(extractor.img_urls.len() > 0); - assert_eq!(vec!["/img.jpg"], extractor.img_urls); + assert_eq!(vec![("/img.jpg".to_string(), None)], extractor.img_urls); } #[test] diff --git a/src/main.rs b/src/main.rs index af5b718..d790f9b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,6 @@ use std::fs::File; -use async_std::task; +use async_std::{fs::create_dir, fs::remove_dir_all, task}; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use url::Url; @@ -17,10 +17,13 @@ fn main() { "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10", "https://dev.to/steelwolf180/full-stack-development-in-django-3768" ]; - let html = fetch_url(urls[5]).await; + let html = fetch_url(urls[4]).await; let mut extractor = Extractor::from_html(&html); println!("Extracting"); extractor.extract_content(); + create_dir("res/") + .await + .expect("Unable to create res/ output folder"); extractor .download_images(&Url::parse(urls[5]).unwrap()) .await @@ -34,12 +37,18 @@ fn main() { .serialize(&mut html_buf) .expect("Unable to serialize"); let html_buf = std::str::from_utf8(&html_buf).unwrap(); - EpubBuilder::new(ZipLibrary::new().unwrap()) - .unwrap() - .add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) - .unwrap() - .generate(&mut out_file) + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) .unwrap(); + for img in extractor.img_urls { + let file_path = format!("{}", &img.0); + + let img_buf = File::open(file_path).expect("Can't read file"); + epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap(); + } + epub.generate(&mut out_file).unwrap(); + println!("Cleaning up"); + remove_dir_all("res/").await.unwrap(); }) }