Change download code to save images to a folder

Add downloaded images to the output epub file
2020-05-05 12:24:11 +03:00 · 2020-05-05 12:24:11 +03:00 · 271d3c8951
commit 271d3c8951
parent f02973157d
2 changed files with 34 additions and 18 deletions
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -4,10 +4,12 @@ use async_std::task;
 use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef};
 use url::Url;

+pub type ResourceInfo = (String, Option<String>);
+
 pub struct Extractor {
    pub root_node: NodeRef,
    pub content: Option<NodeDataRef<ElementData>>,
-    img_urls: Vec<String>,
+    pub img_urls: Vec<ResourceInfo>,
 }

 impl Extractor {
@ -86,7 +88,7 @@ impl Extractor {
                img_ref.as_node().as_element().map(|img_elem| {
                    img_elem.attributes.borrow().get("src").map(|img_url| {
                        if !img_url.is_empty() {
-                            self.img_urls.push(img_url.to_string())
+                            self.img_urls.push((img_url.to_string(), None))
                        }
                    })
                });
@ -100,20 +102,22 @@ impl Extractor {
    ) -> async_std::io::Result<()> {
        let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
        self.extract_img_urls();
-
+        println!("Downloading images to res/");
        for img_url in &self.img_urls {
-            let mut img_url = img_url.clone();
+            let mut img_url = img_url.0.clone();
            get_absolute_url(&mut img_url, article_origin);
            async_download_tasks.push(task::spawn(async {
-                println!("Fetching {}", img_url);
                let mut img_response = surf::get(&img_url).await.expect("Unable to retrieve file");
                let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
+                let img_mime = img_response
+                    .header("Content-Type")
+                    .map(|header| header.to_string());
                let img_ext = img_response
                    .header("Content-Type")
                    .and_then(map_mime_type_to_ext)
                    .unwrap();

-                let img_path = format!("{}{}", hash_url(&img_url), &img_ext);
+                let img_path = format!("res/{}{}", hash_url(&img_url), &img_ext);
                let mut img_file = File::create(&img_path)
                    .await
                    .expect("Unable to create file");
@ -121,13 +125,15 @@ impl Extractor {
                    .write_all(&img_content)
                    .await
                    .expect("Unable to save to file");
-                println!("Image file downloaded successfully");
-                (img_url, img_path)
+
+                (img_url, img_path, img_mime)
            }));
        }

+        self.img_urls.clear();
+
        for async_task in async_download_tasks {
-            let (img_url, img_path) = async_task.await;
+            let (img_url, img_path, img_mime) = async_task.await;
            // Update the image sources
            let img_ref = self
                .content
@ -137,7 +143,8 @@ impl Extractor {
                .select_first(&format!("img[src='{}']", img_url))
                .expect("Image node does not exist");
            let mut img_node = img_ref.attributes.borrow_mut();
-            *img_node.get_mut("src").unwrap() = img_path;
+            *img_node.get_mut("src").unwrap() = img_path.clone();
+            self.img_urls.push((img_path, img_mime));
        }
        Ok(())
    }
@ -324,7 +331,7 @@ mod test {
        extractor.extract_img_urls();

        assert!(extractor.img_urls.len() > 0);
-        assert_eq!(vec!["/img.jpg"], extractor.img_urls);
+        assert_eq!(vec![("/img.jpg".to_string(), None)], extractor.img_urls);
    }

    #[test]
--- a/src/main.rs
+++ b/src/main.rs
@ -1,6 +1,6 @@
 use std::fs::File;

-use async_std::task;
+use async_std::{fs::create_dir, fs::remove_dir_all, task};
 use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
 use url::Url;

@ -17,10 +17,13 @@ fn main() {
            "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
            "https://dev.to/steelwolf180/full-stack-development-in-django-3768"
        ];
-        let html = fetch_url(urls[5]).await;
+        let html = fetch_url(urls[4]).await;
        let mut extractor = Extractor::from_html(&html);
        println!("Extracting");
        extractor.extract_content();
+        create_dir("res/")
+            .await
+            .expect("Unable to create res/ output folder");
        extractor
            .download_images(&Url::parse(urls[5]).unwrap())
            .await
@ -34,12 +37,18 @@ fn main() {
            .serialize(&mut html_buf)
            .expect("Unable to serialize");
        let html_buf = std::str::from_utf8(&html_buf).unwrap();
-        EpubBuilder::new(ZipLibrary::new().unwrap())
-            .unwrap()
-            .add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
-            .unwrap()
-            .generate(&mut out_file)
+        let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
+        epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
            .unwrap();
+        for img in extractor.img_urls {
+            let file_path = format!("{}", &img.0);
+
+            let img_buf = File::open(file_path).expect("Can't read file");
+            epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
+        }
+        epub.generate(&mut out_file).unwrap();
+        println!("Cleaning up");
+        remove_dir_all("res/").await.unwrap();
    })
 }