From 78ba40f57a1354ddb3b747a622d2f1c104d6a799 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 2 May 2020 18:33:45 +0300 Subject: [PATCH] Add image download functionality --- Cargo.lock | 8 +++ Cargo.toml | 4 +- src/extractor.rs | 140 ++++++++++++++++++++++++++++++++++++++++++++++- src/main.rs | 8 ++- 4 files changed, 155 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 020193d..1f7f343 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -575,6 +575,12 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "memchr" version = "2.3.3" @@ -712,7 +718,9 @@ version = "0.1.0" dependencies = [ "async-std", "kuchiki", + "md5", "surf", + "url", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 20b30c1..1228e69 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,4 +10,6 @@ license = "MIT" [dependencies] async-std = "1.5.0" kuchiki = "0.8.0" -surf = "1.0.3" \ No newline at end of file +md5 = "0.7.0" +surf = "1.0.3" +url = "2.1.1" \ No newline at end of file diff --git a/src/extractor.rs b/src/extractor.rs index 8b1971d..c034ae4 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,8 +1,12 @@ +use async_std::fs::File; +use async_std::io::prelude::*; use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef}; +use url::Url; pub struct Extractor { pub root_node: NodeRef, content: Option>, + img_urls: Vec, } impl Extractor { @@ -10,6 +14,7 @@ impl Extractor { pub fn from_html(html_str: &str) -> Self { Extractor { content: None, + img_urls: Vec::new(), root_node: kuchiki::parse_html().one(html_str), } } @@ -59,7 +64,6 @@ impl Extractor { .unwrap_or("en".to_string()); let meta_attrs = MetaAttr::new(author, description, lang, tags, title); - dbg!(meta_attrs); // Extract the article @@ -74,16 +78,91 @@ impl Extractor { self.content = Some(article_ref); } - _ => (), + /// Traverses the DOM tree of the content and retrieves the IMG URLs + fn extract_img_urls(&mut self) { + if let Some(content_ref) = &self.content { + for img_ref in content_ref.as_node().select("img").unwrap() { + img_ref.as_node().as_element().map(|img_elem| { + img_elem.attributes.borrow().get("src").map(|img_url| { + if !img_url.is_empty() { + self.img_urls.push(img_url.to_string()) + } + }) + }); } } } + + pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> { + self.extract_img_urls(); + for img_url in &self.img_urls { + dbg!(&self.img_urls); + let mut img_url = img_url.clone(); + + get_absolute_url(&mut img_url, article_origin); + + println!("Fetching {}", img_url); + let mut img_response = surf::get(&img_url).await.expect("Unable to retrieve file"); + let img_content: Vec = img_response.body_bytes().await.unwrap(); + let img_ext = img_response + .header("Content-Type") + .and_then(map_mime_type_to_ext) + .unwrap(); + + let mut img_file = File::create(format!("{}{}", hash_url(&img_url), &img_ext)).await?; + img_file.write_all(&img_content).await?; + println!("Image file downloaded successfully"); + + // Update img URLs + // self.content.as_ref().map(|content_ref| {}); + } + Ok(()) + } } fn extract_text_from_node(node: &NodeRef) -> Option { node.first_child() .map(|child_ref| child_ref.text_contents()) } +/// Utility for hashing URLs. This is used to help store files locally with unique values +fn hash_url(url: &str) -> String { + format!("{:x}", md5::compute(url.as_bytes())) +} + +/// Handles getting the extension from a given MIME type. The extension starts with a dot +fn map_mime_type_to_ext(mime_type: &str) -> Option { + mime_type + .split("/") + .last() + .map(|format| { + if format == ("svg+xml") { + return "svg"; + } else if format == "x-icon" { + "ico" + } else { + format + } + }) + .map(|format| String::from(".") + format) +} + +fn get_absolute_url(url: &mut String, request_url: &Url) { + if Url::parse(url).is_ok() { + } else if url.starts_with("/") { + *url = Url::parse(&format!( + "{}://{}", + request_url.scheme(), + request_url.host_str().unwrap() + )) + .unwrap() + .join(url) + .unwrap() + .into_string(); + } else { + *url = request_url.join(url).unwrap().into_string(); + } +} + #[derive(Debug)] pub struct MetaAttr { author: Option, @@ -133,7 +212,7 @@ mod test {

Starting out

Some Lorem Ipsum text here

Observe this picture

- Random image + Random image