Refactor to use temp directory and update surf

Change from using res directory for image downloads to using temp directories.
Update surf to v2 which required changing the way Content-Type headers are
read from.
This commit is contained in:
Kenneth Gitere 2020-11-23 09:39:56 +03:00
parent ab800d0174
commit ef3efdba81
4 changed files with 1186 additions and 838 deletions

1933
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -19,6 +19,6 @@ kuchiki = "0.8.1"
lazy_static = "1.3.9" lazy_static = "1.3.9"
md5 = "0.7.0" md5 = "0.7.0"
regex = "1.3.9" regex = "1.3.9"
surf = "1.0.3" surf = "2.1.0"
structopt = { version = "0.3" } structopt = { version = "0.3" }
url = "2.1.1" url = "2.1.1"

View file

@ -62,22 +62,23 @@ impl Extractor {
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> { pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len()); let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
self.extract_img_urls(); self.extract_img_urls();
println!("Downloading images to res/"); println!("Downloading images...");
for img_url in &self.img_urls { for img_url in &self.img_urls {
let img_url = img_url.0.clone(); let img_url = img_url.0.clone();
let abs_url = get_absolute_url(&img_url, article_origin); let abs_url = get_absolute_url(&img_url, article_origin);
async_download_tasks.push(task::spawn(async move { async_download_tasks.push(task::spawn(async move {
let mut img_response = surf::get(&abs_url).await.expect("Unable to retrieve file"); let mut img_response = surf::get(&abs_url).await.expect("Unable to retrieve file");
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap(); let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
let img_mime = img_response let img_mime = img_response
.header("Content-Type") .content_type()
.map(|header| header.to_string()); .map(|mime| mime.essence().to_string());
let img_ext = img_response let img_ext = img_response
.header("Content-Type") .content_type()
.and_then(map_mime_type_to_ext) .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
.unwrap(); .unwrap();
let mut img_path = std::env::temp_dir();
let img_path = format!("res/{}{}", hash_url(&abs_url), &img_ext); img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
let mut img_file = File::create(&img_path) let mut img_file = File::create(&img_path)
.await .await
.expect("Unable to create file"); .expect("Unable to create file");
@ -86,7 +87,19 @@ impl Extractor {
.await .await
.expect("Unable to save to file"); .expect("Unable to save to file");
(img_url, img_path, img_mime) (
img_url,
img_path
.file_name()
.map(|os_str_name| {
os_str_name
.to_str()
.expect("Unable to get image file name")
.to_string()
})
.unwrap(),
img_mime,
)
})); }));
} }
@ -123,21 +136,15 @@ fn hash_url(url: &str) -> String {
format!("{:x}", md5::compute(url.as_bytes())) format!("{:x}", md5::compute(url.as_bytes()))
} }
/// Handles getting the extension from a given MIME type. The extension starts with a dot /// Handles getting the extension from a given MIME subtype.
fn map_mime_type_to_ext(mime_type: &str) -> Option<String> { fn map_mime_subtype_to_ext(subtype: &str) -> &str {
mime_type if subtype == ("svg+xml") {
.split("/")
.last()
.map(|format| {
if format == ("svg+xml") {
return "svg"; return "svg";
} else if format == "x-icon" { } else if subtype == "x-icon" {
"ico" "ico"
} else { } else {
format subtype
} }
})
.map(|format| String::from(".") + format)
} }
fn get_absolute_url(url: &str, request_url: &Url) -> String { fn get_absolute_url(url: &str, request_url: &Url) -> String {
@ -204,23 +211,15 @@ mod test {
#[test] #[test]
fn test_map_mime_type_to_ext() { fn test_map_mime_type_to_ext() {
let mime_types = vec![ let mime_subtypes = vec![
"image/apng", "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
"image/bmp",
"image/gif",
"image/x-icon",
"image/jpeg",
"image/png",
"image/svg+xml",
"image/tiff",
"image/webp",
]; ];
let exts = mime_types let exts = mime_subtypes
.into_iter() .into_iter()
.map(|mime_type| map_mime_type_to_ext(mime_type).unwrap()) .map(|mime_type| map_mime_subtype_to_ext(mime_type))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
assert_eq!( assert_eq!(
vec![".apng", ".bmp", ".gif", ".ico", ".jpeg", ".png", ".svg", ".tiff", ".webp"], vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
exts exts
); );
} }

View file

@ -2,9 +2,8 @@
extern crate lazy_static; extern crate lazy_static;
use std::fs::File; use std::fs::File;
use std::path::Path;
use async_std::{fs::create_dir, fs::remove_dir_all, task}; use async_std::task;
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use structopt::StructOpt; use structopt::StructOpt;
use url::Url; use url::Url;
@ -50,11 +49,6 @@ fn download(urls: Vec<String>) {
let mut extractor = Extractor::from_html(&html); let mut extractor = Extractor::from_html(&html);
extractor.extract_content(&url); extractor.extract_content(&url);
if extractor.article().is_some() { if extractor.article().is_some() {
if !Path::new("res/").exists() {
create_dir("res/")
.await
.expect("Unable to create res/ output folder");
}
extractor extractor
.download_images(&Url::parse(&url).unwrap()) .download_images(&Url::parse(&url).unwrap())
.await .await
@ -79,14 +73,14 @@ fn download(urls: Vec<String>) {
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
.unwrap(); .unwrap();
for img in extractor.img_urls { for img in extractor.img_urls {
let file_path = format!("{}", &img.0); let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(file_path).expect("Can't read file"); let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap(); epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
.unwrap();
} }
epub.generate(&mut out_file).unwrap(); epub.generate(&mut out_file).unwrap();
println!("Cleaning up");
remove_dir_all("res/").await.unwrap();
println!("Created {:?}", file_name); println!("Created {:?}", file_name);
} }
} }