Refactor to use temp directory and update surf
Change from using res directory for image downloads to using temp directories. Update surf to v2 which required changing the way Content-Type headers are read from.
This commit is contained in:
parent
ab800d0174
commit
ef3efdba81
4 changed files with 1186 additions and 838 deletions
1933
Cargo.lock
generated
1933
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -19,6 +19,6 @@ kuchiki = "0.8.1"
|
||||||
lazy_static = "1.3.9"
|
lazy_static = "1.3.9"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
regex = "1.3.9"
|
regex = "1.3.9"
|
||||||
surf = "1.0.3"
|
surf = "2.1.0"
|
||||||
structopt = { version = "0.3" }
|
structopt = { version = "0.3" }
|
||||||
url = "2.1.1"
|
url = "2.1.1"
|
|
@ -62,22 +62,23 @@ impl Extractor {
|
||||||
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
||||||
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
||||||
self.extract_img_urls();
|
self.extract_img_urls();
|
||||||
println!("Downloading images to res/");
|
println!("Downloading images...");
|
||||||
for img_url in &self.img_urls {
|
for img_url in &self.img_urls {
|
||||||
let img_url = img_url.0.clone();
|
let img_url = img_url.0.clone();
|
||||||
let abs_url = get_absolute_url(&img_url, article_origin);
|
let abs_url = get_absolute_url(&img_url, article_origin);
|
||||||
|
|
||||||
async_download_tasks.push(task::spawn(async move {
|
async_download_tasks.push(task::spawn(async move {
|
||||||
let mut img_response = surf::get(&abs_url).await.expect("Unable to retrieve file");
|
let mut img_response = surf::get(&abs_url).await.expect("Unable to retrieve file");
|
||||||
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
||||||
let img_mime = img_response
|
let img_mime = img_response
|
||||||
.header("Content-Type")
|
.content_type()
|
||||||
.map(|header| header.to_string());
|
.map(|mime| mime.essence().to_string());
|
||||||
let img_ext = img_response
|
let img_ext = img_response
|
||||||
.header("Content-Type")
|
.content_type()
|
||||||
.and_then(map_mime_type_to_ext)
|
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
let mut img_path = std::env::temp_dir();
|
||||||
let img_path = format!("res/{}{}", hash_url(&abs_url), &img_ext);
|
img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
|
||||||
let mut img_file = File::create(&img_path)
|
let mut img_file = File::create(&img_path)
|
||||||
.await
|
.await
|
||||||
.expect("Unable to create file");
|
.expect("Unable to create file");
|
||||||
|
@ -86,7 +87,19 @@ impl Extractor {
|
||||||
.await
|
.await
|
||||||
.expect("Unable to save to file");
|
.expect("Unable to save to file");
|
||||||
|
|
||||||
(img_url, img_path, img_mime)
|
(
|
||||||
|
img_url,
|
||||||
|
img_path
|
||||||
|
.file_name()
|
||||||
|
.map(|os_str_name| {
|
||||||
|
os_str_name
|
||||||
|
.to_str()
|
||||||
|
.expect("Unable to get image file name")
|
||||||
|
.to_string()
|
||||||
|
})
|
||||||
|
.unwrap(),
|
||||||
|
img_mime,
|
||||||
|
)
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,21 +136,15 @@ fn hash_url(url: &str) -> String {
|
||||||
format!("{:x}", md5::compute(url.as_bytes()))
|
format!("{:x}", md5::compute(url.as_bytes()))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Handles getting the extension from a given MIME type. The extension starts with a dot
|
/// Handles getting the extension from a given MIME subtype.
|
||||||
fn map_mime_type_to_ext(mime_type: &str) -> Option<String> {
|
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
||||||
mime_type
|
if subtype == ("svg+xml") {
|
||||||
.split("/")
|
return "svg";
|
||||||
.last()
|
} else if subtype == "x-icon" {
|
||||||
.map(|format| {
|
"ico"
|
||||||
if format == ("svg+xml") {
|
} else {
|
||||||
return "svg";
|
subtype
|
||||||
} else if format == "x-icon" {
|
}
|
||||||
"ico"
|
|
||||||
} else {
|
|
||||||
format
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.map(|format| String::from(".") + format)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
||||||
|
@ -204,23 +211,15 @@ mod test {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_map_mime_type_to_ext() {
|
fn test_map_mime_type_to_ext() {
|
||||||
let mime_types = vec![
|
let mime_subtypes = vec![
|
||||||
"image/apng",
|
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
|
||||||
"image/bmp",
|
|
||||||
"image/gif",
|
|
||||||
"image/x-icon",
|
|
||||||
"image/jpeg",
|
|
||||||
"image/png",
|
|
||||||
"image/svg+xml",
|
|
||||||
"image/tiff",
|
|
||||||
"image/webp",
|
|
||||||
];
|
];
|
||||||
let exts = mime_types
|
let exts = mime_subtypes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|mime_type| map_mime_type_to_ext(mime_type).unwrap())
|
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
vec![".apng", ".bmp", ".gif", ".ico", ".jpeg", ".png", ".svg", ".tiff", ".webp"],
|
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
|
||||||
exts
|
exts
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
18
src/main.rs
18
src/main.rs
|
@ -2,9 +2,8 @@
|
||||||
extern crate lazy_static;
|
extern crate lazy_static;
|
||||||
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
use async_std::{fs::create_dir, fs::remove_dir_all, task};
|
use async_std::task;
|
||||||
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
@ -50,11 +49,6 @@ fn download(urls: Vec<String>) {
|
||||||
let mut extractor = Extractor::from_html(&html);
|
let mut extractor = Extractor::from_html(&html);
|
||||||
extractor.extract_content(&url);
|
extractor.extract_content(&url);
|
||||||
if extractor.article().is_some() {
|
if extractor.article().is_some() {
|
||||||
if !Path::new("res/").exists() {
|
|
||||||
create_dir("res/")
|
|
||||||
.await
|
|
||||||
.expect("Unable to create res/ output folder");
|
|
||||||
}
|
|
||||||
extractor
|
extractor
|
||||||
.download_images(&Url::parse(&url).unwrap())
|
.download_images(&Url::parse(&url).unwrap())
|
||||||
.await
|
.await
|
||||||
|
@ -79,14 +73,14 @@ fn download(urls: Vec<String>) {
|
||||||
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
for img in extractor.img_urls {
|
for img in extractor.img_urls {
|
||||||
let file_path = format!("{}", &img.0);
|
let mut file_path = std::env::temp_dir();
|
||||||
|
file_path.push(&img.0);
|
||||||
|
|
||||||
let img_buf = File::open(file_path).expect("Can't read file");
|
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||||
epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
|
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
|
||||||
|
.unwrap();
|
||||||
}
|
}
|
||||||
epub.generate(&mut out_file).unwrap();
|
epub.generate(&mut out_file).unwrap();
|
||||||
println!("Cleaning up");
|
|
||||||
remove_dir_all("res/").await.unwrap();
|
|
||||||
println!("Created {:?}", file_name);
|
println!("Created {:?}", file_name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue