Change download code to save images to a folder
Add downloaded images to the output epub file
This commit is contained in:
parent
f02973157d
commit
271d3c8951
2 changed files with 34 additions and 18 deletions
|
@ -4,10 +4,12 @@ use async_std::task;
|
||||||
use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef};
|
use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
|
||||||
pub struct Extractor {
|
pub struct Extractor {
|
||||||
pub root_node: NodeRef,
|
pub root_node: NodeRef,
|
||||||
pub content: Option<NodeDataRef<ElementData>>,
|
pub content: Option<NodeDataRef<ElementData>>,
|
||||||
img_urls: Vec<String>,
|
pub img_urls: Vec<ResourceInfo>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Extractor {
|
impl Extractor {
|
||||||
|
@ -86,7 +88,7 @@ impl Extractor {
|
||||||
img_ref.as_node().as_element().map(|img_elem| {
|
img_ref.as_node().as_element().map(|img_elem| {
|
||||||
img_elem.attributes.borrow().get("src").map(|img_url| {
|
img_elem.attributes.borrow().get("src").map(|img_url| {
|
||||||
if !img_url.is_empty() {
|
if !img_url.is_empty() {
|
||||||
self.img_urls.push(img_url.to_string())
|
self.img_urls.push((img_url.to_string(), None))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
@ -100,20 +102,22 @@ impl Extractor {
|
||||||
) -> async_std::io::Result<()> {
|
) -> async_std::io::Result<()> {
|
||||||
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
||||||
self.extract_img_urls();
|
self.extract_img_urls();
|
||||||
|
println!("Downloading images to res/");
|
||||||
for img_url in &self.img_urls {
|
for img_url in &self.img_urls {
|
||||||
let mut img_url = img_url.clone();
|
let mut img_url = img_url.0.clone();
|
||||||
get_absolute_url(&mut img_url, article_origin);
|
get_absolute_url(&mut img_url, article_origin);
|
||||||
async_download_tasks.push(task::spawn(async {
|
async_download_tasks.push(task::spawn(async {
|
||||||
println!("Fetching {}", img_url);
|
|
||||||
let mut img_response = surf::get(&img_url).await.expect("Unable to retrieve file");
|
let mut img_response = surf::get(&img_url).await.expect("Unable to retrieve file");
|
||||||
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
||||||
|
let img_mime = img_response
|
||||||
|
.header("Content-Type")
|
||||||
|
.map(|header| header.to_string());
|
||||||
let img_ext = img_response
|
let img_ext = img_response
|
||||||
.header("Content-Type")
|
.header("Content-Type")
|
||||||
.and_then(map_mime_type_to_ext)
|
.and_then(map_mime_type_to_ext)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let img_path = format!("{}{}", hash_url(&img_url), &img_ext);
|
let img_path = format!("res/{}{}", hash_url(&img_url), &img_ext);
|
||||||
let mut img_file = File::create(&img_path)
|
let mut img_file = File::create(&img_path)
|
||||||
.await
|
.await
|
||||||
.expect("Unable to create file");
|
.expect("Unable to create file");
|
||||||
|
@ -121,13 +125,15 @@ impl Extractor {
|
||||||
.write_all(&img_content)
|
.write_all(&img_content)
|
||||||
.await
|
.await
|
||||||
.expect("Unable to save to file");
|
.expect("Unable to save to file");
|
||||||
println!("Image file downloaded successfully");
|
|
||||||
(img_url, img_path)
|
(img_url, img_path, img_mime)
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.img_urls.clear();
|
||||||
|
|
||||||
for async_task in async_download_tasks {
|
for async_task in async_download_tasks {
|
||||||
let (img_url, img_path) = async_task.await;
|
let (img_url, img_path, img_mime) = async_task.await;
|
||||||
// Update the image sources
|
// Update the image sources
|
||||||
let img_ref = self
|
let img_ref = self
|
||||||
.content
|
.content
|
||||||
|
@ -137,7 +143,8 @@ impl Extractor {
|
||||||
.select_first(&format!("img[src='{}']", img_url))
|
.select_first(&format!("img[src='{}']", img_url))
|
||||||
.expect("Image node does not exist");
|
.expect("Image node does not exist");
|
||||||
let mut img_node = img_ref.attributes.borrow_mut();
|
let mut img_node = img_ref.attributes.borrow_mut();
|
||||||
*img_node.get_mut("src").unwrap() = img_path;
|
*img_node.get_mut("src").unwrap() = img_path.clone();
|
||||||
|
self.img_urls.push((img_path, img_mime));
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -324,7 +331,7 @@ mod test {
|
||||||
extractor.extract_img_urls();
|
extractor.extract_img_urls();
|
||||||
|
|
||||||
assert!(extractor.img_urls.len() > 0);
|
assert!(extractor.img_urls.len() > 0);
|
||||||
assert_eq!(vec!["/img.jpg"], extractor.img_urls);
|
assert_eq!(vec![("/img.jpg".to_string(), None)], extractor.img_urls);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
23
src/main.rs
23
src/main.rs
|
@ -1,6 +1,6 @@
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
|
||||||
use async_std::task;
|
use async_std::{fs::create_dir, fs::remove_dir_all, task};
|
||||||
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
|
@ -17,10 +17,13 @@ fn main() {
|
||||||
"https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
|
"https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
|
||||||
"https://dev.to/steelwolf180/full-stack-development-in-django-3768"
|
"https://dev.to/steelwolf180/full-stack-development-in-django-3768"
|
||||||
];
|
];
|
||||||
let html = fetch_url(urls[5]).await;
|
let html = fetch_url(urls[4]).await;
|
||||||
let mut extractor = Extractor::from_html(&html);
|
let mut extractor = Extractor::from_html(&html);
|
||||||
println!("Extracting");
|
println!("Extracting");
|
||||||
extractor.extract_content();
|
extractor.extract_content();
|
||||||
|
create_dir("res/")
|
||||||
|
.await
|
||||||
|
.expect("Unable to create res/ output folder");
|
||||||
extractor
|
extractor
|
||||||
.download_images(&Url::parse(urls[5]).unwrap())
|
.download_images(&Url::parse(urls[5]).unwrap())
|
||||||
.await
|
.await
|
||||||
|
@ -34,12 +37,18 @@ fn main() {
|
||||||
.serialize(&mut html_buf)
|
.serialize(&mut html_buf)
|
||||||
.expect("Unable to serialize");
|
.expect("Unable to serialize");
|
||||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
||||||
EpubBuilder::new(ZipLibrary::new().unwrap())
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||||
.unwrap()
|
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
||||||
.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
|
||||||
.unwrap()
|
|
||||||
.generate(&mut out_file)
|
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
for img in extractor.img_urls {
|
||||||
|
let file_path = format!("{}", &img.0);
|
||||||
|
|
||||||
|
let img_buf = File::open(file_path).expect("Can't read file");
|
||||||
|
epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
|
||||||
|
}
|
||||||
|
epub.generate(&mut out_file).unwrap();
|
||||||
|
println!("Cleaning up");
|
||||||
|
remove_dir_all("res/").await.unwrap();
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Reference in a new issue