Update img tags with new src values to point to the local files
This commit is contained in:
parent
78ba40f57a
commit
e5a318282d
2 changed files with 21 additions and 7 deletions
|
@ -5,7 +5,7 @@ use url::Url;
|
||||||
|
|
||||||
pub struct Extractor {
|
pub struct Extractor {
|
||||||
pub root_node: NodeRef,
|
pub root_node: NodeRef,
|
||||||
content: Option<NodeDataRef<ElementData>>,
|
pub content: Option<NodeDataRef<ElementData>>,
|
||||||
img_urls: Vec<String>,
|
img_urls: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,7 +96,6 @@ impl Extractor {
|
||||||
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
||||||
self.extract_img_urls();
|
self.extract_img_urls();
|
||||||
for img_url in &self.img_urls {
|
for img_url in &self.img_urls {
|
||||||
dbg!(&self.img_urls);
|
|
||||||
let mut img_url = img_url.clone();
|
let mut img_url = img_url.clone();
|
||||||
|
|
||||||
get_absolute_url(&mut img_url, article_origin);
|
get_absolute_url(&mut img_url, article_origin);
|
||||||
|
@ -108,13 +107,21 @@ impl Extractor {
|
||||||
.header("Content-Type")
|
.header("Content-Type")
|
||||||
.and_then(map_mime_type_to_ext)
|
.and_then(map_mime_type_to_ext)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
let img_path = format!("{}{}", hash_url(&img_url), &img_ext);
|
||||||
|
|
||||||
let mut img_file = File::create(format!("{}{}", hash_url(&img_url), &img_ext)).await?;
|
let mut img_file = File::create(&img_path).await?;
|
||||||
img_file.write_all(&img_content).await?;
|
img_file.write_all(&img_content).await?;
|
||||||
println!("Image file downloaded successfully");
|
println!("Image file downloaded successfully");
|
||||||
|
|
||||||
// Update img URLs
|
let img_ref = self
|
||||||
// self.content.as_ref().map(|content_ref| {});
|
.content
|
||||||
|
.as_mut()
|
||||||
|
.expect("Unable to get mutable ref")
|
||||||
|
.as_node()
|
||||||
|
.select_first(&format!("img[src='{}']", img_url))
|
||||||
|
.expect("Image node does not exist");
|
||||||
|
let mut img_node = img_ref.attributes.borrow_mut();
|
||||||
|
*img_node.get_mut("src").unwrap() = img_path;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
11
src/main.rs
11
src/main.rs
|
@ -16,14 +16,21 @@ fn main() {
|
||||||
"https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
|
"https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
|
||||||
"https://dev.to/steelwolf180/full-stack-development-in-django-3768"
|
"https://dev.to/steelwolf180/full-stack-development-in-django-3768"
|
||||||
];
|
];
|
||||||
let html = fetch_url(urls[3]).await;
|
let html = fetch_url(urls[5]).await;
|
||||||
let mut extractor = Extractor::from_html(&html);
|
let mut extractor = Extractor::from_html(&html);
|
||||||
println!("Extracting");
|
println!("Extracting");
|
||||||
extractor.extract_content();
|
extractor.extract_content();
|
||||||
extractor
|
extractor
|
||||||
.download_images(&Url::parse(urls[3]).unwrap())
|
.download_images(&Url::parse(urls[5]).unwrap())
|
||||||
.await
|
.await
|
||||||
.expect("Unable to download images");
|
.expect("Unable to download images");
|
||||||
|
let mut out_file = File::create("out.html").unwrap();
|
||||||
|
extractor
|
||||||
|
.content
|
||||||
|
.unwrap()
|
||||||
|
.as_node()
|
||||||
|
.serialize(&mut out_file)
|
||||||
|
.expect("Unable to serialize");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Reference in a new issue