Update img tags with new src values to point to the local files

This commit is contained in:
Kenneth Gitere 2020-05-02 19:06:03 +03:00
parent 78ba40f57a
commit e5a318282d
2 changed files with 21 additions and 7 deletions

View file

@ -5,7 +5,7 @@ use url::Url;
pub struct Extractor { pub struct Extractor {
pub root_node: NodeRef, pub root_node: NodeRef,
content: Option<NodeDataRef<ElementData>>, pub content: Option<NodeDataRef<ElementData>>,
img_urls: Vec<String>, img_urls: Vec<String>,
} }
@ -96,7 +96,6 @@ impl Extractor {
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> { pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
self.extract_img_urls(); self.extract_img_urls();
for img_url in &self.img_urls { for img_url in &self.img_urls {
dbg!(&self.img_urls);
let mut img_url = img_url.clone(); let mut img_url = img_url.clone();
get_absolute_url(&mut img_url, article_origin); get_absolute_url(&mut img_url, article_origin);
@ -108,13 +107,21 @@ impl Extractor {
.header("Content-Type") .header("Content-Type")
.and_then(map_mime_type_to_ext) .and_then(map_mime_type_to_ext)
.unwrap(); .unwrap();
let img_path = format!("{}{}", hash_url(&img_url), &img_ext);
let mut img_file = File::create(format!("{}{}", hash_url(&img_url), &img_ext)).await?; let mut img_file = File::create(&img_path).await?;
img_file.write_all(&img_content).await?; img_file.write_all(&img_content).await?;
println!("Image file downloaded successfully"); println!("Image file downloaded successfully");
// Update img URLs let img_ref = self
// self.content.as_ref().map(|content_ref| {}); .content
.as_mut()
.expect("Unable to get mutable ref")
.as_node()
.select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut();
*img_node.get_mut("src").unwrap() = img_path;
} }
Ok(()) Ok(())
} }

View file

@ -16,14 +16,21 @@ fn main() {
"https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10", "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
"https://dev.to/steelwolf180/full-stack-development-in-django-3768" "https://dev.to/steelwolf180/full-stack-development-in-django-3768"
]; ];
let html = fetch_url(urls[3]).await; let html = fetch_url(urls[5]).await;
let mut extractor = Extractor::from_html(&html); let mut extractor = Extractor::from_html(&html);
println!("Extracting"); println!("Extracting");
extractor.extract_content(); extractor.extract_content();
extractor extractor
.download_images(&Url::parse(urls[3]).unwrap()) .download_images(&Url::parse(urls[5]).unwrap())
.await .await
.expect("Unable to download images"); .expect("Unable to download images");
let mut out_file = File::create("out.html").unwrap();
extractor
.content
.unwrap()
.as_node()
.serialize(&mut out_file)
.expect("Unable to serialize");
}); });
} }