From 4581f073309953c61c2e3eb041e6a6ab04f5fdab Mon Sep 17 00:00:00 2001 From: KOVACS Tamas Date: Sat, 8 May 2021 21:30:00 +0200 Subject: [PATCH 1/2] http.rs: extract process_img_response function --- src/http.rs | 124 ++++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/src/http.rs b/src/http.rs index bb457b1..b076cf5 100644 --- a/src/http.rs +++ b/src/http.rs @@ -72,6 +72,53 @@ pub async fn fetch_html(url: &str) -> Result { }) } +type ImgItem<'a> = (&'a str, String, Option); + +async fn process_img_response<'a>( + img_response: &mut surf::Response, + url: &'a str, +) -> Result, ImgError> { + let img_content: Vec = match img_response.body_bytes().await { + Ok(bytes) => bytes, + Err(e) => return Err(e.into()), + }; + let img_mime = img_response + .content_type() + .map(|mime| mime.essence().to_string()); + let img_ext = match img_response + .content_type() + .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) + { + Some(mime_str) => mime_str, + None => return Err(ErrorKind::HTTPError("Image has no Content-Type".to_owned()).into()), + }; + + let mut img_path = std::env::temp_dir(); + img_path.push(format!("{}.{}", hash_url(url), &img_ext)); + let mut img_file = match File::create(&img_path).await { + Ok(file) => file, + Err(e) => return Err(e.into()), + }; + match img_file.write_all(&img_content).await { + Ok(_) => (), + Err(e) => return Err(e.into()), + } + + Ok(( + url, + img_path + .file_name() + .map(|os_str_name| { + os_str_name + .to_str() + .expect("Unable to get image file name") + .to_string() + }) + .unwrap(), + img_mime, + )) +} + pub async fn download_images( extractor: &mut Extractor, article_origin: &Url, @@ -102,53 +149,9 @@ pub async fn download_images( bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str()); match req.await { Ok(mut img_response) => { - let process_response = async { - let img_content: Vec = match img_response.body_bytes().await { - Ok(bytes) => bytes, - Err(e) => return Err(e.into()), - }; - let img_mime = img_response - .content_type() - .map(|mime| mime.essence().to_string()); - let img_ext = match img_response - .content_type() - .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) - { - Some(mime_str) => mime_str, - None => { - return Err(ErrorKind::HTTPError( - "Image has no Content-Type".to_owned(), - ) - .into()) - } - }; - - let mut img_path = std::env::temp_dir(); - img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); - let mut img_file = match File::create(&img_path).await { - Ok(file) => file, - Err(e) => return Err(e.into()), - }; - match img_file.write_all(&img_content).await { - Ok(_) => (), - Err(e) => return Err(e.into()), - } - - Ok(( - url, - img_path - .file_name() - .map(|os_str_name| { - os_str_name - .to_str() - .expect("Unable to get image file name") - .to_string() - }) - .unwrap(), - img_mime, - )) - }; - process_response.await.map_err(|mut e: ImgError| { + let process_response = + process_img_response(&mut img_response, url.as_ref()).await; + process_response.map_err(|mut e: ImgError| { e.set_url(url); e }) @@ -162,20 +165,19 @@ pub async fn download_images( }); // A utility closure used when update the value of an image source after downloading is successful - let replace_existing_img_src = - |img_item: (&String, String, Option)| -> (String, Option) { - let (img_url, img_path, img_mime) = img_item; - let img_ref = extractor - .article() - .select_first(&format!("img[src='{}']", img_url)) - .expect("Image node does not exist"); - let mut img_node = img_ref.attributes.borrow_mut(); - *img_node.get_mut("src").unwrap() = img_path.clone(); - // srcset is removed because readers such as Foliate then fail to display - // the image already downloaded and stored in src - img_node.remove("srcset"); - (img_path, img_mime) - }; + let replace_existing_img_src = |img_item: ImgItem| -> (String, Option) { + let (img_url, img_path, img_mime) = img_item; + let img_ref = extractor + .article() + .select_first(&format!("img[src='{}']", img_url)) + .expect("Image node does not exist"); + let mut img_node = img_ref.attributes.borrow_mut(); + *img_node.get_mut("src").unwrap() = img_path.clone(); + // srcset is removed because readers such as Foliate then fail to display + // the image already downloaded and stored in src + img_node.remove("srcset"); + (img_path, img_mime) + }; let imgs_req_iter = stream::from_iter(imgs_req_iter) .buffered(10) From 8ec491ff06610bf017270d576f95e3e356423b9f Mon Sep 17 00:00:00 2001 From: KOVACS Tamas Date: Sun, 9 May 2021 13:55:26 +0200 Subject: [PATCH 2/2] http.rs: check response status for fetched images This patch checks if fetching an image resulted in a non-success status code. In case of non-success status, the response is discarded and an error is emitted. This relies on having 3xx codes already handled by surf's Redirect middleware, so we should see 4xx and 5xx codes here. Fixes hipstermojo/paperoni#11 --- src/http.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/http.rs b/src/http.rs index b076cf5..efd64b8 100644 --- a/src/http.rs +++ b/src/http.rs @@ -78,6 +78,13 @@ async fn process_img_response<'a>( img_response: &mut surf::Response, url: &'a str, ) -> Result, ImgError> { + if !img_response.status().is_success() { + let kind = ErrorKind::HTTPError(format!( + "Non-success HTTP status code ({})", + img_response.status() + )); + return Err(ImgError::with_kind(kind)); + } let img_content: Vec = match img_response.body_bytes().await { Ok(bytes) => bytes, Err(e) => return Err(e.into()),