From 4581f073309953c61c2e3eb041e6a6ab04f5fdab Mon Sep 17 00:00:00 2001 From: KOVACS Tamas Date: Sat, 8 May 2021 21:30:00 +0200 Subject: [PATCH 1/4] http.rs: extract process_img_response function --- src/http.rs | 124 ++++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/src/http.rs b/src/http.rs index bb457b1..b076cf5 100644 --- a/src/http.rs +++ b/src/http.rs @@ -72,6 +72,53 @@ pub async fn fetch_html(url: &str) -> Result { }) } +type ImgItem<'a> = (&'a str, String, Option); + +async fn process_img_response<'a>( + img_response: &mut surf::Response, + url: &'a str, +) -> Result, ImgError> { + let img_content: Vec = match img_response.body_bytes().await { + Ok(bytes) => bytes, + Err(e) => return Err(e.into()), + }; + let img_mime = img_response + .content_type() + .map(|mime| mime.essence().to_string()); + let img_ext = match img_response + .content_type() + .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) + { + Some(mime_str) => mime_str, + None => return Err(ErrorKind::HTTPError("Image has no Content-Type".to_owned()).into()), + }; + + let mut img_path = std::env::temp_dir(); + img_path.push(format!("{}.{}", hash_url(url), &img_ext)); + let mut img_file = match File::create(&img_path).await { + Ok(file) => file, + Err(e) => return Err(e.into()), + }; + match img_file.write_all(&img_content).await { + Ok(_) => (), + Err(e) => return Err(e.into()), + } + + Ok(( + url, + img_path + .file_name() + .map(|os_str_name| { + os_str_name + .to_str() + .expect("Unable to get image file name") + .to_string() + }) + .unwrap(), + img_mime, + )) +} + pub async fn download_images( extractor: &mut Extractor, article_origin: &Url, @@ -102,53 +149,9 @@ pub async fn download_images( bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str()); match req.await { Ok(mut img_response) => { - let process_response = async { - let img_content: Vec = match img_response.body_bytes().await { - Ok(bytes) => bytes, - Err(e) => return Err(e.into()), - }; - let img_mime = img_response - .content_type() - .map(|mime| mime.essence().to_string()); - let img_ext = match img_response - .content_type() - .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) - { - Some(mime_str) => mime_str, - None => { - return Err(ErrorKind::HTTPError( - "Image has no Content-Type".to_owned(), - ) - .into()) - } - }; - - let mut img_path = std::env::temp_dir(); - img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); - let mut img_file = match File::create(&img_path).await { - Ok(file) => file, - Err(e) => return Err(e.into()), - }; - match img_file.write_all(&img_content).await { - Ok(_) => (), - Err(e) => return Err(e.into()), - } - - Ok(( - url, - img_path - .file_name() - .map(|os_str_name| { - os_str_name - .to_str() - .expect("Unable to get image file name") - .to_string() - }) - .unwrap(), - img_mime, - )) - }; - process_response.await.map_err(|mut e: ImgError| { + let process_response = + process_img_response(&mut img_response, url.as_ref()).await; + process_response.map_err(|mut e: ImgError| { e.set_url(url); e }) @@ -162,20 +165,19 @@ pub async fn download_images( }); // A utility closure used when update the value of an image source after downloading is successful - let replace_existing_img_src = - |img_item: (&String, String, Option)| -> (String, Option) { - let (img_url, img_path, img_mime) = img_item; - let img_ref = extractor - .article() - .select_first(&format!("img[src='{}']", img_url)) - .expect("Image node does not exist"); - let mut img_node = img_ref.attributes.borrow_mut(); - *img_node.get_mut("src").unwrap() = img_path.clone(); - // srcset is removed because readers such as Foliate then fail to display - // the image already downloaded and stored in src - img_node.remove("srcset"); - (img_path, img_mime) - }; + let replace_existing_img_src = |img_item: ImgItem| -> (String, Option) { + let (img_url, img_path, img_mime) = img_item; + let img_ref = extractor + .article() + .select_first(&format!("img[src='{}']", img_url)) + .expect("Image node does not exist"); + let mut img_node = img_ref.attributes.borrow_mut(); + *img_node.get_mut("src").unwrap() = img_path.clone(); + // srcset is removed because readers such as Foliate then fail to display + // the image already downloaded and stored in src + img_node.remove("srcset"); + (img_path, img_mime) + }; let imgs_req_iter = stream::from_iter(imgs_req_iter) .buffered(10) From 8ec491ff06610bf017270d576f95e3e356423b9f Mon Sep 17 00:00:00 2001 From: KOVACS Tamas Date: Sun, 9 May 2021 13:55:26 +0200 Subject: [PATCH 2/4] http.rs: check response status for fetched images This patch checks if fetching an image resulted in a non-success status code. In case of non-success status, the response is discarded and an error is emitted. This relies on having 3xx codes already handled by surf's Redirect middleware, so we should see 4xx and 5xx codes here. Fixes hipstermojo/paperoni#11 --- src/http.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/http.rs b/src/http.rs index b076cf5..efd64b8 100644 --- a/src/http.rs +++ b/src/http.rs @@ -78,6 +78,13 @@ async fn process_img_response<'a>( img_response: &mut surf::Response, url: &'a str, ) -> Result, ImgError> { + if !img_response.status().is_success() { + let kind = ErrorKind::HTTPError(format!( + "Non-success HTTP status code ({})", + img_response.status() + )); + return Err(ImgError::with_kind(kind)); + } let img_content: Vec = match img_response.body_bytes().await { Ok(bytes) => bytes, Err(e) => return Err(e.into()), From d50f08b875316e08a8713a0da86e32bbc9c2628b Mon Sep 17 00:00:00 2001 From: KOVACS Tamas Date: Mon, 10 May 2021 01:30:05 +0200 Subject: [PATCH 3/4] moz_readability/mod.rs: add testcase for issue #13 This patch adds a testcase for issue #13, where an img node without a class attribute is automatically assumed to be lazy and its src is replaced. --- src/moz_readability/mod.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 38236d3..7af37f7 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -3160,6 +3160,7 @@ characters. For that reason, this

tag could not be a byline because it's too Flowers + "#; @@ -3189,6 +3190,13 @@ characters. For that reason, this

tag could not be a byline because it's too lazy_loaded_attrs.get("data-src"), lazy_loaded_attrs.get("src") ); + + let no_lazy_class = doc.root_node.select_first("#no-lazy-class").unwrap(); + let no_lazy_class_attrs = no_lazy_class.attributes.borrow(); + assert_eq!( + no_lazy_class_attrs.get("src").unwrap(), + "https://image.url/" + ); } #[test] From 7649f6aa183bac6d1b579beb0a5cd16207005f75 Mon Sep 17 00:00:00 2001 From: KOVACS Tamas Date: Mon, 10 May 2021 01:33:12 +0200 Subject: [PATCH 4/4] moz_readability/mod.rs: fix laziness check in fix_lazy_images fix_lazy_images checks whether an img node is lazily loaded. An img is considered lazily loaded if it does not have an src/srcset attribute, or if it's class contains the 'lazy' string. If an img is considered lazy, fix_lazy_images will make attempts to replace it's src. However, if an img was missing the class attribute, it was incorrectly assumed to be lazy and had it's src replaced. Fixes hipstermojo/paperoni#13 --- src/moz_readability/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 7af37f7..705fa55 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -1248,8 +1248,7 @@ impl Readability { let srcset = node_attr.get("srcset"); let class = node_attr.get("class"); if (src.is_some() || srcset.is_some()) - && class.is_some() - && !class.unwrap().contains("lazy") + && class.and_then(|classname| classname.find("lazy")).is_none() { continue; }