Merge branch 'dev' of github.com:hipstermojo/paperoni into dev

2021-05-13 12:25:11 +03:00 · 2021-05-13 12:25:11 +03:00 · 5ccbe1a17a
commit 5ccbe1a17a
parent 538a65f6fd 102304544d
2 changed files with 79 additions and 63 deletions
--- a/src/http.rs
+++ b/src/http.rs
@ -72,6 +72,60 @@ pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
    })
 }
 type ImgItem<'a> = (&'a str, String, Option<String>);
 async fn process_img_response<'a>(
    img_response: &mut surf::Response,
    url: &'a str,
 ) -> Result<ImgItem<'a>, ImgError> {
    if !img_response.status().is_success() {
        let kind = ErrorKind::HTTPError(format!(
            "Non-success HTTP status code ({})",
            img_response.status()
        ));
        return Err(ImgError::with_kind(kind));
    }
    let img_content: Vec<u8> = match img_response.body_bytes().await {
        Ok(bytes) => bytes,
        Err(e) => return Err(e.into()),
    };
    let img_mime = img_response
        .content_type()
        .map(|mime| mime.essence().to_string());
    let img_ext = match img_response
        .content_type()
        .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
    {
        Some(mime_str) => mime_str,
        None => return Err(ErrorKind::HTTPError("Image has no Content-Type".to_owned()).into()),
    };
    let mut img_path = std::env::temp_dir();
    img_path.push(format!("{}.{}", hash_url(url), &img_ext));
    let mut img_file = match File::create(&img_path).await {
        Ok(file) => file,
        Err(e) => return Err(e.into()),
    };
    match img_file.write_all(&img_content).await {
        Ok(_) => (),
        Err(e) => return Err(e.into()),
    }
    Ok((
        url,
        img_path
            .file_name()
            .map(|os_str_name| {
                os_str_name
                    .to_str()
                    .expect("Unable to get image file name")
                    .to_string()
            })
            .unwrap(),
        img_mime,
    ))
 }
 pub async fn download_images(
    extractor: &mut Extractor,
    article_origin: &Url,
@ -102,53 +156,9 @@ pub async fn download_images(
            bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());
            match req.await {
                Ok(mut img_response) => {
-                    let process_response = async {
+                    let process_response =
-                        let img_content: Vec<u8> = match img_response.body_bytes().await {
+                        process_img_response(&mut img_response, url.as_ref()).await;
-                            Ok(bytes) => bytes,
+                    process_response.map_err(|mut e: ImgError| {
                            Err(e) => return Err(e.into()),
                        };
                        let img_mime = img_response
                            .content_type()
                            .map(|mime| mime.essence().to_string());
                        let img_ext = match img_response
                            .content_type()
                            .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
                        {
                            Some(mime_str) => mime_str,
                            None => {
                                return Err(ErrorKind::HTTPError(
                                    "Image has no Content-Type".to_owned(),
                                )
                                .into())
                            }
                        };
                        let mut img_path = std::env::temp_dir();
                        img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
                        let mut img_file = match File::create(&img_path).await {
                            Ok(file) => file,
                            Err(e) => return Err(e.into()),
                        };
                        match img_file.write_all(&img_content).await {
                            Ok(_) => (),
                            Err(e) => return Err(e.into()),
                        }
                        Ok((
                            url,
                            img_path
                                .file_name()
                                .map(|os_str_name| {
                                    os_str_name
                                        .to_str()
                                        .expect("Unable to get image file name")
                                        .to_string()
                                })
                                .unwrap(),
                            img_mime,
                        ))
                    };
                    process_response.await.map_err(|mut e: ImgError| {
                        e.set_url(url);
                        e
                    })
@ -162,8 +172,7 @@ pub async fn download_images(
        });
    // A utility closure used when update the value of an image source after downloading is successful
-    let replace_existing_img_src =
+    let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
        |img_item: (&String, String, Option<String>)| -> (String, Option<String>) {
        let (img_url, img_path, img_mime) = img_item;
        let img_ref = extractor
            .article()
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@ -1248,8 +1248,7 @@ impl Readability {
            let srcset = node_attr.get("srcset");
            let class = node_attr.get("class");
            if (src.is_some() || srcset.is_some())
-                && class.is_some()
+                && class.and_then(|classname| classname.find("lazy")).is_none()
                && !class.unwrap().contains("lazy")
            {
                continue;
            }
@ -3160,6 +3159,7 @@ characters. For that reason, this <p> tag could not be a byline because it's too
                    <source media="(min-width:465px)" srcset="img_white_flower.jpg">
                    <img src="img_orange_flowers.jpg" alt="Flowers" style="width:auto;">
                </picture>
                <img id="no-lazy-class" src="https://image.url/" data-attrs="{&quot;src&quot;:&quot;https://other.url/1.png&quot;,&quot;alt&quot;:&quot;&quot;}"/>
            </body>
        </html>
        "#;
@ -3189,6 +3189,13 @@ characters. For that reason, this <p> tag could not be a byline because it's too
            lazy_loaded_attrs.get("data-src"),
            lazy_loaded_attrs.get("src")
        );
        let no_lazy_class = doc.root_node.select_first("#no-lazy-class").unwrap();
        let no_lazy_class_attrs = no_lazy_class.attributes.borrow();
        assert_eq!(
            no_lazy_class_attrs.get("src").unwrap(),
            "https://image.url/"
        );
    }
    #[test]