paperoni/src/http.rs

use async_std::io::prelude::*;
use async_std::{fs::File, stream};
use futures::StreamExt;
use indicatif::ProgressBar;
use log::{debug, info};
use url::Url;

use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Extractor;
type HTMLResource = (String, String);

pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
    let client = surf::Client::new();
    debug!("Fetching {}", url);

    let process_request = async {
        let mut redirect_count: u8 = 0;
        let base_url = Url::parse(&url)?;
        let mut url = base_url.clone();
        while redirect_count < 5 {
            redirect_count += 1;
            let req = surf::get(&url);
            let mut res = client.send(req).await?;
            if res.status().is_redirection() {
                if let Some(location) = res.header(surf::http::headers::LOCATION) {
                    match Url::parse(location.last().as_str()) {
                        Ok(valid_url) => {
                            info!("Redirecting {} to {}", url, valid_url);
                            url = valid_url
                        }
                        Err(e) => match e {
                            url::ParseError::RelativeUrlWithoutBase => {
                                match base_url.join(location.last().as_str()) {
                                    Ok(joined_url) => {
                                        info!("Redirecting {} to {}", url, joined_url);
                                        url = joined_url;
                                    }
                                    Err(e) => return Err(e.into()),
                                }
                            }
                            e => return Err(e.into()),
                        },
                    };
                }
            } else if res.status().is_success() {
                if let Some(mime) = res.content_type() {
                    if mime.essence() == "text/html" {
                        debug!("Successfully fetched {}", url);
                        return Ok((url.to_string(), res.body_string().await?));
                    } else {
                        let msg = format!(
                            "Invalid HTTP response. Received {} instead of text/html",
                            mime.essence()
                        );

                        return Err(ErrorKind::HTTPError(msg).into());
                    }
                } else {
                    return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
                }
            } else {
                let msg = format!("Request failed: HTTP {}", res.status());
                return Err(ErrorKind::HTTPError(msg).into());
            }
        }
        Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
    };

    process_request.await.map_err(|mut error: PaperoniError| {
        error.set_article_source(url);
        error
    })
}

type ImgItem<'a> = (&'a str, String, Option<String>);

async fn process_img_response<'a>(
    img_response: &mut surf::Response,
    url: &'a str,
) -> Result<ImgItem<'a>, ImgError> {
    if !img_response.status().is_success() {
        let kind = ErrorKind::HTTPError(format!(
            "Non-success HTTP status code ({})",
            img_response.status()
        ));
        return Err(ImgError::with_kind(kind));
    }
    let img_content: Vec<u8> = match img_response.body_bytes().await {
        Ok(bytes) => bytes,
        Err(e) => return Err(e.into()),
    };
    let img_mime = img_response
        .content_type()
        .map(|mime| mime.essence().to_string());
    let img_ext = match img_response
        .content_type()
        .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
    {
        Some(mime_str) => mime_str,
        None => return Err(ErrorKind::HTTPError("Image has no Content-Type".to_owned()).into()),
    };

    let mut img_path = std::env::temp_dir();
    img_path.push(format!("{}.{}", hash_url(url), &img_ext));
    let mut img_file = match File::create(&img_path).await {
        Ok(file) => file,
        Err(e) => return Err(e.into()),
    };
    match img_file.write_all(&img_content).await {
        Ok(_) => (),
        Err(e) => return Err(e.into()),
    }

    Ok((
        url,
        img_path
            .file_name()
            .map(|os_str_name| {
                os_str_name
                    .to_str()
                    .expect("Unable to get image file name")
                    .to_string()
            })
            .unwrap(),
        img_mime,
    ))
}

pub async fn download_images(
    extractor: &mut Extractor,
    article_origin: &Url,
    bar: &ProgressBar,
) -> Result<(), Vec<ImgError>> {
    if extractor.img_urls.len() > 0 {
        debug!(
            "Downloading {} images for {}",
            extractor.img_urls.len(),
            article_origin
        );
    }
    let img_count = extractor.img_urls.len();

    let imgs_req_iter = extractor
        .img_urls
        .iter()
        .map(|(url, _)| {
            (
                url,
                surf::Client::new()
                    .with(surf::middleware::Redirect::default())
                    .get(get_absolute_url(&url, article_origin)),
            )
        })
        .enumerate()
        .map(|(img_idx, (url, req))| async move {
            bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());
            match req.await {
                Ok(mut img_response) => {
                    let process_response =
                        process_img_response(&mut img_response, url.as_ref()).await;
                    process_response.map_err(|mut e: ImgError| {
                        e.set_url(url);
                        e
                    })
                }
                Err(e) => {
                    let mut img_err: ImgError = e.into();
                    img_err.set_url(url);
                    Err(img_err)
                }
            }
        });

    // A utility closure used when update the value of an image source after downloading is successful
    let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
        let (img_url, img_path, img_mime) = img_item;
        let img_ref = extractor
            .article()
            .select_first(&format!("img[src='{}']", img_url))
            .expect("Image node does not exist");
        let mut img_node = img_ref.attributes.borrow_mut();
        *img_node.get_mut("src").unwrap() = img_path.clone();
        // srcset is removed because readers such as Foliate then fail to display
        // the image already downloaded and stored in src
        img_node.remove("srcset");
        (img_path, img_mime)
    };

    let imgs_req_iter = stream::from_iter(imgs_req_iter)
        .buffered(10)
        .collect::<Vec<Result<_, ImgError>>>()
        .await;
    let mut errors = Vec::new();
    let mut replaced_imgs = Vec::new();
    for img_req_result in imgs_req_iter {
        match img_req_result {
            Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
            Err(e) => errors.push(e),
        }
    }
    extractor.img_urls = replaced_imgs;
    if errors.is_empty() {
        Ok(())
    } else {
        Err(errors)
    }
}

/// Handles getting the extension from a given MIME subtype.
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
    if subtype == ("svg+xml") {
        return "svg";
    } else if subtype == "x-icon" {
        "ico"
    } else {
        subtype
    }
}

/// Utility for hashing URLs. This is used to help store files locally with unique values
fn hash_url(url: &str) -> String {
    format!("{:x}", md5::compute(url.as_bytes()))
}

fn get_absolute_url(url: &str, request_url: &Url) -> String {
    if Url::parse(url).is_ok() {
        url.to_owned()
    } else if url.starts_with("/") {
        Url::parse(&format!(
            "{}://{}",
            request_url.scheme(),
            request_url.host_str().unwrap()
        ))
        .unwrap()
        .join(url)
        .unwrap()
        .into_string()
    } else {
        request_url.join(url).unwrap().into_string()
    }
}

#[cfg(test)]
mod test {
    use super::*;
    #[test]
    fn test_map_mime_type_to_ext() {
        let mime_subtypes = vec![
            "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
        ];
        let exts = mime_subtypes
            .into_iter()
            .map(|mime_type| map_mime_subtype_to_ext(mime_type))
            .collect::<Vec<_>>();
        assert_eq!(
            vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
            exts
        );
    }
}
Add http and epub modules 2021-02-06 09:59:03 +00:00			`use async_std::io::prelude::*;`
Refactor image downloading and update README Image downloads uses streams instead of spawned tasks to ensure that it does not start an unbounded number of spawned tasks 2021-02-09 07:33:02 +00:00			`use async_std::{fs::File, stream};`
			`use futures::StreamExt;`
Add progress indicators for the cli 2021-04-17 15:27:38 +01:00			`use indicatif::ProgressBar;`
Add logging configured to send to a file by default 2021-04-24 11:54:47 +01:00			`use log::{debug, info};`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`use url::Url;`

Add ImgError struct for representing errors in downloading article images 2021-04-24 11:57:06 +01:00			`use crate::errors::{ErrorKind, ImgError, PaperoniError};`
			`use crate::extractor::Extractor;`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`type HTMLResource = (String, String);`

Minor refactor Change cli to grab version from the Cargo manifest Rename fetch_url to fetch_html 2021-04-17 10:08:24 +01:00			`pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`let client = surf::Client::new();`
Add logging configured to send to a file by default 2021-04-24 11:54:47 +01:00			`debug!("Fetching {}", url);`
Add http and epub modules 2021-02-06 09:59:03 +00:00
Add printing of table for failed article downloads - Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table 2021-04-20 19:09:38 +01:00			`let process_request = async {`
			`let mut redirect_count: u8 = 0;`
			`let base_url = Url::parse(&url)?;`
			`let mut url = base_url.clone();`
			`while redirect_count < 5 {`
			`redirect_count += 1;`
			`let req = surf::get(&url);`
			`let mut res = client.send(req).await?;`
			`if res.status().is_redirection() {`
			`if let Some(location) = res.header(surf::http::headers::LOCATION) {`
			`match Url::parse(location.last().as_str()) {`
Add logging configured to send to a file by default 2021-04-24 11:54:47 +01:00			`Ok(valid_url) => {`
			`info!("Redirecting {} to {}", url, valid_url);`
			`url = valid_url`
			`}`
Add printing of table for failed article downloads - Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table 2021-04-20 19:09:38 +01:00			`Err(e) => match e {`
			`url::ParseError::RelativeUrlWithoutBase => {`
Add logging configured to send to a file by default 2021-04-24 11:54:47 +01:00			`match base_url.join(location.last().as_str()) {`
			`Ok(joined_url) => {`
			`info!("Redirecting {} to {}", url, joined_url);`
			`url = joined_url;`
			`}`
			`Err(e) => return Err(e.into()),`
			`}`
Add printing of table for failed article downloads - Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table 2021-04-20 19:09:38 +01:00			`}`
			`e => return Err(e.into()),`
			`},`
			`};`
			`}`
			`} else if res.status().is_success() {`
			`if let Some(mime) = res.content_type() {`
			`if mime.essence() == "text/html" {`
Add logging configured to send to a file by default 2021-04-24 11:54:47 +01:00			`debug!("Successfully fetched {}", url);`
Add printing of table for failed article downloads - Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table 2021-04-20 19:09:38 +01:00			`return Ok((url.to_string(), res.body_string().await?));`
			`} else {`
			`let msg = format!(`
			`"Invalid HTTP response. Received {} instead of text/html",`
			`mime.essence()`
			`);`
Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. 2021-04-17 10:04:06 +01:00
Add printing of table for failed article downloads - Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table 2021-04-20 19:09:38 +01:00			`return Err(ErrorKind::HTTPError(msg).into());`
			`}`
			`} else {`
			`return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`}`
			`} else {`
Add printing of table for failed article downloads - Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table 2021-04-20 19:09:38 +01:00			`let msg = format!("Request failed: HTTP {}", res.status());`
			`return Err(ErrorKind::HTTPError(msg).into());`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`}`
			`}`
Add printing of table for failed article downloads - Map errors in `fetch_html` to include the source url - Change `article_link` to `article_source` - Add `Into` conversion for `UTF8Error` - Collect errors in `generate_epubs` for displaying in a table 2021-04-20 19:09:38 +01:00			`Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())`
			`};`

			`process_request.await.map_err(\|mut error: PaperoniError\| {`
			`error.set_article_source(url);`
			`error`
			`})`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`}`

http.rs: extract process_img_response function 2021-05-08 20:30:00 +01:00			`type ImgItem<'a> = (&'a str, String, Option<String>);`

			`async fn process_img_response<'a>(`
			`img_response: &mut surf::Response,`
			`url: &'a str,`
			`) -> Result<ImgItem<'a>, ImgError> {`
http.rs: check response status for fetched images This patch checks if fetching an image resulted in a non-success status code. In case of non-success status, the response is discarded and an error is emitted. This relies on having 3xx codes already handled by surf's Redirect middleware, so we should see 4xx and 5xx codes here. Fixes hipstermojo/paperoni#11 2021-05-09 12:55:26 +01:00			`if !img_response.status().is_success() {`
			`let kind = ErrorKind::HTTPError(format!(`
			`"Non-success HTTP status code ({})",`
			`img_response.status()`
			`));`
			`return Err(ImgError::with_kind(kind));`
			`}`
http.rs: extract process_img_response function 2021-05-08 20:30:00 +01:00			`let img_content: Vec<u8> = match img_response.body_bytes().await {`
			`Ok(bytes) => bytes,`
			`Err(e) => return Err(e.into()),`
			`};`
			`let img_mime = img_response`
			`.content_type()`
			`.map(\|mime\| mime.essence().to_string());`
			`let img_ext = match img_response`
			`.content_type()`
			`.map(\|mime\| map_mime_subtype_to_ext(mime.subtype()).to_string())`
			`{`
			`Some(mime_str) => mime_str,`
			`None => return Err(ErrorKind::HTTPError("Image has no Content-Type".to_owned()).into()),`
			`};`

			`let mut img_path = std::env::temp_dir();`
			`img_path.push(format!("{}.{}", hash_url(url), &img_ext));`
			`let mut img_file = match File::create(&img_path).await {`
			`Ok(file) => file,`
			`Err(e) => return Err(e.into()),`
			`};`
			`match img_file.write_all(&img_content).await {`
			`Ok(_) => (),`
			`Err(e) => return Err(e.into()),`
			`}`

			`Ok((`
			`url,`
			`img_path`
			`.file_name()`
			`.map(\|os_str_name\| {`
			`os_str_name`
			`.to_str()`
			`.expect("Unable to get image file name")`
			`.to_string()`
			`})`
			`.unwrap(),`
			`img_mime,`
			`))`
			`}`

Add http and epub modules 2021-02-06 09:59:03 +00:00			`pub async fn download_images(`
			`extractor: &mut Extractor,`
			`article_origin: &Url,`
Add progress indicators for the cli 2021-04-17 15:27:38 +01:00			`bar: &ProgressBar,`
Add ImgError struct for representing errors in downloading article images 2021-04-24 11:57:06 +01:00			`) -> Result<(), Vec<ImgError>> {`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`if extractor.img_urls.len() > 0 {`
Add logging configured to send to a file by default 2021-04-24 11:54:47 +01:00			`debug!(`
			`"Downloading {} images for {}",`
			`extractor.img_urls.len(),`
			`article_origin`
			`);`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`}`
Add progress indicators for the cli 2021-04-17 15:27:38 +01:00			`let img_count = extractor.img_urls.len();`
Add http and epub modules 2021-02-06 09:59:03 +00:00
Refactor image downloading and update README Image downloads uses streams instead of spawned tasks to ensure that it does not start an unbounded number of spawned tasks 2021-02-09 07:33:02 +00:00			`let imgs_req_iter = extractor`
			`.img_urls`
			`.iter()`
			`.map(\|(url, _)\| {`
			`(`
			`url,`
Update dependencies and restore redirect middleware in `download_images` 2021-04-22 16:01:23 +01:00			`surf::Client::new()`
			`.with(surf::middleware::Redirect::default())`
			`.get(get_absolute_url(&url, article_origin)),`
Refactor image downloading and update README Image downloads uses streams instead of spawned tasks to ensure that it does not start an unbounded number of spawned tasks 2021-02-09 07:33:02 +00:00			`)`
			`})`
Add progress indicators for the cli 2021-04-17 15:27:38 +01:00			`.enumerate()`
			`.map(\|(img_idx, (url, req))\| async move {`
			`bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());`
Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. 2021-04-17 10:04:06 +01:00			`match req.await {`
			`Ok(mut img_response) => {`
http.rs: extract process_img_response function 2021-05-08 20:30:00 +01:00			`let process_response =`
			`process_img_response(&mut img_response, url.as_ref()).await;`
			`process_response.map_err(\|mut e: ImgError\| {`
Add ImgError struct for representing errors in downloading article images 2021-04-24 11:57:06 +01:00			`e.set_url(url);`
			`e`
			`})`
			`}`
			`Err(e) => {`
			`let mut img_err: ImgError = e.into();`
			`img_err.set_url(url);`
			`Err(img_err)`
Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. 2021-04-17 10:04:06 +01:00			`}`
			`}`
Refactor image downloading and update README Image downloads uses streams instead of spawned tasks to ensure that it does not start an unbounded number of spawned tasks 2021-02-09 07:33:02 +00:00			`});`
Add http and epub modules 2021-02-06 09:59:03 +00:00
Refactor image downloading and update README Image downloads uses streams instead of spawned tasks to ensure that it does not start an unbounded number of spawned tasks 2021-02-09 07:33:02 +00:00			`// A utility closure used when update the value of an image source after downloading is successful`
http.rs: extract process_img_response function 2021-05-08 20:30:00 +01:00			`let replace_existing_img_src = \|img_item: ImgItem\| -> (String, Option<String>) {`
			`let (img_url, img_path, img_mime) = img_item;`
			`let img_ref = extractor`
			`.article()`
			`.select_first(&format!("img[src='{}']", img_url))`
			`.expect("Image node does not exist");`
			`let mut img_node = img_ref.attributes.borrow_mut();`
			`*img_node.get_mut("src").unwrap() = img_path.clone();`
			`// srcset is removed because readers such as Foliate then fail to display`
			`// the image already downloaded and stored in src`
			`img_node.remove("srcset");`
			`(img_path, img_mime)`
			`};`
Add http and epub modules 2021-02-06 09:59:03 +00:00
Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. 2021-04-17 10:04:06 +01:00			`let imgs_req_iter = stream::from_iter(imgs_req_iter)`
Refactor image downloading and update README Image downloads uses streams instead of spawned tasks to ensure that it does not start an unbounded number of spawned tasks 2021-02-09 07:33:02 +00:00			`.buffered(10)`
Add ImgError struct for representing errors in downloading article images 2021-04-24 11:57:06 +01:00			`.collect::<Vec<Result<_, ImgError>>>()`
Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. 2021-04-17 10:04:06 +01:00			`.await;`
			`let mut errors = Vec::new();`
			`let mut replaced_imgs = Vec::new();`
			`for img_req_result in imgs_req_iter {`
			`match img_req_result {`
			`Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),`
			`Err(e) => errors.push(e),`
			`}`
			`}`
			`extractor.img_urls = replaced_imgs;`
			`if errors.is_empty() {`
			`Ok(())`
			`} else {`
			`Err(errors)`
			`}`
Add http and epub modules 2021-02-06 09:59:03 +00:00			`}`

			`/// Handles getting the extension from a given MIME subtype.`
			`fn map_mime_subtype_to_ext(subtype: &str) -> &str {`
			`if subtype == ("svg+xml") {`
			`return "svg";`
			`} else if subtype == "x-icon" {`
			`"ico"`
			`} else {`
			`subtype`
			`}`
			`}`

			`/// Utility for hashing URLs. This is used to help store files locally with unique values`
			`fn hash_url(url: &str) -> String {`
			`format!("{:x}", md5::compute(url.as_bytes()))`
			`}`

			`fn get_absolute_url(url: &str, request_url: &Url) -> String {`
			`if Url::parse(url).is_ok() {`
			`url.to_owned()`
			`} else if url.starts_with("/") {`
			`Url::parse(&format!(`
			`"{}://{}",`
			`request_url.scheme(),`
			`request_url.host_str().unwrap()`
			`))`
			`.unwrap()`
			`.join(url)`
			`.unwrap()`
			`.into_string()`
			`} else {`
			`request_url.join(url).unwrap().into_string()`
			`}`
			`}`

			`#[cfg(test)]`
			`mod test {`
			`use super::*;`
			`#[test]`
			`fn test_map_mime_type_to_ext() {`
			`let mime_subtypes = vec![`
			`"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",`
			`];`
			`let exts = mime_subtypes`
			`.into_iter()`
			`.map(\|mime_type\| map_mime_subtype_to_ext(mime_type))`
			`.collect::<Vec<_>>();`
			`assert_eq!(`
			`vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],`
			`exts`
			`);`
			`}`
			`}`