use async_std::io::prelude::*; use async_std::task; use async_std::{fs::File, stream}; use futures::StreamExt; use indicatif::ProgressBar; use log::warn; use log::{debug, info}; use url::Url; use crate::cli::AppConfig; use crate::errors::{ErrorKind, ImgError, PaperoniError}; use crate::extractor::Article; type HTMLResource = (String, String); pub fn download( app_config: &AppConfig, bar: &ProgressBar, partial_downloads: &mut Vec, errors: &mut Vec, ) -> Vec
{ task::block_on(async { let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); let mut articles = Vec::new(); while let Some(fetch_result) = responses.next().await { match fetch_result { Ok((url, html)) => { debug!("Extracting {}", &url); let mut extractor = Article::from_html(&html, &url); bar.set_message("Extracting..."); match extractor.extract_content() { Ok(_) => { extractor.extract_img_urls(); if let Err(img_errors) = download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) .await { partial_downloads .push(PartialDownload::new(&url, extractor.metadata().title())); warn!( "{} image{} failed to download for {}", img_errors.len(), if img_errors.len() > 1 { "s" } else { "" }, url ); for img_error in img_errors { warn!( "{}\n\t\tReason {}", img_error.url().as_ref().unwrap(), img_error ); } } articles.push(extractor); } Err(mut e) => { e.set_article_source(&url); errors.push(e); } } } Err(e) => errors.push(e), } bar.inc(1); } articles }) } pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); debug!("Fetching {}", url); let process_request = async { let mut redirect_count: u8 = 0; let base_url = Url::parse(&url)?; let mut url = base_url.clone(); while redirect_count < 5 { redirect_count += 1; let req = surf::get(&url); let mut res = client.send(req).await?; if res.status().is_redirection() { if let Some(location) = res.header(surf::http::headers::LOCATION) { match Url::parse(location.last().as_str()) { Ok(valid_url) => { info!("Redirecting {} to {}", url, valid_url); url = valid_url } Err(e) => match e { url::ParseError::RelativeUrlWithoutBase => { match base_url.join(location.last().as_str()) { Ok(joined_url) => { info!("Redirecting {} to {}", url, joined_url); url = joined_url; } Err(e) => return Err(e.into()), } } e => return Err(e.into()), }, }; } } else if res.status().is_success() { if let Some(mime) = res.content_type() { if mime.essence() == "text/html" { debug!("Successfully fetched {}", url); return Ok((url.to_string(), res.body_string().await?)); } else { let msg = format!( "Invalid HTTP response. Received {} instead of text/html", mime.essence() ); return Err(ErrorKind::HTTPError(msg).into()); } } else { return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into()); } } else { let msg = format!("Request failed: HTTP {}", res.status()); return Err(ErrorKind::HTTPError(msg).into()); } } Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into()) }; process_request.await.map_err(|mut error: PaperoniError| { error.set_article_source(url); error }) } type ImgItem<'a> = (&'a str, String, Option); async fn process_img_response<'a>( img_response: &mut surf::Response, url: &'a str, ) -> Result, ImgError> { if !img_response.status().is_success() { let kind = ErrorKind::HTTPError(format!( "Non-success HTTP status code ({})", img_response.status() )); return Err(ImgError::with_kind(kind)); } let img_content: Vec = match img_response.body_bytes().await { Ok(bytes) => bytes, Err(e) => return Err(e.into()), }; let img_mime = img_response .content_type() .map(|mime| mime.essence().to_string()); if let Some(mime_str) = &img_mime { if !mime_str.starts_with("image/") { return Err(ErrorKind::HTTPError(format!( "Invalid image MIME type: {} for {}", mime_str, url )) .into()); } } let img_ext = match img_response .content_type() .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) { Some(mime_str) => mime_str, None => return Err(ErrorKind::HTTPError("Image has no Content-Type".to_owned()).into()), }; let mut img_path = std::env::temp_dir(); img_path.push(format!("{}.{}", hash_url(url), &img_ext)); let mut img_file = match File::create(&img_path).await { Ok(file) => file, Err(e) => return Err(e.into()), }; match img_file.write_all(&img_content).await { Ok(_) => (), Err(e) => return Err(e.into()), } Ok(( url, img_path .file_name() .map(|os_str_name| { os_str_name .to_str() .expect("Unable to get image file name") .to_string() }) .unwrap(), img_mime, )) } pub async fn download_images( extractor: &mut Article, article_origin: &Url, bar: &ProgressBar, ) -> Result<(), Vec> { if extractor.img_urls.len() > 0 { debug!( "Downloading {} images for {}", extractor.img_urls.len(), article_origin ); } let img_count = extractor.img_urls.len(); let imgs_req_iter = extractor .img_urls .iter() .map(|(url, _)| { ( url, surf::Client::new() .with(surf::middleware::Redirect::default()) .get(get_absolute_url(&url, article_origin)), ) }) .enumerate() .map(|(img_idx, (url, req))| async move { bar.set_message(format!( "Downloading images [{}/{}]", img_idx + 1, img_count )); match req.await { Ok(mut img_response) => { let process_response = process_img_response(&mut img_response, url.as_ref()).await; process_response.map_err(|mut e: ImgError| { e.set_url(url); e }) } Err(e) => { let mut img_err: ImgError = e.into(); img_err.set_url(url); Err(img_err) } } }); // A utility closure used when update the value of an image source after downloading is successful let replace_existing_img_src = |img_item: ImgItem| -> (String, Option) { let (img_url, img_path, img_mime) = img_item; let img_ref = extractor .node_ref() .select_first(&format!("img[src='{}']", img_url)) .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut(); *img_node.get_mut("src").unwrap() = img_path.clone(); // srcset is removed because readers such as Foliate then fail to display // the image already downloaded and stored in src img_node.remove("srcset"); (img_path, img_mime) }; let imgs_req_iter = stream::from_iter(imgs_req_iter) .buffered(10) .collect::>>() .await; let mut errors = Vec::new(); let mut replaced_imgs = Vec::new(); for img_req_result in imgs_req_iter { match img_req_result { Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)), Err(e) => errors.push(e), } } extractor.img_urls = replaced_imgs; if errors.is_empty() { Ok(()) } else { Err(errors) } } pub struct PartialDownload { pub link: String, pub title: String, } impl PartialDownload { pub fn new(link: &str, title: &str) -> Self { Self { link: link.into(), title: title.into(), } } } /// Handles getting the extension from a given MIME subtype. fn map_mime_subtype_to_ext(subtype: &str) -> &str { if subtype == ("svg+xml") { return "svg"; } else if subtype == "x-icon" { "ico" } else { subtype } } /// Utility for hashing URLs. This is used to help store files locally with unique values fn hash_url(url: &str) -> String { format!("{:x}", md5::compute(url.as_bytes())) } fn get_absolute_url(url: &str, request_url: &Url) -> String { if Url::parse(url).is_ok() { url.to_owned() } else if url.starts_with("/") { Url::parse(&format!( "{}://{}", request_url.scheme(), request_url.host_str().unwrap() )) .unwrap() .join(url) .unwrap() .into() } else { request_url.join(url).unwrap().into() } } #[cfg(test)] mod test { use super::*; #[test] fn test_map_mime_type_to_ext() { let mime_subtypes = vec![ "apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp", ]; let exts = mime_subtypes .into_iter() .map(|mime_type| map_mime_subtype_to_ext(mime_type)) .collect::>(); assert_eq!( vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"], exts ); } }