07479afeac
chore: add doc comment on ResourceType alias fix: add error when image MIME type is invalid on an image
345 lines
12 KiB
Rust
345 lines
12 KiB
Rust
use async_std::io::prelude::*;
|
|
use async_std::task;
|
|
use async_std::{fs::File, stream};
|
|
use futures::StreamExt;
|
|
use indicatif::ProgressBar;
|
|
use log::warn;
|
|
use log::{debug, info};
|
|
use url::Url;
|
|
|
|
use crate::cli::AppConfig;
|
|
use crate::errors::{ErrorKind, ImgError, PaperoniError};
|
|
use crate::extractor::Article;
|
|
type HTMLResource = (String, String);
|
|
|
|
pub fn download(
|
|
app_config: &AppConfig,
|
|
bar: &ProgressBar,
|
|
partial_downloads: &mut Vec<PartialDownload>,
|
|
errors: &mut Vec<PaperoniError>,
|
|
) -> Vec<Article> {
|
|
task::block_on(async {
|
|
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
|
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
|
let mut articles = Vec::new();
|
|
while let Some(fetch_result) = responses.next().await {
|
|
match fetch_result {
|
|
Ok((url, html)) => {
|
|
debug!("Extracting {}", &url);
|
|
let mut extractor = Article::from_html(&html, &url);
|
|
bar.set_message("Extracting...");
|
|
match extractor.extract_content() {
|
|
Ok(_) => {
|
|
extractor.extract_img_urls();
|
|
if let Err(img_errors) =
|
|
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
|
|
.await
|
|
{
|
|
partial_downloads
|
|
.push(PartialDownload::new(&url, extractor.metadata().title()));
|
|
warn!(
|
|
"{} image{} failed to download for {}",
|
|
img_errors.len(),
|
|
if img_errors.len() > 1 { "s" } else { "" },
|
|
url
|
|
);
|
|
for img_error in img_errors {
|
|
warn!(
|
|
"{}\n\t\tReason {}",
|
|
img_error.url().as_ref().unwrap(),
|
|
img_error
|
|
);
|
|
}
|
|
}
|
|
articles.push(extractor);
|
|
}
|
|
Err(mut e) => {
|
|
e.set_article_source(&url);
|
|
errors.push(e);
|
|
}
|
|
}
|
|
}
|
|
Err(e) => errors.push(e),
|
|
}
|
|
bar.inc(1);
|
|
}
|
|
articles
|
|
})
|
|
}
|
|
|
|
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
|
let client = surf::Client::new();
|
|
debug!("Fetching {}", url);
|
|
|
|
let process_request = async {
|
|
let mut redirect_count: u8 = 0;
|
|
let base_url = Url::parse(&url)?;
|
|
let mut url = base_url.clone();
|
|
while redirect_count < 5 {
|
|
redirect_count += 1;
|
|
let req = surf::get(&url);
|
|
let mut res = client.send(req).await?;
|
|
if res.status().is_redirection() {
|
|
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
|
match Url::parse(location.last().as_str()) {
|
|
Ok(valid_url) => {
|
|
info!("Redirecting {} to {}", url, valid_url);
|
|
url = valid_url
|
|
}
|
|
Err(e) => match e {
|
|
url::ParseError::RelativeUrlWithoutBase => {
|
|
match base_url.join(location.last().as_str()) {
|
|
Ok(joined_url) => {
|
|
info!("Redirecting {} to {}", url, joined_url);
|
|
url = joined_url;
|
|
}
|
|
Err(e) => return Err(e.into()),
|
|
}
|
|
}
|
|
e => return Err(e.into()),
|
|
},
|
|
};
|
|
}
|
|
} else if res.status().is_success() {
|
|
if let Some(mime) = res.content_type() {
|
|
if mime.essence() == "text/html" {
|
|
debug!("Successfully fetched {}", url);
|
|
return Ok((url.to_string(), res.body_string().await?));
|
|
} else {
|
|
let msg = format!(
|
|
"Invalid HTTP response. Received {} instead of text/html",
|
|
mime.essence()
|
|
);
|
|
|
|
return Err(ErrorKind::HTTPError(msg).into());
|
|
}
|
|
} else {
|
|
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
|
|
}
|
|
} else {
|
|
let msg = format!("Request failed: HTTP {}", res.status());
|
|
return Err(ErrorKind::HTTPError(msg).into());
|
|
}
|
|
}
|
|
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
|
|
};
|
|
|
|
process_request.await.map_err(|mut error: PaperoniError| {
|
|
error.set_article_source(url);
|
|
error
|
|
})
|
|
}
|
|
|
|
type ImgItem<'a> = (&'a str, String, Option<String>);
|
|
|
|
async fn process_img_response<'a>(
|
|
img_response: &mut surf::Response,
|
|
url: &'a str,
|
|
) -> Result<ImgItem<'a>, ImgError> {
|
|
if !img_response.status().is_success() {
|
|
let kind = ErrorKind::HTTPError(format!(
|
|
"Non-success HTTP status code ({})",
|
|
img_response.status()
|
|
));
|
|
return Err(ImgError::with_kind(kind));
|
|
}
|
|
let img_content: Vec<u8> = match img_response.body_bytes().await {
|
|
Ok(bytes) => bytes,
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
let img_mime = img_response
|
|
.content_type()
|
|
.map(|mime| mime.essence().to_string());
|
|
if let Some(mime_str) = &img_mime {
|
|
if !mime_str.starts_with("image/") {
|
|
return Err(ErrorKind::HTTPError(format!(
|
|
"Invalid image MIME type: {} for {}",
|
|
mime_str, url
|
|
))
|
|
.into());
|
|
}
|
|
}
|
|
let img_ext = match img_response
|
|
.content_type()
|
|
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
|
{
|
|
Some(mime_str) => mime_str,
|
|
None => return Err(ErrorKind::HTTPError("Image has no Content-Type".to_owned()).into()),
|
|
};
|
|
|
|
let mut img_path = std::env::temp_dir();
|
|
img_path.push(format!("{}.{}", hash_url(url), &img_ext));
|
|
let mut img_file = match File::create(&img_path).await {
|
|
Ok(file) => file,
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
match img_file.write_all(&img_content).await {
|
|
Ok(_) => (),
|
|
Err(e) => return Err(e.into()),
|
|
}
|
|
|
|
Ok((
|
|
url,
|
|
img_path
|
|
.file_name()
|
|
.map(|os_str_name| {
|
|
os_str_name
|
|
.to_str()
|
|
.expect("Unable to get image file name")
|
|
.to_string()
|
|
})
|
|
.unwrap(),
|
|
img_mime,
|
|
))
|
|
}
|
|
|
|
pub async fn download_images(
|
|
extractor: &mut Article,
|
|
article_origin: &Url,
|
|
bar: &ProgressBar,
|
|
) -> Result<(), Vec<ImgError>> {
|
|
if extractor.img_urls.len() > 0 {
|
|
debug!(
|
|
"Downloading {} images for {}",
|
|
extractor.img_urls.len(),
|
|
article_origin
|
|
);
|
|
}
|
|
let img_count = extractor.img_urls.len();
|
|
|
|
let imgs_req_iter = extractor
|
|
.img_urls
|
|
.iter()
|
|
.map(|(url, _)| {
|
|
(
|
|
url,
|
|
surf::Client::new()
|
|
.with(surf::middleware::Redirect::default())
|
|
.get(get_absolute_url(&url, article_origin)),
|
|
)
|
|
})
|
|
.enumerate()
|
|
.map(|(img_idx, (url, req))| async move {
|
|
bar.set_message(format!(
|
|
"Downloading images [{}/{}]",
|
|
img_idx + 1,
|
|
img_count
|
|
));
|
|
match req.await {
|
|
Ok(mut img_response) => {
|
|
let process_response =
|
|
process_img_response(&mut img_response, url.as_ref()).await;
|
|
process_response.map_err(|mut e: ImgError| {
|
|
e.set_url(url);
|
|
e
|
|
})
|
|
}
|
|
Err(e) => {
|
|
let mut img_err: ImgError = e.into();
|
|
img_err.set_url(url);
|
|
Err(img_err)
|
|
}
|
|
}
|
|
});
|
|
|
|
// A utility closure used when update the value of an image source after downloading is successful
|
|
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
|
|
let (img_url, img_path, img_mime) = img_item;
|
|
let img_ref = extractor
|
|
.node_ref()
|
|
.select_first(&format!("img[src='{}']", img_url))
|
|
.expect("Image node does not exist");
|
|
let mut img_node = img_ref.attributes.borrow_mut();
|
|
*img_node.get_mut("src").unwrap() = img_path.clone();
|
|
// srcset is removed because readers such as Foliate then fail to display
|
|
// the image already downloaded and stored in src
|
|
img_node.remove("srcset");
|
|
(img_path, img_mime)
|
|
};
|
|
|
|
let imgs_req_iter = stream::from_iter(imgs_req_iter)
|
|
.buffered(10)
|
|
.collect::<Vec<Result<_, ImgError>>>()
|
|
.await;
|
|
let mut errors = Vec::new();
|
|
let mut replaced_imgs = Vec::new();
|
|
for img_req_result in imgs_req_iter {
|
|
match img_req_result {
|
|
Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
|
|
Err(e) => errors.push(e),
|
|
}
|
|
}
|
|
extractor.img_urls = replaced_imgs;
|
|
if errors.is_empty() {
|
|
Ok(())
|
|
} else {
|
|
Err(errors)
|
|
}
|
|
}
|
|
|
|
pub struct PartialDownload {
|
|
pub link: String,
|
|
pub title: String,
|
|
}
|
|
|
|
impl PartialDownload {
|
|
pub fn new(link: &str, title: &str) -> Self {
|
|
Self {
|
|
link: link.into(),
|
|
title: title.into(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Handles getting the extension from a given MIME subtype.
|
|
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
|
if subtype == ("svg+xml") {
|
|
return "svg";
|
|
} else if subtype == "x-icon" {
|
|
"ico"
|
|
} else {
|
|
subtype
|
|
}
|
|
}
|
|
|
|
/// Utility for hashing URLs. This is used to help store files locally with unique values
|
|
fn hash_url(url: &str) -> String {
|
|
format!("{:x}", md5::compute(url.as_bytes()))
|
|
}
|
|
|
|
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
|
if Url::parse(url).is_ok() {
|
|
url.to_owned()
|
|
} else if url.starts_with("/") {
|
|
Url::parse(&format!(
|
|
"{}://{}",
|
|
request_url.scheme(),
|
|
request_url.host_str().unwrap()
|
|
))
|
|
.unwrap()
|
|
.join(url)
|
|
.unwrap()
|
|
.into()
|
|
} else {
|
|
request_url.join(url).unwrap().into()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use super::*;
|
|
#[test]
|
|
fn test_map_mime_type_to_ext() {
|
|
let mime_subtypes = vec![
|
|
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
|
|
];
|
|
let exts = mime_subtypes
|
|
.into_iter()
|
|
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
|
|
.collect::<Vec<_>>();
|
|
assert_eq!(
|
|
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
|
|
exts
|
|
);
|
|
}
|
|
}
|