7e9dcfc2b7
Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact.
215 lines
7.4 KiB
Rust
215 lines
7.4 KiB
Rust
use async_std::io::prelude::*;
|
|
use async_std::{fs::File, stream};
|
|
use futures::StreamExt;
|
|
use url::Url;
|
|
|
|
use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor};
|
|
|
|
type HTMLResource = (String, String);
|
|
|
|
pub async fn fetch_url(url: &str) -> Result<HTMLResource, PaperoniError> {
|
|
let client = surf::Client::new();
|
|
println!("Fetching...");
|
|
|
|
let mut redirect_count: u8 = 0;
|
|
let base_url = Url::parse(&url)?;
|
|
let mut url = base_url.clone();
|
|
while redirect_count < 5 {
|
|
redirect_count += 1;
|
|
let req = surf::get(&url);
|
|
let mut res = client.send(req).await?;
|
|
if res.status().is_redirection() {
|
|
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
|
match Url::parse(location.last().as_str()) {
|
|
Ok(valid_url) => url = valid_url,
|
|
Err(e) => match e {
|
|
url::ParseError::RelativeUrlWithoutBase => {
|
|
url = base_url.join(location.last().as_str())?
|
|
}
|
|
e => return Err(e.into()),
|
|
},
|
|
};
|
|
}
|
|
} else if res.status().is_success() {
|
|
if let Some(mime) = res.content_type() {
|
|
if mime.essence() == "text/html" {
|
|
return Ok((url.to_string(), res.body_string().await?));
|
|
} else {
|
|
let msg = format!(
|
|
"Invalid HTTP response. Received {} instead of text/html",
|
|
mime.essence()
|
|
);
|
|
|
|
return Err(ErrorKind::HTTPError(msg).into());
|
|
}
|
|
} else {
|
|
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
|
|
}
|
|
} else {
|
|
let msg = format!("Request failed: HTTP {}", res.status());
|
|
return Err(ErrorKind::HTTPError(msg).into());
|
|
}
|
|
}
|
|
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
|
|
}
|
|
|
|
pub async fn download_images(
|
|
extractor: &mut Extractor,
|
|
article_origin: &Url,
|
|
) -> Result<(), Vec<PaperoniError>> {
|
|
if extractor.img_urls.len() > 0 {
|
|
println!("Downloading images...");
|
|
}
|
|
|
|
let imgs_req_iter = extractor
|
|
.img_urls
|
|
.iter()
|
|
.map(|(url, _)| {
|
|
(
|
|
url,
|
|
surf::Client::new().get(get_absolute_url(&url, article_origin)),
|
|
)
|
|
})
|
|
.map(|(url, req)| async move {
|
|
match req.await {
|
|
Ok(mut img_response) => {
|
|
// let mut img_response = req.await.expect("Unable to retrieve image");
|
|
let img_content: Vec<u8> = match img_response.body_bytes().await {
|
|
Ok(bytes) => bytes,
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
let img_mime = img_response
|
|
.content_type()
|
|
.map(|mime| mime.essence().to_string());
|
|
let img_ext = match img_response
|
|
.content_type()
|
|
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
|
{
|
|
Some(mime_str) => mime_str,
|
|
None => {
|
|
return Err(ErrorKind::HTTPError(
|
|
"Image has no Content-Type".to_owned(),
|
|
)
|
|
.into())
|
|
}
|
|
};
|
|
|
|
let mut img_path = std::env::temp_dir();
|
|
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
|
|
let mut img_file = match File::create(&img_path).await {
|
|
Ok(file) => file,
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
match img_file.write_all(&img_content).await {
|
|
Ok(_) => (),
|
|
Err(e) => return Err(e.into()),
|
|
}
|
|
|
|
Ok((
|
|
url,
|
|
img_path
|
|
.file_name()
|
|
.map(|os_str_name| {
|
|
os_str_name
|
|
.to_str()
|
|
.expect("Unable to get image file name")
|
|
.to_string()
|
|
})
|
|
.unwrap(),
|
|
img_mime,
|
|
))
|
|
}
|
|
Err(e) => Err(e.into()),
|
|
}
|
|
});
|
|
|
|
// A utility closure used when update the value of an image source after downloading is successful
|
|
let replace_existing_img_src =
|
|
|img_item: (&String, String, Option<String>)| -> (String, Option<String>) {
|
|
let (img_url, img_path, img_mime) = img_item;
|
|
let img_ref = extractor
|
|
.article()
|
|
.as_mut()
|
|
.expect("Unable to get mutable ref")
|
|
.select_first(&format!("img[src='{}']", img_url))
|
|
.expect("Image node does not exist");
|
|
let mut img_node = img_ref.attributes.borrow_mut();
|
|
*img_node.get_mut("src").unwrap() = img_path.clone();
|
|
// srcset is removed because readers such as Foliate then fail to display
|
|
// the image already downloaded and stored in src
|
|
img_node.remove("srcset");
|
|
(img_path, img_mime)
|
|
};
|
|
|
|
let imgs_req_iter = stream::from_iter(imgs_req_iter)
|
|
.buffered(10)
|
|
.collect::<Vec<Result<_, PaperoniError>>>()
|
|
.await;
|
|
let mut errors = Vec::new();
|
|
let mut replaced_imgs = Vec::new();
|
|
for img_req_result in imgs_req_iter {
|
|
match img_req_result {
|
|
Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
|
|
Err(e) => errors.push(e),
|
|
}
|
|
}
|
|
extractor.img_urls = replaced_imgs;
|
|
if errors.is_empty() {
|
|
Ok(())
|
|
} else {
|
|
Err(errors)
|
|
}
|
|
}
|
|
|
|
/// Handles getting the extension from a given MIME subtype.
|
|
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
|
if subtype == ("svg+xml") {
|
|
return "svg";
|
|
} else if subtype == "x-icon" {
|
|
"ico"
|
|
} else {
|
|
subtype
|
|
}
|
|
}
|
|
|
|
/// Utility for hashing URLs. This is used to help store files locally with unique values
|
|
fn hash_url(url: &str) -> String {
|
|
format!("{:x}", md5::compute(url.as_bytes()))
|
|
}
|
|
|
|
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
|
if Url::parse(url).is_ok() {
|
|
url.to_owned()
|
|
} else if url.starts_with("/") {
|
|
Url::parse(&format!(
|
|
"{}://{}",
|
|
request_url.scheme(),
|
|
request_url.host_str().unwrap()
|
|
))
|
|
.unwrap()
|
|
.join(url)
|
|
.unwrap()
|
|
.into_string()
|
|
} else {
|
|
request_url.join(url).unwrap().into_string()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use super::*;
|
|
#[test]
|
|
fn test_map_mime_type_to_ext() {
|
|
let mime_subtypes = vec![
|
|
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
|
|
];
|
|
let exts = mime_subtypes
|
|
.into_iter()
|
|
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
|
|
.collect::<Vec<_>>();
|
|
assert_eq!(
|
|
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
|
|
exts
|
|
);
|
|
}
|
|
}
|