This repository has been archived on 2024-11-22. You can view files and clone it, but cannot push or open issues or pull requests.
paperoni/src/http.rs

346 lines
12 KiB
Rust
Raw Normal View History

2021-02-06 09:59:03 +00:00
use async_std::io::prelude::*;
use async_std::task;
use async_std::{fs::File, stream};
use futures::StreamExt;
2021-04-17 15:27:38 +01:00
use indicatif::ProgressBar;
use log::warn;
use log::{debug, info};
2021-02-06 09:59:03 +00:00
use url::Url;
use crate::cli::AppConfig;
use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Article;
2021-02-06 09:59:03 +00:00
type HTMLResource = (String, String);
pub fn download(
app_config: &AppConfig,
bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>,
errors: &mut Vec<PaperoniError>,
) -> Vec<Article> {
task::block_on(async {
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
match fetch_result {
Ok((url, html)) => {
debug!("Extracting {}", &url);
let mut extractor = Article::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.await
{
partial_downloads
.push(PartialDownload::new(&url, extractor.metadata().title()));
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
}
}
Err(e) => errors.push(e),
}
bar.inc(1);
}
articles
})
}
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
2021-02-06 09:59:03 +00:00
let client = surf::Client::new();
debug!("Fetching {}", url);
2021-02-06 09:59:03 +00:00
let process_request = async {
let mut redirect_count: u8 = 0;
let base_url = Url::parse(&url)?;
let mut url = base_url.clone();
while redirect_count < 5 {
redirect_count += 1;
let req = surf::get(&url);
let mut res = client.send(req).await?;
if res.status().is_redirection() {
if let Some(location) = res.header(surf::http::headers::LOCATION) {
match Url::parse(location.last().as_str()) {
Ok(valid_url) => {
info!("Redirecting {} to {}", url, valid_url);
url = valid_url
}
Err(e) => match e {
url::ParseError::RelativeUrlWithoutBase => {
match base_url.join(location.last().as_str()) {
Ok(joined_url) => {
info!("Redirecting {} to {}", url, joined_url);
url = joined_url;
}
Err(e) => return Err(e.into()),
}
}
e => return Err(e.into()),
},
};
}
} else if res.status().is_success() {
if let Some(mime) = res.content_type() {
if mime.essence() == "text/html" {
debug!("Successfully fetched {}", url);
return Ok((url.to_string(), res.body_string().await?));
} else {
let msg = format!(
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
);
return Err(ErrorKind::HTTPError(msg).into());
}
} else {
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
2021-02-06 09:59:03 +00:00
}
} else {
let msg = format!("Request failed: HTTP {}", res.status());
return Err(ErrorKind::HTTPError(msg).into());
2021-02-06 09:59:03 +00:00
}
}
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
};
process_request.await.map_err(|mut error: PaperoniError| {
error.set_article_source(url);
error
})
2021-02-06 09:59:03 +00:00
}
type ImgItem<'a> = (&'a str, String, Option<String>);
async fn process_img_response<'a>(
img_response: &mut surf::Response,
url: &'a str,
) -> Result<ImgItem<'a>, ImgError> {
if !img_response.status().is_success() {
let kind = ErrorKind::HTTPError(format!(
"Non-success HTTP status code ({})",
img_response.status()
));
return Err(ImgError::with_kind(kind));
}
let img_content: Vec<u8> = match img_response.body_bytes().await {
Ok(bytes) => bytes,
Err(e) => return Err(e.into()),
};
let img_mime = img_response
.content_type()
.map(|mime| mime.essence().to_string());
if let Some(mime_str) = &img_mime {
if !mime_str.starts_with("image/") {
return Err(ErrorKind::HTTPError(format!(
"Invalid image MIME type: {} for {}",
mime_str, url
))
.into());
}
}
let img_ext = match img_response
.content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
{
Some(mime_str) => mime_str,
None => return Err(ErrorKind::HTTPError("Image has no Content-Type".to_owned()).into()),
};
let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(url), &img_ext));
let mut img_file = match File::create(&img_path).await {
Ok(file) => file,
Err(e) => return Err(e.into()),
};
match img_file.write_all(&img_content).await {
Ok(_) => (),
Err(e) => return Err(e.into()),
}
Ok((
url,
img_path
.file_name()
.map(|os_str_name| {
os_str_name
.to_str()
.expect("Unable to get image file name")
.to_string()
})
.unwrap(),
img_mime,
))
}
2021-02-06 09:59:03 +00:00
pub async fn download_images(
extractor: &mut Article,
2021-02-06 09:59:03 +00:00
article_origin: &Url,
2021-04-17 15:27:38 +01:00
bar: &ProgressBar,
) -> Result<(), Vec<ImgError>> {
2021-02-06 09:59:03 +00:00
if extractor.img_urls.len() > 0 {
debug!(
"Downloading {} images for {}",
extractor.img_urls.len(),
article_origin
);
2021-02-06 09:59:03 +00:00
}
2021-04-17 15:27:38 +01:00
let img_count = extractor.img_urls.len();
2021-02-06 09:59:03 +00:00
let imgs_req_iter = extractor
.img_urls
.iter()
.map(|(url, _)| {
(
url,
surf::Client::new()
.with(surf::middleware::Redirect::default())
.get(get_absolute_url(&url, article_origin)),
)
})
2021-04-17 15:27:38 +01:00
.enumerate()
.map(|(img_idx, (url, req))| async move {
bar.set_message(format!(
"Downloading images [{}/{}]",
img_idx + 1,
img_count
));
match req.await {
Ok(mut img_response) => {
let process_response =
process_img_response(&mut img_response, url.as_ref()).await;
process_response.map_err(|mut e: ImgError| {
e.set_url(url);
e
})
}
Err(e) => {
let mut img_err: ImgError = e.into();
img_err.set_url(url);
Err(img_err)
}
}
});
2021-02-06 09:59:03 +00:00
// A utility closure used when update the value of an image source after downloading is successful
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
let (img_url, img_path, img_mime) = img_item;
let img_ref = extractor
.node_ref()
.select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut();
*img_node.get_mut("src").unwrap() = img_path.clone();
// srcset is removed because readers such as Foliate then fail to display
// the image already downloaded and stored in src
img_node.remove("srcset");
(img_path, img_mime)
};
2021-02-06 09:59:03 +00:00
let imgs_req_iter = stream::from_iter(imgs_req_iter)
.buffered(10)
.collect::<Vec<Result<_, ImgError>>>()
.await;
let mut errors = Vec::new();
let mut replaced_imgs = Vec::new();
for img_req_result in imgs_req_iter {
match img_req_result {
Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
Err(e) => errors.push(e),
}
}
extractor.img_urls = replaced_imgs;
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
2021-02-06 09:59:03 +00:00
}
pub struct PartialDownload {
pub link: String,
pub title: String,
}
impl PartialDownload {
pub fn new(link: &str, title: &str) -> Self {
Self {
link: link.into(),
title: title.into(),
}
}
}
2021-02-06 09:59:03 +00:00
/// Handles getting the extension from a given MIME subtype.
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
if subtype == ("svg+xml") {
return "svg";
} else if subtype == "x-icon" {
"ico"
} else {
subtype
}
}
/// Utility for hashing URLs. This is used to help store files locally with unique values
fn hash_url(url: &str) -> String {
format!("{:x}", md5::compute(url.as_bytes()))
}
fn get_absolute_url(url: &str, request_url: &Url) -> String {
if Url::parse(url).is_ok() {
url.to_owned()
} else if url.starts_with("/") {
Url::parse(&format!(
"{}://{}",
request_url.scheme(),
request_url.host_str().unwrap()
))
.unwrap()
.join(url)
.unwrap()
.into()
2021-02-06 09:59:03 +00:00
} else {
request_url.join(url).unwrap().into()
2021-02-06 09:59:03 +00:00
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_map_mime_type_to_ext() {
let mime_subtypes = vec![
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
];
let exts = mime_subtypes
.into_iter()
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
.collect::<Vec<_>>();
assert_eq!(
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
exts
);
}
}