Add ImgError struct for representing errors in downloading article images
This commit is contained in:
parent
910c45abf7
commit
a3de3fb6ff
3 changed files with 110 additions and 47 deletions
|
@ -14,6 +14,53 @@ pub enum ErrorKind {
|
||||||
ReadabilityError(String),
|
ReadabilityError(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Error, Debug)]
|
||||||
|
#[error("{kind}")]
|
||||||
|
/// Used to represent errors from downloading images. Errors from here are used solely for debugging
|
||||||
|
/// as they are considered recoverable.
|
||||||
|
pub struct ImgError {
|
||||||
|
kind: ErrorKind,
|
||||||
|
url: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ImgError {
|
||||||
|
pub fn with_kind(kind: ErrorKind) -> Self {
|
||||||
|
ImgError { url: None, kind }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_url(&mut self, url: &str) {
|
||||||
|
self.url = Some(url.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn url(&self) -> &Option<String> {
|
||||||
|
&self.url
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<ErrorKind> for ImgError {
|
||||||
|
fn from(kind: ErrorKind) -> Self {
|
||||||
|
ImgError::with_kind(kind)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<surf::Error> for ImgError {
|
||||||
|
fn from(err: surf::Error) -> Self {
|
||||||
|
ImgError::with_kind(ErrorKind::HTTPError(err.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<url::ParseError> for ImgError {
|
||||||
|
fn from(err: url::ParseError) -> Self {
|
||||||
|
ImgError::with_kind(ErrorKind::HTTPError(err.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<std::io::Error> for ImgError {
|
||||||
|
fn from(err: std::io::Error) -> Self {
|
||||||
|
ImgError::with_kind(ErrorKind::IOError(err.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
#[error("{kind}")]
|
#[error("{kind}")]
|
||||||
pub struct PaperoniError {
|
pub struct PaperoniError {
|
||||||
|
|
103
src/http.rs
103
src/http.rs
|
@ -5,8 +5,8 @@ use indicatif::ProgressBar;
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor};
|
use crate::errors::{ErrorKind, ImgError, PaperoniError};
|
||||||
|
use crate::extractor::Extractor;
|
||||||
type HTMLResource = (String, String);
|
type HTMLResource = (String, String);
|
||||||
|
|
||||||
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
||||||
|
@ -76,7 +76,7 @@ pub async fn download_images(
|
||||||
extractor: &mut Extractor,
|
extractor: &mut Extractor,
|
||||||
article_origin: &Url,
|
article_origin: &Url,
|
||||||
bar: &ProgressBar,
|
bar: &ProgressBar,
|
||||||
) -> Result<(), Vec<PaperoniError>> {
|
) -> Result<(), Vec<ImgError>> {
|
||||||
if extractor.img_urls.len() > 0 {
|
if extractor.img_urls.len() > 0 {
|
||||||
debug!(
|
debug!(
|
||||||
"Downloading {} images for {}",
|
"Downloading {} images for {}",
|
||||||
|
@ -102,53 +102,62 @@ pub async fn download_images(
|
||||||
bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());
|
bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());
|
||||||
match req.await {
|
match req.await {
|
||||||
Ok(mut img_response) => {
|
Ok(mut img_response) => {
|
||||||
// let mut img_response = req.await.expect("Unable to retrieve image");
|
let process_response = async {
|
||||||
let img_content: Vec<u8> = match img_response.body_bytes().await {
|
let img_content: Vec<u8> = match img_response.body_bytes().await {
|
||||||
Ok(bytes) => bytes,
|
Ok(bytes) => bytes,
|
||||||
Err(e) => return Err(e.into()),
|
Err(e) => return Err(e.into()),
|
||||||
};
|
};
|
||||||
let img_mime = img_response
|
let img_mime = img_response
|
||||||
.content_type()
|
.content_type()
|
||||||
.map(|mime| mime.essence().to_string());
|
.map(|mime| mime.essence().to_string());
|
||||||
let img_ext = match img_response
|
let img_ext = match img_response
|
||||||
.content_type()
|
.content_type()
|
||||||
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||||
{
|
{
|
||||||
Some(mime_str) => mime_str,
|
Some(mime_str) => mime_str,
|
||||||
None => {
|
None => {
|
||||||
return Err(ErrorKind::HTTPError(
|
return Err(ErrorKind::HTTPError(
|
||||||
"Image has no Content-Type".to_owned(),
|
"Image has no Content-Type".to_owned(),
|
||||||
)
|
)
|
||||||
.into())
|
.into())
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut img_path = std::env::temp_dir();
|
||||||
|
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
|
||||||
|
let mut img_file = match File::create(&img_path).await {
|
||||||
|
Ok(file) => file,
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
|
};
|
||||||
|
match img_file.write_all(&img_content).await {
|
||||||
|
Ok(_) => (),
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
let mut img_path = std::env::temp_dir();
|
Ok((
|
||||||
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
|
url,
|
||||||
let mut img_file = match File::create(&img_path).await {
|
img_path
|
||||||
Ok(file) => file,
|
.file_name()
|
||||||
Err(e) => return Err(e.into()),
|
.map(|os_str_name| {
|
||||||
|
os_str_name
|
||||||
|
.to_str()
|
||||||
|
.expect("Unable to get image file name")
|
||||||
|
.to_string()
|
||||||
|
})
|
||||||
|
.unwrap(),
|
||||||
|
img_mime,
|
||||||
|
))
|
||||||
};
|
};
|
||||||
match img_file.write_all(&img_content).await {
|
process_response.await.map_err(|mut e: ImgError| {
|
||||||
Ok(_) => (),
|
e.set_url(url);
|
||||||
Err(e) => return Err(e.into()),
|
e
|
||||||
}
|
})
|
||||||
|
}
|
||||||
Ok((
|
Err(e) => {
|
||||||
url,
|
let mut img_err: ImgError = e.into();
|
||||||
img_path
|
img_err.set_url(url);
|
||||||
.file_name()
|
Err(img_err)
|
||||||
.map(|os_str_name| {
|
|
||||||
os_str_name
|
|
||||||
.to_str()
|
|
||||||
.expect("Unable to get image file name")
|
|
||||||
.to_string()
|
|
||||||
})
|
|
||||||
.unwrap(),
|
|
||||||
img_mime,
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
Err(e) => Err(e.into()),
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -170,7 +179,7 @@ pub async fn download_images(
|
||||||
|
|
||||||
let imgs_req_iter = stream::from_iter(imgs_req_iter)
|
let imgs_req_iter = stream::from_iter(imgs_req_iter)
|
||||||
.buffered(10)
|
.buffered(10)
|
||||||
.collect::<Vec<Result<_, PaperoniError>>>()
|
.collect::<Vec<Result<_, ImgError>>>()
|
||||||
.await;
|
.await;
|
||||||
let mut errors = Vec::new();
|
let mut errors = Vec::new();
|
||||||
let mut replaced_imgs = Vec::new();
|
let mut replaced_imgs = Vec::new();
|
||||||
|
|
|
@ -70,6 +70,13 @@ fn download(app_config: AppConfig) {
|
||||||
if img_errors.len() > 1 { "s" } else { "" },
|
if img_errors.len() > 1 { "s" } else { "" },
|
||||||
url
|
url
|
||||||
);
|
);
|
||||||
|
for img_error in img_errors {
|
||||||
|
warn!(
|
||||||
|
"{}\n\t\tReason {}",
|
||||||
|
img_error.url().as_ref().unwrap(),
|
||||||
|
img_error
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
articles.push(extractor);
|
articles.push(extractor);
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue