Add ImgError struct for representing errors in downloading article images

This commit is contained in:
Kenneth Gitere 2021-04-24 13:57:06 +03:00
parent 910c45abf7
commit a3de3fb6ff
3 changed files with 110 additions and 47 deletions

View file

@ -14,6 +14,53 @@ pub enum ErrorKind {
ReadabilityError(String), ReadabilityError(String),
} }
#[derive(Error, Debug)]
#[error("{kind}")]
/// Used to represent errors from downloading images. Errors from here are used solely for debugging
/// as they are considered recoverable.
pub struct ImgError {
kind: ErrorKind,
url: Option<String>,
}
impl ImgError {
pub fn with_kind(kind: ErrorKind) -> Self {
ImgError { url: None, kind }
}
pub fn set_url(&mut self, url: &str) {
self.url = Some(url.to_string());
}
pub fn url(&self) -> &Option<String> {
&self.url
}
}
impl From<ErrorKind> for ImgError {
fn from(kind: ErrorKind) -> Self {
ImgError::with_kind(kind)
}
}
impl From<surf::Error> for ImgError {
fn from(err: surf::Error) -> Self {
ImgError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<url::ParseError> for ImgError {
fn from(err: url::ParseError) -> Self {
ImgError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<std::io::Error> for ImgError {
fn from(err: std::io::Error) -> Self {
ImgError::with_kind(ErrorKind::IOError(err.to_string()))
}
}
#[derive(Error, Debug)] #[derive(Error, Debug)]
#[error("{kind}")] #[error("{kind}")]
pub struct PaperoniError { pub struct PaperoniError {

View file

@ -5,8 +5,8 @@ use indicatif::ProgressBar;
use log::{debug, info}; use log::{debug, info};
use url::Url; use url::Url;
use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor}; use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Extractor;
type HTMLResource = (String, String); type HTMLResource = (String, String);
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> { pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
@ -76,7 +76,7 @@ pub async fn download_images(
extractor: &mut Extractor, extractor: &mut Extractor,
article_origin: &Url, article_origin: &Url,
bar: &ProgressBar, bar: &ProgressBar,
) -> Result<(), Vec<PaperoniError>> { ) -> Result<(), Vec<ImgError>> {
if extractor.img_urls.len() > 0 { if extractor.img_urls.len() > 0 {
debug!( debug!(
"Downloading {} images for {}", "Downloading {} images for {}",
@ -102,53 +102,62 @@ pub async fn download_images(
bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str()); bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());
match req.await { match req.await {
Ok(mut img_response) => { Ok(mut img_response) => {
// let mut img_response = req.await.expect("Unable to retrieve image"); let process_response = async {
let img_content: Vec<u8> = match img_response.body_bytes().await { let img_content: Vec<u8> = match img_response.body_bytes().await {
Ok(bytes) => bytes, Ok(bytes) => bytes,
Err(e) => return Err(e.into()), Err(e) => return Err(e.into()),
}; };
let img_mime = img_response let img_mime = img_response
.content_type() .content_type()
.map(|mime| mime.essence().to_string()); .map(|mime| mime.essence().to_string());
let img_ext = match img_response let img_ext = match img_response
.content_type() .content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
{ {
Some(mime_str) => mime_str, Some(mime_str) => mime_str,
None => { None => {
return Err(ErrorKind::HTTPError( return Err(ErrorKind::HTTPError(
"Image has no Content-Type".to_owned(), "Image has no Content-Type".to_owned(),
) )
.into()) .into())
}
};
let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
let mut img_file = match File::create(&img_path).await {
Ok(file) => file,
Err(e) => return Err(e.into()),
};
match img_file.write_all(&img_content).await {
Ok(_) => (),
Err(e) => return Err(e.into()),
} }
};
let mut img_path = std::env::temp_dir(); Ok((
img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); url,
let mut img_file = match File::create(&img_path).await { img_path
Ok(file) => file, .file_name()
Err(e) => return Err(e.into()), .map(|os_str_name| {
os_str_name
.to_str()
.expect("Unable to get image file name")
.to_string()
})
.unwrap(),
img_mime,
))
}; };
match img_file.write_all(&img_content).await { process_response.await.map_err(|mut e: ImgError| {
Ok(_) => (), e.set_url(url);
Err(e) => return Err(e.into()), e
} })
}
Ok(( Err(e) => {
url, let mut img_err: ImgError = e.into();
img_path img_err.set_url(url);
.file_name() Err(img_err)
.map(|os_str_name| {
os_str_name
.to_str()
.expect("Unable to get image file name")
.to_string()
})
.unwrap(),
img_mime,
))
} }
Err(e) => Err(e.into()),
} }
}); });
@ -170,7 +179,7 @@ pub async fn download_images(
let imgs_req_iter = stream::from_iter(imgs_req_iter) let imgs_req_iter = stream::from_iter(imgs_req_iter)
.buffered(10) .buffered(10)
.collect::<Vec<Result<_, PaperoniError>>>() .collect::<Vec<Result<_, ImgError>>>()
.await; .await;
let mut errors = Vec::new(); let mut errors = Vec::new();
let mut replaced_imgs = Vec::new(); let mut replaced_imgs = Vec::new();

View file

@ -70,6 +70,13 @@ fn download(app_config: AppConfig) {
if img_errors.len() > 1 { "s" } else { "" }, if img_errors.len() > 1 { "s" } else { "" },
url url
); );
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
} }
articles.push(extractor); articles.push(extractor);
} }