Add custom error types and ignore failed image downloads

Using this custom error type, many instances of unwrap are replaced
with mapping to errors that are then logged in main.rs. This allows
paperoni to stop crashing when downloading articles when the errors
are possibly recoverable or should not affect other downloads.

This subsequently introduces ignoring the failed image downloads
and instead leaving the original URLs intact.
This commit is contained in:
Kenneth Gitere 2021-04-17 12:04:06 +03:00
parent d6cbbe405b
commit 7e9dcfc2b7
7 changed files with 178 additions and 72 deletions

9
Cargo.lock generated
View file

@ -1283,6 +1283,7 @@ dependencies = [
"md5", "md5",
"regex", "regex",
"surf", "surf",
"thiserror",
"url", "url",
] ]
@ -1960,18 +1961,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.22" version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e" checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e"
dependencies = [ dependencies = [
"thiserror-impl", "thiserror-impl",
] ]
[[package]] [[package]]
name = "thiserror-impl" name = "thiserror-impl"
version = "1.0.22" version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56" checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",

View file

@ -22,4 +22,5 @@ lazy_static = "1.4.0"
md5 = "0.7.0" md5 = "0.7.0"
regex = "1.4.2" regex = "1.4.2"
surf = "2.1.0" surf = "2.1.0"
thiserror = "1.0.24"
url = "2.2.0" url = "2.2.0"

View file

@ -2,12 +2,18 @@ use std::fs::File;
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use crate::extractor::{self, Extractor}; use crate::{
errors::PaperoniError,
extractor::{self, Extractor},
};
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) { pub fn generate_epubs(
articles: Vec<Extractor>,
merged: Option<&String>,
) -> Result<(), PaperoniError> {
match merged { match merged {
Some(name) => { Some(name) => {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
epub.inline_toc(); epub.inline_toc();
epub = articles epub = articles
.iter() .iter()
@ -41,12 +47,12 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
epub epub
}); });
let mut out_file = File::create(&name).unwrap(); let mut out_file = File::create(&name).unwrap();
epub.generate(&mut out_file).unwrap(); epub.generate(&mut out_file)?;
println!("Created {:?}", name); println!("Created {:?}", name);
} }
None => { None => {
for article in articles { for article in articles {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
let file_name = format!( let file_name = format!(
"{}.epub", "{}.epub",
article article
@ -61,26 +67,23 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
.expect("Unable to serialize to xhtml"); .expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap(); let html_str = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = article.metadata().byline() { if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author)) epub.metadata("author", replace_metadata_value(author))?;
.unwrap();
} }
epub.metadata("title", replace_metadata_value(article.metadata().title())) epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
.unwrap(); epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
.unwrap();
for img in article.img_urls { for img in article.img_urls {
let mut file_path = std::env::temp_dir(); let mut file_path = std::env::temp_dir();
file_path.push(&img.0); file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file"); let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())?;
.unwrap();
} }
epub.generate(&mut out_file).unwrap(); epub.generate(&mut out_file)?;
println!("Created {:?}", file_name); println!("Created {:?}", file_name);
} }
} }
} }
Ok(())
} }
/// Replaces characters that have to be escaped before adding to the epub's metadata /// Replaces characters that have to be escaped before adding to the epub's metadata

61
src/errors.rs Normal file
View file

@ -0,0 +1,61 @@
use thiserror::Error;
#[derive(Error, Debug)]
pub enum ErrorKind {
#[error("[EpubError]: {0}")]
EpubError(String),
#[error("[HTTPError]: {0}")]
HTTPError(String),
#[error("[IOError]: {0}")]
IOError(String),
}
#[derive(Error, Debug)]
#[error("{kind}")]
pub struct PaperoniError {
article_link: Option<String>,
kind: ErrorKind,
}
impl PaperoniError {
pub fn with_kind(kind: ErrorKind) -> Self {
PaperoniError {
article_link: None,
kind,
}
}
pub fn set_article_link(&mut self, article_link: String) {
self.article_link = Some(article_link);
}
}
impl From<ErrorKind> for PaperoniError {
fn from(kind: ErrorKind) -> Self {
PaperoniError::with_kind(kind)
}
}
impl From<epub_builder::Error> for PaperoniError {
fn from(err: epub_builder::Error) -> Self {
PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned()))
}
}
impl From<surf::Error> for PaperoniError {
fn from(err: surf::Error) -> Self {
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<url::ParseError> for PaperoniError {
fn from(err: url::ParseError) -> Self {
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<std::io::Error> for PaperoniError {
fn from(err: std::io::Error) -> Self {
PaperoniError::with_kind(ErrorKind::IOError(err.to_string()))
}
}

View file

@ -2,6 +2,7 @@ use std::collections::HashMap;
use kuchiki::{traits::*, NodeRef}; use kuchiki::{traits::*, NodeRef};
use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability}; use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
@ -75,7 +76,7 @@ impl Extractor {
pub fn serialize_to_xhtml<W: std::io::Write>( pub fn serialize_to_xhtml<W: std::io::Write>(
node_ref: &NodeRef, node_ref: &NodeRef,
mut w: &mut W, mut w: &mut W,
) -> Result<(), Box<dyn std::error::Error>> { ) -> Result<(), PaperoniError> {
let mut escape_map = HashMap::new(); let mut escape_map = HashMap::new();
escape_map.insert("<", "&lt;"); escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;"); escape_map.insert(">", "&gt;");

View file

@ -3,13 +3,11 @@ use async_std::{fs::File, stream};
use futures::StreamExt; use futures::StreamExt;
use url::Url; use url::Url;
use crate::extractor::Extractor; use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor};
type HTMLResource = (String, String); type HTMLResource = (String, String);
pub async fn fetch_url( pub async fn fetch_url(url: &str) -> Result<HTMLResource, PaperoniError> {
url: &str,
) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
let client = surf::Client::new(); let client = surf::Client::new();
println!("Fetching..."); println!("Fetching...");
@ -37,26 +35,28 @@ pub async fn fetch_url(
if mime.essence() == "text/html" { if mime.essence() == "text/html" {
return Ok((url.to_string(), res.body_string().await?)); return Ok((url.to_string(), res.body_string().await?));
} else { } else {
return Err(format!( let msg = format!(
"Invalid HTTP response. Received {} instead of text/html", "Invalid HTTP response. Received {} instead of text/html",
mime.essence() mime.essence()
) );
.into());
return Err(ErrorKind::HTTPError(msg).into());
} }
} else { } else {
return Err("Unknown HTTP response".into()); return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
} }
} else { } else {
return Err(format!("Request failed: HTTP {}", res.status()).into()); let msg = format!("Request failed: HTTP {}", res.status());
return Err(ErrorKind::HTTPError(msg).into());
} }
} }
Err("Unable to fetch HTML".into()) Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
} }
pub async fn download_images( pub async fn download_images(
extractor: &mut Extractor, extractor: &mut Extractor,
article_origin: &Url, article_origin: &Url,
) -> async_std::io::Result<()> { ) -> Result<(), Vec<PaperoniError>> {
if extractor.img_urls.len() > 0 { if extractor.img_urls.len() > 0 {
println!("Downloading images..."); println!("Downloading images...");
} }
@ -71,39 +71,56 @@ pub async fn download_images(
) )
}) })
.map(|(url, req)| async move { .map(|(url, req)| async move {
let mut img_response = req.await.expect("Unable to retrieve image"); match req.await {
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap(); Ok(mut img_response) => {
let img_mime = img_response // let mut img_response = req.await.expect("Unable to retrieve image");
.content_type() let img_content: Vec<u8> = match img_response.body_bytes().await {
.map(|mime| mime.essence().to_string()); Ok(bytes) => bytes,
let img_ext = img_response Err(e) => return Err(e.into()),
.content_type() };
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) let img_mime = img_response
.unwrap(); .content_type()
.map(|mime| mime.essence().to_string());
let img_ext = match img_response
.content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
{
Some(mime_str) => mime_str,
None => {
return Err(ErrorKind::HTTPError(
"Image has no Content-Type".to_owned(),
)
.into())
}
};
let mut img_path = std::env::temp_dir(); let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
let mut img_file = File::create(&img_path) let mut img_file = match File::create(&img_path).await {
.await Ok(file) => file,
.expect("Unable to create file"); Err(e) => return Err(e.into()),
img_file };
.write_all(&img_content) match img_file.write_all(&img_content).await {
.await Ok(_) => (),
.expect("Unable to save to file"); Err(e) => return Err(e.into()),
}
( Ok((
url, url,
img_path img_path
.file_name() .file_name()
.map(|os_str_name| { .map(|os_str_name| {
os_str_name os_str_name
.to_str() .to_str()
.expect("Unable to get image file name") .expect("Unable to get image file name")
.to_string() .to_string()
}) })
.unwrap(), .unwrap(),
img_mime, img_mime,
) ))
}
Err(e) => Err(e.into()),
}
}); });
// A utility closure used when update the value of an image source after downloading is successful // A utility closure used when update the value of an image source after downloading is successful
@ -124,14 +141,24 @@ pub async fn download_images(
(img_path, img_mime) (img_path, img_mime)
}; };
extractor.img_urls = stream::from_iter(imgs_req_iter) let imgs_req_iter = stream::from_iter(imgs_req_iter)
.buffered(10) .buffered(10)
.collect::<Vec<_>>() .collect::<Vec<Result<_, PaperoniError>>>()
.await .await;
.into_iter() let mut errors = Vec::new();
.map(replace_existing_img_src) let mut replaced_imgs = Vec::new();
.collect(); for img_req_result in imgs_req_iter {
Ok(()) match img_req_result {
Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
Err(e) => errors.push(e),
}
}
extractor.img_urls = replaced_imgs;
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
} }
/// Handles getting the extension from a given MIME subtype. /// Handles getting the extension from a given MIME subtype.

View file

@ -8,6 +8,7 @@ use url::Url;
mod cli; mod cli;
mod epub; mod epub;
mod errors;
mod extractor; mod extractor;
/// This module is responsible for async HTTP calls for downloading /// This module is responsible for async HTTP calls for downloading
/// the HTML content and images /// the HTML content and images
@ -41,9 +42,17 @@ fn download(app_config: AppConfig) {
if extractor.article().is_some() { if extractor.article().is_some() {
extractor.extract_img_urls(); extractor.extract_img_urls();
download_images(&mut extractor, &Url::parse(&url).unwrap())
.await if let Err(img_errors) =
.expect("Unable to download images"); download_images(&mut extractor, &Url::parse(&url).unwrap()).await
{
eprintln!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
}
articles.push(extractor); articles.push(extractor);
} }
} }
@ -52,5 +61,8 @@ fn download(app_config: AppConfig) {
} }
articles articles
}); });
generate_epubs(articles, app_config.merged()); match generate_epubs(articles, app_config.merged()) {
Ok(_) => (),
Err(e) => eprintln!("{}", e),
};
} }