Add custom error types and ignore failed image downloads

Using this custom error type, many instances of unwrap are replaced
with mapping to errors that are then logged in main.rs. This allows
paperoni to stop crashing when downloading articles when the errors
are possibly recoverable or should not affect other downloads.

This subsequently introduces ignoring the failed image downloads
and instead leaving the original URLs intact.
This commit is contained in:
Kenneth Gitere 2021-04-17 12:04:06 +03:00
parent d6cbbe405b
commit 7e9dcfc2b7
7 changed files with 178 additions and 72 deletions

9
Cargo.lock generated
View file

@ -1283,6 +1283,7 @@ dependencies = [
"md5",
"regex",
"surf",
"thiserror",
"url",
]
@ -1960,18 +1961,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]]
name = "thiserror"
version = "1.0.22"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e"
checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.22"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56"
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
dependencies = [
"proc-macro2",
"quote",

View file

@ -22,4 +22,5 @@ lazy_static = "1.4.0"
md5 = "0.7.0"
regex = "1.4.2"
surf = "2.1.0"
thiserror = "1.0.24"
url = "2.2.0"

View file

@ -2,12 +2,18 @@ use std::fs::File;
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use crate::extractor::{self, Extractor};
use crate::{
errors::PaperoniError,
extractor::{self, Extractor},
};
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
pub fn generate_epubs(
articles: Vec<Extractor>,
merged: Option<&String>,
) -> Result<(), PaperoniError> {
match merged {
Some(name) => {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
epub.inline_toc();
epub = articles
.iter()
@ -41,12 +47,12 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
epub
});
let mut out_file = File::create(&name).unwrap();
epub.generate(&mut out_file).unwrap();
epub.generate(&mut out_file)?;
println!("Created {:?}", name);
}
None => {
for article in articles {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
let file_name = format!(
"{}.epub",
article
@ -61,26 +67,23 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))
.unwrap();
epub.metadata("author", replace_metadata_value(author))?;
}
epub.metadata("title", replace_metadata_value(article.metadata().title()))
.unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
.unwrap();
epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
for img in article.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
.unwrap();
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())?;
}
epub.generate(&mut out_file).unwrap();
epub.generate(&mut out_file)?;
println!("Created {:?}", file_name);
}
}
}
Ok(())
}
/// Replaces characters that have to be escaped before adding to the epub's metadata

61
src/errors.rs Normal file
View file

@ -0,0 +1,61 @@
use thiserror::Error;
#[derive(Error, Debug)]
pub enum ErrorKind {
#[error("[EpubError]: {0}")]
EpubError(String),
#[error("[HTTPError]: {0}")]
HTTPError(String),
#[error("[IOError]: {0}")]
IOError(String),
}
#[derive(Error, Debug)]
#[error("{kind}")]
pub struct PaperoniError {
article_link: Option<String>,
kind: ErrorKind,
}
impl PaperoniError {
pub fn with_kind(kind: ErrorKind) -> Self {
PaperoniError {
article_link: None,
kind,
}
}
pub fn set_article_link(&mut self, article_link: String) {
self.article_link = Some(article_link);
}
}
impl From<ErrorKind> for PaperoniError {
fn from(kind: ErrorKind) -> Self {
PaperoniError::with_kind(kind)
}
}
impl From<epub_builder::Error> for PaperoniError {
fn from(err: epub_builder::Error) -> Self {
PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned()))
}
}
impl From<surf::Error> for PaperoniError {
fn from(err: surf::Error) -> Self {
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<url::ParseError> for PaperoniError {
fn from(err: url::ParseError) -> Self {
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<std::io::Error> for PaperoniError {
fn from(err: std::io::Error) -> Self {
PaperoniError::with_kind(ErrorKind::IOError(err.to_string()))
}
}

View file

@ -2,6 +2,7 @@ use std::collections::HashMap;
use kuchiki::{traits::*, NodeRef};
use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>);
@ -75,7 +76,7 @@ impl Extractor {
pub fn serialize_to_xhtml<W: std::io::Write>(
node_ref: &NodeRef,
mut w: &mut W,
) -> Result<(), Box<dyn std::error::Error>> {
) -> Result<(), PaperoniError> {
let mut escape_map = HashMap::new();
escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;");

View file

@ -3,13 +3,11 @@ use async_std::{fs::File, stream};
use futures::StreamExt;
use url::Url;
use crate::extractor::Extractor;
use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor};
type HTMLResource = (String, String);
pub async fn fetch_url(
url: &str,
) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
pub async fn fetch_url(url: &str) -> Result<HTMLResource, PaperoniError> {
let client = surf::Client::new();
println!("Fetching...");
@ -37,26 +35,28 @@ pub async fn fetch_url(
if mime.essence() == "text/html" {
return Ok((url.to_string(), res.body_string().await?));
} else {
return Err(format!(
let msg = format!(
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
)
.into());
);
return Err(ErrorKind::HTTPError(msg).into());
}
} else {
return Err("Unknown HTTP response".into());
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
}
} else {
return Err(format!("Request failed: HTTP {}", res.status()).into());
let msg = format!("Request failed: HTTP {}", res.status());
return Err(ErrorKind::HTTPError(msg).into());
}
}
Err("Unable to fetch HTML".into())
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
}
pub async fn download_images(
extractor: &mut Extractor,
article_origin: &Url,
) -> async_std::io::Result<()> {
) -> Result<(), Vec<PaperoniError>> {
if extractor.img_urls.len() > 0 {
println!("Downloading images...");
}
@ -71,27 +71,41 @@ pub async fn download_images(
)
})
.map(|(url, req)| async move {
let mut img_response = req.await.expect("Unable to retrieve image");
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
match req.await {
Ok(mut img_response) => {
// let mut img_response = req.await.expect("Unable to retrieve image");
let img_content: Vec<u8> = match img_response.body_bytes().await {
Ok(bytes) => bytes,
Err(e) => return Err(e.into()),
};
let img_mime = img_response
.content_type()
.map(|mime| mime.essence().to_string());
let img_ext = img_response
let img_ext = match img_response
.content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
.unwrap();
{
Some(mime_str) => mime_str,
None => {
return Err(ErrorKind::HTTPError(
"Image has no Content-Type".to_owned(),
)
.into())
}
};
let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
let mut img_file = File::create(&img_path)
.await
.expect("Unable to create file");
img_file
.write_all(&img_content)
.await
.expect("Unable to save to file");
let mut img_file = match File::create(&img_path).await {
Ok(file) => file,
Err(e) => return Err(e.into()),
};
match img_file.write_all(&img_content).await {
Ok(_) => (),
Err(e) => return Err(e.into()),
}
(
Ok((
url,
img_path
.file_name()
@ -103,7 +117,10 @@ pub async fn download_images(
})
.unwrap(),
img_mime,
)
))
}
Err(e) => Err(e.into()),
}
});
// A utility closure used when update the value of an image source after downloading is successful
@ -124,14 +141,24 @@ pub async fn download_images(
(img_path, img_mime)
};
extractor.img_urls = stream::from_iter(imgs_req_iter)
let imgs_req_iter = stream::from_iter(imgs_req_iter)
.buffered(10)
.collect::<Vec<_>>()
.await
.into_iter()
.map(replace_existing_img_src)
.collect();
.collect::<Vec<Result<_, PaperoniError>>>()
.await;
let mut errors = Vec::new();
let mut replaced_imgs = Vec::new();
for img_req_result in imgs_req_iter {
match img_req_result {
Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
Err(e) => errors.push(e),
}
}
extractor.img_urls = replaced_imgs;
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
}
/// Handles getting the extension from a given MIME subtype.

View file

@ -8,6 +8,7 @@ use url::Url;
mod cli;
mod epub;
mod errors;
mod extractor;
/// This module is responsible for async HTTP calls for downloading
/// the HTML content and images
@ -41,9 +42,17 @@ fn download(app_config: AppConfig) {
if extractor.article().is_some() {
extractor.extract_img_urls();
download_images(&mut extractor, &Url::parse(&url).unwrap())
.await
.expect("Unable to download images");
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap()).await
{
eprintln!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
}
articles.push(extractor);
}
}
@ -52,5 +61,8 @@ fn download(app_config: AppConfig) {
}
articles
});
generate_epubs(articles, app_config.merged());
match generate_epubs(articles, app_config.merged()) {
Ok(_) => (),
Err(e) => eprintln!("{}", e),
};
}