Add custom error types and ignore failed image downloads
Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact.
This commit is contained in:
parent
d6cbbe405b
commit
7e9dcfc2b7
7 changed files with 178 additions and 72 deletions
9
Cargo.lock
generated
9
Cargo.lock
generated
|
@ -1283,6 +1283,7 @@ dependencies = [
|
||||||
"md5",
|
"md5",
|
||||||
"regex",
|
"regex",
|
||||||
"surf",
|
"surf",
|
||||||
|
"thiserror",
|
||||||
"url",
|
"url",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1960,18 +1961,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "thiserror"
|
||||||
version = "1.0.22"
|
version = "1.0.24"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e"
|
checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"thiserror-impl",
|
"thiserror-impl",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror-impl"
|
name = "thiserror-impl"
|
||||||
version = "1.0.22"
|
version = "1.0.24"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56"
|
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
|
|
|
@ -22,4 +22,5 @@ lazy_static = "1.4.0"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
regex = "1.4.2"
|
regex = "1.4.2"
|
||||||
surf = "2.1.0"
|
surf = "2.1.0"
|
||||||
|
thiserror = "1.0.24"
|
||||||
url = "2.2.0"
|
url = "2.2.0"
|
||||||
|
|
31
src/epub.rs
31
src/epub.rs
|
@ -2,12 +2,18 @@ use std::fs::File;
|
||||||
|
|
||||||
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
||||||
|
|
||||||
use crate::extractor::{self, Extractor};
|
use crate::{
|
||||||
|
errors::PaperoniError,
|
||||||
|
extractor::{self, Extractor},
|
||||||
|
};
|
||||||
|
|
||||||
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
|
pub fn generate_epubs(
|
||||||
|
articles: Vec<Extractor>,
|
||||||
|
merged: Option<&String>,
|
||||||
|
) -> Result<(), PaperoniError> {
|
||||||
match merged {
|
match merged {
|
||||||
Some(name) => {
|
Some(name) => {
|
||||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
|
||||||
epub.inline_toc();
|
epub.inline_toc();
|
||||||
epub = articles
|
epub = articles
|
||||||
.iter()
|
.iter()
|
||||||
|
@ -41,12 +47,12 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
|
||||||
epub
|
epub
|
||||||
});
|
});
|
||||||
let mut out_file = File::create(&name).unwrap();
|
let mut out_file = File::create(&name).unwrap();
|
||||||
epub.generate(&mut out_file).unwrap();
|
epub.generate(&mut out_file)?;
|
||||||
println!("Created {:?}", name);
|
println!("Created {:?}", name);
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
for article in articles {
|
for article in articles {
|
||||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
|
||||||
let file_name = format!(
|
let file_name = format!(
|
||||||
"{}.epub",
|
"{}.epub",
|
||||||
article
|
article
|
||||||
|
@ -61,26 +67,23 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
|
||||||
.expect("Unable to serialize to xhtml");
|
.expect("Unable to serialize to xhtml");
|
||||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||||
if let Some(author) = article.metadata().byline() {
|
if let Some(author) = article.metadata().byline() {
|
||||||
epub.metadata("author", replace_metadata_value(author))
|
epub.metadata("author", replace_metadata_value(author))?;
|
||||||
.unwrap();
|
|
||||||
}
|
}
|
||||||
epub.metadata("title", replace_metadata_value(article.metadata().title()))
|
epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
|
||||||
.unwrap();
|
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
|
||||||
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
|
|
||||||
.unwrap();
|
|
||||||
for img in article.img_urls {
|
for img in article.img_urls {
|
||||||
let mut file_path = std::env::temp_dir();
|
let mut file_path = std::env::temp_dir();
|
||||||
file_path.push(&img.0);
|
file_path.push(&img.0);
|
||||||
|
|
||||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||||
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
|
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())?;
|
||||||
.unwrap();
|
|
||||||
}
|
}
|
||||||
epub.generate(&mut out_file).unwrap();
|
epub.generate(&mut out_file)?;
|
||||||
println!("Created {:?}", file_name);
|
println!("Created {:?}", file_name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Replaces characters that have to be escaped before adding to the epub's metadata
|
/// Replaces characters that have to be escaped before adding to the epub's metadata
|
||||||
|
|
61
src/errors.rs
Normal file
61
src/errors.rs
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
#[derive(Error, Debug)]
|
||||||
|
pub enum ErrorKind {
|
||||||
|
#[error("[EpubError]: {0}")]
|
||||||
|
EpubError(String),
|
||||||
|
#[error("[HTTPError]: {0}")]
|
||||||
|
HTTPError(String),
|
||||||
|
#[error("[IOError]: {0}")]
|
||||||
|
IOError(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Error, Debug)]
|
||||||
|
#[error("{kind}")]
|
||||||
|
pub struct PaperoniError {
|
||||||
|
article_link: Option<String>,
|
||||||
|
kind: ErrorKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PaperoniError {
|
||||||
|
pub fn with_kind(kind: ErrorKind) -> Self {
|
||||||
|
PaperoniError {
|
||||||
|
article_link: None,
|
||||||
|
kind,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_article_link(&mut self, article_link: String) {
|
||||||
|
self.article_link = Some(article_link);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<ErrorKind> for PaperoniError {
|
||||||
|
fn from(kind: ErrorKind) -> Self {
|
||||||
|
PaperoniError::with_kind(kind)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<epub_builder::Error> for PaperoniError {
|
||||||
|
fn from(err: epub_builder::Error) -> Self {
|
||||||
|
PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<surf::Error> for PaperoniError {
|
||||||
|
fn from(err: surf::Error) -> Self {
|
||||||
|
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<url::ParseError> for PaperoniError {
|
||||||
|
fn from(err: url::ParseError) -> Self {
|
||||||
|
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<std::io::Error> for PaperoniError {
|
||||||
|
fn from(err: std::io::Error) -> Self {
|
||||||
|
PaperoniError::with_kind(ErrorKind::IOError(err.to_string()))
|
||||||
|
}
|
||||||
|
}
|
|
@ -2,6 +2,7 @@ use std::collections::HashMap;
|
||||||
|
|
||||||
use kuchiki::{traits::*, NodeRef};
|
use kuchiki::{traits::*, NodeRef};
|
||||||
|
|
||||||
|
use crate::errors::PaperoniError;
|
||||||
use crate::moz_readability::{MetaData, Readability};
|
use crate::moz_readability::{MetaData, Readability};
|
||||||
|
|
||||||
pub type ResourceInfo = (String, Option<String>);
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
@ -75,7 +76,7 @@ impl Extractor {
|
||||||
pub fn serialize_to_xhtml<W: std::io::Write>(
|
pub fn serialize_to_xhtml<W: std::io::Write>(
|
||||||
node_ref: &NodeRef,
|
node_ref: &NodeRef,
|
||||||
mut w: &mut W,
|
mut w: &mut W,
|
||||||
) -> Result<(), Box<dyn std::error::Error>> {
|
) -> Result<(), PaperoniError> {
|
||||||
let mut escape_map = HashMap::new();
|
let mut escape_map = HashMap::new();
|
||||||
escape_map.insert("<", "<");
|
escape_map.insert("<", "<");
|
||||||
escape_map.insert(">", ">");
|
escape_map.insert(">", ">");
|
||||||
|
|
125
src/http.rs
125
src/http.rs
|
@ -3,13 +3,11 @@ use async_std::{fs::File, stream};
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use crate::extractor::Extractor;
|
use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor};
|
||||||
|
|
||||||
type HTMLResource = (String, String);
|
type HTMLResource = (String, String);
|
||||||
|
|
||||||
pub async fn fetch_url(
|
pub async fn fetch_url(url: &str) -> Result<HTMLResource, PaperoniError> {
|
||||||
url: &str,
|
|
||||||
) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
|
|
||||||
let client = surf::Client::new();
|
let client = surf::Client::new();
|
||||||
println!("Fetching...");
|
println!("Fetching...");
|
||||||
|
|
||||||
|
@ -37,26 +35,28 @@ pub async fn fetch_url(
|
||||||
if mime.essence() == "text/html" {
|
if mime.essence() == "text/html" {
|
||||||
return Ok((url.to_string(), res.body_string().await?));
|
return Ok((url.to_string(), res.body_string().await?));
|
||||||
} else {
|
} else {
|
||||||
return Err(format!(
|
let msg = format!(
|
||||||
"Invalid HTTP response. Received {} instead of text/html",
|
"Invalid HTTP response. Received {} instead of text/html",
|
||||||
mime.essence()
|
mime.essence()
|
||||||
)
|
);
|
||||||
.into());
|
|
||||||
|
return Err(ErrorKind::HTTPError(msg).into());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
return Err("Unknown HTTP response".into());
|
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
return Err(format!("Request failed: HTTP {}", res.status()).into());
|
let msg = format!("Request failed: HTTP {}", res.status());
|
||||||
|
return Err(ErrorKind::HTTPError(msg).into());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err("Unable to fetch HTML".into())
|
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn download_images(
|
pub async fn download_images(
|
||||||
extractor: &mut Extractor,
|
extractor: &mut Extractor,
|
||||||
article_origin: &Url,
|
article_origin: &Url,
|
||||||
) -> async_std::io::Result<()> {
|
) -> Result<(), Vec<PaperoniError>> {
|
||||||
if extractor.img_urls.len() > 0 {
|
if extractor.img_urls.len() > 0 {
|
||||||
println!("Downloading images...");
|
println!("Downloading images...");
|
||||||
}
|
}
|
||||||
|
@ -71,39 +71,56 @@ pub async fn download_images(
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.map(|(url, req)| async move {
|
.map(|(url, req)| async move {
|
||||||
let mut img_response = req.await.expect("Unable to retrieve image");
|
match req.await {
|
||||||
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
Ok(mut img_response) => {
|
||||||
let img_mime = img_response
|
// let mut img_response = req.await.expect("Unable to retrieve image");
|
||||||
.content_type()
|
let img_content: Vec<u8> = match img_response.body_bytes().await {
|
||||||
.map(|mime| mime.essence().to_string());
|
Ok(bytes) => bytes,
|
||||||
let img_ext = img_response
|
Err(e) => return Err(e.into()),
|
||||||
.content_type()
|
};
|
||||||
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
let img_mime = img_response
|
||||||
.unwrap();
|
.content_type()
|
||||||
|
.map(|mime| mime.essence().to_string());
|
||||||
|
let img_ext = match img_response
|
||||||
|
.content_type()
|
||||||
|
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||||
|
{
|
||||||
|
Some(mime_str) => mime_str,
|
||||||
|
None => {
|
||||||
|
return Err(ErrorKind::HTTPError(
|
||||||
|
"Image has no Content-Type".to_owned(),
|
||||||
|
)
|
||||||
|
.into())
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let mut img_path = std::env::temp_dir();
|
let mut img_path = std::env::temp_dir();
|
||||||
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
|
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
|
||||||
let mut img_file = File::create(&img_path)
|
let mut img_file = match File::create(&img_path).await {
|
||||||
.await
|
Ok(file) => file,
|
||||||
.expect("Unable to create file");
|
Err(e) => return Err(e.into()),
|
||||||
img_file
|
};
|
||||||
.write_all(&img_content)
|
match img_file.write_all(&img_content).await {
|
||||||
.await
|
Ok(_) => (),
|
||||||
.expect("Unable to save to file");
|
Err(e) => return Err(e.into()),
|
||||||
|
}
|
||||||
|
|
||||||
(
|
Ok((
|
||||||
url,
|
url,
|
||||||
img_path
|
img_path
|
||||||
.file_name()
|
.file_name()
|
||||||
.map(|os_str_name| {
|
.map(|os_str_name| {
|
||||||
os_str_name
|
os_str_name
|
||||||
.to_str()
|
.to_str()
|
||||||
.expect("Unable to get image file name")
|
.expect("Unable to get image file name")
|
||||||
.to_string()
|
.to_string()
|
||||||
})
|
})
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
img_mime,
|
img_mime,
|
||||||
)
|
))
|
||||||
|
}
|
||||||
|
Err(e) => Err(e.into()),
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// A utility closure used when update the value of an image source after downloading is successful
|
// A utility closure used when update the value of an image source after downloading is successful
|
||||||
|
@ -124,14 +141,24 @@ pub async fn download_images(
|
||||||
(img_path, img_mime)
|
(img_path, img_mime)
|
||||||
};
|
};
|
||||||
|
|
||||||
extractor.img_urls = stream::from_iter(imgs_req_iter)
|
let imgs_req_iter = stream::from_iter(imgs_req_iter)
|
||||||
.buffered(10)
|
.buffered(10)
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<Result<_, PaperoniError>>>()
|
||||||
.await
|
.await;
|
||||||
.into_iter()
|
let mut errors = Vec::new();
|
||||||
.map(replace_existing_img_src)
|
let mut replaced_imgs = Vec::new();
|
||||||
.collect();
|
for img_req_result in imgs_req_iter {
|
||||||
Ok(())
|
match img_req_result {
|
||||||
|
Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
|
||||||
|
Err(e) => errors.push(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
extractor.img_urls = replaced_imgs;
|
||||||
|
if errors.is_empty() {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(errors)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Handles getting the extension from a given MIME subtype.
|
/// Handles getting the extension from a given MIME subtype.
|
||||||
|
|
20
src/main.rs
20
src/main.rs
|
@ -8,6 +8,7 @@ use url::Url;
|
||||||
|
|
||||||
mod cli;
|
mod cli;
|
||||||
mod epub;
|
mod epub;
|
||||||
|
mod errors;
|
||||||
mod extractor;
|
mod extractor;
|
||||||
/// This module is responsible for async HTTP calls for downloading
|
/// This module is responsible for async HTTP calls for downloading
|
||||||
/// the HTML content and images
|
/// the HTML content and images
|
||||||
|
@ -41,9 +42,17 @@ fn download(app_config: AppConfig) {
|
||||||
|
|
||||||
if extractor.article().is_some() {
|
if extractor.article().is_some() {
|
||||||
extractor.extract_img_urls();
|
extractor.extract_img_urls();
|
||||||
download_images(&mut extractor, &Url::parse(&url).unwrap())
|
|
||||||
.await
|
if let Err(img_errors) =
|
||||||
.expect("Unable to download images");
|
download_images(&mut extractor, &Url::parse(&url).unwrap()).await
|
||||||
|
{
|
||||||
|
eprintln!(
|
||||||
|
"{} image{} failed to download for {}",
|
||||||
|
img_errors.len(),
|
||||||
|
if img_errors.len() > 1 { "s" } else { "" },
|
||||||
|
url
|
||||||
|
);
|
||||||
|
}
|
||||||
articles.push(extractor);
|
articles.push(extractor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -52,5 +61,8 @@ fn download(app_config: AppConfig) {
|
||||||
}
|
}
|
||||||
articles
|
articles
|
||||||
});
|
});
|
||||||
generate_epubs(articles, app_config.merged());
|
match generate_epubs(articles, app_config.merged()) {
|
||||||
|
Ok(_) => (),
|
||||||
|
Err(e) => eprintln!("{}", e),
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue