Add custom error types and ignore failed image downloads
Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact.
This commit is contained in:
parent
d6cbbe405b
commit
7e9dcfc2b7
7 changed files with 178 additions and 72 deletions
9
Cargo.lock
generated
9
Cargo.lock
generated
|
@ -1283,6 +1283,7 @@ dependencies = [
|
|||
"md5",
|
||||
"regex",
|
||||
"surf",
|
||||
"thiserror",
|
||||
"url",
|
||||
]
|
||||
|
||||
|
@ -1960,18 +1961,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
|
|||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.22"
|
||||
version = "1.0.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e"
|
||||
checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.22"
|
||||
version = "1.0.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56"
|
||||
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
|
|
@ -22,4 +22,5 @@ lazy_static = "1.4.0"
|
|||
md5 = "0.7.0"
|
||||
regex = "1.4.2"
|
||||
surf = "2.1.0"
|
||||
thiserror = "1.0.24"
|
||||
url = "2.2.0"
|
||||
|
|
31
src/epub.rs
31
src/epub.rs
|
@ -2,12 +2,18 @@ use std::fs::File;
|
|||
|
||||
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
||||
|
||||
use crate::extractor::{self, Extractor};
|
||||
use crate::{
|
||||
errors::PaperoniError,
|
||||
extractor::{self, Extractor},
|
||||
};
|
||||
|
||||
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
|
||||
pub fn generate_epubs(
|
||||
articles: Vec<Extractor>,
|
||||
merged: Option<&String>,
|
||||
) -> Result<(), PaperoniError> {
|
||||
match merged {
|
||||
Some(name) => {
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
|
||||
epub.inline_toc();
|
||||
epub = articles
|
||||
.iter()
|
||||
|
@ -41,12 +47,12 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
|
|||
epub
|
||||
});
|
||||
let mut out_file = File::create(&name).unwrap();
|
||||
epub.generate(&mut out_file).unwrap();
|
||||
epub.generate(&mut out_file)?;
|
||||
println!("Created {:?}", name);
|
||||
}
|
||||
None => {
|
||||
for article in articles {
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
|
||||
let file_name = format!(
|
||||
"{}.epub",
|
||||
article
|
||||
|
@ -61,26 +67,23 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
|
|||
.expect("Unable to serialize to xhtml");
|
||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||
if let Some(author) = article.metadata().byline() {
|
||||
epub.metadata("author", replace_metadata_value(author))
|
||||
.unwrap();
|
||||
epub.metadata("author", replace_metadata_value(author))?;
|
||||
}
|
||||
epub.metadata("title", replace_metadata_value(article.metadata().title()))
|
||||
.unwrap();
|
||||
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
|
||||
.unwrap();
|
||||
epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
|
||||
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
|
||||
for img in article.img_urls {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
|
||||
.unwrap();
|
||||
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())?;
|
||||
}
|
||||
epub.generate(&mut out_file).unwrap();
|
||||
epub.generate(&mut out_file)?;
|
||||
println!("Created {:?}", file_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Replaces characters that have to be escaped before adding to the epub's metadata
|
||||
|
|
61
src/errors.rs
Normal file
61
src/errors.rs
Normal file
|
@ -0,0 +1,61 @@
|
|||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ErrorKind {
|
||||
#[error("[EpubError]: {0}")]
|
||||
EpubError(String),
|
||||
#[error("[HTTPError]: {0}")]
|
||||
HTTPError(String),
|
||||
#[error("[IOError]: {0}")]
|
||||
IOError(String),
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("{kind}")]
|
||||
pub struct PaperoniError {
|
||||
article_link: Option<String>,
|
||||
kind: ErrorKind,
|
||||
}
|
||||
|
||||
impl PaperoniError {
|
||||
pub fn with_kind(kind: ErrorKind) -> Self {
|
||||
PaperoniError {
|
||||
article_link: None,
|
||||
kind,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_article_link(&mut self, article_link: String) {
|
||||
self.article_link = Some(article_link);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ErrorKind> for PaperoniError {
|
||||
fn from(kind: ErrorKind) -> Self {
|
||||
PaperoniError::with_kind(kind)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<epub_builder::Error> for PaperoniError {
|
||||
fn from(err: epub_builder::Error) -> Self {
|
||||
PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<surf::Error> for PaperoniError {
|
||||
fn from(err: surf::Error) -> Self {
|
||||
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<url::ParseError> for PaperoniError {
|
||||
fn from(err: url::ParseError) -> Self {
|
||||
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for PaperoniError {
|
||||
fn from(err: std::io::Error) -> Self {
|
||||
PaperoniError::with_kind(ErrorKind::IOError(err.to_string()))
|
||||
}
|
||||
}
|
|
@ -2,6 +2,7 @@ use std::collections::HashMap;
|
|||
|
||||
use kuchiki::{traits::*, NodeRef};
|
||||
|
||||
use crate::errors::PaperoniError;
|
||||
use crate::moz_readability::{MetaData, Readability};
|
||||
|
||||
pub type ResourceInfo = (String, Option<String>);
|
||||
|
@ -75,7 +76,7 @@ impl Extractor {
|
|||
pub fn serialize_to_xhtml<W: std::io::Write>(
|
||||
node_ref: &NodeRef,
|
||||
mut w: &mut W,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
) -> Result<(), PaperoniError> {
|
||||
let mut escape_map = HashMap::new();
|
||||
escape_map.insert("<", "<");
|
||||
escape_map.insert(">", ">");
|
||||
|
|
125
src/http.rs
125
src/http.rs
|
@ -3,13 +3,11 @@ use async_std::{fs::File, stream};
|
|||
use futures::StreamExt;
|
||||
use url::Url;
|
||||
|
||||
use crate::extractor::Extractor;
|
||||
use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor};
|
||||
|
||||
type HTMLResource = (String, String);
|
||||
|
||||
pub async fn fetch_url(
|
||||
url: &str,
|
||||
) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
|
||||
pub async fn fetch_url(url: &str) -> Result<HTMLResource, PaperoniError> {
|
||||
let client = surf::Client::new();
|
||||
println!("Fetching...");
|
||||
|
||||
|
@ -37,26 +35,28 @@ pub async fn fetch_url(
|
|||
if mime.essence() == "text/html" {
|
||||
return Ok((url.to_string(), res.body_string().await?));
|
||||
} else {
|
||||
return Err(format!(
|
||||
let msg = format!(
|
||||
"Invalid HTTP response. Received {} instead of text/html",
|
||||
mime.essence()
|
||||
)
|
||||
.into());
|
||||
);
|
||||
|
||||
return Err(ErrorKind::HTTPError(msg).into());
|
||||
}
|
||||
} else {
|
||||
return Err("Unknown HTTP response".into());
|
||||
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
|
||||
}
|
||||
} else {
|
||||
return Err(format!("Request failed: HTTP {}", res.status()).into());
|
||||
let msg = format!("Request failed: HTTP {}", res.status());
|
||||
return Err(ErrorKind::HTTPError(msg).into());
|
||||
}
|
||||
}
|
||||
Err("Unable to fetch HTML".into())
|
||||
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
|
||||
}
|
||||
|
||||
pub async fn download_images(
|
||||
extractor: &mut Extractor,
|
||||
article_origin: &Url,
|
||||
) -> async_std::io::Result<()> {
|
||||
) -> Result<(), Vec<PaperoniError>> {
|
||||
if extractor.img_urls.len() > 0 {
|
||||
println!("Downloading images...");
|
||||
}
|
||||
|
@ -71,39 +71,56 @@ pub async fn download_images(
|
|||
)
|
||||
})
|
||||
.map(|(url, req)| async move {
|
||||
let mut img_response = req.await.expect("Unable to retrieve image");
|
||||
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
||||
let img_mime = img_response
|
||||
.content_type()
|
||||
.map(|mime| mime.essence().to_string());
|
||||
let img_ext = img_response
|
||||
.content_type()
|
||||
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||
.unwrap();
|
||||
match req.await {
|
||||
Ok(mut img_response) => {
|
||||
// let mut img_response = req.await.expect("Unable to retrieve image");
|
||||
let img_content: Vec<u8> = match img_response.body_bytes().await {
|
||||
Ok(bytes) => bytes,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let img_mime = img_response
|
||||
.content_type()
|
||||
.map(|mime| mime.essence().to_string());
|
||||
let img_ext = match img_response
|
||||
.content_type()
|
||||
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||
{
|
||||
Some(mime_str) => mime_str,
|
||||
None => {
|
||||
return Err(ErrorKind::HTTPError(
|
||||
"Image has no Content-Type".to_owned(),
|
||||
)
|
||||
.into())
|
||||
}
|
||||
};
|
||||
|
||||
let mut img_path = std::env::temp_dir();
|
||||
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
|
||||
let mut img_file = File::create(&img_path)
|
||||
.await
|
||||
.expect("Unable to create file");
|
||||
img_file
|
||||
.write_all(&img_content)
|
||||
.await
|
||||
.expect("Unable to save to file");
|
||||
let mut img_path = std::env::temp_dir();
|
||||
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
|
||||
let mut img_file = match File::create(&img_path).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
match img_file.write_all(&img_content).await {
|
||||
Ok(_) => (),
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
|
||||
(
|
||||
url,
|
||||
img_path
|
||||
.file_name()
|
||||
.map(|os_str_name| {
|
||||
os_str_name
|
||||
.to_str()
|
||||
.expect("Unable to get image file name")
|
||||
.to_string()
|
||||
})
|
||||
.unwrap(),
|
||||
img_mime,
|
||||
)
|
||||
Ok((
|
||||
url,
|
||||
img_path
|
||||
.file_name()
|
||||
.map(|os_str_name| {
|
||||
os_str_name
|
||||
.to_str()
|
||||
.expect("Unable to get image file name")
|
||||
.to_string()
|
||||
})
|
||||
.unwrap(),
|
||||
img_mime,
|
||||
))
|
||||
}
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
});
|
||||
|
||||
// A utility closure used when update the value of an image source after downloading is successful
|
||||
|
@ -124,14 +141,24 @@ pub async fn download_images(
|
|||
(img_path, img_mime)
|
||||
};
|
||||
|
||||
extractor.img_urls = stream::from_iter(imgs_req_iter)
|
||||
let imgs_req_iter = stream::from_iter(imgs_req_iter)
|
||||
.buffered(10)
|
||||
.collect::<Vec<_>>()
|
||||
.await
|
||||
.into_iter()
|
||||
.map(replace_existing_img_src)
|
||||
.collect();
|
||||
Ok(())
|
||||
.collect::<Vec<Result<_, PaperoniError>>>()
|
||||
.await;
|
||||
let mut errors = Vec::new();
|
||||
let mut replaced_imgs = Vec::new();
|
||||
for img_req_result in imgs_req_iter {
|
||||
match img_req_result {
|
||||
Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
|
||||
Err(e) => errors.push(e),
|
||||
}
|
||||
}
|
||||
extractor.img_urls = replaced_imgs;
|
||||
if errors.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(errors)
|
||||
}
|
||||
}
|
||||
|
||||
/// Handles getting the extension from a given MIME subtype.
|
||||
|
|
20
src/main.rs
20
src/main.rs
|
@ -8,6 +8,7 @@ use url::Url;
|
|||
|
||||
mod cli;
|
||||
mod epub;
|
||||
mod errors;
|
||||
mod extractor;
|
||||
/// This module is responsible for async HTTP calls for downloading
|
||||
/// the HTML content and images
|
||||
|
@ -41,9 +42,17 @@ fn download(app_config: AppConfig) {
|
|||
|
||||
if extractor.article().is_some() {
|
||||
extractor.extract_img_urls();
|
||||
download_images(&mut extractor, &Url::parse(&url).unwrap())
|
||||
.await
|
||||
.expect("Unable to download images");
|
||||
|
||||
if let Err(img_errors) =
|
||||
download_images(&mut extractor, &Url::parse(&url).unwrap()).await
|
||||
{
|
||||
eprintln!(
|
||||
"{} image{} failed to download for {}",
|
||||
img_errors.len(),
|
||||
if img_errors.len() > 1 { "s" } else { "" },
|
||||
url
|
||||
);
|
||||
}
|
||||
articles.push(extractor);
|
||||
}
|
||||
}
|
||||
|
@ -52,5 +61,8 @@ fn download(app_config: AppConfig) {
|
|||
}
|
||||
articles
|
||||
});
|
||||
generate_epubs(articles, app_config.merged());
|
||||
match generate_epubs(articles, app_config.merged()) {
|
||||
Ok(_) => (),
|
||||
Err(e) => eprintln!("{}", e),
|
||||
};
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue