From 7e9dcfc2b77315fe96c049f097ba2dd075c21c86 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 17 Apr 2021 12:04:06 +0300 Subject: [PATCH] Add custom error types and ignore failed image downloads Using this custom error type, many instances of unwrap are replaced with mapping to errors that are then logged in main.rs. This allows paperoni to stop crashing when downloading articles when the errors are possibly recoverable or should not affect other downloads. This subsequently introduces ignoring the failed image downloads and instead leaving the original URLs intact. --- Cargo.lock | 9 ++-- Cargo.toml | 1 + src/epub.rs | 31 ++++++------ src/errors.rs | 61 +++++++++++++++++++++++ src/extractor.rs | 3 +- src/http.rs | 125 ++++++++++++++++++++++++++++------------------- src/main.rs | 20 ++++++-- 7 files changed, 178 insertions(+), 72 deletions(-) create mode 100644 src/errors.rs diff --git a/Cargo.lock b/Cargo.lock index 8466dbf..2c8d164 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1283,6 +1283,7 @@ dependencies = [ "md5", "regex", "surf", + "thiserror", "url", ] @@ -1960,18 +1961,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" [[package]] name = "thiserror" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e" +checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56" +checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 05660ed..451bf38 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,4 +22,5 @@ lazy_static = "1.4.0" md5 = "0.7.0" regex = "1.4.2" surf = "2.1.0" +thiserror = "1.0.24" url = "2.2.0" diff --git a/src/epub.rs b/src/epub.rs index e6e0376..a7a8cbc 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -2,12 +2,18 @@ use std::fs::File; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; -use crate::extractor::{self, Extractor}; +use crate::{ + errors::PaperoniError, + extractor::{self, Extractor}, +}; -pub fn generate_epubs(articles: Vec, merged: Option<&String>) { +pub fn generate_epubs( + articles: Vec, + merged: Option<&String>, +) -> Result<(), PaperoniError> { match merged { Some(name) => { - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; epub.inline_toc(); epub = articles .iter() @@ -41,12 +47,12 @@ pub fn generate_epubs(articles: Vec, merged: Option<&String>) { epub }); let mut out_file = File::create(&name).unwrap(); - epub.generate(&mut out_file).unwrap(); + epub.generate(&mut out_file)?; println!("Created {:?}", name); } None => { for article in articles { - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; let file_name = format!( "{}.epub", article @@ -61,26 +67,23 @@ pub fn generate_epubs(articles: Vec, merged: Option<&String>) { .expect("Unable to serialize to xhtml"); let html_str = std::str::from_utf8(&html_buf).unwrap(); if let Some(author) = article.metadata().byline() { - epub.metadata("author", replace_metadata_value(author)) - .unwrap(); + epub.metadata("author", replace_metadata_value(author))?; } - epub.metadata("title", replace_metadata_value(article.metadata().title())) - .unwrap(); - epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes())) - .unwrap(); + epub.metadata("title", replace_metadata_value(article.metadata().title()))?; + epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; for img in article.img_urls { let mut file_path = std::env::temp_dir(); file_path.push(&img.0); let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) - .unwrap(); + epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())?; } - epub.generate(&mut out_file).unwrap(); + epub.generate(&mut out_file)?; println!("Created {:?}", file_name); } } } + Ok(()) } /// Replaces characters that have to be escaped before adding to the epub's metadata diff --git a/src/errors.rs b/src/errors.rs new file mode 100644 index 0000000..f0b3d9c --- /dev/null +++ b/src/errors.rs @@ -0,0 +1,61 @@ +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum ErrorKind { + #[error("[EpubError]: {0}")] + EpubError(String), + #[error("[HTTPError]: {0}")] + HTTPError(String), + #[error("[IOError]: {0}")] + IOError(String), +} + +#[derive(Error, Debug)] +#[error("{kind}")] +pub struct PaperoniError { + article_link: Option, + kind: ErrorKind, +} + +impl PaperoniError { + pub fn with_kind(kind: ErrorKind) -> Self { + PaperoniError { + article_link: None, + kind, + } + } + + pub fn set_article_link(&mut self, article_link: String) { + self.article_link = Some(article_link); + } +} + +impl From for PaperoniError { + fn from(kind: ErrorKind) -> Self { + PaperoniError::with_kind(kind) + } +} + +impl From for PaperoniError { + fn from(err: epub_builder::Error) -> Self { + PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned())) + } +} + +impl From for PaperoniError { + fn from(err: surf::Error) -> Self { + PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From for PaperoniError { + fn from(err: url::ParseError) -> Self { + PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From for PaperoniError { + fn from(err: std::io::Error) -> Self { + PaperoniError::with_kind(ErrorKind::IOError(err.to_string())) + } +} diff --git a/src/extractor.rs b/src/extractor.rs index 0fcc5e8..64b9a2a 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use kuchiki::{traits::*, NodeRef}; +use crate::errors::PaperoniError; use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); @@ -75,7 +76,7 @@ impl Extractor { pub fn serialize_to_xhtml( node_ref: &NodeRef, mut w: &mut W, -) -> Result<(), Box> { +) -> Result<(), PaperoniError> { let mut escape_map = HashMap::new(); escape_map.insert("<", "<"); escape_map.insert(">", ">"); diff --git a/src/http.rs b/src/http.rs index faf9428..9ff7ef8 100644 --- a/src/http.rs +++ b/src/http.rs @@ -3,13 +3,11 @@ use async_std::{fs::File, stream}; use futures::StreamExt; use url::Url; -use crate::extractor::Extractor; +use crate::{errors::ErrorKind, errors::PaperoniError, extractor::Extractor}; type HTMLResource = (String, String); -pub async fn fetch_url( - url: &str, -) -> Result> { +pub async fn fetch_url(url: &str) -> Result { let client = surf::Client::new(); println!("Fetching..."); @@ -37,26 +35,28 @@ pub async fn fetch_url( if mime.essence() == "text/html" { return Ok((url.to_string(), res.body_string().await?)); } else { - return Err(format!( + let msg = format!( "Invalid HTTP response. Received {} instead of text/html", mime.essence() - ) - .into()); + ); + + return Err(ErrorKind::HTTPError(msg).into()); } } else { - return Err("Unknown HTTP response".into()); + return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into()); } } else { - return Err(format!("Request failed: HTTP {}", res.status()).into()); + let msg = format!("Request failed: HTTP {}", res.status()); + return Err(ErrorKind::HTTPError(msg).into()); } } - Err("Unable to fetch HTML".into()) + Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into()) } pub async fn download_images( extractor: &mut Extractor, article_origin: &Url, -) -> async_std::io::Result<()> { +) -> Result<(), Vec> { if extractor.img_urls.len() > 0 { println!("Downloading images..."); } @@ -71,39 +71,56 @@ pub async fn download_images( ) }) .map(|(url, req)| async move { - let mut img_response = req.await.expect("Unable to retrieve image"); - let img_content: Vec = img_response.body_bytes().await.unwrap(); - let img_mime = img_response - .content_type() - .map(|mime| mime.essence().to_string()); - let img_ext = img_response - .content_type() - .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) - .unwrap(); + match req.await { + Ok(mut img_response) => { + // let mut img_response = req.await.expect("Unable to retrieve image"); + let img_content: Vec = match img_response.body_bytes().await { + Ok(bytes) => bytes, + Err(e) => return Err(e.into()), + }; + let img_mime = img_response + .content_type() + .map(|mime| mime.essence().to_string()); + let img_ext = match img_response + .content_type() + .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) + { + Some(mime_str) => mime_str, + None => { + return Err(ErrorKind::HTTPError( + "Image has no Content-Type".to_owned(), + ) + .into()) + } + }; - let mut img_path = std::env::temp_dir(); - img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); - let mut img_file = File::create(&img_path) - .await - .expect("Unable to create file"); - img_file - .write_all(&img_content) - .await - .expect("Unable to save to file"); + let mut img_path = std::env::temp_dir(); + img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); + let mut img_file = match File::create(&img_path).await { + Ok(file) => file, + Err(e) => return Err(e.into()), + }; + match img_file.write_all(&img_content).await { + Ok(_) => (), + Err(e) => return Err(e.into()), + } - ( - url, - img_path - .file_name() - .map(|os_str_name| { - os_str_name - .to_str() - .expect("Unable to get image file name") - .to_string() - }) - .unwrap(), - img_mime, - ) + Ok(( + url, + img_path + .file_name() + .map(|os_str_name| { + os_str_name + .to_str() + .expect("Unable to get image file name") + .to_string() + }) + .unwrap(), + img_mime, + )) + } + Err(e) => Err(e.into()), + } }); // A utility closure used when update the value of an image source after downloading is successful @@ -124,14 +141,24 @@ pub async fn download_images( (img_path, img_mime) }; - extractor.img_urls = stream::from_iter(imgs_req_iter) + let imgs_req_iter = stream::from_iter(imgs_req_iter) .buffered(10) - .collect::>() - .await - .into_iter() - .map(replace_existing_img_src) - .collect(); - Ok(()) + .collect::>>() + .await; + let mut errors = Vec::new(); + let mut replaced_imgs = Vec::new(); + for img_req_result in imgs_req_iter { + match img_req_result { + Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)), + Err(e) => errors.push(e), + } + } + extractor.img_urls = replaced_imgs; + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } } /// Handles getting the extension from a given MIME subtype. diff --git a/src/main.rs b/src/main.rs index 0467712..7ad2560 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ use url::Url; mod cli; mod epub; +mod errors; mod extractor; /// This module is responsible for async HTTP calls for downloading /// the HTML content and images @@ -41,9 +42,17 @@ fn download(app_config: AppConfig) { if extractor.article().is_some() { extractor.extract_img_urls(); - download_images(&mut extractor, &Url::parse(&url).unwrap()) - .await - .expect("Unable to download images"); + + if let Err(img_errors) = + download_images(&mut extractor, &Url::parse(&url).unwrap()).await + { + eprintln!( + "{} image{} failed to download for {}", + img_errors.len(), + if img_errors.len() > 1 { "s" } else { "" }, + url + ); + } articles.push(extractor); } } @@ -52,5 +61,8 @@ fn download(app_config: AppConfig) { } articles }); - generate_epubs(articles, app_config.merged()); + match generate_epubs(articles, app_config.merged()) { + Ok(_) => (), + Err(e) => eprintln!("{}", e), + }; }