From 8407c613dfa1ee896b2e70aaca922b60a5fcaa72 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 24 Dec 2020 12:16:30 +0300 Subject: [PATCH] Bug fixes - Prevent downloading images with base64 strings as the source - Add escaping of quotation characters in the serializer - Disable redirects when downloading images which fails on multiple sites - Remove invalid characters for making the epub export file name - Fix version number in release --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/cli.rs | 2 +- src/extractor.rs | 11 ++++++++--- src/main.rs | 11 +++++++++-- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 01c1426..e1d51de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1242,7 +1242,7 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.2.0-alpha1" +version = "0.2.1-alpha1" dependencies = [ "async-std", "clap", diff --git a/Cargo.toml b/Cargo.toml index 7559efa..c4c64ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ description = "A web article downloader" homepage = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.2.0-alpha1" +version = "0.2.1-alpha1" authors = ["Kenneth Gitere "] edition = "2018" license = "MIT" diff --git a/src/cli.rs b/src/cli.rs index f62b0c5..92b56f4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> { AppSettings::ArgRequiredElseHelp, AppSettings::UnifiedHelpMessage, ]) - .version("0.1.0-alpha1") + .version("0.2.1-alpha1") .about( " Paperoni is an article downloader. diff --git a/src/extractor.rs b/src/extractor.rs index ea7066f..9294ae6 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -11,7 +11,7 @@ use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); lazy_static! { - static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap(); + static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); } pub struct Extractor { @@ -56,7 +56,7 @@ impl Extractor { for img_ref in content_ref.select("img").unwrap() { img_ref.as_node().as_element().map(|img_elem| { img_elem.attributes.borrow().get("src").map(|img_url| { - if !img_url.is_empty() { + if !(img_url.is_empty() || img_url.starts_with("data:image")) { self.img_urls.push((img_url.to_string(), None)) } }) @@ -75,7 +75,9 @@ impl Extractor { async_download_tasks.push(task::spawn(async move { let mut img_response = surf::Client::new() - .with(surf::middleware::Redirect::default()) + // The middleware has been temporarily commented out because it happens + // to affect downloading images when there is no redirecting + // .with(surf::middleware::Redirect::default()) .get(&abs_url) .await .expect("Unable to retrieve file"); @@ -185,6 +187,8 @@ pub fn serialize_to_xhtml( escape_map.insert("<", "<"); escape_map.insert(">", ">"); escape_map.insert("&", "&"); + escape_map.insert("\"", """); + escape_map.insert("'", "'"); for edge in node_ref.traverse_inclusive() { match edge { kuchiki::iter::NodeEdge::Start(n) => match n.data() { @@ -248,6 +252,7 @@ mod test {

Some Lorem Ipsum text here

Observe this picture

Random image +