From 8407c613dfa1ee896b2e70aaca922b60a5fcaa72 Mon Sep 17 00:00:00 2001
From: Kenneth Gitere <gitere81@gmail.com>
Date: Thu, 24 Dec 2020 12:16:30 +0300
Subject: [PATCH] Bug fixes

- Prevent downloading images with base64 strings as the source
- Add escaping of quotation characters in the serializer
- Disable redirects when downloading images which fails on multiple sites
- Remove invalid characters for making the epub export file name
- Fix version number in release
---
 Cargo.lock       |  2 +-
 Cargo.toml       |  2 +-
 src/cli.rs       |  2 +-
 src/extractor.rs | 11 ++++++++---
 src/main.rs      | 11 +++++++++--
 5 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 01c1426..e1d51de 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1242,7 +1242,7 @@ dependencies = [
 
 [[package]]
 name = "paperoni"
-version = "0.2.0-alpha1"
+version = "0.2.1-alpha1"
 dependencies = [
  "async-std",
  "clap",
diff --git a/Cargo.toml b/Cargo.toml
index 7559efa..c4c64ac 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ description = "A web article downloader"
 homepage = "https://github.com/hipstermojo/paperoni"
 repository = "https://github.com/hipstermojo/paperoni"
 name = "paperoni"
-version = "0.2.0-alpha1"
+version = "0.2.1-alpha1"
 authors = ["Kenneth Gitere <gitere81@gmail.com>"]
 edition = "2018"
 license = "MIT"
diff --git a/src/cli.rs b/src/cli.rs
index f62b0c5..92b56f4 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
             AppSettings::ArgRequiredElseHelp,
             AppSettings::UnifiedHelpMessage,
         ])
-        .version("0.1.0-alpha1")
+        .version("0.2.1-alpha1")
         .about(
             "
 Paperoni is an article downloader.
diff --git a/src/extractor.rs b/src/extractor.rs
index ea7066f..9294ae6 100644
--- a/src/extractor.rs
+++ b/src/extractor.rs
@@ -11,7 +11,7 @@ use crate::moz_readability::{MetaData, Readability};
 pub type ResourceInfo = (String, Option<String>);
 
 lazy_static! {
-    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
+    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
 }
 
 pub struct Extractor {
@@ -56,7 +56,7 @@ impl Extractor {
             for img_ref in content_ref.select("img").unwrap() {
                 img_ref.as_node().as_element().map(|img_elem| {
                     img_elem.attributes.borrow().get("src").map(|img_url| {
-                        if !img_url.is_empty() {
+                        if !(img_url.is_empty() || img_url.starts_with("data:image")) {
                             self.img_urls.push((img_url.to_string(), None))
                         }
                     })
@@ -75,7 +75,9 @@ impl Extractor {
 
             async_download_tasks.push(task::spawn(async move {
                 let mut img_response = surf::Client::new()
-                    .with(surf::middleware::Redirect::default())
+                    // The middleware has been temporarily commented out because it happens
+                    // to affect downloading images when there is no redirecting
+                    // .with(surf::middleware::Redirect::default())
                     .get(&abs_url)
                     .await
                     .expect("Unable to retrieve file");
@@ -185,6 +187,8 @@ pub fn serialize_to_xhtml<W: std::io::Write>(
     escape_map.insert("<", "&lt;");
     escape_map.insert(">", "&gt;");
     escape_map.insert("&", "&amp;");
+    escape_map.insert("\"", "&quot;");
+    escape_map.insert("'", "&apos;");
     for edge in node_ref.traverse_inclusive() {
         match edge {
             kuchiki::iter::NodeEdge::Start(n) => match n.data() {
@@ -248,6 +252,7 @@ mod test {
                     <p>Some Lorem Ipsum text here</p>
                     <p>Observe this picture</p>
                     <img src="./img.jpg" alt="Random image">
+                    <img src="data:image/png;base64,lJGWEIUQOIQWIDYVIVEDYFOUYQFWD">
                 </article>
                 <footer>
                     <p>Made in HTML</p>
diff --git a/src/main.rs b/src/main.rs
index d4f6c1b..bf14ee3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -55,7 +55,14 @@ fn download(urls: Vec<String>) {
                     .download_images(&Url::parse(&url).unwrap())
                     .await
                     .expect("Unable to download images");
-                let file_name = format!("{}.epub", extractor.metadata().title());
+                let file_name = format!(
+                    "{}.epub",
+                    extractor
+                        .metadata()
+                        .title()
+                        .replace("/", " ")
+                        .replace("\\", " ")
+                );
                 let mut out_file = File::create(&file_name).unwrap();
                 let mut html_buf = Vec::new();
                 extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
@@ -68,7 +75,7 @@ fn download(urls: Vec<String>) {
                 }
                 epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
                     .unwrap();
-                epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
+                epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
                     .unwrap();
                 for img in extractor.img_urls {
                     let mut file_path = std::env::temp_dir();