Bug fixes

- Prevent downloading images with base64 strings as the source - Add escaping of quotation characters in the serializer - Disable redirects when downloading images which fails on multiple sites - Remove invalid characters for making the epub export file name - Fix version number in release
2020-12-24 12:16:30 +03:00 · 2020-12-24 12:16:30 +03:00 · 8407c613df
commit 8407c613df
parent 3bfa82ba60
5 changed files with 20 additions and 8 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1242,7 +1242,7 @@ dependencies = [

 [[package]]
 name = "paperoni"
-version = "0.2.0-alpha1"
+version = "0.2.1-alpha1"
 dependencies = [
 "async-std",
 "clap",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ description = "A web article downloader"
 homepage = "https://github.com/hipstermojo/paperoni"
 repository = "https://github.com/hipstermojo/paperoni"
 name = "paperoni"
-version = "0.2.0-alpha1"
+version = "0.2.1-alpha1"
 authors = ["Kenneth Gitere <gitere81@gmail.com>"]
 edition = "2018"
 license = "MIT"
--- a/src/cli.rs
+++ b/src/cli.rs
@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
            AppSettings::ArgRequiredElseHelp,
            AppSettings::UnifiedHelpMessage,
        ])
-        .version("0.1.0-alpha1")
+        .version("0.2.1-alpha1")
        .about(
            "
 Paperoni is an article downloader.
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -11,7 +11,7 @@ use crate::moz_readability::{MetaData, Readability};
 pub type ResourceInfo = (String, Option<String>);

 lazy_static! {
-    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
+    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
 }

 pub struct Extractor {
@ -56,7 +56,7 @@ impl Extractor {
            for img_ref in content_ref.select("img").unwrap() {
                img_ref.as_node().as_element().map(|img_elem| {
                    img_elem.attributes.borrow().get("src").map(|img_url| {
-                        if !img_url.is_empty() {
+                        if !(img_url.is_empty() || img_url.starts_with("data:image")) {
                            self.img_urls.push((img_url.to_string(), None))
                        }
                    })
@ -75,7 +75,9 @@ impl Extractor {

            async_download_tasks.push(task::spawn(async move {
                let mut img_response = surf::Client::new()
-                    .with(surf::middleware::Redirect::default())
+                    // The middleware has been temporarily commented out because it happens
+                    // to affect downloading images when there is no redirecting
+                    // .with(surf::middleware::Redirect::default())
                    .get(&abs_url)
                    .await
                    .expect("Unable to retrieve file");
@ -185,6 +187,8 @@ pub fn serialize_to_xhtml<W: std::io::Write>(
    escape_map.insert("<", "&lt;");
    escape_map.insert(">", "&gt;");
    escape_map.insert("&", "&amp;");
+    escape_map.insert("\"", "&quot;");
+    escape_map.insert("'", "&apos;");
    for edge in node_ref.traverse_inclusive() {
        match edge {
            kuchiki::iter::NodeEdge::Start(n) => match n.data() {
@ -248,6 +252,7 @@ mod test {
                    <p>Some Lorem Ipsum text here</p>
                    <p>Observe this picture</p>
                    <img src="./img.jpg" alt="Random image">
+                    <img src="data:image/png;base64,lJGWEIUQOIQWIDYVIVEDYFOUYQFWD">
                </article>
                <footer>
                    <p>Made in HTML</p>
--- a/src/main.rs
+++ b/src/main.rs
@ -55,7 +55,14 @@ fn download(urls: Vec<String>) {
                    .download_images(&Url::parse(&url).unwrap())
                    .await
                    .expect("Unable to download images");
-                let file_name = format!("{}.epub", extractor.metadata().title());
+                let file_name = format!(
+                    "{}.epub",
+                    extractor
+                        .metadata()
+                        .title()
+                        .replace("/", " ")
+                        .replace("\\", " ")
+                );
                let mut out_file = File::create(&file_name).unwrap();
                let mut html_buf = Vec::new();
                extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
@ -68,7 +75,7 @@ fn download(urls: Vec<String>) {
                }
                epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
                    .unwrap();
-                epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
+                epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
                    .unwrap();
                for img in extractor.img_urls {
                    let mut file_path = std::env::temp_dir();