From 282d2297541752e36a16fdca457cd1d559ea02dd Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 10 Jun 2021 20:16:31 +0300 Subject: [PATCH] fix: fix ordering issue with merged articles This commit adds the itertools crate which is used to dedup the Vec when downloading urls fix: fix error message feat: change the serif and mono fonts declarations --- Cargo.lock | 16 ++++++++++++++++ Cargo.toml | 1 + README.md | 4 ---- src/assets/writ.min.css | 2 +- src/cli.rs | 31 ++++++++++++++++--------------- src/errors.rs | 2 +- src/extractor.rs | 23 ++++++++++++++--------- 7 files changed, 49 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2bcea0..e1c29ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -758,6 +758,12 @@ dependencies = [ "dtoa", ] +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + [[package]] name = "encode_unicode" version = "0.3.6" @@ -1247,6 +1253,15 @@ dependencies = [ "waker-fn", ] +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.7" @@ -1550,6 +1565,7 @@ dependencies = [ "futures", "html5ever", "indicatif", + "itertools", "kuchiki", "lazy_static", "log 0.4.14", diff --git a/Cargo.toml b/Cargo.toml index e3f6055..d8305fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ flexi_logger = "0.17.1" futures = "0.3.15" html5ever = "0.25.1" indicatif = "0.16.2" +itertools = "0.10.1" kuchiki = "0.8.1" lazy_static = "1.4.0" log = "0.4.14" diff --git a/README.md b/README.md index d80bbe7..f38c741 100644 --- a/README.md +++ b/README.md @@ -118,10 +118,6 @@ into a single epub using the `merge` flag and specifying the output file. paperoni -f links.txt --merge out.epub ``` -### Recommended fonts - -The styling on the EPUB files comes from the [writ.css](https://github.com/causal-agent/writ) library. This uses Palatino as the serif font which you can get online for free. However, you can use whichever serif fonts you have installed. - ### Logging events Logging is disabled by default. This can be activated by either using the `-v` flag or `--log-to-file` flag. If the `--log-to-file` flag is passed the logs are sent to a file in the default Paperoni directory `.paperoni/logs` which is on your home directory. The `-v` flag configures the verbosity levels such that: diff --git a/src/assets/writ.min.css b/src/assets/writ.min.css index afef597..1c9c0b4 100644 --- a/src/assets/writ.min.css +++ b/src/assets/writ.min.css @@ -4,4 +4,4 @@ * Copyright © 2015, Curtis McEnroe * * https://cmcenroe.me/writ/LICENSE (ISC) - */dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Palatino,Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Consolas,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap} + */dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap} diff --git a/src/cli.rs b/src/cli.rs index 22cc156..763898f 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,8 +1,9 @@ -use std::{collections::BTreeSet, fs, num::NonZeroUsize, path::Path}; +use std::{fs, num::NonZeroUsize, path::Path}; use chrono::{DateTime, Local}; use clap::{App, AppSettings, Arg, ArgMatches}; use flexi_logger::LevelFilter as LogLevel; +use itertools::Itertools; type Error = crate::errors::CliError; @@ -126,24 +127,24 @@ impl<'a> TryFrom> for AppConfig { }; let direct_urls = arg_matches .values_of("urls") - .and_then(|urls| urls.map(url_filter).collect::>>()); + .and_then(|urls| urls.map(url_filter).collect::>>()) + .unwrap_or(Vec::new()); let file_urls = arg_matches .value_of("file") .map(fs::read_to_string) .transpose()? - .and_then(|content| { - content - .lines() - .map(url_filter) - .collect::>>() - }); - match (direct_urls, file_urls) { - (Some(direct_urls), Some(file_urls)) => Ok(direct_urls - .union(&file_urls) - .map(ToOwned::to_owned) - .collect::>()), - (Some(urls), None) | (None, Some(urls)) => Ok(urls.into_iter().collect()), - (None, None) => Err(Error::NoUrls), + .and_then(|content| content.lines().map(url_filter).collect::>>()) + .unwrap_or(Vec::new()); + + let urls = [direct_urls, file_urls] + .concat() + .into_iter() + .unique() + .collect_vec(); + if !urls.is_empty() { + Ok(urls) + } else { + Err(Error::NoUrls) } }?) .max_conn(match arg_matches.value_of("max-conn") { diff --git a/src/errors.rs b/src/errors.rs index a479268..17ae34a 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -152,7 +152,7 @@ pub enum CliError { InvalidOutputPath(String), #[error("Wrong output directory")] WrongOutputDirectory, - #[error("Output directory not exists")] + #[error("Output directory does not exist")] OutputDirectoryNotExists, #[error("Unable to start logger!\n{0}")] LogError(#[from] LogError), diff --git a/src/extractor.rs b/src/extractor.rs index ef470d9..632f567 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use itertools::Itertools; use kuchiki::{traits::*, NodeRef}; use crate::errors::PaperoniError; @@ -54,15 +55,19 @@ impl Extractor { /// Traverses the DOM tree of the content and retrieves the IMG URLs pub fn extract_img_urls(&mut self) { if let Some(content_ref) = &self.article { - for img_ref in content_ref.select("img").unwrap() { - img_ref.as_node().as_element().map(|img_elem| { - img_elem.attributes.borrow().get("src").map(|img_url| { - if !(img_url.is_empty() || img_url.starts_with("data:image")) { - self.img_urls.push((img_url.to_string(), None)) - } - }) - }); - } + self.img_urls = content_ref + .select("img") + .unwrap() + .filter_map(|img_ref| { + let attrs = img_ref.attributes.borrow(); + attrs + .get("src") + .filter(|val| !(val.is_empty() || val.starts_with("data:image"))) + .map(ToString::to_string) + }) + .unique() + .map(|val| (val, None)) + .collect(); } }