Bug fixes

- Prevent downloading images with base64 strings as the source
- Add escaping of quotation characters in the serializer
- Disable redirects when downloading images which fails on multiple sites
- Remove invalid characters for making the epub export file name
- Fix version number in release
This commit is contained in:
Kenneth Gitere 2020-12-24 12:16:30 +03:00
parent 3bfa82ba60
commit 8407c613df
5 changed files with 20 additions and 8 deletions

2
Cargo.lock generated
View file

@ -1242,7 +1242,7 @@ dependencies = [
[[package]] [[package]]
name = "paperoni" name = "paperoni"
version = "0.2.0-alpha1" version = "0.2.1-alpha1"
dependencies = [ dependencies = [
"async-std", "async-std",
"clap", "clap",

View file

@ -3,7 +3,7 @@ description = "A web article downloader"
homepage = "https://github.com/hipstermojo/paperoni" homepage = "https://github.com/hipstermojo/paperoni"
repository = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni"
name = "paperoni" name = "paperoni"
version = "0.2.0-alpha1" version = "0.2.1-alpha1"
authors = ["Kenneth Gitere <gitere81@gmail.com>"] authors = ["Kenneth Gitere <gitere81@gmail.com>"]
edition = "2018" edition = "2018"
license = "MIT" license = "MIT"

View file

@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
AppSettings::ArgRequiredElseHelp, AppSettings::ArgRequiredElseHelp,
AppSettings::UnifiedHelpMessage, AppSettings::UnifiedHelpMessage,
]) ])
.version("0.1.0-alpha1") .version("0.2.1-alpha1")
.about( .about(
" "
Paperoni is an article downloader. Paperoni is an article downloader.

View file

@ -11,7 +11,7 @@ use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
lazy_static! { lazy_static! {
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap(); static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
} }
pub struct Extractor { pub struct Extractor {
@ -56,7 +56,7 @@ impl Extractor {
for img_ref in content_ref.select("img").unwrap() { for img_ref in content_ref.select("img").unwrap() {
img_ref.as_node().as_element().map(|img_elem| { img_ref.as_node().as_element().map(|img_elem| {
img_elem.attributes.borrow().get("src").map(|img_url| { img_elem.attributes.borrow().get("src").map(|img_url| {
if !img_url.is_empty() { if !(img_url.is_empty() || img_url.starts_with("data:image")) {
self.img_urls.push((img_url.to_string(), None)) self.img_urls.push((img_url.to_string(), None))
} }
}) })
@ -75,7 +75,9 @@ impl Extractor {
async_download_tasks.push(task::spawn(async move { async_download_tasks.push(task::spawn(async move {
let mut img_response = surf::Client::new() let mut img_response = surf::Client::new()
.with(surf::middleware::Redirect::default()) // The middleware has been temporarily commented out because it happens
// to affect downloading images when there is no redirecting
// .with(surf::middleware::Redirect::default())
.get(&abs_url) .get(&abs_url)
.await .await
.expect("Unable to retrieve file"); .expect("Unable to retrieve file");
@ -185,6 +187,8 @@ pub fn serialize_to_xhtml<W: std::io::Write>(
escape_map.insert("<", "&lt;"); escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;"); escape_map.insert(">", "&gt;");
escape_map.insert("&", "&amp;"); escape_map.insert("&", "&amp;");
escape_map.insert("\"", "&quot;");
escape_map.insert("'", "&apos;");
for edge in node_ref.traverse_inclusive() { for edge in node_ref.traverse_inclusive() {
match edge { match edge {
kuchiki::iter::NodeEdge::Start(n) => match n.data() { kuchiki::iter::NodeEdge::Start(n) => match n.data() {
@ -248,6 +252,7 @@ mod test {
<p>Some Lorem Ipsum text here</p> <p>Some Lorem Ipsum text here</p>
<p>Observe this picture</p> <p>Observe this picture</p>
<img src="./img.jpg" alt="Random image"> <img src="./img.jpg" alt="Random image">
<img src="">
</article> </article>
<footer> <footer>
<p>Made in HTML</p> <p>Made in HTML</p>

View file

@ -55,7 +55,14 @@ fn download(urls: Vec<String>) {
.download_images(&Url::parse(&url).unwrap()) .download_images(&Url::parse(&url).unwrap())
.await .await
.expect("Unable to download images"); .expect("Unable to download images");
let file_name = format!("{}.epub", extractor.metadata().title()); let file_name = format!(
"{}.epub",
extractor
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap(); let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new(); let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
@ -68,7 +75,7 @@ fn download(urls: Vec<String>) {
} }
epub.metadata("title", extractor.metadata().title().replace("&", "&amp;")) epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
.unwrap(); .unwrap();
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
.unwrap(); .unwrap();
for img in extractor.img_urls { for img in extractor.img_urls {
let mut file_path = std::env::temp_dir(); let mut file_path = std::env::temp_dir();