Bug fixes
- Prevent downloading images with base64 strings as the source - Add escaping of quotation characters in the serializer - Disable redirects when downloading images which fails on multiple sites - Remove invalid characters for making the epub export file name - Fix version number in release
This commit is contained in:
parent
3bfa82ba60
commit
8407c613df
5 changed files with 20 additions and 8 deletions
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -1242,7 +1242,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "paperoni"
|
name = "paperoni"
|
||||||
version = "0.2.0-alpha1"
|
version = "0.2.1-alpha1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-std",
|
"async-std",
|
||||||
"clap",
|
"clap",
|
||||||
|
|
|
@ -3,7 +3,7 @@ description = "A web article downloader"
|
||||||
homepage = "https://github.com/hipstermojo/paperoni"
|
homepage = "https://github.com/hipstermojo/paperoni"
|
||||||
repository = "https://github.com/hipstermojo/paperoni"
|
repository = "https://github.com/hipstermojo/paperoni"
|
||||||
name = "paperoni"
|
name = "paperoni"
|
||||||
version = "0.2.0-alpha1"
|
version = "0.2.1-alpha1"
|
||||||
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|
|
@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
|
||||||
AppSettings::ArgRequiredElseHelp,
|
AppSettings::ArgRequiredElseHelp,
|
||||||
AppSettings::UnifiedHelpMessage,
|
AppSettings::UnifiedHelpMessage,
|
||||||
])
|
])
|
||||||
.version("0.1.0-alpha1")
|
.version("0.2.1-alpha1")
|
||||||
.about(
|
.about(
|
||||||
"
|
"
|
||||||
Paperoni is an article downloader.
|
Paperoni is an article downloader.
|
||||||
|
|
|
@ -11,7 +11,7 @@ use crate::moz_readability::{MetaData, Readability};
|
||||||
pub type ResourceInfo = (String, Option<String>);
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
|
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Extractor {
|
pub struct Extractor {
|
||||||
|
@ -56,7 +56,7 @@ impl Extractor {
|
||||||
for img_ref in content_ref.select("img").unwrap() {
|
for img_ref in content_ref.select("img").unwrap() {
|
||||||
img_ref.as_node().as_element().map(|img_elem| {
|
img_ref.as_node().as_element().map(|img_elem| {
|
||||||
img_elem.attributes.borrow().get("src").map(|img_url| {
|
img_elem.attributes.borrow().get("src").map(|img_url| {
|
||||||
if !img_url.is_empty() {
|
if !(img_url.is_empty() || img_url.starts_with("data:image")) {
|
||||||
self.img_urls.push((img_url.to_string(), None))
|
self.img_urls.push((img_url.to_string(), None))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
@ -75,7 +75,9 @@ impl Extractor {
|
||||||
|
|
||||||
async_download_tasks.push(task::spawn(async move {
|
async_download_tasks.push(task::spawn(async move {
|
||||||
let mut img_response = surf::Client::new()
|
let mut img_response = surf::Client::new()
|
||||||
.with(surf::middleware::Redirect::default())
|
// The middleware has been temporarily commented out because it happens
|
||||||
|
// to affect downloading images when there is no redirecting
|
||||||
|
// .with(surf::middleware::Redirect::default())
|
||||||
.get(&abs_url)
|
.get(&abs_url)
|
||||||
.await
|
.await
|
||||||
.expect("Unable to retrieve file");
|
.expect("Unable to retrieve file");
|
||||||
|
@ -185,6 +187,8 @@ pub fn serialize_to_xhtml<W: std::io::Write>(
|
||||||
escape_map.insert("<", "<");
|
escape_map.insert("<", "<");
|
||||||
escape_map.insert(">", ">");
|
escape_map.insert(">", ">");
|
||||||
escape_map.insert("&", "&");
|
escape_map.insert("&", "&");
|
||||||
|
escape_map.insert("\"", """);
|
||||||
|
escape_map.insert("'", "'");
|
||||||
for edge in node_ref.traverse_inclusive() {
|
for edge in node_ref.traverse_inclusive() {
|
||||||
match edge {
|
match edge {
|
||||||
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
|
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
|
||||||
|
@ -248,6 +252,7 @@ mod test {
|
||||||
<p>Some Lorem Ipsum text here</p>
|
<p>Some Lorem Ipsum text here</p>
|
||||||
<p>Observe this picture</p>
|
<p>Observe this picture</p>
|
||||||
<img src="./img.jpg" alt="Random image">
|
<img src="./img.jpg" alt="Random image">
|
||||||
|
<img src="data:image/png;base64,lJGWEIUQOIQWIDYVIVEDYFOUYQFWD">
|
||||||
</article>
|
</article>
|
||||||
<footer>
|
<footer>
|
||||||
<p>Made in HTML</p>
|
<p>Made in HTML</p>
|
||||||
|
|
11
src/main.rs
11
src/main.rs
|
@ -55,7 +55,14 @@ fn download(urls: Vec<String>) {
|
||||||
.download_images(&Url::parse(&url).unwrap())
|
.download_images(&Url::parse(&url).unwrap())
|
||||||
.await
|
.await
|
||||||
.expect("Unable to download images");
|
.expect("Unable to download images");
|
||||||
let file_name = format!("{}.epub", extractor.metadata().title());
|
let file_name = format!(
|
||||||
|
"{}.epub",
|
||||||
|
extractor
|
||||||
|
.metadata()
|
||||||
|
.title()
|
||||||
|
.replace("/", " ")
|
||||||
|
.replace("\\", " ")
|
||||||
|
);
|
||||||
let mut out_file = File::create(&file_name).unwrap();
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
let mut html_buf = Vec::new();
|
let mut html_buf = Vec::new();
|
||||||
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
||||||
|
@ -68,7 +75,7 @@ fn download(urls: Vec<String>) {
|
||||||
}
|
}
|
||||||
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
for img in extractor.img_urls {
|
for img in extractor.img_urls {
|
||||||
let mut file_path = std::env::temp_dir();
|
let mut file_path = std::env::temp_dir();
|
||||||
|
|
Reference in a new issue