Bug fixes
- Prevent downloading images with base64 strings as the source - Add escaping of quotation characters in the serializer - Disable redirects when downloading images which fails on multiple sites - Remove invalid characters for making the epub export file name - Fix version number in release
This commit is contained in:
parent
3bfa82ba60
commit
8407c613df
5 changed files with 20 additions and 8 deletions
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -1242,7 +1242,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "paperoni"
|
||||
version = "0.2.0-alpha1"
|
||||
version = "0.2.1-alpha1"
|
||||
dependencies = [
|
||||
"async-std",
|
||||
"clap",
|
||||
|
|
|
@ -3,7 +3,7 @@ description = "A web article downloader"
|
|||
homepage = "https://github.com/hipstermojo/paperoni"
|
||||
repository = "https://github.com/hipstermojo/paperoni"
|
||||
name = "paperoni"
|
||||
version = "0.2.0-alpha1"
|
||||
version = "0.2.1-alpha1"
|
||||
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
||||
edition = "2018"
|
||||
license = "MIT"
|
||||
|
|
|
@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
|
|||
AppSettings::ArgRequiredElseHelp,
|
||||
AppSettings::UnifiedHelpMessage,
|
||||
])
|
||||
.version("0.1.0-alpha1")
|
||||
.version("0.2.1-alpha1")
|
||||
.about(
|
||||
"
|
||||
Paperoni is an article downloader.
|
||||
|
|
|
@ -11,7 +11,7 @@ use crate::moz_readability::{MetaData, Readability};
|
|||
pub type ResourceInfo = (String, Option<String>);
|
||||
|
||||
lazy_static! {
|
||||
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
|
||||
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
||||
}
|
||||
|
||||
pub struct Extractor {
|
||||
|
@ -56,7 +56,7 @@ impl Extractor {
|
|||
for img_ref in content_ref.select("img").unwrap() {
|
||||
img_ref.as_node().as_element().map(|img_elem| {
|
||||
img_elem.attributes.borrow().get("src").map(|img_url| {
|
||||
if !img_url.is_empty() {
|
||||
if !(img_url.is_empty() || img_url.starts_with("data:image")) {
|
||||
self.img_urls.push((img_url.to_string(), None))
|
||||
}
|
||||
})
|
||||
|
@ -75,7 +75,9 @@ impl Extractor {
|
|||
|
||||
async_download_tasks.push(task::spawn(async move {
|
||||
let mut img_response = surf::Client::new()
|
||||
.with(surf::middleware::Redirect::default())
|
||||
// The middleware has been temporarily commented out because it happens
|
||||
// to affect downloading images when there is no redirecting
|
||||
// .with(surf::middleware::Redirect::default())
|
||||
.get(&abs_url)
|
||||
.await
|
||||
.expect("Unable to retrieve file");
|
||||
|
@ -185,6 +187,8 @@ pub fn serialize_to_xhtml<W: std::io::Write>(
|
|||
escape_map.insert("<", "<");
|
||||
escape_map.insert(">", ">");
|
||||
escape_map.insert("&", "&");
|
||||
escape_map.insert("\"", """);
|
||||
escape_map.insert("'", "'");
|
||||
for edge in node_ref.traverse_inclusive() {
|
||||
match edge {
|
||||
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
|
||||
|
@ -248,6 +252,7 @@ mod test {
|
|||
<p>Some Lorem Ipsum text here</p>
|
||||
<p>Observe this picture</p>
|
||||
<img src="./img.jpg" alt="Random image">
|
||||
<img src="">
|
||||
</article>
|
||||
<footer>
|
||||
<p>Made in HTML</p>
|
||||
|
|
11
src/main.rs
11
src/main.rs
|
@ -55,7 +55,14 @@ fn download(urls: Vec<String>) {
|
|||
.download_images(&Url::parse(&url).unwrap())
|
||||
.await
|
||||
.expect("Unable to download images");
|
||||
let file_name = format!("{}.epub", extractor.metadata().title());
|
||||
let file_name = format!(
|
||||
"{}.epub",
|
||||
extractor
|
||||
.metadata()
|
||||
.title()
|
||||
.replace("/", " ")
|
||||
.replace("\\", " ")
|
||||
);
|
||||
let mut out_file = File::create(&file_name).unwrap();
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
||||
|
@ -68,7 +75,7 @@ fn download(urls: Vec<String>) {
|
|||
}
|
||||
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
||||
.unwrap();
|
||||
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
||||
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
||||
.unwrap();
|
||||
for img in extractor.img_urls {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
|
|
Reference in a new issue