Merge pull request #4 from hipstermojo/dev

Update to 0.2.1-alpha1
This commit is contained in:
Kenneth Gitere 2020-12-24 14:11:42 +03:00 committed by GitHub
commit ca1f9e2800
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 20 additions and 8 deletions

2
Cargo.lock generated
View file

@ -1242,7 +1242,7 @@ dependencies = [
[[package]]
name = "paperoni"
version = "0.2.0-alpha1"
version = "0.2.1-alpha1"
dependencies = [
"async-std",
"clap",

View file

@ -3,7 +3,7 @@ description = "A web article downloader"
homepage = "https://github.com/hipstermojo/paperoni"
repository = "https://github.com/hipstermojo/paperoni"
name = "paperoni"
version = "0.2.0-alpha1"
version = "0.2.1-alpha1"
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
edition = "2018"
license = "MIT"

View file

@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
AppSettings::ArgRequiredElseHelp,
AppSettings::UnifiedHelpMessage,
])
.version("0.1.0-alpha1")
.version("0.2.1-alpha1")
.about(
"
Paperoni is an article downloader.

View file

@ -11,7 +11,7 @@ use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>);
lazy_static! {
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
}
pub struct Extractor {
@ -56,7 +56,7 @@ impl Extractor {
for img_ref in content_ref.select("img").unwrap() {
img_ref.as_node().as_element().map(|img_elem| {
img_elem.attributes.borrow().get("src").map(|img_url| {
if !img_url.is_empty() {
if !(img_url.is_empty() || img_url.starts_with("data:image")) {
self.img_urls.push((img_url.to_string(), None))
}
})
@ -75,7 +75,9 @@ impl Extractor {
async_download_tasks.push(task::spawn(async move {
let mut img_response = surf::Client::new()
.with(surf::middleware::Redirect::default())
// The middleware has been temporarily commented out because it happens
// to affect downloading images when there is no redirecting
// .with(surf::middleware::Redirect::default())
.get(&abs_url)
.await
.expect("Unable to retrieve file");
@ -185,6 +187,8 @@ pub fn serialize_to_xhtml<W: std::io::Write>(
escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;");
escape_map.insert("&", "&amp;");
escape_map.insert("\"", "&quot;");
escape_map.insert("'", "&apos;");
for edge in node_ref.traverse_inclusive() {
match edge {
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
@ -248,6 +252,7 @@ mod test {
<p>Some Lorem Ipsum text here</p>
<p>Observe this picture</p>
<img src="./img.jpg" alt="Random image">
<img src="data:image/png;base64,lJGWEIUQOIQWIDYVIVEDYFOUYQFWD">
</article>
<footer>
<p>Made in HTML</p>

View file

@ -55,7 +55,14 @@ fn download(urls: Vec<String>) {
.download_images(&Url::parse(&url).unwrap())
.await
.expect("Unable to download images");
let file_name = format!("{}.epub", extractor.metadata().title());
let file_name = format!(
"{}.epub",
extractor
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
@ -68,7 +75,7 @@ fn download(urls: Vec<String>) {
}
epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
.unwrap();
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
.unwrap();
for img in extractor.img_urls {
let mut file_path = std::env::temp_dir();