Merge pull request #6 from hipstermojo/dev

Update to 0.3.0
This commit is contained in:
Kenneth Gitere 2021-02-24 13:13:36 +03:00 committed by GitHub
commit e9f96d2970
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 520 additions and 283 deletions

104
Cargo.lock generated
View file

@ -187,7 +187,7 @@ dependencies = [
"memchr", "memchr",
"num_cpus", "num_cpus",
"once_cell", "once_cell",
"pin-project-lite", "pin-project-lite 0.1.11",
"pin-utils", "pin-utils",
"slab", "slab",
"wasm-bindgen-futures", "wasm-bindgen-futures",
@ -684,25 +684,52 @@ dependencies = [
] ]
[[package]] [[package]]
name = "futures-channel" name = "futures"
version = "0.3.8" version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b7109687aa4e177ef6fe84553af6280ef2778bdb7783ba44c9dc3399110fe64" checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-sink",
] ]
[[package]] [[package]]
name = "futures-core" name = "futures-core"
version = "0.3.8" version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "847ce131b72ffb13b6109a221da9ad97a64cbe48feb1028356b836b47b8f1748" checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
[[package]]
name = "futures-executor"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]] [[package]]
name = "futures-io" name = "futures-io"
version = "0.3.8" version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "611834ce18aaa1bd13c4b374f5d653e1027cf99b6b502584ff8c9a64413b30bb" checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"
[[package]] [[package]]
name = "futures-lite" name = "futures-lite"
@ -715,15 +742,15 @@ dependencies = [
"futures-io", "futures-io",
"memchr", "memchr",
"parking", "parking",
"pin-project-lite", "pin-project-lite 0.1.11",
"waker-fn", "waker-fn",
] ]
[[package]] [[package]]
name = "futures-macro" name = "futures-macro"
version = "0.3.8" version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77408a692f1f97bcc61dc001d752e00643408fbc922e4d634c655df50d595556" checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
dependencies = [ dependencies = [
"proc-macro-hack", "proc-macro-hack",
"proc-macro2", "proc-macro2",
@ -733,31 +760,33 @@ dependencies = [
[[package]] [[package]]
name = "futures-sink" name = "futures-sink"
version = "0.3.8" version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f878195a49cee50e006b02b93cf7e0a95a38ac7b776b4c4d9cc1207cd20fcb3d" checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"
[[package]] [[package]]
name = "futures-task" name = "futures-task"
version = "0.3.8" version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c554eb5bf48b2426c4771ab68c6b14468b6e76cc90996f528c3338d761a4d0d" checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
dependencies = [ dependencies = [
"once_cell", "once_cell",
] ]
[[package]] [[package]]
name = "futures-util" name = "futures-util"
version = "0.3.8" version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d304cff4a7b99cfb7986f7d43fbe93d175e72e704a8860787cc95e9ffd85cbd2" checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
dependencies = [ dependencies = [
"futures-channel",
"futures-core", "futures-core",
"futures-io", "futures-io",
"futures-macro", "futures-macro",
"futures-sink",
"futures-task", "futures-task",
"memchr", "memchr",
"pin-project 1.0.2", "pin-project-lite 0.2.4",
"pin-utils", "pin-utils",
"proc-macro-hack", "proc-macro-hack",
"proc-macro-nested", "proc-macro-nested",
@ -911,7 +940,7 @@ dependencies = [
"cookie", "cookie",
"futures-lite", "futures-lite",
"infer", "infer",
"pin-project-lite", "pin-project-lite 0.1.11",
"rand 0.7.3", "rand 0.7.3",
"serde", "serde",
"serde_json", "serde_json",
@ -1242,11 +1271,12 @@ dependencies = [
[[package]] [[package]]
name = "paperoni" name = "paperoni"
version = "0.2.2-alpha1" version = "0.3.0-alpha1"
dependencies = [ dependencies = [
"async-std", "async-std",
"clap", "clap",
"epub-builder", "epub-builder",
"futures",
"html5ever", "html5ever",
"kuchiki", "kuchiki",
"lazy_static", "lazy_static",
@ -1328,16 +1358,7 @@ version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15" checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15"
dependencies = [ dependencies = [
"pin-project-internal 0.4.27", "pin-project-internal",
]
[[package]]
name = "pin-project"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ccc2237c2c489783abd8c4c80e5450fc0e98644555b1364da68cc29aa151ca7"
dependencies = [
"pin-project-internal 1.0.2",
] ]
[[package]] [[package]]
@ -1351,23 +1372,18 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "pin-project-internal"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8e8d2bf0b23038a4424865103a4df472855692821aab4e4f5c3312d461d9e5f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "pin-project-lite" name = "pin-project-lite"
version = "0.1.11" version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b" checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b"
[[package]]
name = "pin-project-lite"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827"
[[package]] [[package]]
name = "pin-utils" name = "pin-utils"
version = "0.1.0" version = "0.1.0"
@ -1889,7 +1905,7 @@ dependencies = [
"log 0.4.11", "log 0.4.11",
"mime_guess", "mime_guess",
"once_cell", "once_cell",
"pin-project-lite", "pin-project-lite 0.1.11",
"serde", "serde",
"serde_json", "serde_json",
"web-sys", "web-sys",
@ -2043,7 +2059,7 @@ checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27"
dependencies = [ dependencies = [
"cfg-if 0.1.10", "cfg-if 0.1.10",
"log 0.4.11", "log 0.4.11",
"pin-project-lite", "pin-project-lite 0.1.11",
"tracing-attributes", "tracing-attributes",
"tracing-core", "tracing-core",
] ]
@ -2074,7 +2090,7 @@ version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c" checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c"
dependencies = [ dependencies = [
"pin-project 0.4.27", "pin-project",
"tracing", "tracing",
] ]

View file

@ -3,7 +3,7 @@ description = "A web article downloader"
homepage = "https://github.com/hipstermojo/paperoni" homepage = "https://github.com/hipstermojo/paperoni"
repository = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni"
name = "paperoni" name = "paperoni"
version = "0.2.2-alpha1" version = "0.3.0-alpha1"
authors = ["Kenneth Gitere <gitere81@gmail.com>"] authors = ["Kenneth Gitere <gitere81@gmail.com>"]
edition = "2018" edition = "2018"
license = "MIT" license = "MIT"
@ -15,6 +15,7 @@ readme = "README.md"
async-std = "1.7.0" async-std = "1.7.0"
clap = "2.33.3" clap = "2.33.3"
epub-builder = "0.4.8" epub-builder = "0.4.8"
futures = "0.3.12"
html5ever = "0.25.1" html5ever = "0.25.1"
kuchiki = "0.8.1" kuchiki = "0.8.1"
lazy_static = "1.4.0" lazy_static = "1.4.0"

View file

@ -12,12 +12,24 @@ Paperoni is a web article downloader written in Rust. The downloaded articles ar
paperoni https://en.wikipedia.org/wiki/Pepperoni paperoni https://en.wikipedia.org/wiki/Pepperoni
``` ```
Paperoni also supports passing multiple links as arguments. If you are on a Unix-like OS, you can simply do something like this: Paperoni also supports passing multiple links as arguments.
```sh
paperoni https://en.wikipedia.org/wiki/Pepperoni https://en.wikipedia.org/wiki/Salami
```
Alternatively, if you are on a Unix-like OS, you can simply do something like this:
```sh ```sh
cat links.txt | xargs paperoni cat links.txt | xargs paperoni
``` ```
These can also be read from a file using the `-f` flag.
```sh
paperoni -f links.txt
```
## How it works ## How it works
The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor. The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor.
@ -27,11 +39,11 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi
## How it (currently) doesn't work ## How it (currently) doesn't work
This program is still in alpha so a number of things currently break: This program is still in alpha so a number of things won't work:
- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium.
- Websites that only run with JavaScript cannot be extracted. - Websites that only run with JavaScript cannot be extracted.
- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either. - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB.
## Running locally ## Running locally

View file

@ -1,12 +1,14 @@
use std::{fs::File, io::Read};
use clap::{App, AppSettings, Arg}; use clap::{App, AppSettings, Arg};
pub fn cli_init() -> App<'static, 'static> { pub fn cli_init() -> AppConfig {
App::new("paperoni") let app = App::new("paperoni")
.settings(&[ .settings(&[
AppSettings::ArgRequiredElseHelp, AppSettings::ArgRequiredElseHelp,
AppSettings::UnifiedHelpMessage, AppSettings::UnifiedHelpMessage,
]) ])
.version("0.2.2-alpha1") .version("0.3.0-alpha1")
.about( .about(
" "
Paperoni is an article downloader. Paperoni is an article downloader.
@ -18,4 +20,104 @@ It takes a url and downloads the article content from it and saves it to an epub
.help("Urls of web articles") .help("Urls of web articles")
.multiple(true), .multiple(true),
) )
.arg(
Arg::with_name("file")
.short("f")
.long("file")
.help("Input file containing links")
.takes_value(true),
)
.arg(
Arg::with_name("output_name")
.long("merge")
.help("Merge multiple articles into a single epub")
.long_help("Merge multiple articles into a single epub that will be given the name provided")
.takes_value(true),
).arg(
Arg::with_name("max_conn")
.long("max_conn")
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
.takes_value(true));
let arg_matches = app.get_matches();
let mut urls: Vec<String> = match arg_matches.value_of("file") {
Some(file_name) => {
if let Ok(mut file) = File::open(file_name) {
let mut content = String::new();
match file.read_to_string(&mut content) {
Ok(_) => content
.lines()
.filter(|line| !line.is_empty())
.map(|line| line.to_owned())
.collect(),
Err(_) => vec![],
}
} else {
println!("Unable to open file: {}", file_name);
vec![]
}
}
None => vec![],
};
if let Some(vals) = arg_matches.values_of("urls") {
urls.extend(
vals.filter(|val| !val.is_empty())
.map(|val| val.to_string()),
);
}
let max_conn = arg_matches
.value_of("max_conn")
.map(|conn_str| conn_str.parse::<usize>().ok())
.flatten()
.map(|max| if max > 0 { max } else { 1 })
.unwrap_or(8);
let mut app_config = AppConfig::new(max_conn);
app_config.set_urls(urls);
if let Some(name) = arg_matches.value_of("output_name") {
let file_name = if name.ends_with(".epub") && name.len() > 5 {
name.to_owned()
} else {
name.to_owned() + ".epub"
};
app_config.set_merged(file_name);
}
app_config
}
pub struct AppConfig {
urls: Vec<String>,
max_conn: usize,
merged: Option<String>,
}
impl AppConfig {
fn new(max_conn: usize) -> Self {
Self {
urls: vec![],
max_conn,
merged: None,
}
}
fn set_urls(&mut self, urls: Vec<String>) {
self.urls.extend(urls);
}
fn set_merged(&mut self, name: String) {
self.merged = Some(name);
}
pub fn urls(&self) -> &Vec<String> {
&self.urls
}
pub fn max_conn(&self) -> usize {
self.max_conn
}
pub fn merged(&self) -> Option<&String> {
self.merged.as_ref()
}
} }

113
src/epub.rs Normal file
View file

@ -0,0 +1,113 @@
use std::fs::File;
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use crate::extractor::{self, Extractor};
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
match merged {
Some(name) => {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
epub.inline_toc();
epub = articles
.iter()
.enumerate()
.fold(epub, |mut epub, (idx, article)| {
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
epub.metadata("title", replace_metadata_value(name))
.unwrap();
let section_name = article.metadata().title();
epub.add_content(
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
.title(replace_metadata_value(section_name)),
)
.unwrap();
article.img_urls.iter().for_each(|img| {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.as_ref().unwrap(),
)
.unwrap();
});
epub
});
let mut out_file = File::create(&name).unwrap();
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", name);
}
None => {
for article in articles {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
let file_name = format!(
"{}.epub",
article
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))
.unwrap();
}
epub.metadata("title", replace_metadata_value(article.metadata().title()))
.unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
.unwrap();
for img in article.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
.unwrap();
}
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name);
}
}
}
}
/// Replaces characters that have to be escaped before adding to the epub's metadata
fn replace_metadata_value(value: &str) -> String {
value
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
}
#[cfg(test)]
mod test {
use super::replace_metadata_value;
#[test]
fn test_replace_metadata_value() {
let mut value = "Lorem ipsum";
assert_eq!(replace_metadata_value(value), "Lorem ipsum");
value = "Memory safe > memory unsafe";
assert_eq!(
replace_metadata_value(value),
"Memory safe &gt; memory unsafe"
);
value = "Author Name <author@mail.example>";
assert_eq!(
replace_metadata_value(value),
"Author Name &lt;author@mail.example&gt;"
);
}
}

View file

@ -1,10 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use async_std::fs::File;
use async_std::io::prelude::*;
use async_std::task;
use kuchiki::{traits::*, NodeRef}; use kuchiki::{traits::*, NodeRef};
use url::Url;
use crate::moz_readability::{MetaData, Readability}; use crate::moz_readability::{MetaData, Readability};
@ -51,8 +47,8 @@ impl Extractor {
} }
/// Traverses the DOM tree of the content and retrieves the IMG URLs /// Traverses the DOM tree of the content and retrieves the IMG URLs
fn extract_img_urls(&mut self) { pub fn extract_img_urls(&mut self) {
if let Some(content_ref) = &self.readability.article_node { if let Some(content_ref) = &self.article {
for img_ref in content_ref.select("img").unwrap() { for img_ref in content_ref.select("img").unwrap() {
img_ref.as_node().as_element().map(|img_elem| { img_ref.as_node().as_element().map(|img_elem| {
img_elem.attributes.borrow().get("src").map(|img_url| { img_elem.attributes.borrow().get("src").map(|img_url| {
@ -65,80 +61,6 @@ impl Extractor {
} }
} }
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
self.extract_img_urls();
if self.img_urls.len() > 0 {
println!("Downloading images...");
}
for img_url in &self.img_urls {
let img_url = img_url.0.clone();
let abs_url = get_absolute_url(&img_url, article_origin);
async_download_tasks.push(task::spawn(async move {
let mut img_response = surf::Client::new()
// The middleware has been temporarily commented out because it happens
// to affect downloading images when there is no redirecting
// .with(surf::middleware::Redirect::default())
.get(&abs_url)
.await
.expect("Unable to retrieve file");
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
let img_mime = img_response
.content_type()
.map(|mime| mime.essence().to_string());
let img_ext = img_response
.content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
.unwrap();
let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
let mut img_file = File::create(&img_path)
.await
.expect("Unable to create file");
img_file
.write_all(&img_content)
.await
.expect("Unable to save to file");
(
img_url,
img_path
.file_name()
.map(|os_str_name| {
os_str_name
.to_str()
.expect("Unable to get image file name")
.to_string()
})
.unwrap(),
img_mime,
)
}));
}
self.img_urls.clear();
for async_task in async_download_tasks {
let (img_url, img_path, img_mime) = async_task.await;
// Update the image sources
let img_ref = self
.readability
.article_node
.as_mut()
.expect("Unable to get mutable ref")
.select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut();
*img_node.get_mut("src").unwrap() = img_path.clone();
// srcset is removed because readers such as Foliate then fail to display
// the image already downloaded and stored in src
img_node.remove("srcset");
self.img_urls.push((img_path, img_mime));
}
Ok(())
}
pub fn article(&self) -> Option<&NodeRef> { pub fn article(&self) -> Option<&NodeRef> {
self.article.as_ref() self.article.as_ref()
} }
@ -148,40 +70,6 @@ impl Extractor {
} }
} }
/// Utility for hashing URLs. This is used to help store files locally with unique values
fn hash_url(url: &str) -> String {
format!("{:x}", md5::compute(url.as_bytes()))
}
/// Handles getting the extension from a given MIME subtype.
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
if subtype == ("svg+xml") {
return "svg";
} else if subtype == "x-icon" {
"ico"
} else {
subtype
}
}
fn get_absolute_url(url: &str, request_url: &Url) -> String {
if Url::parse(url).is_ok() {
url.to_owned()
} else if url.starts_with("/") {
Url::parse(&format!(
"{}://{}",
request_url.scheme(),
request_url.host_str().unwrap()
))
.unwrap()
.join(url)
.unwrap()
.into_string()
} else {
request_url.join(url).unwrap().into_string()
}
}
/// Serializes a NodeRef to a string that is XHTML compatible /// Serializes a NodeRef to a string that is XHTML compatible
/// The only DOM nodes serialized are Text and Element nodes /// The only DOM nodes serialized are Text and Element nodes
pub fn serialize_to_xhtml<W: std::io::Write>( pub fn serialize_to_xhtml<W: std::io::Write>(
@ -278,19 +166,4 @@ mod test {
extractor.img_urls extractor.img_urls
); );
} }
#[test]
fn test_map_mime_type_to_ext() {
let mime_subtypes = vec![
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
];
let exts = mime_subtypes
.into_iter()
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
.collect::<Vec<_>>();
assert_eq!(
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
exts
);
}
} }

188
src/http.rs Normal file
View file

@ -0,0 +1,188 @@
use async_std::io::prelude::*;
use async_std::{fs::File, stream};
use futures::StreamExt;
use url::Url;
use crate::extractor::Extractor;
type HTMLResource = (String, String);
pub async fn fetch_url(
url: &str,
) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
let client = surf::Client::new();
println!("Fetching...");
let mut redirect_count: u8 = 0;
let base_url = Url::parse(&url)?;
let mut url = base_url.clone();
while redirect_count < 5 {
redirect_count += 1;
let req = surf::get(&url);
let mut res = client.send(req).await?;
if res.status().is_redirection() {
if let Some(location) = res.header(surf::http::headers::LOCATION) {
match Url::parse(location.last().as_str()) {
Ok(valid_url) => url = valid_url,
Err(e) => match e {
url::ParseError::RelativeUrlWithoutBase => {
url = base_url.join(location.last().as_str())?
}
e => return Err(e.into()),
},
};
}
} else if res.status().is_success() {
if let Some(mime) = res.content_type() {
if mime.essence() == "text/html" {
return Ok((url.to_string(), res.body_string().await?));
} else {
return Err(format!(
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
)
.into());
}
} else {
return Err("Unknown HTTP response".into());
}
} else {
return Err(format!("Request failed: HTTP {}", res.status()).into());
}
}
Err("Unable to fetch HTML".into())
}
pub async fn download_images(
extractor: &mut Extractor,
article_origin: &Url,
) -> async_std::io::Result<()> {
if extractor.img_urls.len() > 0 {
println!("Downloading images...");
}
let imgs_req_iter = extractor
.img_urls
.iter()
.map(|(url, _)| {
(
url,
surf::Client::new().get(get_absolute_url(&url, article_origin)),
)
})
.map(|(url, req)| async move {
let mut img_response = req.await.expect("Unable to retrieve image");
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
let img_mime = img_response
.content_type()
.map(|mime| mime.essence().to_string());
let img_ext = img_response
.content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
.unwrap();
let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
let mut img_file = File::create(&img_path)
.await
.expect("Unable to create file");
img_file
.write_all(&img_content)
.await
.expect("Unable to save to file");
(
url,
img_path
.file_name()
.map(|os_str_name| {
os_str_name
.to_str()
.expect("Unable to get image file name")
.to_string()
})
.unwrap(),
img_mime,
)
});
// A utility closure used when update the value of an image source after downloading is successful
let replace_existing_img_src =
|img_item: (&String, String, Option<String>)| -> (String, Option<String>) {
let (img_url, img_path, img_mime) = img_item;
let img_ref = extractor
.article()
.as_mut()
.expect("Unable to get mutable ref")
.select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut();
*img_node.get_mut("src").unwrap() = img_path.clone();
// srcset is removed because readers such as Foliate then fail to display
// the image already downloaded and stored in src
img_node.remove("srcset");
(img_path, img_mime)
};
extractor.img_urls = stream::from_iter(imgs_req_iter)
.buffered(10)
.collect::<Vec<_>>()
.await
.into_iter()
.map(replace_existing_img_src)
.collect();
Ok(())
}
/// Handles getting the extension from a given MIME subtype.
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
if subtype == ("svg+xml") {
return "svg";
} else if subtype == "x-icon" {
"ico"
} else {
subtype
}
}
/// Utility for hashing URLs. This is used to help store files locally with unique values
fn hash_url(url: &str) -> String {
format!("{:x}", md5::compute(url.as_bytes()))
}
fn get_absolute_url(url: &str, request_url: &Url) -> String {
if Url::parse(url).is_ok() {
url.to_owned()
} else if url.starts_with("/") {
Url::parse(&format!(
"{}://{}",
request_url.scheme(),
request_url.host_str().unwrap()
))
.unwrap()
.join(url)
.unwrap()
.into_string()
} else {
request_url.join(url).unwrap().into_string()
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_map_mime_type_to_ext() {
let mime_subtypes = vec![
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
];
let exts = mime_subtypes
.into_iter()
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
.collect::<Vec<_>>();
assert_eq!(
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
exts
);
}
}

View file

@ -1,129 +1,56 @@
#[macro_use] #[macro_use]
extern crate lazy_static; extern crate lazy_static;
use std::fs::File; use async_std::stream;
use async_std::task; use async_std::task;
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use futures::stream::StreamExt;
use url::Url; use url::Url;
mod cli; mod cli;
mod epub;
mod extractor; mod extractor;
/// This module is responsible for async HTTP calls for downloading
/// the HTML content and images
mod http;
mod moz_readability; mod moz_readability;
use cli::AppConfig;
use epub::generate_epubs;
use extractor::Extractor; use extractor::Extractor;
use http::{download_images, fetch_url};
fn main() { fn main() {
let app = cli::cli_init(); let app_config = cli::cli_init();
let arg_matches = app.get_matches();
if let Some(vals) = arg_matches.values_of("urls") { if !app_config.urls().is_empty() {
let urls = vals.map(|val| val.to_string()).collect::<Vec<_>>(); download(app_config);
download(urls);
} }
} }
type HTMLResource = (String, String); fn download(app_config: AppConfig) {
let articles = task::block_on(async {
async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> { let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
let client = surf::Client::new(); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
println!("Fetching..."); let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
let mut redirect_count: u8 = 0; match fetch_result {
let base_url = Url::parse(&url)?;
let mut url = base_url.clone();
while redirect_count < 5 {
redirect_count += 1;
let req = surf::get(&url);
let mut res = client.send(req).await?;
if res.status().is_redirection() {
if let Some(location) = res.header(surf::http::headers::LOCATION) {
match Url::parse(location.last().as_str()) {
Ok(valid_url) => url = valid_url,
Err(e) => match e {
url::ParseError::RelativeUrlWithoutBase => {
url = base_url.join(location.last().as_str())?
}
e => return Err(e.into()),
},
};
}
} else if res.status().is_success() {
if let Some(mime) = res.content_type() {
if mime.essence() == "text/html" {
return Ok((url.to_string(), res.body_string().await?));
} else {
return Err(format!(
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
)
.into());
}
} else {
return Err("Unknown HTTP response".into());
}
} else {
return Err(format!("Request failed: HTTP {}", res.status()).into());
}
}
Err("Unable to fetch HTML".into())
}
fn download(urls: Vec<String>) {
let mut async_url_tasks = Vec::with_capacity(urls.len());
for url in urls {
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
}
task::block_on(async {
for url_task in async_url_tasks {
match url_task.await {
Ok((url, html)) => { Ok((url, html)) => {
println!("Extracting"); println!("Extracting");
let mut extractor = Extractor::from_html(&html); let mut extractor = Extractor::from_html(&html);
extractor.extract_content(&url); extractor.extract_content(&url);
if extractor.article().is_some() { if extractor.article().is_some() {
extractor extractor.extract_img_urls();
.download_images(&Url::parse(&url).unwrap()) download_images(&mut extractor, &Url::parse(&url).unwrap())
.await .await
.expect("Unable to download images"); .expect("Unable to download images");
let file_name = format!( articles.push(extractor);
"{}.epub",
extractor
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_buf = std::str::from_utf8(&html_buf).unwrap();
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
if let Some(author) = extractor.metadata().byline() {
epub.metadata("author", author.replace("&", "&amp;"))
.unwrap();
}
epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
.unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
.unwrap();
for img in extractor.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.unwrap(),
)
.unwrap();
}
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name);
} }
} }
Err(e) => println!("{}", e), Err(e) => eprintln!("{}", e),
} }
} }
}) articles
});
generate_epubs(articles, app_config.merged());
} }

View file

@ -462,7 +462,12 @@ impl Readability {
.iter() .iter()
.find(|key| values.contains_key(**key)) .find(|key| values.contains_key(**key))
{ {
values.get(*key).map(|title| title.to_owned()).unwrap() let title = values.get(*key).map(|title| title.to_owned()).unwrap();
if title.is_empty() {
self.get_article_title()
} else {
title
}
} else { } else {
self.get_article_title() self.get_article_title()
}; };