commit
e9f96d2970
9 changed files with 520 additions and 283 deletions
104
Cargo.lock
generated
104
Cargo.lock
generated
|
@ -187,7 +187,7 @@ dependencies = [
|
|||
"memchr",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"pin-utils",
|
||||
"slab",
|
||||
"wasm-bindgen-futures",
|
||||
|
@ -684,25 +684,52 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.8"
|
||||
name = "futures"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b7109687aa4e177ef6fe84553af6280ef2778bdb7783ba44c9dc3399110fe64"
|
||||
checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "847ce131b72ffb13b6109a221da9ad97a64cbe48feb1028356b836b47b8f1748"
|
||||
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "611834ce18aaa1bd13c4b374f5d653e1027cf99b6b502584ff8c9a64413b30bb"
|
||||
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"
|
||||
|
||||
[[package]]
|
||||
name = "futures-lite"
|
||||
|
@ -715,15 +742,15 @@ dependencies = [
|
|||
"futures-io",
|
||||
"memchr",
|
||||
"parking",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"waker-fn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77408a692f1f97bcc61dc001d752e00643408fbc922e4d634c655df50d595556"
|
||||
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
|
||||
dependencies = [
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
|
@ -733,31 +760,33 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f878195a49cee50e006b02b93cf7e0a95a38ac7b776b4c4d9cc1207cd20fcb3d"
|
||||
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c554eb5bf48b2426c4771ab68c6b14468b6e76cc90996f528c3338d761a4d0d"
|
||||
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.8"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d304cff4a7b99cfb7986f7d43fbe93d175e72e704a8860787cc95e9ffd85cbd2"
|
||||
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-macro",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
"pin-project 1.0.2",
|
||||
"pin-project-lite 0.2.4",
|
||||
"pin-utils",
|
||||
"proc-macro-hack",
|
||||
"proc-macro-nested",
|
||||
|
@ -911,7 +940,7 @@ dependencies = [
|
|||
"cookie",
|
||||
"futures-lite",
|
||||
"infer",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"rand 0.7.3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
@ -1242,11 +1271,12 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "paperoni"
|
||||
version = "0.2.2-alpha1"
|
||||
version = "0.3.0-alpha1"
|
||||
dependencies = [
|
||||
"async-std",
|
||||
"clap",
|
||||
"epub-builder",
|
||||
"futures",
|
||||
"html5ever",
|
||||
"kuchiki",
|
||||
"lazy_static",
|
||||
|
@ -1328,16 +1358,7 @@ version = "0.4.27"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15"
|
||||
dependencies = [
|
||||
"pin-project-internal 0.4.27",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ccc2237c2c489783abd8c4c80e5450fc0e98644555b1364da68cc29aa151ca7"
|
||||
dependencies = [
|
||||
"pin-project-internal 1.0.2",
|
||||
"pin-project-internal",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1351,23 +1372,18 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-internal"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8e8d2bf0b23038a4424865103a4df472855692821aab4e4f5c3312d461d9e5f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b"
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827"
|
||||
|
||||
[[package]]
|
||||
name = "pin-utils"
|
||||
version = "0.1.0"
|
||||
|
@ -1889,7 +1905,7 @@ dependencies = [
|
|||
"log 0.4.11",
|
||||
"mime_guess",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"web-sys",
|
||||
|
@ -2043,7 +2059,7 @@ checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27"
|
|||
dependencies = [
|
||||
"cfg-if 0.1.10",
|
||||
"log 0.4.11",
|
||||
"pin-project-lite",
|
||||
"pin-project-lite 0.1.11",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
@ -2074,7 +2090,7 @@ version = "0.2.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c"
|
||||
dependencies = [
|
||||
"pin-project 0.4.27",
|
||||
"pin-project",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ description = "A web article downloader"
|
|||
homepage = "https://github.com/hipstermojo/paperoni"
|
||||
repository = "https://github.com/hipstermojo/paperoni"
|
||||
name = "paperoni"
|
||||
version = "0.2.2-alpha1"
|
||||
version = "0.3.0-alpha1"
|
||||
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
||||
edition = "2018"
|
||||
license = "MIT"
|
||||
|
@ -15,6 +15,7 @@ readme = "README.md"
|
|||
async-std = "1.7.0"
|
||||
clap = "2.33.3"
|
||||
epub-builder = "0.4.8"
|
||||
futures = "0.3.12"
|
||||
html5ever = "0.25.1"
|
||||
kuchiki = "0.8.1"
|
||||
lazy_static = "1.4.0"
|
||||
|
|
18
README.md
18
README.md
|
@ -12,12 +12,24 @@ Paperoni is a web article downloader written in Rust. The downloaded articles ar
|
|||
paperoni https://en.wikipedia.org/wiki/Pepperoni
|
||||
```
|
||||
|
||||
Paperoni also supports passing multiple links as arguments. If you are on a Unix-like OS, you can simply do something like this:
|
||||
Paperoni also supports passing multiple links as arguments.
|
||||
|
||||
```sh
|
||||
paperoni https://en.wikipedia.org/wiki/Pepperoni https://en.wikipedia.org/wiki/Salami
|
||||
```
|
||||
|
||||
Alternatively, if you are on a Unix-like OS, you can simply do something like this:
|
||||
|
||||
```sh
|
||||
cat links.txt | xargs paperoni
|
||||
```
|
||||
|
||||
These can also be read from a file using the `-f` flag.
|
||||
|
||||
```sh
|
||||
paperoni -f links.txt
|
||||
```
|
||||
|
||||
## How it works
|
||||
|
||||
The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor.
|
||||
|
@ -27,11 +39,11 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi
|
|||
|
||||
## How it (currently) doesn't work
|
||||
|
||||
This program is still in alpha so a number of things currently break:
|
||||
This program is still in alpha so a number of things won't work:
|
||||
|
||||
- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium.
|
||||
- Websites that only run with JavaScript cannot be extracted.
|
||||
- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
|
||||
- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB.
|
||||
|
||||
## Running locally
|
||||
|
||||
|
|
108
src/cli.rs
108
src/cli.rs
|
@ -1,12 +1,14 @@
|
|||
use std::{fs::File, io::Read};
|
||||
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
pub fn cli_init() -> App<'static, 'static> {
|
||||
App::new("paperoni")
|
||||
pub fn cli_init() -> AppConfig {
|
||||
let app = App::new("paperoni")
|
||||
.settings(&[
|
||||
AppSettings::ArgRequiredElseHelp,
|
||||
AppSettings::UnifiedHelpMessage,
|
||||
])
|
||||
.version("0.2.2-alpha1")
|
||||
.version("0.3.0-alpha1")
|
||||
.about(
|
||||
"
|
||||
Paperoni is an article downloader.
|
||||
|
@ -18,4 +20,104 @@ It takes a url and downloads the article content from it and saves it to an epub
|
|||
.help("Urls of web articles")
|
||||
.multiple(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("file")
|
||||
.short("f")
|
||||
.long("file")
|
||||
.help("Input file containing links")
|
||||
.takes_value(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("output_name")
|
||||
.long("merge")
|
||||
.help("Merge multiple articles into a single epub")
|
||||
.long_help("Merge multiple articles into a single epub that will be given the name provided")
|
||||
.takes_value(true),
|
||||
).arg(
|
||||
Arg::with_name("max_conn")
|
||||
.long("max_conn")
|
||||
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
|
||||
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
|
||||
.takes_value(true));
|
||||
let arg_matches = app.get_matches();
|
||||
let mut urls: Vec<String> = match arg_matches.value_of("file") {
|
||||
Some(file_name) => {
|
||||
if let Ok(mut file) = File::open(file_name) {
|
||||
let mut content = String::new();
|
||||
match file.read_to_string(&mut content) {
|
||||
Ok(_) => content
|
||||
.lines()
|
||||
.filter(|line| !line.is_empty())
|
||||
.map(|line| line.to_owned())
|
||||
.collect(),
|
||||
Err(_) => vec![],
|
||||
}
|
||||
} else {
|
||||
println!("Unable to open file: {}", file_name);
|
||||
vec![]
|
||||
}
|
||||
}
|
||||
None => vec![],
|
||||
};
|
||||
|
||||
if let Some(vals) = arg_matches.values_of("urls") {
|
||||
urls.extend(
|
||||
vals.filter(|val| !val.is_empty())
|
||||
.map(|val| val.to_string()),
|
||||
);
|
||||
}
|
||||
|
||||
let max_conn = arg_matches
|
||||
.value_of("max_conn")
|
||||
.map(|conn_str| conn_str.parse::<usize>().ok())
|
||||
.flatten()
|
||||
.map(|max| if max > 0 { max } else { 1 })
|
||||
.unwrap_or(8);
|
||||
|
||||
let mut app_config = AppConfig::new(max_conn);
|
||||
app_config.set_urls(urls);
|
||||
if let Some(name) = arg_matches.value_of("output_name") {
|
||||
let file_name = if name.ends_with(".epub") && name.len() > 5 {
|
||||
name.to_owned()
|
||||
} else {
|
||||
name.to_owned() + ".epub"
|
||||
};
|
||||
app_config.set_merged(file_name);
|
||||
}
|
||||
app_config
|
||||
}
|
||||
|
||||
pub struct AppConfig {
|
||||
urls: Vec<String>,
|
||||
max_conn: usize,
|
||||
merged: Option<String>,
|
||||
}
|
||||
|
||||
impl AppConfig {
|
||||
fn new(max_conn: usize) -> Self {
|
||||
Self {
|
||||
urls: vec![],
|
||||
max_conn,
|
||||
merged: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn set_urls(&mut self, urls: Vec<String>) {
|
||||
self.urls.extend(urls);
|
||||
}
|
||||
|
||||
fn set_merged(&mut self, name: String) {
|
||||
self.merged = Some(name);
|
||||
}
|
||||
|
||||
pub fn urls(&self) -> &Vec<String> {
|
||||
&self.urls
|
||||
}
|
||||
pub fn max_conn(&self) -> usize {
|
||||
self.max_conn
|
||||
}
|
||||
|
||||
pub fn merged(&self) -> Option<&String> {
|
||||
self.merged.as_ref()
|
||||
}
|
||||
}
|
||||
|
|
113
src/epub.rs
Normal file
113
src/epub.rs
Normal file
|
@ -0,0 +1,113 @@
|
|||
use std::fs::File;
|
||||
|
||||
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
||||
|
||||
use crate::extractor::{self, Extractor};
|
||||
|
||||
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
|
||||
match merged {
|
||||
Some(name) => {
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
epub.inline_toc();
|
||||
epub = articles
|
||||
.iter()
|
||||
.enumerate()
|
||||
.fold(epub, |mut epub, (idx, article)| {
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||
epub.metadata("title", replace_metadata_value(name))
|
||||
.unwrap();
|
||||
let section_name = article.metadata().title();
|
||||
epub.add_content(
|
||||
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
|
||||
.title(replace_metadata_value(section_name)),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
article.img_urls.iter().for_each(|img| {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(
|
||||
file_path.file_name().unwrap(),
|
||||
img_buf,
|
||||
img.1.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
});
|
||||
epub
|
||||
});
|
||||
let mut out_file = File::create(&name).unwrap();
|
||||
epub.generate(&mut out_file).unwrap();
|
||||
println!("Created {:?}", name);
|
||||
}
|
||||
None => {
|
||||
for article in articles {
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
let file_name = format!(
|
||||
"{}.epub",
|
||||
article
|
||||
.metadata()
|
||||
.title()
|
||||
.replace("/", " ")
|
||||
.replace("\\", " ")
|
||||
);
|
||||
let mut out_file = File::create(&file_name).unwrap();
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||
if let Some(author) = article.metadata().byline() {
|
||||
epub.metadata("author", replace_metadata_value(author))
|
||||
.unwrap();
|
||||
}
|
||||
epub.metadata("title", replace_metadata_value(article.metadata().title()))
|
||||
.unwrap();
|
||||
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
|
||||
.unwrap();
|
||||
for img in article.img_urls {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
|
||||
.unwrap();
|
||||
}
|
||||
epub.generate(&mut out_file).unwrap();
|
||||
println!("Created {:?}", file_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Replaces characters that have to be escaped before adding to the epub's metadata
|
||||
fn replace_metadata_value(value: &str) -> String {
|
||||
value
|
||||
.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::replace_metadata_value;
|
||||
|
||||
#[test]
|
||||
fn test_replace_metadata_value() {
|
||||
let mut value = "Lorem ipsum";
|
||||
assert_eq!(replace_metadata_value(value), "Lorem ipsum");
|
||||
value = "Memory safe > memory unsafe";
|
||||
assert_eq!(
|
||||
replace_metadata_value(value),
|
||||
"Memory safe > memory unsafe"
|
||||
);
|
||||
value = "Author Name <author@mail.example>";
|
||||
assert_eq!(
|
||||
replace_metadata_value(value),
|
||||
"Author Name <author@mail.example>"
|
||||
);
|
||||
}
|
||||
}
|
131
src/extractor.rs
131
src/extractor.rs
|
@ -1,10 +1,6 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use async_std::fs::File;
|
||||
use async_std::io::prelude::*;
|
||||
use async_std::task;
|
||||
use kuchiki::{traits::*, NodeRef};
|
||||
use url::Url;
|
||||
|
||||
use crate::moz_readability::{MetaData, Readability};
|
||||
|
||||
|
@ -51,8 +47,8 @@ impl Extractor {
|
|||
}
|
||||
|
||||
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
||||
fn extract_img_urls(&mut self) {
|
||||
if let Some(content_ref) = &self.readability.article_node {
|
||||
pub fn extract_img_urls(&mut self) {
|
||||
if let Some(content_ref) = &self.article {
|
||||
for img_ref in content_ref.select("img").unwrap() {
|
||||
img_ref.as_node().as_element().map(|img_elem| {
|
||||
img_elem.attributes.borrow().get("src").map(|img_url| {
|
||||
|
@ -65,80 +61,6 @@ impl Extractor {
|
|||
}
|
||||
}
|
||||
|
||||
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
||||
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
||||
self.extract_img_urls();
|
||||
if self.img_urls.len() > 0 {
|
||||
println!("Downloading images...");
|
||||
}
|
||||
for img_url in &self.img_urls {
|
||||
let img_url = img_url.0.clone();
|
||||
let abs_url = get_absolute_url(&img_url, article_origin);
|
||||
|
||||
async_download_tasks.push(task::spawn(async move {
|
||||
let mut img_response = surf::Client::new()
|
||||
// The middleware has been temporarily commented out because it happens
|
||||
// to affect downloading images when there is no redirecting
|
||||
// .with(surf::middleware::Redirect::default())
|
||||
.get(&abs_url)
|
||||
.await
|
||||
.expect("Unable to retrieve file");
|
||||
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
||||
let img_mime = img_response
|
||||
.content_type()
|
||||
.map(|mime| mime.essence().to_string());
|
||||
let img_ext = img_response
|
||||
.content_type()
|
||||
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||
.unwrap();
|
||||
let mut img_path = std::env::temp_dir();
|
||||
img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
|
||||
let mut img_file = File::create(&img_path)
|
||||
.await
|
||||
.expect("Unable to create file");
|
||||
img_file
|
||||
.write_all(&img_content)
|
||||
.await
|
||||
.expect("Unable to save to file");
|
||||
|
||||
(
|
||||
img_url,
|
||||
img_path
|
||||
.file_name()
|
||||
.map(|os_str_name| {
|
||||
os_str_name
|
||||
.to_str()
|
||||
.expect("Unable to get image file name")
|
||||
.to_string()
|
||||
})
|
||||
.unwrap(),
|
||||
img_mime,
|
||||
)
|
||||
}));
|
||||
}
|
||||
|
||||
self.img_urls.clear();
|
||||
|
||||
for async_task in async_download_tasks {
|
||||
let (img_url, img_path, img_mime) = async_task.await;
|
||||
// Update the image sources
|
||||
let img_ref = self
|
||||
.readability
|
||||
.article_node
|
||||
.as_mut()
|
||||
.expect("Unable to get mutable ref")
|
||||
.select_first(&format!("img[src='{}']", img_url))
|
||||
.expect("Image node does not exist");
|
||||
let mut img_node = img_ref.attributes.borrow_mut();
|
||||
*img_node.get_mut("src").unwrap() = img_path.clone();
|
||||
// srcset is removed because readers such as Foliate then fail to display
|
||||
// the image already downloaded and stored in src
|
||||
img_node.remove("srcset");
|
||||
self.img_urls.push((img_path, img_mime));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn article(&self) -> Option<&NodeRef> {
|
||||
self.article.as_ref()
|
||||
}
|
||||
|
@ -148,40 +70,6 @@ impl Extractor {
|
|||
}
|
||||
}
|
||||
|
||||
/// Utility for hashing URLs. This is used to help store files locally with unique values
|
||||
fn hash_url(url: &str) -> String {
|
||||
format!("{:x}", md5::compute(url.as_bytes()))
|
||||
}
|
||||
|
||||
/// Handles getting the extension from a given MIME subtype.
|
||||
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
||||
if subtype == ("svg+xml") {
|
||||
return "svg";
|
||||
} else if subtype == "x-icon" {
|
||||
"ico"
|
||||
} else {
|
||||
subtype
|
||||
}
|
||||
}
|
||||
|
||||
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
||||
if Url::parse(url).is_ok() {
|
||||
url.to_owned()
|
||||
} else if url.starts_with("/") {
|
||||
Url::parse(&format!(
|
||||
"{}://{}",
|
||||
request_url.scheme(),
|
||||
request_url.host_str().unwrap()
|
||||
))
|
||||
.unwrap()
|
||||
.join(url)
|
||||
.unwrap()
|
||||
.into_string()
|
||||
} else {
|
||||
request_url.join(url).unwrap().into_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes a NodeRef to a string that is XHTML compatible
|
||||
/// The only DOM nodes serialized are Text and Element nodes
|
||||
pub fn serialize_to_xhtml<W: std::io::Write>(
|
||||
|
@ -278,19 +166,4 @@ mod test {
|
|||
extractor.img_urls
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_map_mime_type_to_ext() {
|
||||
let mime_subtypes = vec![
|
||||
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
|
||||
];
|
||||
let exts = mime_subtypes
|
||||
.into_iter()
|
||||
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
|
||||
exts
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
188
src/http.rs
Normal file
188
src/http.rs
Normal file
|
@ -0,0 +1,188 @@
|
|||
use async_std::io::prelude::*;
|
||||
use async_std::{fs::File, stream};
|
||||
use futures::StreamExt;
|
||||
use url::Url;
|
||||
|
||||
use crate::extractor::Extractor;
|
||||
|
||||
type HTMLResource = (String, String);
|
||||
|
||||
pub async fn fetch_url(
|
||||
url: &str,
|
||||
) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
|
||||
let client = surf::Client::new();
|
||||
println!("Fetching...");
|
||||
|
||||
let mut redirect_count: u8 = 0;
|
||||
let base_url = Url::parse(&url)?;
|
||||
let mut url = base_url.clone();
|
||||
while redirect_count < 5 {
|
||||
redirect_count += 1;
|
||||
let req = surf::get(&url);
|
||||
let mut res = client.send(req).await?;
|
||||
if res.status().is_redirection() {
|
||||
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
||||
match Url::parse(location.last().as_str()) {
|
||||
Ok(valid_url) => url = valid_url,
|
||||
Err(e) => match e {
|
||||
url::ParseError::RelativeUrlWithoutBase => {
|
||||
url = base_url.join(location.last().as_str())?
|
||||
}
|
||||
e => return Err(e.into()),
|
||||
},
|
||||
};
|
||||
}
|
||||
} else if res.status().is_success() {
|
||||
if let Some(mime) = res.content_type() {
|
||||
if mime.essence() == "text/html" {
|
||||
return Ok((url.to_string(), res.body_string().await?));
|
||||
} else {
|
||||
return Err(format!(
|
||||
"Invalid HTTP response. Received {} instead of text/html",
|
||||
mime.essence()
|
||||
)
|
||||
.into());
|
||||
}
|
||||
} else {
|
||||
return Err("Unknown HTTP response".into());
|
||||
}
|
||||
} else {
|
||||
return Err(format!("Request failed: HTTP {}", res.status()).into());
|
||||
}
|
||||
}
|
||||
Err("Unable to fetch HTML".into())
|
||||
}
|
||||
|
||||
pub async fn download_images(
|
||||
extractor: &mut Extractor,
|
||||
article_origin: &Url,
|
||||
) -> async_std::io::Result<()> {
|
||||
if extractor.img_urls.len() > 0 {
|
||||
println!("Downloading images...");
|
||||
}
|
||||
|
||||
let imgs_req_iter = extractor
|
||||
.img_urls
|
||||
.iter()
|
||||
.map(|(url, _)| {
|
||||
(
|
||||
url,
|
||||
surf::Client::new().get(get_absolute_url(&url, article_origin)),
|
||||
)
|
||||
})
|
||||
.map(|(url, req)| async move {
|
||||
let mut img_response = req.await.expect("Unable to retrieve image");
|
||||
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
||||
let img_mime = img_response
|
||||
.content_type()
|
||||
.map(|mime| mime.essence().to_string());
|
||||
let img_ext = img_response
|
||||
.content_type()
|
||||
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||
.unwrap();
|
||||
|
||||
let mut img_path = std::env::temp_dir();
|
||||
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
|
||||
let mut img_file = File::create(&img_path)
|
||||
.await
|
||||
.expect("Unable to create file");
|
||||
img_file
|
||||
.write_all(&img_content)
|
||||
.await
|
||||
.expect("Unable to save to file");
|
||||
|
||||
(
|
||||
url,
|
||||
img_path
|
||||
.file_name()
|
||||
.map(|os_str_name| {
|
||||
os_str_name
|
||||
.to_str()
|
||||
.expect("Unable to get image file name")
|
||||
.to_string()
|
||||
})
|
||||
.unwrap(),
|
||||
img_mime,
|
||||
)
|
||||
});
|
||||
|
||||
// A utility closure used when update the value of an image source after downloading is successful
|
||||
let replace_existing_img_src =
|
||||
|img_item: (&String, String, Option<String>)| -> (String, Option<String>) {
|
||||
let (img_url, img_path, img_mime) = img_item;
|
||||
let img_ref = extractor
|
||||
.article()
|
||||
.as_mut()
|
||||
.expect("Unable to get mutable ref")
|
||||
.select_first(&format!("img[src='{}']", img_url))
|
||||
.expect("Image node does not exist");
|
||||
let mut img_node = img_ref.attributes.borrow_mut();
|
||||
*img_node.get_mut("src").unwrap() = img_path.clone();
|
||||
// srcset is removed because readers such as Foliate then fail to display
|
||||
// the image already downloaded and stored in src
|
||||
img_node.remove("srcset");
|
||||
(img_path, img_mime)
|
||||
};
|
||||
|
||||
extractor.img_urls = stream::from_iter(imgs_req_iter)
|
||||
.buffered(10)
|
||||
.collect::<Vec<_>>()
|
||||
.await
|
||||
.into_iter()
|
||||
.map(replace_existing_img_src)
|
||||
.collect();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handles getting the extension from a given MIME subtype.
|
||||
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
||||
if subtype == ("svg+xml") {
|
||||
return "svg";
|
||||
} else if subtype == "x-icon" {
|
||||
"ico"
|
||||
} else {
|
||||
subtype
|
||||
}
|
||||
}
|
||||
|
||||
/// Utility for hashing URLs. This is used to help store files locally with unique values
|
||||
fn hash_url(url: &str) -> String {
|
||||
format!("{:x}", md5::compute(url.as_bytes()))
|
||||
}
|
||||
|
||||
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
||||
if Url::parse(url).is_ok() {
|
||||
url.to_owned()
|
||||
} else if url.starts_with("/") {
|
||||
Url::parse(&format!(
|
||||
"{}://{}",
|
||||
request_url.scheme(),
|
||||
request_url.host_str().unwrap()
|
||||
))
|
||||
.unwrap()
|
||||
.join(url)
|
||||
.unwrap()
|
||||
.into_string()
|
||||
} else {
|
||||
request_url.join(url).unwrap().into_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn test_map_mime_type_to_ext() {
|
||||
let mime_subtypes = vec![
|
||||
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
|
||||
];
|
||||
let exts = mime_subtypes
|
||||
.into_iter()
|
||||
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
|
||||
exts
|
||||
);
|
||||
}
|
||||
}
|
131
src/main.rs
131
src/main.rs
|
@ -1,129 +1,56 @@
|
|||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
use std::fs::File;
|
||||
|
||||
use async_std::stream;
|
||||
use async_std::task;
|
||||
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
||||
use futures::stream::StreamExt;
|
||||
use url::Url;
|
||||
|
||||
mod cli;
|
||||
mod epub;
|
||||
mod extractor;
|
||||
/// This module is responsible for async HTTP calls for downloading
|
||||
/// the HTML content and images
|
||||
mod http;
|
||||
mod moz_readability;
|
||||
|
||||
use cli::AppConfig;
|
||||
use epub::generate_epubs;
|
||||
use extractor::Extractor;
|
||||
use http::{download_images, fetch_url};
|
||||
|
||||
fn main() {
|
||||
let app = cli::cli_init();
|
||||
let arg_matches = app.get_matches();
|
||||
if let Some(vals) = arg_matches.values_of("urls") {
|
||||
let urls = vals.map(|val| val.to_string()).collect::<Vec<_>>();
|
||||
download(urls);
|
||||
let app_config = cli::cli_init();
|
||||
|
||||
if !app_config.urls().is_empty() {
|
||||
download(app_config);
|
||||
}
|
||||
}
|
||||
|
||||
type HTMLResource = (String, String);
|
||||
|
||||
async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
|
||||
let client = surf::Client::new();
|
||||
println!("Fetching...");
|
||||
|
||||
let mut redirect_count: u8 = 0;
|
||||
let base_url = Url::parse(&url)?;
|
||||
let mut url = base_url.clone();
|
||||
while redirect_count < 5 {
|
||||
redirect_count += 1;
|
||||
let req = surf::get(&url);
|
||||
let mut res = client.send(req).await?;
|
||||
if res.status().is_redirection() {
|
||||
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
||||
match Url::parse(location.last().as_str()) {
|
||||
Ok(valid_url) => url = valid_url,
|
||||
Err(e) => match e {
|
||||
url::ParseError::RelativeUrlWithoutBase => {
|
||||
url = base_url.join(location.last().as_str())?
|
||||
}
|
||||
e => return Err(e.into()),
|
||||
},
|
||||
};
|
||||
}
|
||||
} else if res.status().is_success() {
|
||||
if let Some(mime) = res.content_type() {
|
||||
if mime.essence() == "text/html" {
|
||||
return Ok((url.to_string(), res.body_string().await?));
|
||||
} else {
|
||||
return Err(format!(
|
||||
"Invalid HTTP response. Received {} instead of text/html",
|
||||
mime.essence()
|
||||
)
|
||||
.into());
|
||||
}
|
||||
} else {
|
||||
return Err("Unknown HTTP response".into());
|
||||
}
|
||||
} else {
|
||||
return Err(format!("Request failed: HTTP {}", res.status()).into());
|
||||
}
|
||||
}
|
||||
Err("Unable to fetch HTML".into())
|
||||
}
|
||||
|
||||
fn download(urls: Vec<String>) {
|
||||
let mut async_url_tasks = Vec::with_capacity(urls.len());
|
||||
for url in urls {
|
||||
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
|
||||
}
|
||||
task::block_on(async {
|
||||
for url_task in async_url_tasks {
|
||||
match url_task.await {
|
||||
fn download(app_config: AppConfig) {
|
||||
let articles = task::block_on(async {
|
||||
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
|
||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
|
||||
let mut articles = Vec::new();
|
||||
while let Some(fetch_result) = responses.next().await {
|
||||
match fetch_result {
|
||||
Ok((url, html)) => {
|
||||
println!("Extracting");
|
||||
let mut extractor = Extractor::from_html(&html);
|
||||
extractor.extract_content(&url);
|
||||
|
||||
if extractor.article().is_some() {
|
||||
extractor
|
||||
.download_images(&Url::parse(&url).unwrap())
|
||||
extractor.extract_img_urls();
|
||||
download_images(&mut extractor, &Url::parse(&url).unwrap())
|
||||
.await
|
||||
.expect("Unable to download images");
|
||||
let file_name = format!(
|
||||
"{}.epub",
|
||||
extractor
|
||||
.metadata()
|
||||
.title()
|
||||
.replace("/", " ")
|
||||
.replace("\\", " ")
|
||||
);
|
||||
let mut out_file = File::create(&file_name).unwrap();
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
if let Some(author) = extractor.metadata().byline() {
|
||||
epub.metadata("author", author.replace("&", "&"))
|
||||
.unwrap();
|
||||
}
|
||||
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
||||
.unwrap();
|
||||
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
||||
.unwrap();
|
||||
for img in extractor.img_urls {
|
||||
let mut file_path = std::env::temp_dir();
|
||||
file_path.push(&img.0);
|
||||
|
||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||
epub.add_resource(
|
||||
file_path.file_name().unwrap(),
|
||||
img_buf,
|
||||
img.1.unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
epub.generate(&mut out_file).unwrap();
|
||||
println!("Created {:?}", file_name);
|
||||
articles.push(extractor);
|
||||
}
|
||||
}
|
||||
Err(e) => println!("{}", e),
|
||||
Err(e) => eprintln!("{}", e),
|
||||
}
|
||||
}
|
||||
})
|
||||
articles
|
||||
});
|
||||
generate_epubs(articles, app_config.merged());
|
||||
}
|
||||
|
|
|
@ -462,7 +462,12 @@ impl Readability {
|
|||
.iter()
|
||||
.find(|key| values.contains_key(**key))
|
||||
{
|
||||
values.get(*key).map(|title| title.to_owned()).unwrap()
|
||||
let title = values.get(*key).map(|title| title.to_owned()).unwrap();
|
||||
if title.is_empty() {
|
||||
self.get_article_title()
|
||||
} else {
|
||||
title
|
||||
}
|
||||
} else {
|
||||
self.get_article_title()
|
||||
};
|
||||
|
|
Loading…
Reference in a new issue