commit
3958261cda
16 changed files with 746 additions and 148 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -1,4 +1,7 @@
|
||||||
/target
|
/target
|
||||||
*.epub
|
*.epub
|
||||||
|
# Only ignore top level html files which may be made when testing
|
||||||
|
/*.html
|
||||||
|
*.pdf
|
||||||
*.log
|
*.log
|
||||||
.vscode/
|
.vscode/
|
10
Cargo.lock
generated
10
Cargo.lock
generated
|
@ -395,6 +395,7 @@ dependencies = [
|
||||||
"textwrap",
|
"textwrap",
|
||||||
"unicode-width",
|
"unicode-width",
|
||||||
"vec_map",
|
"vec_map",
|
||||||
|
"yaml-rust",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1551,9 +1552,10 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "paperoni"
|
name = "paperoni"
|
||||||
version = "0.5.0-alpha1"
|
version = "0.6.0-alpha1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-std",
|
"async-std",
|
||||||
|
"base64",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"colored",
|
"colored",
|
||||||
|
@ -2756,6 +2758,12 @@ version = "0.4.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "yaml-rust"
|
||||||
|
version = "0.3.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e66366e18dc58b46801afbf2ca7661a9f59cc8c5962c29892b6039b4f86fa992"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "yansi"
|
name = "yansi"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
|
|
|
@ -3,7 +3,7 @@ description = "A web article downloader"
|
||||||
homepage = "https://github.com/hipstermojo/paperoni"
|
homepage = "https://github.com/hipstermojo/paperoni"
|
||||||
repository = "https://github.com/hipstermojo/paperoni"
|
repository = "https://github.com/hipstermojo/paperoni"
|
||||||
name = "paperoni"
|
name = "paperoni"
|
||||||
version = "0.5.0-alpha1"
|
version = "0.6.0-alpha1"
|
||||||
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
@ -14,8 +14,9 @@ readme = "README.md"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
# atty = "0.2.14"
|
# atty = "0.2.14"
|
||||||
async-std = "1.9.0"
|
async-std = "1.9.0"
|
||||||
|
base64 = "0.13.0"
|
||||||
chrono = "0.4.19"
|
chrono = "0.4.19"
|
||||||
clap = "2.33.3"
|
clap = { version = "2.33.3", features = ["yaml"] }
|
||||||
colored = "2.0.0"
|
colored = "2.0.0"
|
||||||
comfy-table = "3.0.0"
|
comfy-table = "3.0.0"
|
||||||
derive_builder = "0.10.2"
|
derive_builder = "0.10.2"
|
||||||
|
|
108
README.md
108
README.md
|
@ -8,7 +8,7 @@
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. There is provisional<sup><a href="#pdf-exports">\*</a></sup> support for exporting to PDF as well.
|
Paperoni is a CLI tool made in Rust for downloading web articles as EPUB or HTML files. There is provisional<sup><a href="#pdf-exports">\*</a></sup> support for exporting to PDF as well.
|
||||||
|
|
||||||
> This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash.
|
> This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash.
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for
|
||||||
Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run:
|
Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
cargo install paperoni --version 0.5.0-alpha1
|
cargo install paperoni --version 0.6.0-alpha1
|
||||||
```
|
```
|
||||||
|
|
||||||
_Paperoni is still in alpha so the `version` flag has to be passed._
|
_Paperoni is still in alpha so the `version` flag has to be passed._
|
||||||
|
@ -48,39 +48,54 @@ USAGE:
|
||||||
paperoni [OPTIONS] [urls]...
|
paperoni [OPTIONS] [urls]...
|
||||||
|
|
||||||
OPTIONS:
|
OPTIONS:
|
||||||
|
--export <type>
|
||||||
|
Specify the file type of the export. The type must be in lower case. [default: epub] [possible values:
|
||||||
|
html, epub]
|
||||||
-f, --file <file>
|
-f, --file <file>
|
||||||
Input file containing links
|
Input file containing links
|
||||||
|
|
||||||
-h, --help
|
-h, --help
|
||||||
Prints help information
|
Prints help information
|
||||||
|
|
||||||
|
--inline-images
|
||||||
|
Inlines the article images when exporting to HTML using base64.
|
||||||
|
This is used when you do not want a separate folder created for images during HTML export.
|
||||||
|
NOTE: It uses base64 encoding on the images which results in larger HTML export sizes as each image
|
||||||
|
increases in size by about 25%-33%.
|
||||||
--inline-toc
|
--inline-toc
|
||||||
Add an inlined Table of Contents page at the start of the merged article.
|
Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation
|
||||||
|
|
||||||
--log-to-file
|
--log-to-file
|
||||||
Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to
|
Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to
|
||||||
specify the logging level
|
specify the logging level
|
||||||
--max-conn <max_conn>
|
--max-conn <max-conn>
|
||||||
The maximum number of concurrent HTTP connections when downloading articles. Default is 8.
|
The maximum number of concurrent HTTP connections when downloading articles. Default is 8.
|
||||||
NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end
|
NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end
|
||||||
up overloading your network card with too many concurrent requests.
|
up overloading your network card with too many concurrent requests.
|
||||||
-o, --output-dir <output_directory>
|
--no-css
|
||||||
Directory for saving epub documents
|
Removes the stylesheets used in the EPUB generation.
|
||||||
|
The EPUB file will then be laid out based on your e-reader's default stylesheets.
|
||||||
--merge <output_name>
|
Images and code blocks may overflow when this flag is set and layout of generated
|
||||||
|
PDFs will be affected. Use --no-header-css if you want to only disable the styling on headers.
|
||||||
|
--no-header-css
|
||||||
|
Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default
|
||||||
|
CSS, use --no-css instead.
|
||||||
|
--merge <output-name>
|
||||||
Merge multiple articles into a single epub that will be given the name provided
|
Merge multiple articles into a single epub that will be given the name provided
|
||||||
|
|
||||||
|
-o, --output-dir <output_directory>
|
||||||
|
Directory to store output epub documents
|
||||||
|
|
||||||
-V, --version
|
-V, --version
|
||||||
Prints version information
|
Prints version information
|
||||||
|
|
||||||
-v
|
-v
|
||||||
This takes upto 4 levels of verbosity in the following order.
|
This takes upto 4 levels of verbosity in the following order.
|
||||||
- Error (-v)
|
- Error (-v)
|
||||||
- Warn (-vv)
|
- Warn (-vv)
|
||||||
- Info (-vvv)
|
- Info (-vvv)
|
||||||
- Debug (-vvvv)
|
- Debug (-vvvv)
|
||||||
When this flag is passed, it disables the progress bars and logs to stderr.
|
When this flag is passed, it disables the progress bars and logs to stderr.
|
||||||
If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag.
|
If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag.
|
||||||
|
|
||||||
ARGS:
|
ARGS:
|
||||||
<urls>...
|
<urls>...
|
||||||
|
@ -112,6 +127,41 @@ These can also be read from a file using the `-f/--file` flag.
|
||||||
paperoni -f links.txt
|
paperoni -f links.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Exporting articles
|
||||||
|
|
||||||
|
By default, Paperoni exports to EPUB files but you can change to HTML by passing the `--export html` flag.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
paperoni https://en.wikipedia.org/wiki/Pepperoni --export html
|
||||||
|
```
|
||||||
|
|
||||||
|
HTML exports allow you to read the articles as plain HTML documents on your browser but can also be used to convert to PDF as explained [here](#).
|
||||||
|
|
||||||
|
When exporting to HTML, Paperoni will download the article's images to a folder named similar to the article. Therefore the folder structure would look like this for the command ran above:
|
||||||
|
|
||||||
|
```
|
||||||
|
.
|
||||||
|
├── Pepperoni - Wikipedia
|
||||||
|
│ ├── 1a9f886e9b58db72e0003a2cd52681d8.png
|
||||||
|
│ ├── 216f8a4265a1ceb3f8cfba4c2f9057b1.jpeg
|
||||||
|
│ ...
|
||||||
|
└── Pepperoni - Wikipedia.html
|
||||||
|
```
|
||||||
|
|
||||||
|
If you would instead prefer to have the images inlined directly to the HTML export, pass the `inline-images` flag, i.e.:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
paperoni https://en.wikipedia.org/wiki/Pepperoni --export html --inline-images
|
||||||
|
```
|
||||||
|
|
||||||
|
This is especially useful when exporting multiple links.
|
||||||
|
|
||||||
|
**NOTE**: The inlining of images for HTML exports uses base64 encoding which is known to increase the overall size of images by about 25% to 33%.
|
||||||
|
|
||||||
|
### Disabling CSS
|
||||||
|
|
||||||
|
The `no-css` and `no-header-css` flags can be used to remove the default styling added by Paperoni. Refer to `--help` to see the usage of the flags.
|
||||||
|
|
||||||
### Merging articles
|
### Merging articles
|
||||||
|
|
||||||
By default, Paperoni generates an epub file for each link. You can also merge multiple links
|
By default, Paperoni generates an epub file for each link. You can also merge multiple links
|
||||||
|
@ -153,7 +203,11 @@ There are also web pages it won't work on in general such as Twitter and Reddit
|
||||||
|
|
||||||
## PDF exports
|
## PDF exports
|
||||||
|
|
||||||
As of version 0.5-alpha1, you can now export to PDF using a third party tool. This requires that you install [Calibre](https://calibre-ebook.com/) which comes with a ebook conversion. You can convert the epub to a pdf through the terminal with `ebook-convert`:
|
PDF conversion can be done using a third party tool. There are 2 options to do so:
|
||||||
|
|
||||||
|
### EPUB to PDF
|
||||||
|
|
||||||
|
This requires that you install [Calibre](https://calibre-ebook.com/) which comes with a ebook conversion. You can convert the epub to a pdf through the terminal with `ebook-convert`:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Assuming the downloaded epub was called foo.epub
|
# Assuming the downloaded epub was called foo.epub
|
||||||
|
@ -161,3 +215,25 @@ ebook-convert foo.epub foo.pdf
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively, you can use the Calibre GUI to do the file conversion.
|
Alternatively, you can use the Calibre GUI to do the file conversion.
|
||||||
|
|
||||||
|
### HTML to PDF
|
||||||
|
|
||||||
|
The recommended approach is to use [Weasyprint](https://weasyprint.org/start/), a free and open-source tool that converts HTML documents to PDF. It is available on Linux, MacOS and Windows. Using the CLI, it can be done as follows:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
paperoni https://en.wikipedia.org/wiki/Pepperoni --export html
|
||||||
|
weasyprint "Pepperoni - Wikipedia.html" Pepperoni.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
Inlining images is not mandatory as Weasyprint will be able to find the files on its own.
|
||||||
|
|
||||||
|
### Comparison of PDF conversion methods
|
||||||
|
|
||||||
|
Either of the conversion methods is sufficient for most use cases. The main differences are listed below:
|
||||||
|
| | EPUB to PDF | HTML to PDF |
|
||||||
|
|----------------------|----------------------------|------------------|
|
||||||
|
| Wrapping code blocks | Yes | No |
|
||||||
|
| CSS customization | No | Yes |
|
||||||
|
| Generated file size | Slightly larger | Slightly smaller |
|
||||||
|
|
||||||
|
The difference in file size is due to the additional fonts added to the PDF file by `ebook-convert`.
|
||||||
|
|
7
src/assets/body.min.css
vendored
Normal file
7
src/assets/body.min.css
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
/*!
|
||||||
|
* Writ v1.0.4
|
||||||
|
*
|
||||||
|
* Copyright © 2015, Curtis McEnroe <curtis@cmcenroe.me>
|
||||||
|
*
|
||||||
|
* https://cmcenroe.me/writ/LICENSE (ISC)
|
||||||
|
*/dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}small{font-size:.833em}th{font-weight:400}blockquote,dl,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap}
|
7
src/assets/headers.min.css
vendored
Normal file
7
src/assets/headers.min.css
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
/*!
|
||||||
|
* Writ v1.0.4
|
||||||
|
*
|
||||||
|
* Copyright © 2015, Curtis McEnroe <curtis@cmcenroe.me>
|
||||||
|
*
|
||||||
|
* https://cmcenroe.me/writ/LICENSE (ISC)
|
||||||
|
*/h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}h1,h2,h3{line-height:3rem}h1,h2,h3,h4,h5,h6{margin:1.5rem 0 0}
|
7
src/assets/writ.min.css
vendored
7
src/assets/writ.min.css
vendored
|
@ -1,7 +0,0 @@
|
||||||
/*!
|
|
||||||
* Writ v1.0.4
|
|
||||||
*
|
|
||||||
* Copyright © 2015, Curtis McEnroe <curtis@cmcenroe.me>
|
|
||||||
*
|
|
||||||
* https://cmcenroe.me/writ/LICENSE (ISC)
|
|
||||||
*/dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap}
|
|
132
src/cli.rs
132
src/cli.rs
|
@ -1,7 +1,7 @@
|
||||||
use std::{fs, num::NonZeroUsize, path::Path};
|
use std::{fs, num::NonZeroUsize, path::Path};
|
||||||
|
|
||||||
use chrono::{DateTime, Local};
|
use chrono::{DateTime, Local};
|
||||||
use clap::{App, AppSettings, Arg, ArgMatches};
|
use clap::{load_yaml, App, ArgMatches};
|
||||||
use flexi_logger::LevelFilter as LogLevel;
|
use flexi_logger::LevelFilter as LogLevel;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
|
||||||
|
@ -11,10 +11,10 @@ const DEFAULT_MAX_CONN: usize = 8;
|
||||||
|
|
||||||
#[derive(derive_builder::Builder)]
|
#[derive(derive_builder::Builder)]
|
||||||
pub struct AppConfig {
|
pub struct AppConfig {
|
||||||
/// Urls for store in epub
|
/// Article urls
|
||||||
pub urls: Vec<String>,
|
pub urls: Vec<String>,
|
||||||
pub max_conn: usize,
|
pub max_conn: usize,
|
||||||
/// Path to file of multiple articles into a single epub
|
/// Path to file of multiple articles into a single article
|
||||||
pub merged: Option<String>,
|
pub merged: Option<String>,
|
||||||
pub output_directory: Option<String>,
|
pub output_directory: Option<String>,
|
||||||
pub log_level: LogLevel,
|
pub log_level: LogLevel,
|
||||||
|
@ -22,80 +22,15 @@ pub struct AppConfig {
|
||||||
pub start_time: DateTime<Local>,
|
pub start_time: DateTime<Local>,
|
||||||
pub is_logging_to_file: bool,
|
pub is_logging_to_file: bool,
|
||||||
pub inline_toc: bool,
|
pub inline_toc: bool,
|
||||||
|
pub css_config: CSSConfig,
|
||||||
|
pub export_type: ExportType,
|
||||||
|
pub is_inlining_images: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AppConfig {
|
impl AppConfig {
|
||||||
pub fn init_with_cli() -> Result<AppConfig, Error> {
|
pub fn init_with_cli() -> Result<AppConfig, Error> {
|
||||||
let app = App::new("paperoni")
|
let yaml_config = load_yaml!("cli_config.yml");
|
||||||
.settings(&[
|
let app = App::from_yaml(yaml_config).version(clap::crate_version!());
|
||||||
AppSettings::ArgRequiredElseHelp,
|
|
||||||
AppSettings::UnifiedHelpMessage,
|
|
||||||
])
|
|
||||||
.version(clap::crate_version!())
|
|
||||||
.about(
|
|
||||||
"Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs",
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::with_name("urls")
|
|
||||||
.help("Urls of web articles")
|
|
||||||
.multiple(true),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::with_name("file")
|
|
||||||
.short("f")
|
|
||||||
.long("file")
|
|
||||||
.help("Input file containing links")
|
|
||||||
.takes_value(true),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::with_name("output_directory")
|
|
||||||
.long("output-dir")
|
|
||||||
.short("o")
|
|
||||||
.help("Directory to store output epub documents")
|
|
||||||
.conflicts_with("output_name")
|
|
||||||
.takes_value(true),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::with_name("output_name")
|
|
||||||
.long("merge")
|
|
||||||
.help("Merge multiple articles into a single epub")
|
|
||||||
.long_help("Merge multiple articles into a single epub that will be given the name provided")
|
|
||||||
.conflicts_with("output_directory")
|
|
||||||
.takes_value(true),
|
|
||||||
).arg(
|
|
||||||
Arg::with_name("max-conn")
|
|
||||||
.long("max_conn")
|
|
||||||
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
|
|
||||||
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
|
|
||||||
.takes_value(true))
|
|
||||||
.arg(
|
|
||||||
Arg::with_name("verbosity")
|
|
||||||
.short("v")
|
|
||||||
.multiple(true)
|
|
||||||
.help("Enables logging of events and set the verbosity level. Use --help to read on its usage")
|
|
||||||
.long_help(
|
|
||||||
"This takes upto 4 levels of verbosity in the following order.
|
|
||||||
- Error (-v)
|
|
||||||
- Warn (-vv)
|
|
||||||
- Info (-vvv)
|
|
||||||
- Debug (-vvvv)
|
|
||||||
When this flag is passed, it disables the progress bars and logs to stderr.
|
|
||||||
If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag."
|
|
||||||
)
|
|
||||||
.takes_value(false))
|
|
||||||
.arg(
|
|
||||||
Arg::with_name("log-to-file")
|
|
||||||
.long("log-to-file")
|
|
||||||
.help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level")
|
|
||||||
.takes_value(false))
|
|
||||||
.arg(
|
|
||||||
Arg::with_name("inline-toc")
|
|
||||||
.long("inline-toc")
|
|
||||||
.requires("output_name")
|
|
||||||
.help("Add an inlined Table of Contents page at the start of the merged article.")
|
|
||||||
.long_help("Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation")
|
|
||||||
);
|
|
||||||
|
|
||||||
Self::try_from(app.get_matches())
|
Self::try_from(app.get_matches())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,11 +94,12 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
|
||||||
Some(max_conn) => max_conn.parse::<NonZeroUsize>()?.get(),
|
Some(max_conn) => max_conn.parse::<NonZeroUsize>()?.get(),
|
||||||
None => DEFAULT_MAX_CONN,
|
None => DEFAULT_MAX_CONN,
|
||||||
})
|
})
|
||||||
.merged(arg_matches.value_of("output_name").map(|name| {
|
.merged(arg_matches.value_of("output-name").map(|name| {
|
||||||
if name.ends_with(".epub") {
|
let file_ext = format!(".{}", arg_matches.value_of("export").unwrap());
|
||||||
|
if name.ends_with(&file_ext) {
|
||||||
name.to_owned()
|
name.to_owned()
|
||||||
} else {
|
} else {
|
||||||
name.to_string() + ".epub"
|
name.to_string() + &file_ext
|
||||||
}
|
}
|
||||||
}))
|
}))
|
||||||
.can_disable_progress_bar(
|
.can_disable_progress_bar(
|
||||||
|
@ -183,7 +119,17 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
|
||||||
4..=u64::MAX => LogLevel::Debug,
|
4..=u64::MAX => LogLevel::Debug,
|
||||||
})
|
})
|
||||||
.is_logging_to_file(arg_matches.is_present("log-to-file"))
|
.is_logging_to_file(arg_matches.is_present("log-to-file"))
|
||||||
.inline_toc(arg_matches.is_present("inline-toc"))
|
.inline_toc(
|
||||||
|
(if arg_matches.is_present("inline-toc") {
|
||||||
|
if arg_matches.value_of("export") == Some("epub") {
|
||||||
|
Ok(true)
|
||||||
|
} else {
|
||||||
|
Err(Error::WrongExportInliningToC)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok(false)
|
||||||
|
})?,
|
||||||
|
)
|
||||||
.output_directory(
|
.output_directory(
|
||||||
arg_matches
|
arg_matches
|
||||||
.value_of("output_directory")
|
.value_of("output_directory")
|
||||||
|
@ -200,6 +146,25 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
|
||||||
.transpose()?,
|
.transpose()?,
|
||||||
)
|
)
|
||||||
.start_time(Local::now())
|
.start_time(Local::now())
|
||||||
|
.css_config(
|
||||||
|
match (
|
||||||
|
arg_matches.is_present("no-css"),
|
||||||
|
arg_matches.is_present("no-header-css"),
|
||||||
|
) {
|
||||||
|
(true, _) => CSSConfig::None,
|
||||||
|
(_, true) => CSSConfig::NoHeaders,
|
||||||
|
_ => CSSConfig::All,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.export_type({
|
||||||
|
let export_type = arg_matches.value_of("export").unwrap();
|
||||||
|
if export_type == "html" {
|
||||||
|
ExportType::HTML
|
||||||
|
} else {
|
||||||
|
ExportType::EPUB
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.is_inlining_images(arg_matches.is_present("inline-images"))
|
||||||
.try_init()
|
.try_init()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -212,3 +177,16 @@ impl AppConfigBuilder {
|
||||||
.init_merge_file()
|
.init_merge_file()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum CSSConfig {
|
||||||
|
All,
|
||||||
|
NoHeaders,
|
||||||
|
None,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum ExportType {
|
||||||
|
HTML,
|
||||||
|
EPUB,
|
||||||
|
}
|
||||||
|
|
82
src/cli_config.yml
Normal file
82
src/cli_config.yml
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
name: paperoni
|
||||||
|
about: Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs
|
||||||
|
settings:
|
||||||
|
- ArgRequiredElseHelp
|
||||||
|
- UnifiedHelpMessage
|
||||||
|
args:
|
||||||
|
- urls:
|
||||||
|
help: Urls of web articles
|
||||||
|
multiple: true
|
||||||
|
- file:
|
||||||
|
short: f
|
||||||
|
long: file
|
||||||
|
help: Input file containing links
|
||||||
|
takes_value: true
|
||||||
|
- output_directory:
|
||||||
|
short: o
|
||||||
|
long: output-dir
|
||||||
|
help: Directory to store output epub documents
|
||||||
|
conflicts_with: output-name
|
||||||
|
takes_value: true
|
||||||
|
- output-name:
|
||||||
|
long: merge
|
||||||
|
help: Merge multiple articles into a single epub
|
||||||
|
long_help: Merge multiple articles into a single epub that will be given the name provided
|
||||||
|
conflicts_with: output_directory
|
||||||
|
takes_value: true
|
||||||
|
- max-conn:
|
||||||
|
long: max-conn
|
||||||
|
help: The maximum number of concurrent HTTP connections when downloading articles. Default is 8
|
||||||
|
long_help: "The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests."
|
||||||
|
takes_value: true
|
||||||
|
- verbosity:
|
||||||
|
short: v
|
||||||
|
multiple: true
|
||||||
|
help: Enables logging of events and set the verbosity level. Use --help to read on its usage
|
||||||
|
long_help: "This takes upto 4 levels of verbosity in the following order.
|
||||||
|
\n- Error (-v)
|
||||||
|
\n- Warn (-vv)
|
||||||
|
\n- Info (-vvv)
|
||||||
|
\n- Debug (-vvvv)
|
||||||
|
\nWhen this flag is passed, it disables the progress bars and logs to stderr.
|
||||||
|
\nIf you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag."
|
||||||
|
takes_value: false
|
||||||
|
- log-to-file:
|
||||||
|
long: log-to-file
|
||||||
|
help: Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level
|
||||||
|
takes_value: false
|
||||||
|
- inline-toc:
|
||||||
|
long: inline-toc
|
||||||
|
requires: output-name
|
||||||
|
help: Add an inlined Table of Contents page at the start of the merged article.
|
||||||
|
long_help: Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation
|
||||||
|
- no-css:
|
||||||
|
long: no-css
|
||||||
|
conflicts_with: no-header-css
|
||||||
|
help: Removes the stylesheets used in the EPUB generation. Pass --help to learn more
|
||||||
|
long_help: "Removes the stylesheets used in the EPUB generation.
|
||||||
|
\nThe EPUB file will then be laid out based on your e-reader's default stylesheets.
|
||||||
|
\nImages and code blocks may overflow when this flag is set and layout of generated
|
||||||
|
\nPDFs will be affected. Use --no-header-css if you want to only disable the styling on headers."
|
||||||
|
takes_value: false
|
||||||
|
- no-header-css:
|
||||||
|
long: no-header-css
|
||||||
|
conflicts_with: no-css
|
||||||
|
help: Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default CSS, use --no-css instead.
|
||||||
|
takes_value: false
|
||||||
|
- export:
|
||||||
|
long: export
|
||||||
|
help: Specify the file type of the export. The type must be in lower case.
|
||||||
|
possible_values: [html, epub]
|
||||||
|
value_name: type
|
||||||
|
takes_value: true
|
||||||
|
default_value: epub
|
||||||
|
- inline-images:
|
||||||
|
long: inline-images
|
||||||
|
help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more.
|
||||||
|
long_help: "Inlines the article images when exporting to HTML using base64.
|
||||||
|
\nThis is used when you do not want a separate folder created for images during HTML export.
|
||||||
|
\nNOTE: It uses base64 encoding on the images which results in larger HTML export sizes as each image
|
||||||
|
increases in size by about 25%-33%."
|
||||||
|
takes_value: false
|
||||||
|
requires: export
|
63
src/epub.rs
63
src/epub.rs
|
@ -8,14 +8,15 @@ use indicatif::{ProgressBar, ProgressStyle};
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
use log::{debug, error, info};
|
use log::{debug, error, info};
|
||||||
|
|
||||||
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
|
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Article};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
||||||
|
static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_:]"#).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_epubs(
|
pub fn generate_epubs(
|
||||||
articles: Vec<Extractor>,
|
articles: Vec<Article>,
|
||||||
app_config: &AppConfig,
|
app_config: &AppConfig,
|
||||||
successful_articles_table: &mut Table,
|
successful_articles_table: &mut Table,
|
||||||
) -> Result<(), Vec<PaperoniError>> {
|
) -> Result<(), Vec<PaperoniError>> {
|
||||||
|
@ -37,8 +38,6 @@ pub fn generate_epubs(
|
||||||
enabled_bar
|
enabled_bar
|
||||||
};
|
};
|
||||||
|
|
||||||
let stylesheet = include_bytes!("./assets/writ.min.css");
|
|
||||||
|
|
||||||
let mut errors: Vec<PaperoniError> = Vec::new();
|
let mut errors: Vec<PaperoniError> = Vec::new();
|
||||||
|
|
||||||
match app_config.merged {
|
match app_config.merged {
|
||||||
|
@ -71,7 +70,7 @@ pub fn generate_epubs(
|
||||||
epub.inline_toc();
|
epub.inline_toc();
|
||||||
}
|
}
|
||||||
|
|
||||||
match epub.stylesheet(stylesheet.as_bytes()) {
|
match add_stylesheets(&mut epub, app_config) {
|
||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Unable to add stylesheets to epub file");
|
error!("Unable to add stylesheets to epub file");
|
||||||
|
@ -89,9 +88,9 @@ pub fn generate_epubs(
|
||||||
let content_url = format!("article_{}.xhtml", idx);
|
let content_url = format!("article_{}.xhtml", idx);
|
||||||
let mut xhtml_buf = Vec::new();
|
let mut xhtml_buf = Vec::new();
|
||||||
let header_level_tocs =
|
let header_level_tocs =
|
||||||
get_header_level_toc_vec(&content_url, article.article());
|
get_header_level_toc_vec(&content_url, article.node_ref());
|
||||||
|
|
||||||
serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
|
serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)?;
|
||||||
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
|
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
|
||||||
let section_name = article.metadata().title();
|
let section_name = article.metadata().title();
|
||||||
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
|
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
|
||||||
|
@ -146,6 +145,8 @@ pub fn generate_epubs(
|
||||||
let mut paperoni_err: PaperoniError = err.into();
|
let mut paperoni_err: PaperoniError = err.into();
|
||||||
paperoni_err.set_article_source(&name);
|
paperoni_err.set_article_source(&name);
|
||||||
errors.push(paperoni_err);
|
errors.push(paperoni_err);
|
||||||
|
error!("Failed to generate epub: {}", name);
|
||||||
|
bar.finish_with_message("epub generation failed\n");
|
||||||
return Err(errors);
|
return Err(errors);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -178,8 +179,8 @@ pub fn generate_epubs(
|
||||||
let mut out_file = File::create(&file_name).unwrap();
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
let mut xhtml_buf = Vec::new();
|
let mut xhtml_buf = Vec::new();
|
||||||
let header_level_tocs =
|
let header_level_tocs =
|
||||||
get_header_level_toc_vec("index.xhtml", article.article());
|
get_header_level_toc_vec("index.xhtml", article.node_ref());
|
||||||
serialize_to_xhtml(article.article(), &mut xhtml_buf)
|
serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)
|
||||||
.expect("Unable to serialize to xhtml");
|
.expect("Unable to serialize to xhtml");
|
||||||
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
|
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
|
||||||
|
|
||||||
|
@ -187,8 +188,7 @@ pub fn generate_epubs(
|
||||||
epub.metadata("author", replace_escaped_characters(author))?;
|
epub.metadata("author", replace_escaped_characters(author))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
epub.stylesheet(stylesheet.as_bytes())?;
|
add_stylesheets(&mut epub, app_config)?;
|
||||||
|
|
||||||
let title = replace_escaped_characters(article.metadata().title());
|
let title = replace_escaped_characters(article.metadata().title());
|
||||||
epub.metadata("title", &title)?;
|
epub.metadata("title", &title)?;
|
||||||
|
|
||||||
|
@ -205,7 +205,7 @@ pub fn generate_epubs(
|
||||||
let mut file_path = std::env::temp_dir();
|
let mut file_path = std::env::temp_dir();
|
||||||
file_path.push(&img.0);
|
file_path.push(&img.0);
|
||||||
|
|
||||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
let img_buf = File::open(&file_path).expect("Can't read image file");
|
||||||
epub.add_resource(
|
epub.add_resource(
|
||||||
file_path.file_name().unwrap(),
|
file_path.file_name().unwrap(),
|
||||||
img_buf,
|
img_buf,
|
||||||
|
@ -249,8 +249,27 @@ fn replace_escaped_characters(value: &str) -> String {
|
||||||
.replace(">", ">")
|
.replace(">", ">")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn add_stylesheets<T: epub_builder::Zip>(
|
||||||
|
epub: &mut EpubBuilder<T>,
|
||||||
|
app_config: &AppConfig,
|
||||||
|
) -> Result<(), epub_builder::Error> {
|
||||||
|
let body_stylesheet: &[u8] = include_bytes!("./assets/body.min.css");
|
||||||
|
let header_stylesheet: &[u8] = include_bytes!("./assets/headers.min.css");
|
||||||
|
match app_config.css_config {
|
||||||
|
crate::cli::CSSConfig::All => {
|
||||||
|
epub.stylesheet([header_stylesheet, body_stylesheet].concat().as_bytes())?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
crate::cli::CSSConfig::NoHeaders => {
|
||||||
|
epub.stylesheet(body_stylesheet.as_bytes())?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
_ => Ok(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
|
//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
|
||||||
fn generate_appendix(articles: Vec<&Extractor>) -> String {
|
fn generate_appendix(articles: Vec<&Article>) -> String {
|
||||||
let link_tags: String = articles
|
let link_tags: String = articles
|
||||||
.iter()
|
.iter()
|
||||||
.map(|article| {
|
.map(|article| {
|
||||||
|
@ -292,6 +311,10 @@ fn generate_header_ids(root_node: &NodeRef) {
|
||||||
let headers_no_id = headers.filter(|node_data_ref| {
|
let headers_no_id = headers.filter(|node_data_ref| {
|
||||||
let attrs = node_data_ref.attributes.borrow();
|
let attrs = node_data_ref.attributes.borrow();
|
||||||
!attrs.contains("id")
|
!attrs.contains("id")
|
||||||
|
|| attrs
|
||||||
|
.get("id")
|
||||||
|
.map(|val| !VALID_ATTR_CHARS_REGEX.is_match(&val))
|
||||||
|
.unwrap()
|
||||||
});
|
});
|
||||||
for header in headers_no_id {
|
for header in headers_no_id {
|
||||||
let mut attrs = header.attributes.borrow_mut();
|
let mut attrs = header.attributes.borrow_mut();
|
||||||
|
@ -410,6 +433,15 @@ fn serialize_to_xhtml<W: std::io::Write>(
|
||||||
node_ref: &NodeRef,
|
node_ref: &NodeRef,
|
||||||
mut w: &mut W,
|
mut w: &mut W,
|
||||||
) -> Result<(), PaperoniError> {
|
) -> Result<(), PaperoniError> {
|
||||||
|
{
|
||||||
|
// Add XHTML attributes
|
||||||
|
let html_elem = node_ref
|
||||||
|
.select_first("html")
|
||||||
|
.expect("Unable to get <html> element in article");
|
||||||
|
let mut html_attrs = html_elem.attributes.borrow_mut();
|
||||||
|
html_attrs.insert("xmlns", "http://www.w3.org/1999/xhtml".into());
|
||||||
|
html_attrs.insert("xmlns:epub", "http://www.idpf.org/2007/ops".into());
|
||||||
|
}
|
||||||
let mut escape_map = HashMap::new();
|
let mut escape_map = HashMap::new();
|
||||||
escape_map.insert("<", "<");
|
escape_map.insert("<", "<");
|
||||||
escape_map.insert(">", ">");
|
escape_map.insert(">", ">");
|
||||||
|
@ -430,7 +462,10 @@ fn serialize_to_xhtml<W: std::io::Write>(
|
||||||
let attrs_str = attrs
|
let attrs_str = attrs
|
||||||
.map
|
.map
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|(k, _)| !k.local.contains("\""))
|
.filter(|(k, _)| {
|
||||||
|
let attr_key: &str = &k.local;
|
||||||
|
attr_key.is_ascii() && VALID_ATTR_CHARS_REGEX.is_match(attr_key)
|
||||||
|
})
|
||||||
.map(|(k, v)| {
|
.map(|(k, v)| {
|
||||||
format!(
|
format!(
|
||||||
"{}=\"{}\"",
|
"{}=\"{}\"",
|
||||||
|
|
|
@ -156,4 +156,6 @@ pub enum CliError<BuilderError: Debug + Display> {
|
||||||
OutputDirectoryNotExists,
|
OutputDirectoryNotExists,
|
||||||
#[error("Unable to start logger!\n{0}")]
|
#[error("Unable to start logger!\n{0}")]
|
||||||
LogError(#[from] LogError),
|
LogError(#[from] LogError),
|
||||||
|
#[error("The --inline-toc can only be used exporting to epub")]
|
||||||
|
WrongExportInliningToC,
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,18 +6,18 @@ use crate::moz_readability::{MetaData, Readability};
|
||||||
|
|
||||||
pub type ResourceInfo = (String, Option<String>);
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
|
||||||
pub struct Extractor {
|
pub struct Article {
|
||||||
article: Option<NodeRef>,
|
node_ref_opt: Option<NodeRef>,
|
||||||
pub img_urls: Vec<ResourceInfo>,
|
pub img_urls: Vec<ResourceInfo>,
|
||||||
readability: Readability,
|
readability: Readability,
|
||||||
pub url: String,
|
pub url: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Extractor {
|
impl Article {
|
||||||
/// Create a new instance of an HTML extractor given an HTML string
|
/// Create a new instance of an HTML extractor given an HTML string
|
||||||
pub fn from_html(html_str: &str, url: &str) -> Self {
|
pub fn from_html(html_str: &str, url: &str) -> Self {
|
||||||
Extractor {
|
Self {
|
||||||
article: None,
|
node_ref_opt: None,
|
||||||
img_urls: Vec::new(),
|
img_urls: Vec::new(),
|
||||||
readability: Readability::new(html_str),
|
readability: Readability::new(html_str),
|
||||||
url: url.to_string(),
|
url: url.to_string(),
|
||||||
|
@ -30,7 +30,8 @@ impl Extractor {
|
||||||
self.readability.parse(&self.url)?;
|
self.readability.parse(&self.url)?;
|
||||||
if let Some(article_node_ref) = &self.readability.article_node {
|
if let Some(article_node_ref) = &self.readability.article_node {
|
||||||
let template = r#"
|
let template = r#"
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<link rel="stylesheet" href="stylesheet.css" type="text/css"></link>
|
<link rel="stylesheet" href="stylesheet.css" type="text/css"></link>
|
||||||
</head>
|
</head>
|
||||||
|
@ -41,14 +42,14 @@ impl Extractor {
|
||||||
let doc = kuchiki::parse_html().one(template);
|
let doc = kuchiki::parse_html().one(template);
|
||||||
let body = doc.select_first("body").unwrap();
|
let body = doc.select_first("body").unwrap();
|
||||||
body.as_node().append(article_node_ref.clone());
|
body.as_node().append(article_node_ref.clone());
|
||||||
self.article = Some(doc);
|
self.node_ref_opt = Some(doc);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
||||||
pub fn extract_img_urls(&mut self) {
|
pub fn extract_img_urls(&mut self) {
|
||||||
if let Some(content_ref) = &self.article {
|
if let Some(content_ref) = &self.node_ref_opt {
|
||||||
self.img_urls = content_ref
|
self.img_urls = content_ref
|
||||||
.select("img")
|
.select("img")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
|
@ -66,8 +67,8 @@ impl Extractor {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
|
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
|
||||||
pub fn article(&self) -> &NodeRef {
|
pub fn node_ref(&self) -> &NodeRef {
|
||||||
self.article.as_ref().expect(
|
self.node_ref_opt.as_ref().expect(
|
||||||
"Article node doesn't exist. This may be because the document has not been parsed",
|
"Article node doesn't exist. This may be because the document has not been parsed",
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -111,16 +112,16 @@ mod test {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_extract_img_urls() {
|
fn test_extract_img_urls() {
|
||||||
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
|
let mut article = Article::from_html(TEST_HTML, "http://example.com/");
|
||||||
extractor
|
article
|
||||||
.extract_content()
|
.extract_content()
|
||||||
.expect("Article extraction failed unexpectedly");
|
.expect("Article extraction failed unexpectedly");
|
||||||
extractor.extract_img_urls();
|
article.extract_img_urls();
|
||||||
|
|
||||||
assert!(extractor.img_urls.len() > 0);
|
assert!(article.img_urls.len() > 0);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
vec![("http://example.com/img.jpg".to_string(), None)],
|
vec![("http://example.com/img.jpg".to_string(), None)],
|
||||||
extractor.img_urls
|
article.img_urls
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
391
src/html.rs
Normal file
391
src/html.rs
Normal file
|
@ -0,0 +1,391 @@
|
||||||
|
use std::{
|
||||||
|
collections::{BTreeMap, HashSet},
|
||||||
|
fs::{self, File},
|
||||||
|
path::Path,
|
||||||
|
};
|
||||||
|
|
||||||
|
use base64::encode;
|
||||||
|
use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table};
|
||||||
|
use html5ever::{LocalName, Namespace, QualName};
|
||||||
|
use indicatif::{ProgressBar, ProgressStyle};
|
||||||
|
use kuchiki::{traits::*, NodeRef};
|
||||||
|
use log::{debug, error, info};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
cli::{self, AppConfig},
|
||||||
|
errors::PaperoniError,
|
||||||
|
extractor::Article,
|
||||||
|
moz_readability::MetaData,
|
||||||
|
};
|
||||||
|
|
||||||
|
const HEAD_ELEM_NOT_FOUND: &str =
|
||||||
|
"Unable to get <head> element to inline css. Ensure that the root node is the HTML document.";
|
||||||
|
const BASE_HTML_TEMPLATE: &str = r#"<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
</head>
|
||||||
|
<body></body>
|
||||||
|
</html>"#;
|
||||||
|
|
||||||
|
pub fn generate_html_exports(
|
||||||
|
articles: Vec<Article>,
|
||||||
|
app_config: &AppConfig,
|
||||||
|
successful_articles_table: &mut Table,
|
||||||
|
) -> Result<(), Vec<PaperoniError>> {
|
||||||
|
if articles.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let bar = if app_config.can_disable_progress_bar {
|
||||||
|
ProgressBar::hidden()
|
||||||
|
} else {
|
||||||
|
let enabled_bar = ProgressBar::new(articles.len() as u64);
|
||||||
|
let style = ProgressStyle::default_bar().template(
|
||||||
|
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} html {pos}/{len:7} {msg:.green}",
|
||||||
|
);
|
||||||
|
enabled_bar.set_style(style);
|
||||||
|
if !articles.is_empty() {
|
||||||
|
enabled_bar.set_message("Generating html files");
|
||||||
|
}
|
||||||
|
enabled_bar
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut errors: Vec<PaperoniError> = Vec::new();
|
||||||
|
|
||||||
|
match app_config.merged {
|
||||||
|
Some(ref name) => {
|
||||||
|
successful_articles_table.set_header(vec![Cell::new("Table of Contents")
|
||||||
|
.add_attribute(Attribute::Bold)
|
||||||
|
.set_alignment(CellAlignment::Center)
|
||||||
|
.fg(Color::Green)]);
|
||||||
|
|
||||||
|
debug!("Creating {:?}", name);
|
||||||
|
|
||||||
|
let base_html_elem = kuchiki::parse_html().one(BASE_HTML_TEMPLATE);
|
||||||
|
let body_elem = base_html_elem.select_first("body").unwrap();
|
||||||
|
let base_path = Path::new(app_config.output_directory.as_deref().unwrap_or("."));
|
||||||
|
let img_dirs_path_name = name.trim_end_matches(".html");
|
||||||
|
let imgs_dir_path = base_path.join(img_dirs_path_name);
|
||||||
|
|
||||||
|
if !(app_config.is_inlining_images || imgs_dir_path.exists()) {
|
||||||
|
info!("Creating imgs dir in {:?} for {}", imgs_dir_path, name);
|
||||||
|
if let Err(e) = std::fs::create_dir(&imgs_dir_path) {
|
||||||
|
error!("Unable to create imgs dir for HTML file");
|
||||||
|
let err: PaperoniError = e.into();
|
||||||
|
errors.push(err);
|
||||||
|
return Err(errors);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
for (idx, article) in articles.iter().enumerate() {
|
||||||
|
let article_elem = article
|
||||||
|
.node_ref()
|
||||||
|
.select_first("div[id=\"readability-page-1\"]")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let title = article.metadata().title();
|
||||||
|
|
||||||
|
let mut elem_attr = article_elem.attributes.borrow_mut();
|
||||||
|
if let Some(id_attr) = elem_attr.get_mut("id") {
|
||||||
|
*id_attr = format!("readability-page-{}", idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (img_url, mime_type_opt) in &article.img_urls {
|
||||||
|
if app_config.is_inlining_images {
|
||||||
|
info!("Inlining images for {}", title);
|
||||||
|
let result = update_imgs_base64(
|
||||||
|
article,
|
||||||
|
img_url,
|
||||||
|
mime_type_opt.as_deref().unwrap_or("image/*"),
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Err(e) = result {
|
||||||
|
let mut err: PaperoniError = e.into();
|
||||||
|
err.set_article_source(title);
|
||||||
|
error!("Unable to copy images to imgs dir for {}", title);
|
||||||
|
errors.push(err);
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Completed inlining images for {}", title);
|
||||||
|
} else {
|
||||||
|
info!("Copying images to imgs dir for {}", title);
|
||||||
|
let result = update_img_urls(article, &imgs_dir_path).map_err(|e| {
|
||||||
|
let mut err: PaperoniError = e.into();
|
||||||
|
err.set_article_source(title);
|
||||||
|
err
|
||||||
|
});
|
||||||
|
if let Err(e) = result {
|
||||||
|
error!("Unable to copy images to imgs dir for {}", title);
|
||||||
|
errors.push(e);
|
||||||
|
} else {
|
||||||
|
info!("Successfully copied images to imgs dir for {}", title);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bar.inc(1);
|
||||||
|
successful_articles_table.add_row(vec![title]);
|
||||||
|
body_elem.as_node().append(article_elem.as_node().clone());
|
||||||
|
debug!("Added {} to the export HTML file", title);
|
||||||
|
}
|
||||||
|
|
||||||
|
insert_title_elem(&base_html_elem, name);
|
||||||
|
insert_appendix(
|
||||||
|
&base_html_elem,
|
||||||
|
articles
|
||||||
|
.iter()
|
||||||
|
.map(|article| (article.metadata(), article.url.as_str()))
|
||||||
|
.collect(),
|
||||||
|
);
|
||||||
|
inline_css(&base_html_elem, app_config);
|
||||||
|
|
||||||
|
info!("Added title, footer and inlined styles for {}", name);
|
||||||
|
|
||||||
|
info!("Creating export HTML file: {}", name);
|
||||||
|
if let Err(mut err) = File::create(name)
|
||||||
|
.and_then(|mut out_file| base_html_elem.serialize(&mut out_file))
|
||||||
|
.map_err(|e| -> PaperoniError { e.into() })
|
||||||
|
{
|
||||||
|
error!("Failed to serialize articles to file: {}", name);
|
||||||
|
err.set_article_source(&name);
|
||||||
|
errors.push(err);
|
||||||
|
bar.finish_with_message("html generation failed");
|
||||||
|
return Err(errors);
|
||||||
|
};
|
||||||
|
|
||||||
|
bar.finish_with_message("Generated html file\n");
|
||||||
|
debug!("Created {:?}", name);
|
||||||
|
println!("Created {:?}", name);
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
successful_articles_table
|
||||||
|
.set_header(vec![Cell::new("Downloaded articles")
|
||||||
|
.add_attribute(Attribute::Bold)
|
||||||
|
.set_alignment(CellAlignment::Center)
|
||||||
|
.fg(Color::Green)])
|
||||||
|
.set_content_arrangement(ContentArrangement::Dynamic);
|
||||||
|
|
||||||
|
let mut file_names: HashSet<String> = HashSet::new();
|
||||||
|
|
||||||
|
for article in &articles {
|
||||||
|
let mut file_name = format!(
|
||||||
|
"{}/{}.html",
|
||||||
|
app_config.output_directory.as_deref().unwrap_or("."),
|
||||||
|
article
|
||||||
|
.metadata()
|
||||||
|
.title()
|
||||||
|
.replace("/", " ")
|
||||||
|
.replace("\\", " ")
|
||||||
|
);
|
||||||
|
|
||||||
|
if file_names.contains(&file_name) {
|
||||||
|
info!("Article name {:?} already exists", file_name);
|
||||||
|
file_name = format!(
|
||||||
|
"{}/{}_{}.html",
|
||||||
|
app_config.output_directory.as_deref().unwrap_or("."),
|
||||||
|
article
|
||||||
|
.metadata()
|
||||||
|
.title()
|
||||||
|
.replace("/", " ")
|
||||||
|
.replace("\\", " "),
|
||||||
|
file_names.len()
|
||||||
|
);
|
||||||
|
info!("Renamed to {:?}", file_name);
|
||||||
|
}
|
||||||
|
file_names.insert(file_name.clone());
|
||||||
|
|
||||||
|
debug!("Creating {:?}", file_name);
|
||||||
|
let export_article = || -> Result<(), PaperoniError> {
|
||||||
|
let mut out_file = File::create(&file_name)?;
|
||||||
|
|
||||||
|
if app_config.is_inlining_images {
|
||||||
|
for (img_url, mime_type_opt) in &article.img_urls {
|
||||||
|
update_imgs_base64(
|
||||||
|
article,
|
||||||
|
img_url,
|
||||||
|
mime_type_opt.as_deref().unwrap_or("image/*"),
|
||||||
|
)?
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let base_path =
|
||||||
|
Path::new(app_config.output_directory.as_deref().unwrap_or("."));
|
||||||
|
let imgs_dir_name = article.metadata().title();
|
||||||
|
|
||||||
|
if !base_path.join(imgs_dir_name).exists() {
|
||||||
|
std::fs::create_dir(base_path.join(imgs_dir_name))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let imgs_dir_path = base_path.join(imgs_dir_name);
|
||||||
|
update_img_urls(article, &imgs_dir_path)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let utf8_encoding =
|
||||||
|
NodeRef::new_element(create_qualname("meta"), BTreeMap::new());
|
||||||
|
if let Some(elem_node) = utf8_encoding.as_element() {
|
||||||
|
let mut elem_attrs = elem_node.attributes.borrow_mut();
|
||||||
|
elem_attrs.insert("charset", "UTF-8".into());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(head_elem) = article.node_ref().select_first("head") {
|
||||||
|
let head_elem_node = head_elem.as_node();
|
||||||
|
head_elem_node.append(utf8_encoding);
|
||||||
|
};
|
||||||
|
|
||||||
|
insert_title_elem(article.node_ref(), article.metadata().title());
|
||||||
|
insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]);
|
||||||
|
inline_css(article.node_ref(), app_config);
|
||||||
|
|
||||||
|
article.node_ref().serialize(&mut out_file)?;
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Err(mut err) = export_article() {
|
||||||
|
err.set_article_source(&article.url);
|
||||||
|
errors.push(err);
|
||||||
|
}
|
||||||
|
debug!("Created {:?}", file_name);
|
||||||
|
|
||||||
|
bar.inc(1);
|
||||||
|
successful_articles_table.add_row(vec![article.metadata().title()]);
|
||||||
|
}
|
||||||
|
bar.finish_with_message("Generated HTML files\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if errors.is_empty() {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(errors)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_qualname(name: &str) -> QualName {
|
||||||
|
QualName::new(
|
||||||
|
None,
|
||||||
|
Namespace::from("http://www.w3.org/1999/xhtml"),
|
||||||
|
LocalName::from(name),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
|
||||||
|
fn update_imgs_base64(
|
||||||
|
article: &Article,
|
||||||
|
img_url: &str,
|
||||||
|
mime_type: &str,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
let temp_dir = std::env::temp_dir();
|
||||||
|
let img_path = temp_dir.join(img_url);
|
||||||
|
let img_bytes = std::fs::read(img_path)?;
|
||||||
|
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));
|
||||||
|
|
||||||
|
let img_elems = article
|
||||||
|
.node_ref()
|
||||||
|
.select(&format!("img[src=\"{}\"]", img_url))
|
||||||
|
.unwrap();
|
||||||
|
for img_elem in img_elems {
|
||||||
|
let mut img_attr = img_elem.attributes.borrow_mut();
|
||||||
|
if let Some(src_attr) = img_attr.get_mut("src") {
|
||||||
|
*src_attr = img_base64_str.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location
|
||||||
|
fn update_img_urls(article: &Article, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
|
||||||
|
let temp_dir = std::env::temp_dir();
|
||||||
|
for (img_url, _) in &article.img_urls {
|
||||||
|
let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url));
|
||||||
|
info!("Copying {:?} to {:?}", from, to);
|
||||||
|
fs::copy(from, to)?;
|
||||||
|
let img_elems = article
|
||||||
|
.node_ref()
|
||||||
|
.select(&format!("img[src=\"{}\"]", img_url))
|
||||||
|
.unwrap();
|
||||||
|
for img_elem in img_elems {
|
||||||
|
let mut img_attr = img_elem.attributes.borrow_mut();
|
||||||
|
if let Some(src_attr) = img_attr.get_mut("src") {
|
||||||
|
*src_attr = imgs_dir_path.join(img_url).to_str().unwrap().into();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a `<title>` element in an HTML document with the value set to the article's title
|
||||||
|
fn insert_title_elem(root_node: &NodeRef, title: &str) {
|
||||||
|
let title_content = NodeRef::new_text(title);
|
||||||
|
let title_elem = NodeRef::new_element(create_qualname("title"), BTreeMap::new());
|
||||||
|
title_elem.append(title_content);
|
||||||
|
match root_node.select_first("head") {
|
||||||
|
Ok(head_elem) => {
|
||||||
|
head_elem.as_node().append(title_elem);
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
debug!("{}", HEAD_ELEM_NOT_FOUND);
|
||||||
|
let html_elem = root_node.select_first("html").unwrap();
|
||||||
|
let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new());
|
||||||
|
head_elem.append(title_elem);
|
||||||
|
html_elem.as_node().prepend(head_elem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates the appendix in an HTML document where article sources are added in a `<footer>` element
|
||||||
|
fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) {
|
||||||
|
let link_tags: String = article_links
|
||||||
|
.iter()
|
||||||
|
.map(|(meta_data, url)| {
|
||||||
|
let article_name = if !meta_data.title().is_empty() {
|
||||||
|
meta_data.title()
|
||||||
|
} else {
|
||||||
|
url
|
||||||
|
};
|
||||||
|
format!("<a href=\"{}\">{}</a><br></br>", url, article_name)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let footer_inner_html = format!("<h2>Appendix</h2><h2>Article sources</h3>{}", link_tags);
|
||||||
|
let footer_elem =
|
||||||
|
kuchiki::parse_fragment(create_qualname("footer"), Vec::new()).one(footer_inner_html);
|
||||||
|
root_node.append(footer_elem);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Inlines the CSS stylesheets into the HTML article node
|
||||||
|
fn inline_css(root_node: &NodeRef, app_config: &AppConfig) {
|
||||||
|
let body_stylesheet = include_str!("./assets/body.min.css");
|
||||||
|
let header_stylesheet = include_str!("./assets/headers.min.css");
|
||||||
|
let mut css_str = String::new();
|
||||||
|
match app_config.css_config {
|
||||||
|
cli::CSSConfig::NoHeaders => {
|
||||||
|
css_str.push_str(body_stylesheet);
|
||||||
|
}
|
||||||
|
cli::CSSConfig::All => {
|
||||||
|
css_str.push_str(body_stylesheet);
|
||||||
|
css_str.push_str(header_stylesheet);
|
||||||
|
}
|
||||||
|
cli::CSSConfig::None => {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let css_html_str = format!("<style>{}</style>", css_str);
|
||||||
|
let style_container =
|
||||||
|
kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str);
|
||||||
|
let style_elem = style_container.select_first("style").unwrap();
|
||||||
|
match root_node.select_first("head") {
|
||||||
|
Ok(head_elem) => {
|
||||||
|
head_elem.as_node().prepend(style_elem.as_node().to_owned());
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
debug!("{}", HEAD_ELEM_NOT_FOUND);
|
||||||
|
let html_elem = root_node.select_first("html").unwrap();
|
||||||
|
let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new());
|
||||||
|
head_elem.prepend(style_elem.as_node().to_owned());
|
||||||
|
html_elem.as_node().prepend(head_elem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove the <link> of the stylesheet since styles are now inlined
|
||||||
|
if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") {
|
||||||
|
style_link_elem.as_node().detach();
|
||||||
|
};
|
||||||
|
}
|
10
src/http.rs
10
src/http.rs
|
@ -9,7 +9,7 @@ use url::Url;
|
||||||
|
|
||||||
use crate::cli::AppConfig;
|
use crate::cli::AppConfig;
|
||||||
use crate::errors::{ErrorKind, ImgError, PaperoniError};
|
use crate::errors::{ErrorKind, ImgError, PaperoniError};
|
||||||
use crate::extractor::Extractor;
|
use crate::extractor::Article;
|
||||||
type HTMLResource = (String, String);
|
type HTMLResource = (String, String);
|
||||||
|
|
||||||
pub fn download(
|
pub fn download(
|
||||||
|
@ -17,7 +17,7 @@ pub fn download(
|
||||||
bar: &ProgressBar,
|
bar: &ProgressBar,
|
||||||
partial_downloads: &mut Vec<PartialDownload>,
|
partial_downloads: &mut Vec<PartialDownload>,
|
||||||
errors: &mut Vec<PaperoniError>,
|
errors: &mut Vec<PaperoniError>,
|
||||||
) -> Vec<Extractor> {
|
) -> Vec<Article> {
|
||||||
task::block_on(async {
|
task::block_on(async {
|
||||||
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
||||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
||||||
|
@ -26,7 +26,7 @@ pub fn download(
|
||||||
match fetch_result {
|
match fetch_result {
|
||||||
Ok((url, html)) => {
|
Ok((url, html)) => {
|
||||||
debug!("Extracting {}", &url);
|
debug!("Extracting {}", &url);
|
||||||
let mut extractor = Extractor::from_html(&html, &url);
|
let mut extractor = Article::from_html(&html, &url);
|
||||||
bar.set_message("Extracting...");
|
bar.set_message("Extracting...");
|
||||||
match extractor.extract_content() {
|
match extractor.extract_content() {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
|
@ -185,7 +185,7 @@ async fn process_img_response<'a>(
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn download_images(
|
pub async fn download_images(
|
||||||
extractor: &mut Extractor,
|
extractor: &mut Article,
|
||||||
article_origin: &Url,
|
article_origin: &Url,
|
||||||
bar: &ProgressBar,
|
bar: &ProgressBar,
|
||||||
) -> Result<(), Vec<ImgError>> {
|
) -> Result<(), Vec<ImgError>> {
|
||||||
|
@ -237,7 +237,7 @@ pub async fn download_images(
|
||||||
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
|
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
|
||||||
let (img_url, img_path, img_mime) = img_item;
|
let (img_url, img_path, img_mime) = img_item;
|
||||||
let img_ref = extractor
|
let img_ref = extractor
|
||||||
.article()
|
.node_ref()
|
||||||
.select_first(&format!("img[src='{}']", img_url))
|
.select_first(&format!("img[src='{}']", img_url))
|
||||||
.expect("Image node does not exist");
|
.expect("Image node does not exist");
|
||||||
let mut img_node = img_ref.attributes.borrow_mut();
|
let mut img_node = img_ref.attributes.borrow_mut();
|
||||||
|
|
|
@ -11,7 +11,7 @@ use crate::errors::PaperoniError;
|
||||||
|
|
||||||
pub fn display_summary(
|
pub fn display_summary(
|
||||||
initial_article_count: usize,
|
initial_article_count: usize,
|
||||||
succesful_articles_table: Table,
|
successful_articles_table: Table,
|
||||||
partial_downloads: Vec<PartialDownload>,
|
partial_downloads: Vec<PartialDownload>,
|
||||||
errors: Vec<PaperoniError>,
|
errors: Vec<PaperoniError>,
|
||||||
) {
|
) {
|
||||||
|
@ -31,7 +31,7 @@ pub fn display_summary(
|
||||||
);
|
);
|
||||||
|
|
||||||
if successfully_downloaded_count > 0 {
|
if successfully_downloaded_count > 0 {
|
||||||
println!("{}", succesful_articles_table);
|
println!("{}", successful_articles_table);
|
||||||
}
|
}
|
||||||
|
|
||||||
if partial_downloads_count > 0 {
|
if partial_downloads_count > 0 {
|
||||||
|
|
32
src/main.rs
32
src/main.rs
|
@ -3,6 +3,7 @@ extern crate lazy_static;
|
||||||
|
|
||||||
use std::process::exit;
|
use std::process::exit;
|
||||||
|
|
||||||
|
use colored::Colorize;
|
||||||
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
|
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
|
||||||
use comfy_table::{ContentArrangement, Table};
|
use comfy_table::{ContentArrangement, Table};
|
||||||
use http::download;
|
use http::download;
|
||||||
|
@ -12,6 +13,7 @@ mod cli;
|
||||||
mod epub;
|
mod epub;
|
||||||
mod errors;
|
mod errors;
|
||||||
mod extractor;
|
mod extractor;
|
||||||
|
mod html;
|
||||||
/// This module is responsible for async HTTP calls for downloading
|
/// This module is responsible for async HTTP calls for downloading
|
||||||
/// the HTML content and images
|
/// the HTML content and images
|
||||||
mod http;
|
mod http;
|
||||||
|
@ -20,13 +22,14 @@ mod moz_readability;
|
||||||
|
|
||||||
use cli::AppConfig;
|
use cli::AppConfig;
|
||||||
use epub::generate_epubs;
|
use epub::generate_epubs;
|
||||||
|
use html::generate_html_exports;
|
||||||
use logs::display_summary;
|
use logs::display_summary;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let app_config = match cli::AppConfig::init_with_cli() {
|
let app_config = match cli::AppConfig::init_with_cli() {
|
||||||
Ok(app_config) => app_config,
|
Ok(app_config) => app_config,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
eprintln!("{}", err);
|
eprintln!("{}: {}", "ERROR".bold().bright_red(), err);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -64,22 +67,33 @@ fn run(app_config: AppConfig) {
|
||||||
let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors);
|
let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors);
|
||||||
bar.finish_with_message("Downloaded articles");
|
bar.finish_with_message("Downloaded articles");
|
||||||
|
|
||||||
let mut succesful_articles_table = Table::new();
|
let mut successful_articles_table = Table::new();
|
||||||
succesful_articles_table
|
successful_articles_table
|
||||||
.load_preset(UTF8_FULL)
|
.load_preset(UTF8_FULL)
|
||||||
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
|
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
|
||||||
.set_content_arrangement(ContentArrangement::Dynamic);
|
.set_content_arrangement(ContentArrangement::Dynamic);
|
||||||
match generate_epubs(articles, &app_config, &mut succesful_articles_table) {
|
|
||||||
Ok(_) => (),
|
match app_config.export_type {
|
||||||
Err(gen_epub_errors) => {
|
cli::ExportType::EPUB => {
|
||||||
errors.extend(gen_epub_errors);
|
match generate_epubs(articles, &app_config, &mut successful_articles_table) {
|
||||||
|
Ok(_) => (),
|
||||||
|
Err(gen_epub_errors) => {
|
||||||
|
errors.extend(gen_epub_errors);
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
};
|
cli::ExportType::HTML => {
|
||||||
|
match generate_html_exports(articles, &app_config, &mut successful_articles_table) {
|
||||||
|
Ok(_) => (),
|
||||||
|
Err(gen_html_errors) => errors.extend(gen_html_errors),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let has_errors = !errors.is_empty() || !partial_downloads.is_empty();
|
let has_errors = !errors.is_empty() || !partial_downloads.is_empty();
|
||||||
display_summary(
|
display_summary(
|
||||||
app_config.urls.len(),
|
app_config.urls.len(),
|
||||||
succesful_articles_table,
|
successful_articles_table,
|
||||||
partial_downloads,
|
partial_downloads,
|
||||||
errors,
|
errors,
|
||||||
);
|
);
|
||||||
|
|
Loading…
Reference in a new issue