Merge pull request #20 from hipstermojo/dev

v0.6.0 release
This commit is contained in:
Kenneth Gitere 2021-07-24 13:54:50 +03:00 committed by GitHub
commit 3958261cda
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 746 additions and 148 deletions

3
.gitignore vendored
View file

@ -1,4 +1,7 @@
/target
*.epub
# Only ignore top level html files which may be made when testing
/*.html
*.pdf
*.log
.vscode/

10
Cargo.lock generated
View file

@ -395,6 +395,7 @@ dependencies = [
"textwrap",
"unicode-width",
"vec_map",
"yaml-rust",
]
[[package]]
@ -1551,9 +1552,10 @@ dependencies = [
[[package]]
name = "paperoni"
version = "0.5.0-alpha1"
version = "0.6.0-alpha1"
dependencies = [
"async-std",
"base64",
"chrono",
"clap",
"colored",
@ -2756,6 +2758,12 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "yaml-rust"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e66366e18dc58b46801afbf2ca7661a9f59cc8c5962c29892b6039b4f86fa992"
[[package]]
name = "yansi"
version = "0.5.0"

View file

@ -3,7 +3,7 @@ description = "A web article downloader"
homepage = "https://github.com/hipstermojo/paperoni"
repository = "https://github.com/hipstermojo/paperoni"
name = "paperoni"
version = "0.5.0-alpha1"
version = "0.6.0-alpha1"
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
edition = "2018"
license = "MIT"
@ -14,8 +14,9 @@ readme = "README.md"
[dependencies]
# atty = "0.2.14"
async-std = "1.9.0"
base64 = "0.13.0"
chrono = "0.4.19"
clap = "2.33.3"
clap = { version = "2.33.3", features = ["yaml"] }
colored = "2.0.0"
comfy-table = "3.0.0"
derive_builder = "0.10.2"

View file

@ -8,7 +8,7 @@
</a>
</div>
Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. There is provisional<sup><a href="#pdf-exports">\*</a></sup> support for exporting to PDF as well.
Paperoni is a CLI tool made in Rust for downloading web articles as EPUB or HTML files. There is provisional<sup><a href="#pdf-exports">\*</a></sup> support for exporting to PDF as well.
> This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash.
@ -23,7 +23,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for
Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run:
```sh
cargo install paperoni --version 0.5.0-alpha1
cargo install paperoni --version 0.6.0-alpha1
```
_Paperoni is still in alpha so the `version` flag has to be passed._
@ -48,28 +48,43 @@ USAGE:
paperoni [OPTIONS] [urls]...
OPTIONS:
--export <type>
Specify the file type of the export. The type must be in lower case. [default: epub] [possible values:
html, epub]
-f, --file <file>
Input file containing links
-h, --help
Prints help information
--inline-images
Inlines the article images when exporting to HTML using base64.
This is used when you do not want a separate folder created for images during HTML export.
NOTE: It uses base64 encoding on the images which results in larger HTML export sizes as each image
increases in size by about 25%-33%.
--inline-toc
Add an inlined Table of Contents page at the start of the merged article.
Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation
--log-to-file
Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to
specify the logging level
--max-conn <max_conn>
--max-conn <max-conn>
The maximum number of concurrent HTTP connections when downloading articles. Default is 8.
NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end
up overloading your network card with too many concurrent requests.
-o, --output-dir <output_directory>
Directory for saving epub documents
--merge <output_name>
--no-css
Removes the stylesheets used in the EPUB generation.
The EPUB file will then be laid out based on your e-reader's default stylesheets.
Images and code blocks may overflow when this flag is set and layout of generated
PDFs will be affected. Use --no-header-css if you want to only disable the styling on headers.
--no-header-css
Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default
CSS, use --no-css instead.
--merge <output-name>
Merge multiple articles into a single epub that will be given the name provided
-o, --output-dir <output_directory>
Directory to store output epub documents
-V, --version
Prints version information
@ -112,6 +127,41 @@ These can also be read from a file using the `-f/--file` flag.
paperoni -f links.txt
```
### Exporting articles
By default, Paperoni exports to EPUB files but you can change to HTML by passing the `--export html` flag.
```sh
paperoni https://en.wikipedia.org/wiki/Pepperoni --export html
```
HTML exports allow you to read the articles as plain HTML documents on your browser but can also be used to convert to PDF as explained [here](#).
When exporting to HTML, Paperoni will download the article's images to a folder named similar to the article. Therefore the folder structure would look like this for the command ran above:
```
.
├── Pepperoni - Wikipedia
│ ├── 1a9f886e9b58db72e0003a2cd52681d8.png
│ ├── 216f8a4265a1ceb3f8cfba4c2f9057b1.jpeg
│ ...
└── Pepperoni - Wikipedia.html
```
If you would instead prefer to have the images inlined directly to the HTML export, pass the `inline-images` flag, i.e.:
```sh
paperoni https://en.wikipedia.org/wiki/Pepperoni --export html --inline-images
```
This is especially useful when exporting multiple links.
**NOTE**: The inlining of images for HTML exports uses base64 encoding which is known to increase the overall size of images by about 25% to 33%.
### Disabling CSS
The `no-css` and `no-header-css` flags can be used to remove the default styling added by Paperoni. Refer to `--help` to see the usage of the flags.
### Merging articles
By default, Paperoni generates an epub file for each link. You can also merge multiple links
@ -153,7 +203,11 @@ There are also web pages it won't work on in general such as Twitter and Reddit
## PDF exports
As of version 0.5-alpha1, you can now export to PDF using a third party tool. This requires that you install [Calibre](https://calibre-ebook.com/) which comes with a ebook conversion. You can convert the epub to a pdf through the terminal with `ebook-convert`:
PDF conversion can be done using a third party tool. There are 2 options to do so:
### EPUB to PDF
This requires that you install [Calibre](https://calibre-ebook.com/) which comes with a ebook conversion. You can convert the epub to a pdf through the terminal with `ebook-convert`:
```sh
# Assuming the downloaded epub was called foo.epub
@ -161,3 +215,25 @@ ebook-convert foo.epub foo.pdf
```
Alternatively, you can use the Calibre GUI to do the file conversion.
### HTML to PDF
The recommended approach is to use [Weasyprint](https://weasyprint.org/start/), a free and open-source tool that converts HTML documents to PDF. It is available on Linux, MacOS and Windows. Using the CLI, it can be done as follows:
```sh
paperoni https://en.wikipedia.org/wiki/Pepperoni --export html
weasyprint "Pepperoni - Wikipedia.html" Pepperoni.pdf
```
Inlining images is not mandatory as Weasyprint will be able to find the files on its own.
### Comparison of PDF conversion methods
Either of the conversion methods is sufficient for most use cases. The main differences are listed below:
| | EPUB to PDF | HTML to PDF |
|----------------------|----------------------------|------------------|
| Wrapping code blocks | Yes | No |
| CSS customization | No | Yes |
| Generated file size | Slightly larger | Slightly smaller |
The difference in file size is due to the additional fonts added to the PDF file by `ebook-convert`.

7
src/assets/body.min.css vendored Normal file
View file

@ -0,0 +1,7 @@
/*!
* Writ v1.0.4
*
* Copyright © 2015, Curtis McEnroe <curtis@cmcenroe.me>
*
* https://cmcenroe.me/writ/LICENSE (ISC)
*/dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}small{font-size:.833em}th{font-weight:400}blockquote,dl,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap}

7
src/assets/headers.min.css vendored Normal file
View file

@ -0,0 +1,7 @@
/*!
* Writ v1.0.4
*
* Copyright © 2015, Curtis McEnroe <curtis@cmcenroe.me>
*
* https://cmcenroe.me/writ/LICENSE (ISC)
*/h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}h1,h2,h3{line-height:3rem}h1,h2,h3,h4,h5,h6{margin:1.5rem 0 0}

View file

@ -1,7 +0,0 @@
/*!
* Writ v1.0.4
*
* Copyright © 2015, Curtis McEnroe <curtis@cmcenroe.me>
*
* https://cmcenroe.me/writ/LICENSE (ISC)
*/dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap}

View file

@ -1,7 +1,7 @@
use std::{fs, num::NonZeroUsize, path::Path};
use chrono::{DateTime, Local};
use clap::{App, AppSettings, Arg, ArgMatches};
use clap::{load_yaml, App, ArgMatches};
use flexi_logger::LevelFilter as LogLevel;
use itertools::Itertools;
@ -11,10 +11,10 @@ const DEFAULT_MAX_CONN: usize = 8;
#[derive(derive_builder::Builder)]
pub struct AppConfig {
/// Urls for store in epub
/// Article urls
pub urls: Vec<String>,
pub max_conn: usize,
/// Path to file of multiple articles into a single epub
/// Path to file of multiple articles into a single article
pub merged: Option<String>,
pub output_directory: Option<String>,
pub log_level: LogLevel,
@ -22,80 +22,15 @@ pub struct AppConfig {
pub start_time: DateTime<Local>,
pub is_logging_to_file: bool,
pub inline_toc: bool,
pub css_config: CSSConfig,
pub export_type: ExportType,
pub is_inlining_images: bool,
}
impl AppConfig {
pub fn init_with_cli() -> Result<AppConfig, Error> {
let app = App::new("paperoni")
.settings(&[
AppSettings::ArgRequiredElseHelp,
AppSettings::UnifiedHelpMessage,
])
.version(clap::crate_version!())
.about(
"Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs",
)
.arg(
Arg::with_name("urls")
.help("Urls of web articles")
.multiple(true),
)
.arg(
Arg::with_name("file")
.short("f")
.long("file")
.help("Input file containing links")
.takes_value(true),
)
.arg(
Arg::with_name("output_directory")
.long("output-dir")
.short("o")
.help("Directory to store output epub documents")
.conflicts_with("output_name")
.takes_value(true),
)
.arg(
Arg::with_name("output_name")
.long("merge")
.help("Merge multiple articles into a single epub")
.long_help("Merge multiple articles into a single epub that will be given the name provided")
.conflicts_with("output_directory")
.takes_value(true),
).arg(
Arg::with_name("max-conn")
.long("max_conn")
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
.takes_value(true))
.arg(
Arg::with_name("verbosity")
.short("v")
.multiple(true)
.help("Enables logging of events and set the verbosity level. Use --help to read on its usage")
.long_help(
"This takes upto 4 levels of verbosity in the following order.
- Error (-v)
- Warn (-vv)
- Info (-vvv)
- Debug (-vvvv)
When this flag is passed, it disables the progress bars and logs to stderr.
If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag."
)
.takes_value(false))
.arg(
Arg::with_name("log-to-file")
.long("log-to-file")
.help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level")
.takes_value(false))
.arg(
Arg::with_name("inline-toc")
.long("inline-toc")
.requires("output_name")
.help("Add an inlined Table of Contents page at the start of the merged article.")
.long_help("Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation")
);
let yaml_config = load_yaml!("cli_config.yml");
let app = App::from_yaml(yaml_config).version(clap::crate_version!());
Self::try_from(app.get_matches())
}
@ -159,11 +94,12 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
Some(max_conn) => max_conn.parse::<NonZeroUsize>()?.get(),
None => DEFAULT_MAX_CONN,
})
.merged(arg_matches.value_of("output_name").map(|name| {
if name.ends_with(".epub") {
.merged(arg_matches.value_of("output-name").map(|name| {
let file_ext = format!(".{}", arg_matches.value_of("export").unwrap());
if name.ends_with(&file_ext) {
name.to_owned()
} else {
name.to_string() + ".epub"
name.to_string() + &file_ext
}
}))
.can_disable_progress_bar(
@ -183,7 +119,17 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
4..=u64::MAX => LogLevel::Debug,
})
.is_logging_to_file(arg_matches.is_present("log-to-file"))
.inline_toc(arg_matches.is_present("inline-toc"))
.inline_toc(
(if arg_matches.is_present("inline-toc") {
if arg_matches.value_of("export") == Some("epub") {
Ok(true)
} else {
Err(Error::WrongExportInliningToC)
}
} else {
Ok(false)
})?,
)
.output_directory(
arg_matches
.value_of("output_directory")
@ -200,6 +146,25 @@ impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
.transpose()?,
)
.start_time(Local::now())
.css_config(
match (
arg_matches.is_present("no-css"),
arg_matches.is_present("no-header-css"),
) {
(true, _) => CSSConfig::None,
(_, true) => CSSConfig::NoHeaders,
_ => CSSConfig::All,
},
)
.export_type({
let export_type = arg_matches.value_of("export").unwrap();
if export_type == "html" {
ExportType::HTML
} else {
ExportType::EPUB
}
})
.is_inlining_images(arg_matches.is_present("inline-images"))
.try_init()
}
}
@ -212,3 +177,16 @@ impl AppConfigBuilder {
.init_merge_file()
}
}
#[derive(Clone, Debug)]
pub enum CSSConfig {
All,
NoHeaders,
None,
}
#[derive(Clone, Debug)]
pub enum ExportType {
HTML,
EPUB,
}

82
src/cli_config.yml Normal file
View file

@ -0,0 +1,82 @@
name: paperoni
about: Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs
settings:
- ArgRequiredElseHelp
- UnifiedHelpMessage
args:
- urls:
help: Urls of web articles
multiple: true
- file:
short: f
long: file
help: Input file containing links
takes_value: true
- output_directory:
short: o
long: output-dir
help: Directory to store output epub documents
conflicts_with: output-name
takes_value: true
- output-name:
long: merge
help: Merge multiple articles into a single epub
long_help: Merge multiple articles into a single epub that will be given the name provided
conflicts_with: output_directory
takes_value: true
- max-conn:
long: max-conn
help: The maximum number of concurrent HTTP connections when downloading articles. Default is 8
long_help: "The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests."
takes_value: true
- verbosity:
short: v
multiple: true
help: Enables logging of events and set the verbosity level. Use --help to read on its usage
long_help: "This takes upto 4 levels of verbosity in the following order.
\n- Error (-v)
\n- Warn (-vv)
\n- Info (-vvv)
\n- Debug (-vvvv)
\nWhen this flag is passed, it disables the progress bars and logs to stderr.
\nIf you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag."
takes_value: false
- log-to-file:
long: log-to-file
help: Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level
takes_value: false
- inline-toc:
long: inline-toc
requires: output-name
help: Add an inlined Table of Contents page at the start of the merged article.
long_help: Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation
- no-css:
long: no-css
conflicts_with: no-header-css
help: Removes the stylesheets used in the EPUB generation. Pass --help to learn more
long_help: "Removes the stylesheets used in the EPUB generation.
\nThe EPUB file will then be laid out based on your e-reader's default stylesheets.
\nImages and code blocks may overflow when this flag is set and layout of generated
\nPDFs will be affected. Use --no-header-css if you want to only disable the styling on headers."
takes_value: false
- no-header-css:
long: no-header-css
conflicts_with: no-css
help: Removes the header CSS styling but preserves styling of images and codeblocks. To remove all the default CSS, use --no-css instead.
takes_value: false
- export:
long: export
help: Specify the file type of the export. The type must be in lower case.
possible_values: [html, epub]
value_name: type
takes_value: true
default_value: epub
- inline-images:
long: inline-images
help: Inlines the article images when exporting to HTML using base64. Pass --help to learn more.
long_help: "Inlines the article images when exporting to HTML using base64.
\nThis is used when you do not want a separate folder created for images during HTML export.
\nNOTE: It uses base64 encoding on the images which results in larger HTML export sizes as each image
increases in size by about 25%-33%."
takes_value: false
requires: export

View file

@ -8,14 +8,15 @@ use indicatif::{ProgressBar, ProgressStyle};
use kuchiki::NodeRef;
use log::{debug, error, info};
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Article};
lazy_static! {
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
static ref VALID_ATTR_CHARS_REGEX: regex::Regex = regex::Regex::new(r#"[a-z0-9\-_:]"#).unwrap();
}
pub fn generate_epubs(
articles: Vec<Extractor>,
articles: Vec<Article>,
app_config: &AppConfig,
successful_articles_table: &mut Table,
) -> Result<(), Vec<PaperoniError>> {
@ -37,8 +38,6 @@ pub fn generate_epubs(
enabled_bar
};
let stylesheet = include_bytes!("./assets/writ.min.css");
let mut errors: Vec<PaperoniError> = Vec::new();
match app_config.merged {
@ -71,7 +70,7 @@ pub fn generate_epubs(
epub.inline_toc();
}
match epub.stylesheet(stylesheet.as_bytes()) {
match add_stylesheets(&mut epub, app_config) {
Ok(_) => (),
Err(e) => {
error!("Unable to add stylesheets to epub file");
@ -89,9 +88,9 @@ pub fn generate_epubs(
let content_url = format!("article_{}.xhtml", idx);
let mut xhtml_buf = Vec::new();
let header_level_tocs =
get_header_level_toc_vec(&content_url, article.article());
get_header_level_toc_vec(&content_url, article.node_ref());
serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)?;
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
let section_name = article.metadata().title();
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
@ -146,6 +145,8 @@ pub fn generate_epubs(
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(&name);
errors.push(paperoni_err);
error!("Failed to generate epub: {}", name);
bar.finish_with_message("epub generation failed\n");
return Err(errors);
}
}
@ -178,8 +179,8 @@ pub fn generate_epubs(
let mut out_file = File::create(&file_name).unwrap();
let mut xhtml_buf = Vec::new();
let header_level_tocs =
get_header_level_toc_vec("index.xhtml", article.article());
serialize_to_xhtml(article.article(), &mut xhtml_buf)
get_header_level_toc_vec("index.xhtml", article.node_ref());
serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)
.expect("Unable to serialize to xhtml");
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
@ -187,8 +188,7 @@ pub fn generate_epubs(
epub.metadata("author", replace_escaped_characters(author))?;
}
epub.stylesheet(stylesheet.as_bytes())?;
add_stylesheets(&mut epub, app_config)?;
let title = replace_escaped_characters(article.metadata().title());
epub.metadata("title", &title)?;
@ -205,7 +205,7 @@ pub fn generate_epubs(
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
let img_buf = File::open(&file_path).expect("Can't read image file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
@ -249,8 +249,27 @@ fn replace_escaped_characters(value: &str) -> String {
.replace(">", "&gt;")
}
fn add_stylesheets<T: epub_builder::Zip>(
epub: &mut EpubBuilder<T>,
app_config: &AppConfig,
) -> Result<(), epub_builder::Error> {
let body_stylesheet: &[u8] = include_bytes!("./assets/body.min.css");
let header_stylesheet: &[u8] = include_bytes!("./assets/headers.min.css");
match app_config.css_config {
crate::cli::CSSConfig::All => {
epub.stylesheet([header_stylesheet, body_stylesheet].concat().as_bytes())?;
Ok(())
}
crate::cli::CSSConfig::NoHeaders => {
epub.stylesheet(body_stylesheet.as_bytes())?;
Ok(())
}
_ => Ok(()),
}
}
//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
fn generate_appendix(articles: Vec<&Extractor>) -> String {
fn generate_appendix(articles: Vec<&Article>) -> String {
let link_tags: String = articles
.iter()
.map(|article| {
@ -292,6 +311,10 @@ fn generate_header_ids(root_node: &NodeRef) {
let headers_no_id = headers.filter(|node_data_ref| {
let attrs = node_data_ref.attributes.borrow();
!attrs.contains("id")
|| attrs
.get("id")
.map(|val| !VALID_ATTR_CHARS_REGEX.is_match(&val))
.unwrap()
});
for header in headers_no_id {
let mut attrs = header.attributes.borrow_mut();
@ -410,6 +433,15 @@ fn serialize_to_xhtml<W: std::io::Write>(
node_ref: &NodeRef,
mut w: &mut W,
) -> Result<(), PaperoniError> {
{
// Add XHTML attributes
let html_elem = node_ref
.select_first("html")
.expect("Unable to get <html> element in article");
let mut html_attrs = html_elem.attributes.borrow_mut();
html_attrs.insert("xmlns", "http://www.w3.org/1999/xhtml".into());
html_attrs.insert("xmlns:epub", "http://www.idpf.org/2007/ops".into());
}
let mut escape_map = HashMap::new();
escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;");
@ -430,7 +462,10 @@ fn serialize_to_xhtml<W: std::io::Write>(
let attrs_str = attrs
.map
.iter()
.filter(|(k, _)| !k.local.contains("\""))
.filter(|(k, _)| {
let attr_key: &str = &k.local;
attr_key.is_ascii() && VALID_ATTR_CHARS_REGEX.is_match(attr_key)
})
.map(|(k, v)| {
format!(
"{}=\"{}\"",

View file

@ -156,4 +156,6 @@ pub enum CliError<BuilderError: Debug + Display> {
OutputDirectoryNotExists,
#[error("Unable to start logger!\n{0}")]
LogError(#[from] LogError),
#[error("The --inline-toc can only be used exporting to epub")]
WrongExportInliningToC,
}

View file

@ -6,18 +6,18 @@ use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>);
pub struct Extractor {
article: Option<NodeRef>,
pub struct Article {
node_ref_opt: Option<NodeRef>,
pub img_urls: Vec<ResourceInfo>,
readability: Readability,
pub url: String,
}
impl Extractor {
impl Article {
/// Create a new instance of an HTML extractor given an HTML string
pub fn from_html(html_str: &str, url: &str) -> Self {
Extractor {
article: None,
Self {
node_ref_opt: None,
img_urls: Vec::new(),
readability: Readability::new(html_str),
url: url.to_string(),
@ -30,7 +30,8 @@ impl Extractor {
self.readability.parse(&self.url)?;
if let Some(article_node_ref) = &self.readability.article_node {
let template = r#"
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" href="stylesheet.css" type="text/css"></link>
</head>
@ -41,14 +42,14 @@ impl Extractor {
let doc = kuchiki::parse_html().one(template);
let body = doc.select_first("body").unwrap();
body.as_node().append(article_node_ref.clone());
self.article = Some(doc);
self.node_ref_opt = Some(doc);
}
Ok(())
}
/// Traverses the DOM tree of the content and retrieves the IMG URLs
pub fn extract_img_urls(&mut self) {
if let Some(content_ref) = &self.article {
if let Some(content_ref) = &self.node_ref_opt {
self.img_urls = content_ref
.select("img")
.unwrap()
@ -66,8 +67,8 @@ impl Extractor {
}
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
pub fn article(&self) -> &NodeRef {
self.article.as_ref().expect(
pub fn node_ref(&self) -> &NodeRef {
self.node_ref_opt.as_ref().expect(
"Article node doesn't exist. This may be because the document has not been parsed",
)
}
@ -111,16 +112,16 @@ mod test {
#[test]
fn test_extract_img_urls() {
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
extractor
let mut article = Article::from_html(TEST_HTML, "http://example.com/");
article
.extract_content()
.expect("Article extraction failed unexpectedly");
extractor.extract_img_urls();
article.extract_img_urls();
assert!(extractor.img_urls.len() > 0);
assert!(article.img_urls.len() > 0);
assert_eq!(
vec![("http://example.com/img.jpg".to_string(), None)],
extractor.img_urls
article.img_urls
);
}
}

391
src/html.rs Normal file
View file

@ -0,0 +1,391 @@
use std::{
collections::{BTreeMap, HashSet},
fs::{self, File},
path::Path,
};
use base64::encode;
use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table};
use html5ever::{LocalName, Namespace, QualName};
use indicatif::{ProgressBar, ProgressStyle};
use kuchiki::{traits::*, NodeRef};
use log::{debug, error, info};
use crate::{
cli::{self, AppConfig},
errors::PaperoniError,
extractor::Article,
moz_readability::MetaData,
};
const HEAD_ELEM_NOT_FOUND: &str =
"Unable to get <head> element to inline css. Ensure that the root node is the HTML document.";
const BASE_HTML_TEMPLATE: &str = r#"<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
</head>
<body></body>
</html>"#;
pub fn generate_html_exports(
articles: Vec<Article>,
app_config: &AppConfig,
successful_articles_table: &mut Table,
) -> Result<(), Vec<PaperoniError>> {
if articles.is_empty() {
return Ok(());
}
let bar = if app_config.can_disable_progress_bar {
ProgressBar::hidden()
} else {
let enabled_bar = ProgressBar::new(articles.len() as u64);
let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} html {pos}/{len:7} {msg:.green}",
);
enabled_bar.set_style(style);
if !articles.is_empty() {
enabled_bar.set_message("Generating html files");
}
enabled_bar
};
let mut errors: Vec<PaperoniError> = Vec::new();
match app_config.merged {
Some(ref name) => {
successful_articles_table.set_header(vec![Cell::new("Table of Contents")
.add_attribute(Attribute::Bold)
.set_alignment(CellAlignment::Center)
.fg(Color::Green)]);
debug!("Creating {:?}", name);
let base_html_elem = kuchiki::parse_html().one(BASE_HTML_TEMPLATE);
let body_elem = base_html_elem.select_first("body").unwrap();
let base_path = Path::new(app_config.output_directory.as_deref().unwrap_or("."));
let img_dirs_path_name = name.trim_end_matches(".html");
let imgs_dir_path = base_path.join(img_dirs_path_name);
if !(app_config.is_inlining_images || imgs_dir_path.exists()) {
info!("Creating imgs dir in {:?} for {}", imgs_dir_path, name);
if let Err(e) = std::fs::create_dir(&imgs_dir_path) {
error!("Unable to create imgs dir for HTML file");
let err: PaperoniError = e.into();
errors.push(err);
return Err(errors);
};
}
for (idx, article) in articles.iter().enumerate() {
let article_elem = article
.node_ref()
.select_first("div[id=\"readability-page-1\"]")
.unwrap();
let title = article.metadata().title();
let mut elem_attr = article_elem.attributes.borrow_mut();
if let Some(id_attr) = elem_attr.get_mut("id") {
*id_attr = format!("readability-page-{}", idx);
}
for (img_url, mime_type_opt) in &article.img_urls {
if app_config.is_inlining_images {
info!("Inlining images for {}", title);
let result = update_imgs_base64(
article,
img_url,
mime_type_opt.as_deref().unwrap_or("image/*"),
);
if let Err(e) = result {
let mut err: PaperoniError = e.into();
err.set_article_source(title);
error!("Unable to copy images to imgs dir for {}", title);
errors.push(err);
}
info!("Completed inlining images for {}", title);
} else {
info!("Copying images to imgs dir for {}", title);
let result = update_img_urls(article, &imgs_dir_path).map_err(|e| {
let mut err: PaperoniError = e.into();
err.set_article_source(title);
err
});
if let Err(e) = result {
error!("Unable to copy images to imgs dir for {}", title);
errors.push(e);
} else {
info!("Successfully copied images to imgs dir for {}", title);
}
}
}
bar.inc(1);
successful_articles_table.add_row(vec![title]);
body_elem.as_node().append(article_elem.as_node().clone());
debug!("Added {} to the export HTML file", title);
}
insert_title_elem(&base_html_elem, name);
insert_appendix(
&base_html_elem,
articles
.iter()
.map(|article| (article.metadata(), article.url.as_str()))
.collect(),
);
inline_css(&base_html_elem, app_config);
info!("Added title, footer and inlined styles for {}", name);
info!("Creating export HTML file: {}", name);
if let Err(mut err) = File::create(name)
.and_then(|mut out_file| base_html_elem.serialize(&mut out_file))
.map_err(|e| -> PaperoniError { e.into() })
{
error!("Failed to serialize articles to file: {}", name);
err.set_article_source(&name);
errors.push(err);
bar.finish_with_message("html generation failed");
return Err(errors);
};
bar.finish_with_message("Generated html file\n");
debug!("Created {:?}", name);
println!("Created {:?}", name);
}
None => {
successful_articles_table
.set_header(vec![Cell::new("Downloaded articles")
.add_attribute(Attribute::Bold)
.set_alignment(CellAlignment::Center)
.fg(Color::Green)])
.set_content_arrangement(ContentArrangement::Dynamic);
let mut file_names: HashSet<String> = HashSet::new();
for article in &articles {
let mut file_name = format!(
"{}/{}.html",
app_config.output_directory.as_deref().unwrap_or("."),
article
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
if file_names.contains(&file_name) {
info!("Article name {:?} already exists", file_name);
file_name = format!(
"{}/{}_{}.html",
app_config.output_directory.as_deref().unwrap_or("."),
article
.metadata()
.title()
.replace("/", " ")
.replace("\\", " "),
file_names.len()
);
info!("Renamed to {:?}", file_name);
}
file_names.insert(file_name.clone());
debug!("Creating {:?}", file_name);
let export_article = || -> Result<(), PaperoniError> {
let mut out_file = File::create(&file_name)?;
if app_config.is_inlining_images {
for (img_url, mime_type_opt) in &article.img_urls {
update_imgs_base64(
article,
img_url,
mime_type_opt.as_deref().unwrap_or("image/*"),
)?
}
} else {
let base_path =
Path::new(app_config.output_directory.as_deref().unwrap_or("."));
let imgs_dir_name = article.metadata().title();
if !base_path.join(imgs_dir_name).exists() {
std::fs::create_dir(base_path.join(imgs_dir_name))?;
}
let imgs_dir_path = base_path.join(imgs_dir_name);
update_img_urls(article, &imgs_dir_path)?;
}
let utf8_encoding =
NodeRef::new_element(create_qualname("meta"), BTreeMap::new());
if let Some(elem_node) = utf8_encoding.as_element() {
let mut elem_attrs = elem_node.attributes.borrow_mut();
elem_attrs.insert("charset", "UTF-8".into());
}
if let Ok(head_elem) = article.node_ref().select_first("head") {
let head_elem_node = head_elem.as_node();
head_elem_node.append(utf8_encoding);
};
insert_title_elem(article.node_ref(), article.metadata().title());
insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]);
inline_css(article.node_ref(), app_config);
article.node_ref().serialize(&mut out_file)?;
Ok(())
};
if let Err(mut err) = export_article() {
err.set_article_source(&article.url);
errors.push(err);
}
debug!("Created {:?}", file_name);
bar.inc(1);
successful_articles_table.add_row(vec![article.metadata().title()]);
}
bar.finish_with_message("Generated HTML files\n");
}
}
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
}
fn create_qualname(name: &str) -> QualName {
QualName::new(
None,
Namespace::from("http://www.w3.org/1999/xhtml"),
LocalName::from(name),
)
}
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
fn update_imgs_base64(
article: &Article,
img_url: &str,
mime_type: &str,
) -> Result<(), std::io::Error> {
let temp_dir = std::env::temp_dir();
let img_path = temp_dir.join(img_url);
let img_bytes = std::fs::read(img_path)?;
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));
let img_elems = article
.node_ref()
.select(&format!("img[src=\"{}\"]", img_url))
.unwrap();
for img_elem in img_elems {
let mut img_attr = img_elem.attributes.borrow_mut();
if let Some(src_attr) = img_attr.get_mut("src") {
*src_attr = img_base64_str.clone();
}
}
Ok(())
}
/// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location
fn update_img_urls(article: &Article, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
let temp_dir = std::env::temp_dir();
for (img_url, _) in &article.img_urls {
let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url));
info!("Copying {:?} to {:?}", from, to);
fs::copy(from, to)?;
let img_elems = article
.node_ref()
.select(&format!("img[src=\"{}\"]", img_url))
.unwrap();
for img_elem in img_elems {
let mut img_attr = img_elem.attributes.borrow_mut();
if let Some(src_attr) = img_attr.get_mut("src") {
*src_attr = imgs_dir_path.join(img_url).to_str().unwrap().into();
}
}
}
Ok(())
}
/// Creates a `<title>` element in an HTML document with the value set to the article's title
fn insert_title_elem(root_node: &NodeRef, title: &str) {
let title_content = NodeRef::new_text(title);
let title_elem = NodeRef::new_element(create_qualname("title"), BTreeMap::new());
title_elem.append(title_content);
match root_node.select_first("head") {
Ok(head_elem) => {
head_elem.as_node().append(title_elem);
}
Err(_) => {
debug!("{}", HEAD_ELEM_NOT_FOUND);
let html_elem = root_node.select_first("html").unwrap();
let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new());
head_elem.append(title_elem);
html_elem.as_node().prepend(head_elem);
}
}
}
/// Creates the appendix in an HTML document where article sources are added in a `<footer>` element
fn insert_appendix(root_node: &NodeRef, article_links: Vec<(&MetaData, &str)>) {
let link_tags: String = article_links
.iter()
.map(|(meta_data, url)| {
let article_name = if !meta_data.title().is_empty() {
meta_data.title()
} else {
url
};
format!("<a href=\"{}\">{}</a><br></br>", url, article_name)
})
.collect();
let footer_inner_html = format!("<h2>Appendix</h2><h2>Article sources</h3>{}", link_tags);
let footer_elem =
kuchiki::parse_fragment(create_qualname("footer"), Vec::new()).one(footer_inner_html);
root_node.append(footer_elem);
}
/// Inlines the CSS stylesheets into the HTML article node
fn inline_css(root_node: &NodeRef, app_config: &AppConfig) {
let body_stylesheet = include_str!("./assets/body.min.css");
let header_stylesheet = include_str!("./assets/headers.min.css");
let mut css_str = String::new();
match app_config.css_config {
cli::CSSConfig::NoHeaders => {
css_str.push_str(body_stylesheet);
}
cli::CSSConfig::All => {
css_str.push_str(body_stylesheet);
css_str.push_str(header_stylesheet);
}
cli::CSSConfig::None => {
return;
}
}
let css_html_str = format!("<style>{}</style>", css_str);
let style_container =
kuchiki::parse_fragment(create_qualname("div"), Vec::new()).one(css_html_str);
let style_elem = style_container.select_first("style").unwrap();
match root_node.select_first("head") {
Ok(head_elem) => {
head_elem.as_node().prepend(style_elem.as_node().to_owned());
}
Err(_) => {
debug!("{}", HEAD_ELEM_NOT_FOUND);
let html_elem = root_node.select_first("html").unwrap();
let head_elem = NodeRef::new_element(create_qualname("head"), BTreeMap::new());
head_elem.prepend(style_elem.as_node().to_owned());
html_elem.as_node().prepend(head_elem);
}
}
// Remove the <link> of the stylesheet since styles are now inlined
if let Ok(style_link_elem) = root_node.select_first("link[href=\"stylesheet.css\"]") {
style_link_elem.as_node().detach();
};
}

View file

@ -9,7 +9,7 @@ use url::Url;
use crate::cli::AppConfig;
use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Extractor;
use crate::extractor::Article;
type HTMLResource = (String, String);
pub fn download(
@ -17,7 +17,7 @@ pub fn download(
bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>,
errors: &mut Vec<PaperoniError>,
) -> Vec<Extractor> {
) -> Vec<Article> {
task::block_on(async {
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
@ -26,7 +26,7 @@ pub fn download(
match fetch_result {
Ok((url, html)) => {
debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html, &url);
let mut extractor = Article::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
@ -185,7 +185,7 @@ async fn process_img_response<'a>(
}
pub async fn download_images(
extractor: &mut Extractor,
extractor: &mut Article,
article_origin: &Url,
bar: &ProgressBar,
) -> Result<(), Vec<ImgError>> {
@ -237,7 +237,7 @@ pub async fn download_images(
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
let (img_url, img_path, img_mime) = img_item;
let img_ref = extractor
.article()
.node_ref()
.select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut();

View file

@ -11,7 +11,7 @@ use crate::errors::PaperoniError;
pub fn display_summary(
initial_article_count: usize,
succesful_articles_table: Table,
successful_articles_table: Table,
partial_downloads: Vec<PartialDownload>,
errors: Vec<PaperoniError>,
) {
@ -31,7 +31,7 @@ pub fn display_summary(
);
if successfully_downloaded_count > 0 {
println!("{}", succesful_articles_table);
println!("{}", successful_articles_table);
}
if partial_downloads_count > 0 {

View file

@ -3,6 +3,7 @@ extern crate lazy_static;
use std::process::exit;
use colored::Colorize;
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
use comfy_table::{ContentArrangement, Table};
use http::download;
@ -12,6 +13,7 @@ mod cli;
mod epub;
mod errors;
mod extractor;
mod html;
/// This module is responsible for async HTTP calls for downloading
/// the HTML content and images
mod http;
@ -20,13 +22,14 @@ mod moz_readability;
use cli::AppConfig;
use epub::generate_epubs;
use html::generate_html_exports;
use logs::display_summary;
fn main() {
let app_config = match cli::AppConfig::init_with_cli() {
Ok(app_config) => app_config,
Err(err) => {
eprintln!("{}", err);
eprintln!("{}: {}", "ERROR".bold().bright_red(), err);
exit(1);
}
};
@ -64,22 +67,33 @@ fn run(app_config: AppConfig) {
let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors);
bar.finish_with_message("Downloaded articles");
let mut succesful_articles_table = Table::new();
succesful_articles_table
let mut successful_articles_table = Table::new();
successful_articles_table
.load_preset(UTF8_FULL)
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_content_arrangement(ContentArrangement::Dynamic);
match generate_epubs(articles, &app_config, &mut succesful_articles_table) {
match app_config.export_type {
cli::ExportType::EPUB => {
match generate_epubs(articles, &app_config, &mut successful_articles_table) {
Ok(_) => (),
Err(gen_epub_errors) => {
errors.extend(gen_epub_errors);
}
};
}
cli::ExportType::HTML => {
match generate_html_exports(articles, &app_config, &mut successful_articles_table) {
Ok(_) => (),
Err(gen_html_errors) => errors.extend(gen_html_errors),
}
}
}
let has_errors = !errors.is_empty() || !partial_downloads.is_empty();
display_summary(
app_config.urls.len(),
succesful_articles_table,
successful_articles_table,
partial_downloads,
errors,
);