diff --git a/.gitignore b/.gitignore index 8e42494..2b8060a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target *.epub -*.log \ No newline at end of file +*.log +.vscode/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index d07b91c..8489e38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,7 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +version = 3 + [[package]] name = "addr2line" version = "0.14.1" @@ -71,9 +73,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "0.7.15" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] @@ -389,7 +391,7 @@ dependencies = [ "ansi_term", "atty", "bitflags", - "strsim", + "strsim 0.8.0", "textwrap", "unicode-width", "vec_map", @@ -408,9 +410,9 @@ dependencies = [ [[package]] name = "comfy-table" -version = "2.1.0" +version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b99e9022e080d384b58d8eaf5976b42a311ff7a9669f8200eb2453c0b2b81a" +checksum = "c93d79ba722818d1a6aedfbe2cf4889330c856d0c6772951efbbf3dd283c070a" dependencies = [ "crossterm", "strum", @@ -435,9 +437,7 @@ dependencies = [ "encode_unicode", "lazy_static", "libc", - "regex", "terminal_size", - "unicode-width", "winapi", ] @@ -504,25 +504,25 @@ dependencies = [ [[package]] name = "crossterm" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c" +checksum = "c0ebde6a9dd5e331cd6c6f48253254d117642c31653baa475e394657c59c1f7d" dependencies = [ "bitflags", "crossterm_winapi", - "lazy_static", "libc", "mio", "parking_lot", "signal-hook", + "signal-hook-mio", "winapi", ] [[package]] name = "crossterm_winapi" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9" +checksum = "3a6966607622438301997d3dac0d2f6e9a90c68bb6bc1785ea98456ab93c0507" dependencies = [ "winapi", ] @@ -614,6 +614,41 @@ dependencies = [ "winapi", ] +[[package]] +name = "darling" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f2c43f534ea4b0b049015d00269734195e6d3f0f6635cb692251aca6f9f8b3c" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e91455b86830a1c21799d94524df0845183fa55bafd9aa137b01c7d1065fa36" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29b5acf0dea37a7f66f7b25d2c5e93fd46f8f6968b1a5d7a3e02e97768afc95a" +dependencies = [ + "darling_core", + "quote", + "syn", +] + [[package]] name = "dashmap" version = "4.0.2" @@ -630,6 +665,37 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57" +[[package]] +name = "derive_builder" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d13202debe11181040ae9063d739fa32cfcaaebe2275fe387703460ae2365b30" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66e616858f6187ed828df7c64a6d71720d83767a7f19740b2d1b6fe6327b36e5" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58a94ace95092c5acb1e97a7e846b310cfbd499652f72297da7493f618a98d73" +dependencies = [ + "derive_builder_core", + "syn", +] + [[package]] name = "derive_more" version = "0.99.13" @@ -692,6 +758,12 @@ dependencies = [ "dtoa", ] +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + [[package]] name = "encode_unicode" version = "0.3.6" @@ -763,9 +835,9 @@ dependencies = [ [[package]] name = "flexi_logger" -version = "0.17.1" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ab94b6ac8eb69f1496a6993f26f785b5fd6d99b7416023eb2a6175c0b242b1" +checksum = "8ba2265890613939b533fa11c3728651531419ac549ccf527896201581f23991" dependencies = [ "atty", "chrono", @@ -822,9 +894,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" +checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27" dependencies = [ "futures-channel", "futures-core", @@ -837,9 +909,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" +checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" dependencies = [ "futures-core", "futures-sink", @@ -847,15 +919,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" +checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" [[package]] name = "futures-executor" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" +checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79" dependencies = [ "futures-core", "futures-task", @@ -864,9 +936,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" +checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" [[package]] name = "futures-lite" @@ -885,10 +957,11 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" +checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" dependencies = [ + "autocfg", "proc-macro-hack", "proc-macro2", "quote", @@ -897,22 +970,23 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" +checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" [[package]] name = "futures-task" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" +checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" [[package]] name = "futures-util" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" +checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" dependencies = [ + "autocfg", "futures-channel", "futures-core", "futures-io", @@ -1112,6 +1186,12 @@ dependencies = [ "url", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.2.3" @@ -1125,9 +1205,9 @@ dependencies = [ [[package]] name = "indicatif" -version = "0.15.0" +version = "0.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4" +checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b" dependencies = [ "console", "lazy_static", @@ -1173,6 +1253,15 @@ dependencies = [ "waker-fn", ] +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.7" @@ -1305,9 +1394,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.3.4" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "mime" @@ -1419,9 +1508,9 @@ dependencies = [ [[package]] name = "number_prefix" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" [[package]] name = "object" @@ -1462,19 +1551,21 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.4.1-alpha1" +version = "0.5.0-alpha1" dependencies = [ "async-std", "chrono", "clap", "colored", "comfy-table", + "derive_builder", "directories", "epub-builder", "flexi_logger", "futures", "html5ever", "indicatif", + "itertools", "kuchiki", "lazy_static", "log 0.4.14", @@ -1829,9 +1920,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.4.6" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", @@ -1840,9 +1931,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.23" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "remove_dir_all" @@ -2011,20 +2102,30 @@ dependencies = [ [[package]] name = "signal-hook" -version = "0.1.17" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729" +checksum = "470c5a6397076fae0094aaf06a08e6ba6f37acb77d3b1b91ea92b4d6c8650c39" dependencies = [ "libc", - "mio", "signal-hook-registry", ] [[package]] -name = "signal-hook-registry" -version = "1.3.0" +name = "signal-hook-mio" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6" +checksum = "29fd5867f1c4f2c5be079aee7a2adf1152ebb04a4bc4d341f504b7dece607ed4" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ "libc", ] @@ -2173,16 +2274,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] -name = "strum" -version = "0.20.0" +name = "strsim" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7318c509b5ba57f18533982607f24070a55d353e90d4cae30c467cdb2ad5ac5c" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "strum" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aaf86bbcfd1fa9670b7a129f64fc0c9fcbbfe4f1bc4210e9e98fe71ffc12cde2" [[package]] name = "strum_macros" -version = "0.20.1" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee8bc6b87a5112aeeab1f4a9f7ab634fe6cbefc4850006df31267f4cfb9e3149" +checksum = "d06aaeeee809dbc59eb4556183dd927df67db1540de5be8d3ec0b6636358a5ec" dependencies = [ "heck", "proc-macro2", @@ -2277,18 +2384,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" [[package]] name = "thiserror" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e" +checksum = "fa6f76457f59514c7eeb4e59d891395fab0b2fd1d40723ae737d64153392e9c6" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" +checksum = "8a36768c0fbf1bb15eca10defa29526bda730a2376c2ab4393ccfa16fb1a318d" dependencies = [ "proc-macro2", "quote", @@ -2465,9 +2572,9 @@ dependencies = [ [[package]] name = "url" -version = "2.2.1" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" dependencies = [ "form_urlencoded", "idna", diff --git a/Cargo.toml b/Cargo.toml index 8b8b6e6..3594149 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ description = "A web article downloader" homepage = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.4.1-alpha1" +version = "0.5.0-alpha1" authors = ["Kenneth Gitere "] edition = "2018" license = "MIT" @@ -12,23 +12,25 @@ readme = "README.md" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -async-std = "1.9.0" # atty = "0.2.14" +async-std = "1.9.0" chrono = "0.4.19" clap = "2.33.3" colored = "2.0.0" -comfy-table = "2.1.0" +comfy-table = "3.0.0" +derive_builder = "0.10.2" directories = "3.0.2" epub-builder = "0.4.8" -flexi_logger = "0.17.1" -futures = "0.3.14" +flexi_logger = "0.18.0" +futures = "0.3.15" html5ever = "0.25.1" -indicatif = "0.15.0" +indicatif = "0.16.2" +itertools = "0.10.1" kuchiki = "0.8.1" lazy_static = "1.4.0" log = "0.4.14" md5 = "0.7.0" -regex = "1.4.5" +regex = "1.5.4" surf = "2.2.0" -thiserror = "1.0.24" -url = "2.2.1" +thiserror = "1.0.25" +url = "2.2.2" diff --git a/README.md b/README.md index 2cfba38..5c547b0 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ -Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. +Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. There is provisional\* support for exporting to PDF as well. > This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash. @@ -23,7 +23,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run: ```sh -cargo install paperoni --version 0.4.1-alpha1 +cargo install paperoni --version 0.5.0-alpha1 ``` _Paperoni is still in alpha so the `version` flag has to be passed._ @@ -48,18 +48,44 @@ USAGE: paperoni [OPTIONS] [urls]... OPTIONS: - -f, --file Input file containing links - -h, --help Prints help information - --log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level - of debug. Use -v to specify the logging level - --max_conn The maximum number of concurrent HTTP connections when downloading articles. Default is - 8 - --merge Merge multiple articles into a single epub - -V, --version Prints version information - -v Enables logging of events and set the verbosity level. Use -h to read on its usage + -f, --file + Input file containing links + + -h, --help + Prints help information + + --inline-toc + Add an inlined Table of Contents page at the start of the merged article. + + --log-to-file + Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to + specify the logging level + --max-conn + The maximum number of concurrent HTTP connections when downloading articles. Default is 8. + NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end + up overloading your network card with too many concurrent requests. + -o, --output-dir + Directory for saving epub documents + + --merge + Merge multiple articles into a single epub that will be given the name provided + + -V, --version + Prints version information + + -v + This takes upto 4 levels of verbosity in the following order. + - Error (-v) + - Warn (-vv) + - Info (-vvv) + - Debug (-vvvv) + When this flag is passed, it disables the progress bars and logs to stderr. + If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag. ARGS: - ... Urls of web articles + ... + Urls of web articles + ``` To download a single article pass in its URL @@ -124,3 +150,14 @@ This program is still in alpha so a number of things won't work: - Code snippets on Medium articles that are lazy loaded will not appear in the EPUB. There are also web pages it won't work on in general such as Twitter and Reddit threads. + +## PDF exports + +As of version 0.5-alpha1, you can now export to PDF using a third party tool. This requires that you install [Calibre](https://calibre-ebook.com/) which comes with a ebook conversion. You can convert the epub to a pdf through the terminal with `ebook-convert`: + +```sh +# Assuming the downloaded epub was called foo.epub +ebook-convert foo.epub foo.pdf +``` + +Alternatively, you can use the Calibre GUI to do the file conversion. diff --git a/rust-toolchain b/rust-toolchain new file mode 100644 index 0000000..154cb93 --- /dev/null +++ b/rust-toolchain @@ -0,0 +1 @@ +1.52.1 diff --git a/src/assets/writ.min.css b/src/assets/writ.min.css new file mode 100644 index 0000000..1c9c0b4 --- /dev/null +++ b/src/assets/writ.min.css @@ -0,0 +1,7 @@ +/*! + * Writ v1.0.4 + * + * Copyright © 2015, Curtis McEnroe + * + * https://cmcenroe.me/writ/LICENSE (ISC) + */dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap} diff --git a/src/cli.rs b/src/cli.rs index 19ce379..62937e7 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,13 +1,32 @@ -use std::{fs::File, io::Read, path::Path}; +use std::{fs, num::NonZeroUsize, path::Path}; use chrono::{DateTime, Local}; -use clap::{App, AppSettings, Arg}; +use clap::{App, AppSettings, Arg, ArgMatches}; use flexi_logger::LevelFilter as LogLevel; +use itertools::Itertools; -use crate::logs::init_logger; +type Error = crate::errors::CliError; -pub fn cli_init() -> AppConfig { - let app = App::new("paperoni") +const DEFAULT_MAX_CONN: usize = 8; + +#[derive(derive_builder::Builder)] +pub struct AppConfig { + /// Urls for store in epub + pub urls: Vec, + pub max_conn: usize, + /// Path to file of multiple articles into a single epub + pub merged: Option, + pub output_directory: Option, + pub log_level: LogLevel, + pub can_disable_progress_bar: bool, + pub start_time: DateTime, + pub is_logging_to_file: bool, + pub inline_toc: bool, +} + +impl AppConfig { + pub fn init_with_cli() -> Result { + let app = App::new("paperoni") .settings(&[ AppSettings::ArgRequiredElseHelp, AppSettings::UnifiedHelpMessage, @@ -28,14 +47,23 @@ pub fn cli_init() -> AppConfig { .help("Input file containing links") .takes_value(true), ) + .arg( + Arg::with_name("output_directory") + .long("output-dir") + .short("o") + .help("Directory to store output epub documents") + .conflicts_with("output_name") + .takes_value(true), + ) .arg( Arg::with_name("output_name") .long("merge") .help("Merge multiple articles into a single epub") .long_help("Merge multiple articles into a single epub that will be given the name provided") + .conflicts_with("output_directory") .takes_value(true), ).arg( - Arg::with_name("max_conn") + Arg::with_name("max-conn") .long("max_conn") .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") @@ -59,144 +87,128 @@ pub fn cli_init() -> AppConfig { Arg::with_name("log-to-file") .long("log-to-file") .help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level") - .takes_value(false)); - let arg_matches = app.get_matches(); - - let mut urls: Vec = match arg_matches.value_of("file") { - Some(file_name) => { - if let Ok(mut file) = File::open(file_name) { - let mut content = String::new(); - match file.read_to_string(&mut content) { - Ok(_) => content - .lines() - .filter(|line| !line.is_empty()) - .map(|line| line.to_owned()) - .collect(), - Err(_) => vec![], - } - } else { - println!("Unable to open file: {}", file_name); - vec![] - } - } - None => vec![], - }; - - if let Some(vals) = arg_matches.values_of("urls") { - urls.extend( - vals.filter(|val| !val.is_empty()) - .map(|val| val.to_string()), + .takes_value(false)) + .arg( + Arg::with_name("inline-toc") + .long("inline-toc") + .requires("output_name") + .help("Add an inlined Table of Contents page at the start of the merged article.") + .long_help("Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation") ); + + Self::try_from(app.get_matches()) } - let max_conn = arg_matches - .value_of("max_conn") - .map(|conn_str| conn_str.parse::().ok()) - .flatten() - .map(|max| if max > 0 { max } else { 1 }) - .unwrap_or(8); - - let mut app_config = AppConfig::new(max_conn); - app_config.set_urls(urls); - - if let Some(name) = arg_matches.value_of("output_name") { - let file_path = Path::new(name); - if file_path.is_dir() { - eprintln!("{:?} is a directory", name); - std::process::exit(1); - } - - let file_name = if file_path.extension().is_some() { - name.to_owned() - } else { - name.to_owned() + ".epub" - }; - - match std::fs::File::create(&file_name) { - Ok(_) => (), - Err(e) => { - eprintln!("Unable to create file {:?}\n{}", file_path, e); - std::process::exit(1) - } - } - app_config.merged = Some(file_name); + fn init_merge_file(self) -> Result { + self.merged + .as_deref() + .map(fs::File::create) + .transpose() + .err() + .map(|err| Err(Error::InvalidOutputPath(err.to_string()))) + .unwrap_or(Ok(self)) } - if arg_matches.is_present("verbosity") { - if !arg_matches.is_present("log-to-file") { - app_config.can_disable_progress_bar = true; - } - let log_levels: [LogLevel; 5] = [ - LogLevel::Off, - LogLevel::Error, - LogLevel::Warn, - LogLevel::Info, - LogLevel::Debug, - ]; - let level = arg_matches.occurrences_of("verbosity").clamp(0, 4) as usize; - app_config.log_level = log_levels[level]; - } - if arg_matches.is_present("log-to-file") { - app_config.log_level = LogLevel::Debug; - app_config.is_logging_to_file = true; - } - - init_logger(&app_config); - - app_config -} - -pub struct AppConfig { - urls: Vec, - max_conn: usize, - merged: Option, - log_level: LogLevel, - can_disable_progress_bar: bool, - start_time: DateTime, - is_logging_to_file: bool, -} - -impl AppConfig { - fn new(max_conn: usize) -> Self { - Self { - urls: vec![], - max_conn, - merged: None, - log_level: LogLevel::Off, - can_disable_progress_bar: false, - start_time: Local::now(), - is_logging_to_file: false, - } - } - - fn set_urls(&mut self, urls: Vec) { - self.urls.extend(urls); - } - - pub fn urls(&self) -> &Vec { - &self.urls - } - pub fn max_conn(&self) -> usize { - self.max_conn - } - - pub fn merged(&self) -> Option<&String> { - self.merged.as_ref() - } - - pub fn log_level(&self) -> LogLevel { - self.log_level - } - - pub fn can_disable_progress_bar(&self) -> bool { - self.can_disable_progress_bar - } - - pub fn start_time(&self) -> &DateTime { - &self.start_time - } - - pub fn is_logging_to_file(&self) -> bool { - self.is_logging_to_file + fn init_logger(self) -> Result { + use crate::logs; + logs::init_logger(self.log_level, &self.start_time, self.is_logging_to_file) + .map(|_| self) + .map_err(Error::LogError) + } +} + +use std::convert::TryFrom; + +impl<'a> TryFrom> for AppConfig { + type Error = Error; + + fn try_from(arg_matches: ArgMatches<'a>) -> Result { + AppConfigBuilder::default() + .urls({ + let url_filter = |url: &str| { + let url = url.trim(); + if !url.is_empty() { + Some(url.to_owned()) + } else { + None + } + }; + let direct_urls = arg_matches + .values_of("urls") + .and_then(|urls| urls.map(url_filter).collect::>>()) + .unwrap_or(Vec::new()); + let file_urls = arg_matches + .value_of("file") + .map(fs::read_to_string) + .transpose()? + .and_then(|content| content.lines().map(url_filter).collect::>>()) + .unwrap_or(Vec::new()); + + let urls = [direct_urls, file_urls] + .concat() + .into_iter() + .unique() + .collect_vec(); + if !urls.is_empty() { + Ok(urls) + } else { + Err(Error::NoUrls) + } + }?) + .max_conn(match arg_matches.value_of("max-conn") { + Some(max_conn) => max_conn.parse::()?.get(), + None => DEFAULT_MAX_CONN, + }) + .merged(arg_matches.value_of("output_name").map(|name| { + if name.ends_with(".epub") { + name.to_owned() + } else { + name.to_string() + ".epub" + } + })) + .can_disable_progress_bar( + arg_matches.is_present("verbosity") && !arg_matches.is_present("log-to-file"), + ) + .log_level(match arg_matches.occurrences_of("verbosity") { + 0 => { + if !arg_matches.is_present("log-to-file") { + LogLevel::Off + } else { + LogLevel::Debug + } + } + 1 => LogLevel::Error, + 2 => LogLevel::Warn, + 3 => LogLevel::Info, + 4..=u64::MAX => LogLevel::Debug, + }) + .is_logging_to_file(arg_matches.is_present("log-to-file")) + .inline_toc(arg_matches.is_present("inline-toc")) + .output_directory( + arg_matches + .value_of("output_directory") + .map(|output_directory| { + let path = Path::new(output_directory); + if !path.exists() { + Err(Error::OutputDirectoryNotExists) + } else if !path.is_dir() { + Err(Error::WrongOutputDirectory) + } else { + Ok(output_directory.to_owned()) + } + }) + .transpose()?, + ) + .start_time(Local::now()) + .try_init() + } +} + +impl AppConfigBuilder { + pub fn try_init(&self) -> Result { + self.build() + .map_err(Error::AppBuildError)? + .init_logger()? + .init_merge_file() } } diff --git a/src/epub.rs b/src/epub.rs index 75f2b9e..f3e37f4 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -1,28 +1,35 @@ +use std::collections::HashMap; use std::fs::File; use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; -use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; +use epub_builder::{EpubBuilder, EpubContent, TocElement, ZipLibrary}; +use html5ever::tendril::fmt::Slice; use indicatif::{ProgressBar, ProgressStyle}; -use log::{debug, info}; +use kuchiki::NodeRef; +use log::{debug, error, info}; -use crate::{ - cli::AppConfig, - errors::PaperoniError, - extractor::{self, Extractor}, -}; +use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor}; + +lazy_static! { + static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); +} pub fn generate_epubs( articles: Vec, app_config: &AppConfig, successful_articles_table: &mut Table, ) -> Result<(), Vec> { - let bar = if app_config.can_disable_progress_bar() { + if articles.is_empty() { + return Ok(()); + } + + let bar = if app_config.can_disable_progress_bar { ProgressBar::hidden() } else { let enabled_bar = ProgressBar::new(articles.len() as u64); let style = ProgressStyle::default_bar().template( - "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", - ); + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", + ); enabled_bar.set_style(style); if !articles.is_empty() { enabled_bar.set_message("Generating epubs"); @@ -30,10 +37,12 @@ pub fn generate_epubs( enabled_bar }; + let stylesheet = include_bytes!("./assets/writ.min.css"); + let mut errors: Vec = Vec::new(); - match app_config.merged() { - Some(name) => { + match app_config.merged { + Some(ref name) => { successful_articles_table.set_header(vec![Cell::new("Table of Contents") .add_attribute(Attribute::Bold) .set_alignment(CellAlignment::Center) @@ -57,21 +66,43 @@ pub fn generate_epubs( } }; debug!("Creating {:?}", name); - epub.inline_toc(); + + if app_config.inline_toc { + epub.inline_toc(); + } + + match epub.stylesheet(stylesheet.as_bytes()) { + Ok(_) => (), + Err(e) => { + error!("Unable to add stylesheets to epub file"); + let mut paperoni_err: PaperoniError = e.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + } articles .iter() .enumerate() .fold(&mut epub, |epub, (idx, article)| { let mut article_result = || -> Result<(), PaperoniError> { - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article(), &mut html_buf)?; - let html_str = std::str::from_utf8(&html_buf)?; - epub.metadata("title", replace_metadata_value(name))?; + let content_url = format!("article_{}.xhtml", idx); + let mut xhtml_buf = Vec::new(); + let header_level_tocs = + get_header_level_toc_vec(&content_url, article.article()); + + serialize_to_xhtml(article.article(), &mut xhtml_buf)?; + let xhtml_str = std::str::from_utf8(&xhtml_buf)?; let section_name = article.metadata().title(); - epub.add_content( - EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) - .title(replace_metadata_value(section_name)), - )?; + let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes()) + .title(replace_escaped_characters(section_name)); + + for toc_element in header_level_tocs { + content = content.child(toc_element); + } + + epub.metadata("title", replace_escaped_characters(name))?; + epub.add_content(content)?; info!("Adding images for {:?}", name); article.img_urls.iter().for_each(|img| { // TODO: Add error handling and return errors as a vec @@ -100,10 +131,10 @@ pub fn generate_epubs( let appendix = generate_appendix(articles.iter().collect()); if let Err(err) = epub.add_content( EpubContent::new("appendix.xhtml", appendix.as_bytes()) - .title(replace_metadata_value("Article Sources")), + .title(replace_escaped_characters("Article Sources")), ) { let mut paperoni_err: PaperoniError = err.into(); - paperoni_err.set_article_source(name); + paperoni_err.set_article_source(&name); errors.push(paperoni_err); return Err(errors); } @@ -113,7 +144,7 @@ pub fn generate_epubs( Ok(_) => (), Err(err) => { let mut paperoni_err: PaperoniError = err.into(); - paperoni_err.set_article_source(name); + paperoni_err.set_article_source(&name); errors.push(paperoni_err); return Err(errors); } @@ -135,7 +166,8 @@ pub fn generate_epubs( let mut result = || -> Result<(), PaperoniError> { let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; let file_name = format!( - "{}.epub", + "{}/{}.epub", + app_config.output_directory.as_deref().unwrap_or("."), article .metadata() .title() @@ -144,15 +176,31 @@ pub fn generate_epubs( ); debug!("Creating {:?}", file_name); let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article(), &mut html_buf) + let mut xhtml_buf = Vec::new(); + let header_level_tocs = + get_header_level_toc_vec("index.xhtml", article.article()); + serialize_to_xhtml(article.article(), &mut xhtml_buf) .expect("Unable to serialize to xhtml"); - let html_str = std::str::from_utf8(&html_buf).unwrap(); + let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap(); + if let Some(author) = article.metadata().byline() { - epub.metadata("author", replace_metadata_value(author))?; + epub.metadata("author", replace_escaped_characters(author))?; } - epub.metadata("title", replace_metadata_value(article.metadata().title()))?; - epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; + + epub.stylesheet(stylesheet.as_bytes())?; + + let title = replace_escaped_characters(article.metadata().title()); + epub.metadata("title", &title)?; + + let mut content = + EpubContent::new("index.xhtml", xhtml_str.as_bytes()).title(title); + + for toc_element in header_level_tocs { + content = content.child(toc_element); + } + + epub.add_content(content)?; + for img in &article.img_urls { let mut file_path = std::env::temp_dir(); file_path.push(&img.0); @@ -167,7 +215,7 @@ pub fn generate_epubs( let appendix = generate_appendix(vec![&article]); epub.add_content( EpubContent::new("appendix.xhtml", appendix.as_bytes()) - .title(replace_metadata_value("Article Source")), + .title(replace_escaped_characters("Article Source")), )?; epub.generate(&mut out_file)?; bar.inc(1); @@ -194,7 +242,7 @@ pub fn generate_epubs( } /// Replaces characters that have to be escaped before adding to the epub's metadata -fn replace_metadata_value(value: &str) -> String { +fn replace_escaped_characters(value: &str) -> String { value .replace("&", "&") .replace("<", "<") @@ -213,14 +261,15 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String { }; format!( "{}

", - replace_metadata_value(&article.url), - replace_metadata_value(article_name) + replace_escaped_characters(&article.url), + replace_escaped_characters(article_name) ) }) .collect(); let template = format!( r#" +

Appendix

Article sources

@@ -232,23 +281,334 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String { template } +/// Adds an id attribute to header elements and assigns a value based on +/// the hash of the text content. Headers with id attributes are not modified. +/// The headers here are known to have text because the grabbed article from +/// readability removes headers with no text. +fn generate_header_ids(root_node: &NodeRef) { + let headers = root_node + .select("h1, h2, h3, h4") + .expect("Unable to create selector for headings"); + let headers_no_id = headers.filter(|node_data_ref| { + let attrs = node_data_ref.attributes.borrow(); + !attrs.contains("id") + }); + for header in headers_no_id { + let mut attrs = header.attributes.borrow_mut(); + let text = header.text_contents(); + // The value of the id begins with an underscore because the hexadecimal + // digest might start with a number which would make it an invalid id + // when querying with selectors + let value = format!("_{:x}", md5::compute(text)); + attrs.insert("id", value); + } +} + +/// Returns a vector of `TocElement` from a NodeRef used for adding to the Table of Contents for navigation +fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec { + // Depth starts from 1 + const HEADER_LEVEL_MAX_DEPTH: usize = 4; + let mut headers_vec: Vec = Vec::new(); + + let mut header_levels = HashMap::with_capacity(HEADER_LEVEL_MAX_DEPTH); + header_levels.insert("h1", 1); + header_levels.insert("h2", 2); + header_levels.insert("h3", 3); + header_levels.insert("h4", 4); + + generate_header_ids(article); + + let headings = article + .select("h1, h2, h3, h4") + .expect("Unable to create selector for headings"); + + // The header list will be generated using some sort of backtracking algorithm + // There will be a stack of maximum size 4 (since it only goes to h4 now) + let mut stack: Vec> = std::iter::repeat(None) + .take(HEADER_LEVEL_MAX_DEPTH) + .collect::<_>(); + + for heading in headings { + let elem_name: &str = &heading.name.local; + let attrs = heading.attributes.borrow(); + let id = attrs + .get("id") + .map(ToOwned::to_owned) + .expect("Unable to get id value in get_header_level_toc_vec"); + let url = format!("{}#{}", content_url, id); + + let level = header_levels[elem_name]; + let index = level - 1; + + if let Some(mut existing_toc) = stack.get_mut(index).take().cloned().flatten() { + // If a toc element already exists at that header level, consume all the toc elements + // of a lower hierarchy e.g if the existing toc is a h2, then the h3 and h4 in the stack + // will be consumed. + // We collapse the children by folding from the right to the left of the stack. + let descendants_levels = HEADER_LEVEL_MAX_DEPTH - level; + let folded_descendants = stack + .iter_mut() + .rev() + .take(descendants_levels) + .map(|toc_elem| toc_elem.take()) + .filter(|toc_elem| toc_elem.is_some()) + .map(|toc_elem| toc_elem.unwrap()) + .reduce(|child, parent| parent.child(child)); + + if let Some(child) = folded_descendants { + existing_toc = existing_toc.child(child); + }; + + // Find the nearest ancestor to embed into. + // If this toc_elem was a h1, then just add it to the headers_vec + if index == 0 { + headers_vec.push(existing_toc); + } else { + // Otherwise, find the nearest ancestor to add it to. If none exists, add it to the headers_vec + let first_ancestor = stack + .iter_mut() + .take(level - 1) + .map(|toc_elem| toc_elem.as_mut()) + .rfind(|toc_elem| toc_elem.is_some()) + .flatten(); + + match first_ancestor { + Some(ancestor_toc_elem) => { + *ancestor_toc_elem = ancestor_toc_elem.clone().child(existing_toc); + } + None => { + headers_vec.push(existing_toc); + } + } + } + } + + if let Some(toc_elem) = stack.get_mut(index) { + *toc_elem = Some(TocElement::new( + url, + replace_escaped_characters(&heading.text_contents()), + )); + } + } + + let folded_stack = stack + .into_iter() + .rev() + .filter(|toc_elem| toc_elem.is_some()) + .map(|opt_toc_elem| opt_toc_elem.unwrap()) + .reduce(|child, parent| parent.child(child)); + if let Some(toc_elem) = folded_stack { + headers_vec.push(toc_elem) + } + + headers_vec +} + +/// Serializes a NodeRef to a string that is XHTML compatible +/// The only DOM nodes serialized are Text and Element nodes +fn serialize_to_xhtml( + node_ref: &NodeRef, + mut w: &mut W, +) -> Result<(), PaperoniError> { + let mut escape_map = HashMap::new(); + escape_map.insert("<", "<"); + escape_map.insert(">", ">"); + escape_map.insert("&", "&"); + escape_map.insert("\"", """); + escape_map.insert("'", "'"); + for edge in node_ref.traverse_inclusive() { + match edge { + kuchiki::iter::NodeEdge::Start(n) => match n.data() { + kuchiki::NodeData::Text(rc_text) => { + let text = rc_text.borrow(); + let esc_text = ESC_SEQ_REGEX + .replace_all(&text, |captures: ®ex::Captures| escape_map[&captures[1]]); + write!(&mut w, "{}", esc_text)?; + } + kuchiki::NodeData::Element(elem_data) => { + let attrs = elem_data.attributes.borrow(); + let attrs_str = attrs + .map + .iter() + .filter(|(k, _)| !k.local.contains("\"")) + .map(|(k, v)| { + format!( + "{}=\"{}\"", + k.local, + ESC_SEQ_REGEX + .replace_all(&v.value, |captures: ®ex::Captures| { + escape_map[&captures[1]] + }) + ) + }) + .fold("".to_string(), |acc, val| acc + " " + &val); + write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?; + } + _ => (), + }, + kuchiki::iter::NodeEdge::End(n) => match n.data() { + kuchiki::NodeData::Element(elem_data) => { + write!(&mut w, "", &elem_data.name.local)?; + } + _ => (), + }, + } + } + Ok(()) +} + #[cfg(test)] mod test { - use super::replace_metadata_value; + use kuchiki::traits::*; + + use super::{generate_header_ids, get_header_level_toc_vec, replace_escaped_characters}; #[test] - fn test_replace_metadata_value() { + fn test_replace_escaped_characters() { let mut value = "Lorem ipsum"; - assert_eq!(replace_metadata_value(value), "Lorem ipsum"); + assert_eq!(replace_escaped_characters(value), "Lorem ipsum"); value = "Memory safe > memory unsafe"; assert_eq!( - replace_metadata_value(value), + replace_escaped_characters(value), "Memory safe > memory unsafe" ); value = "Author Name "; assert_eq!( - replace_metadata_value(value), + replace_escaped_characters(value), "Author Name <author@mail.example>" ); } + + #[test] + fn test_generate_header_ids() { + let html_str = r#" + + + +

Heading 1

+

Heading 2

+

Heading 2 again

+

Heading 4

+

Heading 1 again

+

Heading 3

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + generate_header_ids(&doc); + + let mut headers = doc.select("h1, h2, h3, h4").unwrap(); + let all_headers_have_ids = headers.all(|node_data_ref| { + let attrs = node_data_ref.attributes.borrow(); + if let Some(id) = attrs.get("id") { + !id.trim().is_empty() + } else { + false + } + }); + assert_eq!(true, all_headers_have_ids); + + let selector = format!("h1#_{:x}", md5::compute("Heading 1")); + assert_eq!(true, doc.select_first(&selector).is_ok()); + + let selector = format!("h1#_{:x}", md5::compute("Heading 1 again")); + assert_eq!(true, doc.select_first(&selector).is_ok()); + + let selector = "h2#heading-2-again"; + assert_eq!(true, doc.select_first(selector).is_ok()); + } + + #[test] + fn test_get_header_level_toc_vec() { + // NOTE: Due to `TocElement` not implementing PartialEq, the tests here + // will need to be manually written to cover for this + let html_str = r#" + + + +

Lorem ipsum

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + + let toc_vec = get_header_level_toc_vec("index.xhtml", &doc); + assert_eq!(0, toc_vec.len()); + + let html_str = r#" + + + +

Heading 1

+

Lorem ipsum

+
+

Heading 2

+

Lorem ipsum

+

Lorem ipsum

+
+

Subheading 3

+

Lorem ipsum

+

Second Heading 1

+

Lorem ipsum

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + + let toc_vec = get_header_level_toc_vec("index.xhtml", &doc); + assert_eq!(2, toc_vec.len()); + + let first_h1_toc = toc_vec.first().unwrap(); + assert_eq!("Heading 1", first_h1_toc.title); + assert_eq!(1, first_h1_toc.children.len()); + + let h2_toc = first_h1_toc.children.first().unwrap(); + assert_eq!("Heading 2", h2_toc.title); + assert_eq!(1, h2_toc.children.len()); + + let h3_toc = h2_toc.children.first().unwrap(); + assert_eq!("Subheading 3", h3_toc.title); + assert_eq!(0, h3_toc.children.len()); + + let last_h1_toc = toc_vec.last().unwrap(); + assert_eq!("Second Heading 1", last_h1_toc.title); + assert_eq!(0, last_h1_toc.children.len()); + + let html_str = r#" + + + +

Heading 1

+

Lorem ipsum

+
+

Heading 2

+

Lorem ipsum

+

Lorem ipsum

+

Subheading 3

+

Lorem ipsum

+
+

Heading 2

+

Lorem ipsum

+

Subheading 4

+

Conclusion

+ + + "#; + let doc = kuchiki::parse_html().one(html_str); + + let toc_vec = get_header_level_toc_vec("index.xhtml", &doc); + assert_eq!(1, toc_vec.len()); + + let h1_toc = toc_vec.first().unwrap(); + assert_eq!("Heading 1", h1_toc.title); + assert_eq!(3, h1_toc.children.len()); + + let first_h2_toc = h1_toc.children.first().unwrap(); + assert_eq!("Heading 2", first_h2_toc.title); + assert_eq!(1, first_h2_toc.children.len()); + + let h3_toc = first_h2_toc.children.first().unwrap(); + assert_eq!("Subheading 3", h3_toc.title); + assert_eq!(0, h3_toc.children.len()); + } } diff --git a/src/errors.rs b/src/errors.rs index 84d1535..17ae34a 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,3 +1,6 @@ +use std::fmt::{Debug, Display}; + +use flexi_logger::FlexiLoggerError; use thiserror::Error; #[derive(Error, Debug)] @@ -124,3 +127,33 @@ impl From for PaperoniError { PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string())) } } + +#[derive(Debug, Error)] +pub enum LogError { + #[error(transparent)] + FlexiError(#[from] FlexiLoggerError), + #[error("Unable to get user directories for logging purposes")] + UserDirectoriesError, + #[error("Can't create log directory: {0}")] + CreateLogDirectoryError(#[from] std::io::Error), +} + +#[derive(Debug, Error)] +pub enum CliError { + #[error("Failed to open file with urls: {0}")] + UrlFileError(#[from] std::io::Error), + #[error("Failed to parse max connection value: {0}")] + InvalidMaxConnectionCount(#[from] std::num::ParseIntError), + #[error("No urls were provided")] + NoUrls, + #[error("Failed to build cli application: {0}")] + AppBuildError(BuilderError), + #[error("Invalid output path name for merged epubs: {0}")] + InvalidOutputPath(String), + #[error("Wrong output directory")] + WrongOutputDirectory, + #[error("Output directory does not exist")] + OutputDirectoryNotExists, + #[error("Unable to start logger!\n{0}")] + LogError(#[from] LogError), +} diff --git a/src/extractor.rs b/src/extractor.rs index 2cf3f25..f427e0f 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use itertools::Itertools; use kuchiki::{traits::*, NodeRef}; use crate::errors::PaperoniError; @@ -7,10 +6,6 @@ use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); -lazy_static! { - static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); -} - pub struct Extractor { article: Option, pub img_urls: Vec, @@ -37,6 +32,7 @@ impl Extractor { let template = r#" + @@ -53,15 +49,19 @@ impl Extractor { /// Traverses the DOM tree of the content and retrieves the IMG URLs pub fn extract_img_urls(&mut self) { if let Some(content_ref) = &self.article { - for img_ref in content_ref.select("img").unwrap() { - img_ref.as_node().as_element().map(|img_elem| { - img_elem.attributes.borrow().get("src").map(|img_url| { - if !(img_url.is_empty() || img_url.starts_with("data:image")) { - self.img_urls.push((img_url.to_string(), None)) - } - }) - }); - } + self.img_urls = content_ref + .select("img") + .unwrap() + .filter_map(|img_ref| { + let attrs = img_ref.attributes.borrow(); + attrs + .get("src") + .filter(|val| !(val.is_empty() || val.starts_with("data:image"))) + .map(ToString::to_string) + }) + .unique() + .map(|val| (val, None)) + .collect(); } } @@ -77,59 +77,6 @@ impl Extractor { } } -/// Serializes a NodeRef to a string that is XHTML compatible -/// The only DOM nodes serialized are Text and Element nodes -pub fn serialize_to_xhtml( - node_ref: &NodeRef, - mut w: &mut W, -) -> Result<(), PaperoniError> { - let mut escape_map = HashMap::new(); - escape_map.insert("<", "<"); - escape_map.insert(">", ">"); - escape_map.insert("&", "&"); - escape_map.insert("\"", """); - escape_map.insert("'", "'"); - for edge in node_ref.traverse_inclusive() { - match edge { - kuchiki::iter::NodeEdge::Start(n) => match n.data() { - kuchiki::NodeData::Text(rc_text) => { - let text = rc_text.borrow(); - let esc_text = ESC_SEQ_REGEX - .replace_all(&text, |captures: ®ex::Captures| escape_map[&captures[1]]); - write!(&mut w, "{}", esc_text)?; - } - kuchiki::NodeData::Element(elem_data) => { - let attrs = elem_data.attributes.borrow(); - let attrs_str = attrs - .map - .iter() - .filter(|(k, _)| &k.local != "\"") - .map(|(k, v)| { - format!( - "{}=\"{}\"", - k.local, - ESC_SEQ_REGEX - .replace_all(&v.value, |captures: ®ex::Captures| { - escape_map[&captures[1]] - }) - ) - }) - .fold("".to_string(), |acc, val| acc + " " + &val); - write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?; - } - _ => (), - }, - kuchiki::iter::NodeEdge::End(n) => match n.data() { - kuchiki::NodeData::Element(elem_data) => { - write!(&mut w, "", &elem_data.name.local)?; - } - _ => (), - }, - } - } - Ok(()) -} - #[cfg(test)] mod test { use super::*; diff --git a/src/http.rs b/src/http.rs index efd64b8..8707977 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,14 +1,72 @@ use async_std::io::prelude::*; +use async_std::task; use async_std::{fs::File, stream}; use futures::StreamExt; use indicatif::ProgressBar; +use log::warn; use log::{debug, info}; use url::Url; +use crate::cli::AppConfig; use crate::errors::{ErrorKind, ImgError, PaperoniError}; use crate::extractor::Extractor; type HTMLResource = (String, String); +pub fn download( + app_config: &AppConfig, + bar: &ProgressBar, + partial_downloads: &mut Vec, + errors: &mut Vec, +) -> Vec { + task::block_on(async { + let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); + let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); + let mut articles = Vec::new(); + while let Some(fetch_result) = responses.next().await { + match fetch_result { + Ok((url, html)) => { + debug!("Extracting {}", &url); + let mut extractor = Extractor::from_html(&html, &url); + bar.set_message("Extracting..."); + match extractor.extract_content() { + Ok(_) => { + extractor.extract_img_urls(); + if let Err(img_errors) = + download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) + .await + { + partial_downloads + .push(PartialDownload::new(&url, extractor.metadata().title())); + warn!( + "{} image{} failed to download for {}", + img_errors.len(), + if img_errors.len() > 1 { "s" } else { "" }, + url + ); + for img_error in img_errors { + warn!( + "{}\n\t\tReason {}", + img_error.url().as_ref().unwrap(), + img_error + ); + } + } + articles.push(extractor); + } + Err(mut e) => { + e.set_article_source(&url); + errors.push(e); + } + } + } + Err(e) => errors.push(e), + } + bar.inc(1); + } + articles + }) +} + pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); debug!("Fetching {}", url); @@ -153,7 +211,11 @@ pub async fn download_images( }) .enumerate() .map(|(img_idx, (url, req))| async move { - bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str()); + bar.set_message(format!( + "Downloading images [{}/{}]", + img_idx + 1, + img_count + )); match req.await { Ok(mut img_response) => { let process_response = @@ -206,6 +268,20 @@ pub async fn download_images( } } +pub struct PartialDownload { + pub link: String, + pub title: String, +} + +impl PartialDownload { + pub fn new(link: &str, title: &str) -> Self { + Self { + link: link.into(), + title: title.into(), + } + } +} + /// Handles getting the extension from a given MIME subtype. fn map_mime_subtype_to_ext(subtype: &str) -> &str { if subtype == ("svg+xml") { @@ -234,9 +310,9 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String { .unwrap() .join(url) .unwrap() - .into_string() + .into() } else { - request_url.join(url).unwrap().into_string() + request_url.join(url).unwrap().into() } } diff --git a/src/logs.rs b/src/logs.rs index 87b5d1b..a0f51d8 100644 --- a/src/logs.rs +++ b/src/logs.rs @@ -1,18 +1,21 @@ +use std::fs; + +use chrono::{DateTime, Local}; use colored::*; use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; -use directories::UserDirs; -use flexi_logger::LogSpecBuilder; +use flexi_logger::{FileSpec, LevelFilter}; use log::error; -use crate::{cli::AppConfig, errors::PaperoniError}; +use crate::errors::PaperoniError; pub fn display_summary( initial_article_count: usize, succesful_articles_table: Table, - partial_downloads_count: usize, + partial_downloads: Vec, errors: Vec, ) { + let partial_downloads_count = partial_downloads.len(); let successfully_downloaded_count = initial_article_count - partial_downloads_count - errors.len(); @@ -30,6 +33,24 @@ pub fn display_summary( if successfully_downloaded_count > 0 { println!("{}", succesful_articles_table); } + + if partial_downloads_count > 0 { + println!("\n{}", "Partially failed downloads".yellow().bold()); + let mut table_partial = Table::new(); + table_partial + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_header(vec![ + Cell::new("Link").set_alignment(CellAlignment::Center), + Cell::new("Title").set_alignment(CellAlignment::Center), + ]) + .set_content_arrangement(ContentArrangement::Dynamic); + + for partial in partial_downloads { + table_partial.add_row(vec![&partial.link, &partial.title]); + } + println!("{}", table_partial); + } + if !errors.is_empty() { println!("\n{}", "Failed article downloads".bright_red().bold()); let mut table_failed = Table::new(); @@ -55,76 +76,55 @@ pub fn display_summary( /// Returns a string summary of the total number of failed and successful article downloads fn short_summary(download_count: DownloadCount) -> String { - // TODO: Refactor this if download_count.total != download_count.successful + download_count.failed + download_count.partial { panic!("initial_count must be equal to the sum of failed and successful count") } let get_noun = |count: usize| if count == 1 { "article" } else { "articles" }; - if download_count.successful == download_count.total && download_count.successful == 1 { - "Article downloaded successfully".green().to_string() - } else if download_count.total == download_count.failed && download_count.failed == 1 { - "Article failed to download".red().to_string() - } else if download_count.total == download_count.partial && download_count.partial == 1 { - "Article partially failed to download".yellow().to_string() - } else if download_count.successful == download_count.total { - "All articles downloaded successfully".green().to_string() - } else if download_count.failed == download_count.total { - "All articles failed to download".red().to_string() - } else if download_count.partial == download_count.total { - "All articles partially failed to download" - .yellow() - .to_string() - } else if download_count.partial == 0 { - format!( - "{} {} downloaded successfully, {} {} failed", - download_count.successful, - get_noun(download_count.successful), - download_count.failed, - get_noun(download_count.failed) - ) - .yellow() - .to_string() - } else if download_count.successful == 0 - && download_count.partial > 0 - && download_count.failed > 0 - { - format!( - "{} {} partially failed to download, {} {} failed", - download_count.partial, - get_noun(download_count.partial), - download_count.failed, - get_noun(download_count.failed) - ) - .yellow() - .to_string() - } else if download_count.failed == 0 - && download_count.successful > 0 - && download_count.partial > 0 - { - format!( - "{} {} downloaded successfully, {} {} partially failed to download", - download_count.successful, - get_noun(download_count.successful), - download_count.partial, - get_noun(download_count.partial) - ) - .yellow() + let get_summary = |count, label, color: Color| { + if count == 0 { + return "".to_string(); + }; + + { + if count == 1 && count == download_count.total { + "Article".to_string() + label + } else if count == download_count.total { + "All ".to_string() + get_noun(count) + label + } else { + count.to_string() + " " + get_noun(count) + label + } + } + .color(color) .to_string() + }; + + let mut summary = get_summary( + download_count.successful, + " downloaded successfully", + Color::BrightGreen, + ); + + let partial_summary = get_summary( + download_count.partial, + " partially failed to download", + Color::Yellow, + ); + + if !summary.is_empty() && !partial_summary.is_empty() { + summary = summary + ", " + &partial_summary; } else { - format!( - "{} {} downloaded successfully, {} {} partially failed to download, {} {} failed", - download_count.successful, - get_noun(download_count.successful), - download_count.partial, - get_noun(download_count.partial), - download_count.failed, - get_noun(download_count.failed) - ) - .yellow() - .to_string() + summary = summary + &partial_summary; } + + let failed_summary = get_summary(download_count.failed, " failed to download", Color::Red); + if !summary.is_empty() && !failed_summary.is_empty() { + summary = summary + ", " + &failed_summary; + } else { + summary = summary + &failed_summary; + } + summary } struct DownloadCount { @@ -144,44 +144,43 @@ impl DownloadCount { } } -pub fn init_logger(app_config: &AppConfig) { +use crate::errors::LogError as Error; +use crate::http::PartialDownload; + +pub fn init_logger( + log_level: LevelFilter, + start_time: &DateTime, + is_logging_to_file: bool, +) -> Result<(), Error> { + use directories::UserDirs; + use flexi_logger::LogSpecBuilder; + match UserDirs::new() { Some(user_dirs) => { let home_dir = user_dirs.home_dir(); let paperoni_dir = home_dir.join(".paperoni"); let log_dir = paperoni_dir.join("logs"); - let log_spec = LogSpecBuilder::new() - .module("paperoni", app_config.log_level()) - .build(); - let formatted_timestamp = app_config.start_time().format("%Y-%m-%d_%H-%M-%S"); + let log_spec = LogSpecBuilder::new().module("paperoni", log_level).build(); + let formatted_timestamp = start_time.format("%Y-%m-%d_%H-%M-%S"); let mut logger = flexi_logger::Logger::with(log_spec); - if app_config.is_logging_to_file() && (!paperoni_dir.is_dir() || !log_dir.is_dir()) { - match std::fs::create_dir_all(&log_dir) { - Ok(_) => (), - Err(e) => { - eprintln!("Unable to create paperoni directories on home directory for logging purposes\n{}",e); - std::process::exit(1); - } - }; - } - - if app_config.is_logging_to_file() { - logger = logger - .directory(log_dir) - .discriminant(formatted_timestamp.to_string()) - .suppress_timestamp() - .log_to_file(); - } - - match logger.start() { - Ok(_) => (), - Err(e) => eprintln!("Unable to start logger!\n{}", e), + if is_logging_to_file { + if !paperoni_dir.is_dir() || !log_dir.is_dir() { + fs::create_dir_all(&log_dir)?; + } + logger = logger.log_to_file( + FileSpec::default() + .directory(log_dir) + .discriminant(formatted_timestamp.to_string()) + .suppress_timestamp(), + ); } + logger.start()?; + Ok(()) } - None => eprintln!("Unable to get user directories for logging purposes"), - }; + None => Err(Error::UserDirectoriesError), + } } #[cfg(test)] @@ -192,7 +191,7 @@ mod tests { fn test_short_summary() { assert_eq!( short_summary(DownloadCount::new(1, 1, 0, 0)), - "Article downloaded successfully".green().to_string() + "Article downloaded successfully".bright_green().to_string() ); assert_eq!( short_summary(DownloadCount::new(1, 0, 0, 1)), @@ -200,7 +199,9 @@ mod tests { ); assert_eq!( short_summary(DownloadCount::new(10, 10, 0, 0)), - "All articles downloaded successfully".green().to_string() + "All articles downloaded successfully" + .bright_green() + .to_string() ); assert_eq!( short_summary(DownloadCount::new(10, 0, 0, 10)), @@ -208,39 +209,52 @@ mod tests { ); assert_eq!( short_summary(DownloadCount::new(10, 8, 0, 2)), - "8 articles downloaded successfully, 2 articles failed" - .yellow() - .to_string() + format!( + "{}, {}", + "8 articles downloaded successfully".bright_green(), + "2 articles failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(10, 1, 0, 9)), - "1 article downloaded successfully, 9 articles failed" - .yellow() - .to_string() + format!( + "{}, {}", + "1 article downloaded successfully".bright_green(), + "9 articles failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(7, 6, 0, 1)), - "6 articles downloaded successfully, 1 article failed" - .yellow() - .to_string() + format!( + "{}, {}", + "6 articles downloaded successfully".bright_green(), + "1 article failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(7, 4, 2, 1)), - "4 articles downloaded successfully, 2 articles partially failed to download, 1 article failed" - .yellow() - .to_string() + format!( + "{}, {}, {}", + "4 articles downloaded successfully".bright_green(), + "2 articles partially failed to download".yellow(), + "1 article failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(12, 6, 6, 0)), - "6 articles downloaded successfully, 6 articles partially failed to download" - .yellow() - .to_string() + format!( + "{}, {}", + "6 articles downloaded successfully".bright_green(), + "6 articles partially failed to download".yellow() + ) ); assert_eq!( short_summary(DownloadCount::new(5, 0, 4, 1)), - "4 articles partially failed to download, 1 article failed" - .yellow() - .to_string() + format!( + "{}, {}", + "4 articles partially failed to download".yellow(), + "1 article failed to download".red() + ) ); assert_eq!( short_summary(DownloadCount::new(4, 0, 4, 0)), diff --git a/src/main.rs b/src/main.rs index 0f8b34a..e378115 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,14 +1,12 @@ #[macro_use] extern crate lazy_static; -use async_std::stream; -use async_std::task; +use std::process::exit; + use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::{ContentArrangement, Table}; -use futures::stream::StreamExt; +use http::download; use indicatif::{ProgressBar, ProgressStyle}; -use log::{debug, warn}; -use url::Url; mod cli; mod epub; @@ -22,78 +20,48 @@ mod moz_readability; use cli::AppConfig; use epub::generate_epubs; -use extractor::Extractor; -use http::{download_images, fetch_html}; use logs::display_summary; fn main() { - let app_config = cli::cli_init(); + let app_config = match cli::AppConfig::init_with_cli() { + Ok(app_config) => app_config, + Err(err) => { + eprintln!("{}", err); + exit(1); + } + }; - if !app_config.urls().is_empty() { - download(app_config); + if !app_config.urls.is_empty() { + run(app_config); } } -fn download(app_config: AppConfig) { +fn run(app_config: AppConfig) { let mut errors = Vec::new(); - let mut partial_download_count: usize = 0; - let bar = if app_config.can_disable_progress_bar() { + let mut partial_downloads = Vec::new(); + + if let Some(dir_name) = &app_config.output_directory { + let noun = if app_config.urls.len() > 1 { + "articles" + } else { + "article" + }; + println!("Downloading {} to {}", noun, dir_name); + } + + let bar = if app_config.can_disable_progress_bar { ProgressBar::hidden() } else { - let enabled_bar = ProgressBar::new(app_config.urls().len() as u64); + let enabled_bar = ProgressBar::new(app_config.urls.len() as u64); let style = ProgressStyle::default_bar().template( - "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", - ); + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", + ); enabled_bar.set_style(style); enabled_bar.enable_steady_tick(500); enabled_bar }; - let articles = task::block_on(async { - let urls_iter = app_config.urls().iter().map(|url| fetch_html(url)); - let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); - let mut articles = Vec::new(); - while let Some(fetch_result) = responses.next().await { - match fetch_result { - Ok((url, html)) => { - debug!("Extracting {}", &url); - let mut extractor = Extractor::from_html(&html, &url); - bar.set_message("Extracting..."); - match extractor.extract_content() { - Ok(_) => { - extractor.extract_img_urls(); - if let Err(img_errors) = - download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) - .await - { - partial_download_count += 1; - warn!( - "{} image{} failed to download for {}", - img_errors.len(), - if img_errors.len() > 1 { "s" } else { "" }, - url - ); - for img_error in img_errors { - warn!( - "{}\n\t\tReason {}", - img_error.url().as_ref().unwrap(), - img_error - ); - } - } - articles.push(extractor); - } - Err(mut e) => { - e.set_article_source(&url); - errors.push(e); - } - } - } - Err(e) => errors.push(e), - } - bar.inc(1); - } - articles - }); + + let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors); bar.finish_with_message("Downloaded articles"); let mut succesful_articles_table = Table::new(); @@ -107,19 +75,24 @@ fn download(app_config: AppConfig) { errors.extend(gen_epub_errors); } }; - let has_errors = !errors.is_empty(); + + let has_errors = !errors.is_empty() || !partial_downloads.is_empty(); display_summary( - app_config.urls().len(), + app_config.urls.len(), succesful_articles_table, - partial_download_count, + partial_downloads, errors, ); - if app_config.is_logging_to_file() { + + if app_config.is_logging_to_file { println!( "Log written to paperoni_{}.log\n", - app_config.start_time().format("%Y-%m-%d_%H-%M-%S") + app_config.start_time.format("%Y-%m-%d_%H-%M-%S") ); + } else if has_errors && !app_config.is_logging_to_file { + println!("\nRun paperoni with the --log-to-file flag to create a log file"); } + if has_errors { std::process::exit(1); } diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 705fa55..0f4fc66 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -659,10 +659,24 @@ impl Readability { .map(|node_ref| { let node_attrs = node_ref.attributes.borrow(); let href = node_attrs.get("href").unwrap(); - if href.trim() == "/" { - document_uri.join("/").unwrap() - } else { - Url::parse(href).unwrap() + + match Url::parse(href) { + Ok(url) => url, + Err(e) => match e { + url::ParseError::RelativeUrlWithoutBase => { + match document_uri.join(href) { + Ok(joined_url) => joined_url, + Err(e) => panic!( + "{:} unable to parse url {:?} on element {}", + e, href, &node_ref.name.local + ), + } + } + e => panic!( + "{:} unable to parse url {:?} on element {}", + e, href, &node_ref.name.local + ), + }, } }) .next() @@ -1609,13 +1623,11 @@ impl Readability { // // class name "comment", etc), and turn divs into P tags where they have been // // used inappropriately (as in, where they contain no other block level elements.) let mut elements_to_score: Vec = Vec::new(); - let mut node = Some( - self.root_node - .select_first("html") - .unwrap() - .as_node() - .clone(), - ); + let mut node = self + .root_node + .select_first("html") + .ok() + .map(|n| n.as_node().clone()); while let Some(node_ref) = node { let node_elem = node_ref.as_element().unwrap();