Merge pull request #18 from hipstermojo/dev

v0.5.0 release
This commit is contained in:
Kenneth Gitere 2021-06-24 08:36:11 +03:00 committed by GitHub
commit 6b1a826ccc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 1106 additions and 524 deletions

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
/target /target
*.epub *.epub
*.log *.log
.vscode/

227
Cargo.lock generated
View file

@ -1,5 +1,7 @@
# This file is automatically @generated by Cargo. # This file is automatically @generated by Cargo.
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3
[[package]] [[package]]
name = "addr2line" name = "addr2line"
version = "0.14.1" version = "0.14.1"
@ -71,9 +73,9 @@ dependencies = [
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "0.7.15" version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [ dependencies = [
"memchr", "memchr",
] ]
@ -389,7 +391,7 @@ dependencies = [
"ansi_term", "ansi_term",
"atty", "atty",
"bitflags", "bitflags",
"strsim", "strsim 0.8.0",
"textwrap", "textwrap",
"unicode-width", "unicode-width",
"vec_map", "vec_map",
@ -408,9 +410,9 @@ dependencies = [
[[package]] [[package]]
name = "comfy-table" name = "comfy-table"
version = "2.1.0" version = "3.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17b99e9022e080d384b58d8eaf5976b42a311ff7a9669f8200eb2453c0b2b81a" checksum = "c93d79ba722818d1a6aedfbe2cf4889330c856d0c6772951efbbf3dd283c070a"
dependencies = [ dependencies = [
"crossterm", "crossterm",
"strum", "strum",
@ -435,9 +437,7 @@ dependencies = [
"encode_unicode", "encode_unicode",
"lazy_static", "lazy_static",
"libc", "libc",
"regex",
"terminal_size", "terminal_size",
"unicode-width",
"winapi", "winapi",
] ]
@ -504,25 +504,25 @@ dependencies = [
[[package]] [[package]]
name = "crossterm" name = "crossterm"
version = "0.19.0" version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c" checksum = "c0ebde6a9dd5e331cd6c6f48253254d117642c31653baa475e394657c59c1f7d"
dependencies = [ dependencies = [
"bitflags", "bitflags",
"crossterm_winapi", "crossterm_winapi",
"lazy_static",
"libc", "libc",
"mio", "mio",
"parking_lot", "parking_lot",
"signal-hook", "signal-hook",
"signal-hook-mio",
"winapi", "winapi",
] ]
[[package]] [[package]]
name = "crossterm_winapi" name = "crossterm_winapi"
version = "0.7.0" version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9" checksum = "3a6966607622438301997d3dac0d2f6e9a90c68bb6bc1785ea98456ab93c0507"
dependencies = [ dependencies = [
"winapi", "winapi",
] ]
@ -614,6 +614,41 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "darling"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f2c43f534ea4b0b049015d00269734195e6d3f0f6635cb692251aca6f9f8b3c"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e91455b86830a1c21799d94524df0845183fa55bafd9aa137b01c7d1065fa36"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim 0.10.0",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29b5acf0dea37a7f66f7b25d2c5e93fd46f8f6968b1a5d7a3e02e97768afc95a"
dependencies = [
"darling_core",
"quote",
"syn",
]
[[package]] [[package]]
name = "dashmap" name = "dashmap"
version = "4.0.2" version = "4.0.2"
@ -630,6 +665,37 @@ version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57" checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57"
[[package]]
name = "derive_builder"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d13202debe11181040ae9063d739fa32cfcaaebe2275fe387703460ae2365b30"
dependencies = [
"derive_builder_macro",
]
[[package]]
name = "derive_builder_core"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "66e616858f6187ed828df7c64a6d71720d83767a7f19740b2d1b6fe6327b36e5"
dependencies = [
"darling",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "derive_builder_macro"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58a94ace95092c5acb1e97a7e846b310cfbd499652f72297da7493f618a98d73"
dependencies = [
"derive_builder_core",
"syn",
]
[[package]] [[package]]
name = "derive_more" name = "derive_more"
version = "0.99.13" version = "0.99.13"
@ -692,6 +758,12 @@ dependencies = [
"dtoa", "dtoa",
] ]
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]] [[package]]
name = "encode_unicode" name = "encode_unicode"
version = "0.3.6" version = "0.3.6"
@ -763,9 +835,9 @@ dependencies = [
[[package]] [[package]]
name = "flexi_logger" name = "flexi_logger"
version = "0.17.1" version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab94b6ac8eb69f1496a6993f26f785b5fd6d99b7416023eb2a6175c0b242b1" checksum = "8ba2265890613939b533fa11c3728651531419ac549ccf527896201581f23991"
dependencies = [ dependencies = [
"atty", "atty",
"chrono", "chrono",
@ -822,9 +894,9 @@ dependencies = [
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
dependencies = [ dependencies = [
"futures-channel", "futures-channel",
"futures-core", "futures-core",
@ -837,9 +909,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-channel" name = "futures-channel"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-sink", "futures-sink",
@ -847,15 +919,15 @@ dependencies = [
[[package]] [[package]]
name = "futures-core" name = "futures-core"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
[[package]] [[package]]
name = "futures-executor" name = "futures-executor"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-task", "futures-task",
@ -864,9 +936,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-io" name = "futures-io"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
[[package]] [[package]]
name = "futures-lite" name = "futures-lite"
@ -885,10 +957,11 @@ dependencies = [
[[package]] [[package]]
name = "futures-macro" name = "futures-macro"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
dependencies = [ dependencies = [
"autocfg",
"proc-macro-hack", "proc-macro-hack",
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -897,22 +970,23 @@ dependencies = [
[[package]] [[package]]
name = "futures-sink" name = "futures-sink"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
[[package]] [[package]]
name = "futures-task" name = "futures-task"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
[[package]] [[package]]
name = "futures-util" name = "futures-util"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
dependencies = [ dependencies = [
"autocfg",
"futures-channel", "futures-channel",
"futures-core", "futures-core",
"futures-io", "futures-io",
@ -1112,6 +1186,12 @@ dependencies = [
"url", "url",
] ]
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]] [[package]]
name = "idna" name = "idna"
version = "0.2.3" version = "0.2.3"
@ -1125,9 +1205,9 @@ dependencies = [
[[package]] [[package]]
name = "indicatif" name = "indicatif"
version = "0.15.0" version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4" checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b"
dependencies = [ dependencies = [
"console", "console",
"lazy_static", "lazy_static",
@ -1173,6 +1253,15 @@ dependencies = [
"waker-fn", "waker-fn",
] ]
[[package]]
name = "itertools"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf"
dependencies = [
"either",
]
[[package]] [[package]]
name = "itoa" name = "itoa"
version = "0.4.7" version = "0.4.7"
@ -1305,9 +1394,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.3.4" version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
[[package]] [[package]]
name = "mime" name = "mime"
@ -1419,9 +1508,9 @@ dependencies = [
[[package]] [[package]]
name = "number_prefix" name = "number_prefix"
version = "0.3.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]] [[package]]
name = "object" name = "object"
@ -1462,19 +1551,21 @@ dependencies = [
[[package]] [[package]]
name = "paperoni" name = "paperoni"
version = "0.4.1-alpha1" version = "0.5.0-alpha1"
dependencies = [ dependencies = [
"async-std", "async-std",
"chrono", "chrono",
"clap", "clap",
"colored", "colored",
"comfy-table", "comfy-table",
"derive_builder",
"directories", "directories",
"epub-builder", "epub-builder",
"flexi_logger", "flexi_logger",
"futures", "futures",
"html5ever", "html5ever",
"indicatif", "indicatif",
"itertools",
"kuchiki", "kuchiki",
"lazy_static", "lazy_static",
"log 0.4.14", "log 0.4.14",
@ -1829,9 +1920,9 @@ dependencies = [
[[package]] [[package]]
name = "regex" name = "regex"
version = "1.4.6" version = "1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"memchr", "memchr",
@ -1840,9 +1931,9 @@ dependencies = [
[[package]] [[package]]
name = "regex-syntax" name = "regex-syntax"
version = "0.6.23" version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]] [[package]]
name = "remove_dir_all" name = "remove_dir_all"
@ -2011,20 +2102,30 @@ dependencies = [
[[package]] [[package]]
name = "signal-hook" name = "signal-hook"
version = "0.1.17" version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729" checksum = "470c5a6397076fae0094aaf06a08e6ba6f37acb77d3b1b91ea92b4d6c8650c39"
dependencies = [ dependencies = [
"libc", "libc",
"mio",
"signal-hook-registry", "signal-hook-registry",
] ]
[[package]] [[package]]
name = "signal-hook-registry" name = "signal-hook-mio"
version = "1.3.0" version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6" checksum = "29fd5867f1c4f2c5be079aee7a2adf1152ebb04a4bc4d341f504b7dece607ed4"
dependencies = [
"libc",
"mio",
"signal-hook",
]
[[package]]
name = "signal-hook-registry"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0"
dependencies = [ dependencies = [
"libc", "libc",
] ]
@ -2173,16 +2274,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]] [[package]]
name = "strum" name = "strsim"
version = "0.20.0" version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7318c509b5ba57f18533982607f24070a55d353e90d4cae30c467cdb2ad5ac5c" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "strum"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aaf86bbcfd1fa9670b7a129f64fc0c9fcbbfe4f1bc4210e9e98fe71ffc12cde2"
[[package]] [[package]]
name = "strum_macros" name = "strum_macros"
version = "0.20.1" version = "0.21.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee8bc6b87a5112aeeab1f4a9f7ab634fe6cbefc4850006df31267f4cfb9e3149" checksum = "d06aaeeee809dbc59eb4556183dd927df67db1540de5be8d3ec0b6636358a5ec"
dependencies = [ dependencies = [
"heck", "heck",
"proc-macro2", "proc-macro2",
@ -2277,18 +2384,18 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.24" version = "1.0.25"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e" checksum = "fa6f76457f59514c7eeb4e59d891395fab0b2fd1d40723ae737d64153392e9c6"
dependencies = [ dependencies = [
"thiserror-impl", "thiserror-impl",
] ]
[[package]] [[package]]
name = "thiserror-impl" name = "thiserror-impl"
version = "1.0.24" version = "1.0.25"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" checksum = "8a36768c0fbf1bb15eca10defa29526bda730a2376c2ab4393ccfa16fb1a318d"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -2465,9 +2572,9 @@ dependencies = [
[[package]] [[package]]
name = "url" name = "url"
version = "2.2.1" version = "2.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c"
dependencies = [ dependencies = [
"form_urlencoded", "form_urlencoded",
"idna", "idna",

View file

@ -3,7 +3,7 @@ description = "A web article downloader"
homepage = "https://github.com/hipstermojo/paperoni" homepage = "https://github.com/hipstermojo/paperoni"
repository = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni"
name = "paperoni" name = "paperoni"
version = "0.4.1-alpha1" version = "0.5.0-alpha1"
authors = ["Kenneth Gitere <gitere81@gmail.com>"] authors = ["Kenneth Gitere <gitere81@gmail.com>"]
edition = "2018" edition = "2018"
license = "MIT" license = "MIT"
@ -12,23 +12,25 @@ readme = "README.md"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
async-std = "1.9.0"
# atty = "0.2.14" # atty = "0.2.14"
async-std = "1.9.0"
chrono = "0.4.19" chrono = "0.4.19"
clap = "2.33.3" clap = "2.33.3"
colored = "2.0.0" colored = "2.0.0"
comfy-table = "2.1.0" comfy-table = "3.0.0"
derive_builder = "0.10.2"
directories = "3.0.2" directories = "3.0.2"
epub-builder = "0.4.8" epub-builder = "0.4.8"
flexi_logger = "0.17.1" flexi_logger = "0.18.0"
futures = "0.3.14" futures = "0.3.15"
html5ever = "0.25.1" html5ever = "0.25.1"
indicatif = "0.15.0" indicatif = "0.16.2"
itertools = "0.10.1"
kuchiki = "0.8.1" kuchiki = "0.8.1"
lazy_static = "1.4.0" lazy_static = "1.4.0"
log = "0.4.14" log = "0.4.14"
md5 = "0.7.0" md5 = "0.7.0"
regex = "1.4.5" regex = "1.5.4"
surf = "2.2.0" surf = "2.2.0"
thiserror = "1.0.24" thiserror = "1.0.25"
url = "2.2.1" url = "2.2.2"

View file

@ -8,7 +8,7 @@
</a> </a>
</div> </div>
Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. There is provisional<sup><a href="#pdf-exports">\*</a></sup> support for exporting to PDF as well.
> This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash. > This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash.
@ -23,7 +23,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for
Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run: Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run:
```sh ```sh
cargo install paperoni --version 0.4.1-alpha1 cargo install paperoni --version 0.5.0-alpha1
``` ```
_Paperoni is still in alpha so the `version` flag has to be passed._ _Paperoni is still in alpha so the `version` flag has to be passed._
@ -48,18 +48,44 @@ USAGE:
paperoni [OPTIONS] [urls]... paperoni [OPTIONS] [urls]...
OPTIONS: OPTIONS:
-f, --file <file> Input file containing links -f, --file <file>
-h, --help Prints help information Input file containing links
--log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level
of debug. Use -v to specify the logging level -h, --help
--max_conn <max_conn> The maximum number of concurrent HTTP connections when downloading articles. Default is Prints help information
8
--merge <output_name> Merge multiple articles into a single epub --inline-toc
-V, --version Prints version information Add an inlined Table of Contents page at the start of the merged article.
-v Enables logging of events and set the verbosity level. Use -h to read on its usage
--log-to-file
Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to
specify the logging level
--max-conn <max_conn>
The maximum number of concurrent HTTP connections when downloading articles. Default is 8.
NOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end
up overloading your network card with too many concurrent requests.
-o, --output-dir <output_directory>
Directory for saving epub documents
--merge <output_name>
Merge multiple articles into a single epub that will be given the name provided
-V, --version
Prints version information
-v
This takes upto 4 levels of verbosity in the following order.
- Error (-v)
- Warn (-vv)
- Info (-vvv)
- Debug (-vvvv)
When this flag is passed, it disables the progress bars and logs to stderr.
If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag.
ARGS: ARGS:
<urls>... Urls of web articles <urls>...
Urls of web articles
``` ```
To download a single article pass in its URL To download a single article pass in its URL
@ -124,3 +150,14 @@ This program is still in alpha so a number of things won't work:
- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB. - Code snippets on Medium articles that are lazy loaded will not appear in the EPUB.
There are also web pages it won't work on in general such as Twitter and Reddit threads. There are also web pages it won't work on in general such as Twitter and Reddit threads.
## PDF exports
As of version 0.5-alpha1, you can now export to PDF using a third party tool. This requires that you install [Calibre](https://calibre-ebook.com/) which comes with a ebook conversion. You can convert the epub to a pdf through the terminal with `ebook-convert`:
```sh
# Assuming the downloaded epub was called foo.epub
ebook-convert foo.epub foo.pdf
```
Alternatively, you can use the Calibre GUI to do the file conversion.

1
rust-toolchain Normal file
View file

@ -0,0 +1 @@
1.52.1

7
src/assets/writ.min.css vendored Normal file
View file

@ -0,0 +1,7 @@
/*!
* Writ v1.0.4
*
* Copyright © 2015, Curtis McEnroe <curtis@cmcenroe.me>
*
* https://cmcenroe.me/writ/LICENSE (ISC)
*/dd,hr,ol ol,ol ul,ul ol,ul ul{margin:0}pre,table{overflow-x:auto}a,ins{text-decoration:none}html{font-family:Georgia,Lucida Bright,Book Antiqua,serif;font-size:16px;line-height:1.5rem}code,kbd,pre,samp{font-family:Fira Code,Liberation Mono,Menlo,Courier,monospace;font-size:.833rem;color:#111}kbd{font-weight:700}h1,h2,h3,h4,h5,h6,th{font-weight:400}h1{font-size:2.488em}h2{font-size:2.074em}h3{font-size:1.728em}h4{font-size:1.44em}h5{font-size:1.2em}h6{font-size:1em}small{font-size:.833em}h1,h2,h3{line-height:3rem}blockquote,dl,h1,h2,h3,h4,h5,h6,ol,p,pre,table,ul{margin:1.5rem 0 0}pre,table{margin-bottom:-1px}hr{border:none;padding:1.5rem 0 0}table{line-height:calc(1.5rem - 1px);width:100%;border-collapse:collapse}pre{margin-top:calc(1.5rem - 1px)}body{color:#222;margin:1.5rem 1ch}a,a code,header nav a:visited{color:#00e}a:visited,a:visited code{color:#60b}mark{color:inherit;background-color:#fe0}code,pre,samp,tfoot,thead{background-color:rgba(0,0,0,.05)}blockquote,ins,main aside{border:rgba(0,0,0,.05) solid}blockquote,main aside{border-width:0 0 0 .5ch}code,pre,samp{border:rgba(0,0,0,.1) solid}td,th{border:solid #dbdbdb}body>header{text-align:center}body>footer,main{display:block;max-width:78ch;margin:auto}main aside,main figure{float:right;margin:1.5rem 0 0 1ch}main aside{max-width:26ch;padding:0 0 0 .5ch}blockquote{margin-right:3ch;margin-left:1.5ch;padding:0 0 0 1ch}pre{border-width:1px;border-radius:2px;padding:0 .5ch}pre code{border:none;padding:0;background-color:transparent;white-space:inherit}code,ins,samp,td,th{border-width:1px}img{max-width:100%}dd,ol,ul{padding:0 0 0 3ch}ul>li{list-style-type:disc}li ul>li{list-style-type:circle}li li ul>li{list-style-type:square}ol>li{list-style-type:decimal}li ol>li{list-style-type:lower-roman}li li ol>li{list-style-type:lower-alpha}nav ul{padding:0;list-style-type:none}nav ul li{display:inline;padding-left:1ch;white-space:nowrap}nav ul li:first-child{padding-left:0}ins,mark{padding:1px}td,th{padding:0 .5ch}sub,sup{font-size:.75em;line-height:1em}code,samp{border-radius:2px;padding:.1em .2em;white-space:nowrap}

View file

@ -1,12 +1,31 @@
use std::{fs::File, io::Read, path::Path}; use std::{fs, num::NonZeroUsize, path::Path};
use chrono::{DateTime, Local}; use chrono::{DateTime, Local};
use clap::{App, AppSettings, Arg}; use clap::{App, AppSettings, Arg, ArgMatches};
use flexi_logger::LevelFilter as LogLevel; use flexi_logger::LevelFilter as LogLevel;
use itertools::Itertools;
use crate::logs::init_logger; type Error = crate::errors::CliError<AppConfigBuilderError>;
pub fn cli_init() -> AppConfig { const DEFAULT_MAX_CONN: usize = 8;
#[derive(derive_builder::Builder)]
pub struct AppConfig {
/// Urls for store in epub
pub urls: Vec<String>,
pub max_conn: usize,
/// Path to file of multiple articles into a single epub
pub merged: Option<String>,
pub output_directory: Option<String>,
pub log_level: LogLevel,
pub can_disable_progress_bar: bool,
pub start_time: DateTime<Local>,
pub is_logging_to_file: bool,
pub inline_toc: bool,
}
impl AppConfig {
pub fn init_with_cli() -> Result<AppConfig, Error> {
let app = App::new("paperoni") let app = App::new("paperoni")
.settings(&[ .settings(&[
AppSettings::ArgRequiredElseHelp, AppSettings::ArgRequiredElseHelp,
@ -28,14 +47,23 @@ pub fn cli_init() -> AppConfig {
.help("Input file containing links") .help("Input file containing links")
.takes_value(true), .takes_value(true),
) )
.arg(
Arg::with_name("output_directory")
.long("output-dir")
.short("o")
.help("Directory to store output epub documents")
.conflicts_with("output_name")
.takes_value(true),
)
.arg( .arg(
Arg::with_name("output_name") Arg::with_name("output_name")
.long("merge") .long("merge")
.help("Merge multiple articles into a single epub") .help("Merge multiple articles into a single epub")
.long_help("Merge multiple articles into a single epub that will be given the name provided") .long_help("Merge multiple articles into a single epub that will be given the name provided")
.conflicts_with("output_directory")
.takes_value(true), .takes_value(true),
).arg( ).arg(
Arg::with_name("max_conn") Arg::with_name("max-conn")
.long("max_conn") .long("max_conn")
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
@ -59,144 +87,128 @@ pub fn cli_init() -> AppConfig {
Arg::with_name("log-to-file") Arg::with_name("log-to-file")
.long("log-to-file") .long("log-to-file")
.help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level") .help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level")
.takes_value(false)); .takes_value(false))
let arg_matches = app.get_matches(); .arg(
Arg::with_name("inline-toc")
let mut urls: Vec<String> = match arg_matches.value_of("file") { .long("inline-toc")
Some(file_name) => { .requires("output_name")
if let Ok(mut file) = File::open(file_name) { .help("Add an inlined Table of Contents page at the start of the merged article.")
let mut content = String::new(); .long_help("Add an inlined Table of Contents page at the start of the merged article. This does not affect the Table of Contents navigation")
match file.read_to_string(&mut content) {
Ok(_) => content
.lines()
.filter(|line| !line.is_empty())
.map(|line| line.to_owned())
.collect(),
Err(_) => vec![],
}
} else {
println!("Unable to open file: {}", file_name);
vec![]
}
}
None => vec![],
};
if let Some(vals) = arg_matches.values_of("urls") {
urls.extend(
vals.filter(|val| !val.is_empty())
.map(|val| val.to_string()),
); );
Self::try_from(app.get_matches())
} }
let max_conn = arg_matches fn init_merge_file(self) -> Result<Self, Error> {
.value_of("max_conn") self.merged
.map(|conn_str| conn_str.parse::<usize>().ok()) .as_deref()
.flatten() .map(fs::File::create)
.map(|max| if max > 0 { max } else { 1 }) .transpose()
.unwrap_or(8); .err()
.map(|err| Err(Error::InvalidOutputPath(err.to_string())))
let mut app_config = AppConfig::new(max_conn); .unwrap_or(Ok(self))
app_config.set_urls(urls);
if let Some(name) = arg_matches.value_of("output_name") {
let file_path = Path::new(name);
if file_path.is_dir() {
eprintln!("{:?} is a directory", name);
std::process::exit(1);
} }
let file_name = if file_path.extension().is_some() { fn init_logger(self) -> Result<Self, Error> {
use crate::logs;
logs::init_logger(self.log_level, &self.start_time, self.is_logging_to_file)
.map(|_| self)
.map_err(Error::LogError)
}
}
use std::convert::TryFrom;
impl<'a> TryFrom<ArgMatches<'a>> for AppConfig {
type Error = Error;
fn try_from(arg_matches: ArgMatches<'a>) -> Result<Self, Self::Error> {
AppConfigBuilder::default()
.urls({
let url_filter = |url: &str| {
let url = url.trim();
if !url.is_empty() {
Some(url.to_owned())
} else {
None
}
};
let direct_urls = arg_matches
.values_of("urls")
.and_then(|urls| urls.map(url_filter).collect::<Option<Vec<_>>>())
.unwrap_or(Vec::new());
let file_urls = arg_matches
.value_of("file")
.map(fs::read_to_string)
.transpose()?
.and_then(|content| content.lines().map(url_filter).collect::<Option<Vec<_>>>())
.unwrap_or(Vec::new());
let urls = [direct_urls, file_urls]
.concat()
.into_iter()
.unique()
.collect_vec();
if !urls.is_empty() {
Ok(urls)
} else {
Err(Error::NoUrls)
}
}?)
.max_conn(match arg_matches.value_of("max-conn") {
Some(max_conn) => max_conn.parse::<NonZeroUsize>()?.get(),
None => DEFAULT_MAX_CONN,
})
.merged(arg_matches.value_of("output_name").map(|name| {
if name.ends_with(".epub") {
name.to_owned() name.to_owned()
} else { } else {
name.to_owned() + ".epub" name.to_string() + ".epub"
};
match std::fs::File::create(&file_name) {
Ok(_) => (),
Err(e) => {
eprintln!("Unable to create file {:?}\n{}", file_path, e);
std::process::exit(1)
} }
} }))
app_config.merged = Some(file_name); .can_disable_progress_bar(
} arg_matches.is_present("verbosity") && !arg_matches.is_present("log-to-file"),
)
if arg_matches.is_present("verbosity") { .log_level(match arg_matches.occurrences_of("verbosity") {
0 => {
if !arg_matches.is_present("log-to-file") { if !arg_matches.is_present("log-to-file") {
app_config.can_disable_progress_bar = true; LogLevel::Off
} else {
LogLevel::Debug
} }
let log_levels: [LogLevel; 5] = [
LogLevel::Off,
LogLevel::Error,
LogLevel::Warn,
LogLevel::Info,
LogLevel::Debug,
];
let level = arg_matches.occurrences_of("verbosity").clamp(0, 4) as usize;
app_config.log_level = log_levels[level];
} }
if arg_matches.is_present("log-to-file") { 1 => LogLevel::Error,
app_config.log_level = LogLevel::Debug; 2 => LogLevel::Warn,
app_config.is_logging_to_file = true; 3 => LogLevel::Info,
4..=u64::MAX => LogLevel::Debug,
})
.is_logging_to_file(arg_matches.is_present("log-to-file"))
.inline_toc(arg_matches.is_present("inline-toc"))
.output_directory(
arg_matches
.value_of("output_directory")
.map(|output_directory| {
let path = Path::new(output_directory);
if !path.exists() {
Err(Error::OutputDirectoryNotExists)
} else if !path.is_dir() {
Err(Error::WrongOutputDirectory)
} else {
Ok(output_directory.to_owned())
} }
})
init_logger(&app_config); .transpose()?,
)
app_config .start_time(Local::now())
} .try_init()
pub struct AppConfig {
urls: Vec<String>,
max_conn: usize,
merged: Option<String>,
log_level: LogLevel,
can_disable_progress_bar: bool,
start_time: DateTime<Local>,
is_logging_to_file: bool,
}
impl AppConfig {
fn new(max_conn: usize) -> Self {
Self {
urls: vec![],
max_conn,
merged: None,
log_level: LogLevel::Off,
can_disable_progress_bar: false,
start_time: Local::now(),
is_logging_to_file: false,
} }
} }
fn set_urls(&mut self, urls: Vec<String>) { impl AppConfigBuilder {
self.urls.extend(urls); pub fn try_init(&self) -> Result<AppConfig, Error> {
} self.build()
.map_err(Error::AppBuildError)?
pub fn urls(&self) -> &Vec<String> { .init_logger()?
&self.urls .init_merge_file()
}
pub fn max_conn(&self) -> usize {
self.max_conn
}
pub fn merged(&self) -> Option<&String> {
self.merged.as_ref()
}
pub fn log_level(&self) -> LogLevel {
self.log_level
}
pub fn can_disable_progress_bar(&self) -> bool {
self.can_disable_progress_bar
}
pub fn start_time(&self) -> &DateTime<Local> {
&self.start_time
}
pub fn is_logging_to_file(&self) -> bool {
self.is_logging_to_file
} }
} }

View file

@ -1,22 +1,29 @@
use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table};
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use epub_builder::{EpubBuilder, EpubContent, TocElement, ZipLibrary};
use html5ever::tendril::fmt::Slice;
use indicatif::{ProgressBar, ProgressStyle}; use indicatif::{ProgressBar, ProgressStyle};
use log::{debug, info}; use kuchiki::NodeRef;
use log::{debug, error, info};
use crate::{ use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
cli::AppConfig,
errors::PaperoniError, lazy_static! {
extractor::{self, Extractor}, static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
}; }
pub fn generate_epubs( pub fn generate_epubs(
articles: Vec<Extractor>, articles: Vec<Extractor>,
app_config: &AppConfig, app_config: &AppConfig,
successful_articles_table: &mut Table, successful_articles_table: &mut Table,
) -> Result<(), Vec<PaperoniError>> { ) -> Result<(), Vec<PaperoniError>> {
let bar = if app_config.can_disable_progress_bar() { if articles.is_empty() {
return Ok(());
}
let bar = if app_config.can_disable_progress_bar {
ProgressBar::hidden() ProgressBar::hidden()
} else { } else {
let enabled_bar = ProgressBar::new(articles.len() as u64); let enabled_bar = ProgressBar::new(articles.len() as u64);
@ -30,10 +37,12 @@ pub fn generate_epubs(
enabled_bar enabled_bar
}; };
let stylesheet = include_bytes!("./assets/writ.min.css");
let mut errors: Vec<PaperoniError> = Vec::new(); let mut errors: Vec<PaperoniError> = Vec::new();
match app_config.merged() { match app_config.merged {
Some(name) => { Some(ref name) => {
successful_articles_table.set_header(vec![Cell::new("Table of Contents") successful_articles_table.set_header(vec![Cell::new("Table of Contents")
.add_attribute(Attribute::Bold) .add_attribute(Attribute::Bold)
.set_alignment(CellAlignment::Center) .set_alignment(CellAlignment::Center)
@ -57,21 +66,43 @@ pub fn generate_epubs(
} }
}; };
debug!("Creating {:?}", name); debug!("Creating {:?}", name);
if app_config.inline_toc {
epub.inline_toc(); epub.inline_toc();
}
match epub.stylesheet(stylesheet.as_bytes()) {
Ok(_) => (),
Err(e) => {
error!("Unable to add stylesheets to epub file");
let mut paperoni_err: PaperoniError = e.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
}
articles articles
.iter() .iter()
.enumerate() .enumerate()
.fold(&mut epub, |epub, (idx, article)| { .fold(&mut epub, |epub, (idx, article)| {
let mut article_result = || -> Result<(), PaperoniError> { let mut article_result = || -> Result<(), PaperoniError> {
let mut html_buf = Vec::new(); let content_url = format!("article_{}.xhtml", idx);
extractor::serialize_to_xhtml(article.article(), &mut html_buf)?; let mut xhtml_buf = Vec::new();
let html_str = std::str::from_utf8(&html_buf)?; let header_level_tocs =
epub.metadata("title", replace_metadata_value(name))?; get_header_level_toc_vec(&content_url, article.article());
serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
let section_name = article.metadata().title(); let section_name = article.metadata().title();
epub.add_content( let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) .title(replace_escaped_characters(section_name));
.title(replace_metadata_value(section_name)),
)?; for toc_element in header_level_tocs {
content = content.child(toc_element);
}
epub.metadata("title", replace_escaped_characters(name))?;
epub.add_content(content)?;
info!("Adding images for {:?}", name); info!("Adding images for {:?}", name);
article.img_urls.iter().for_each(|img| { article.img_urls.iter().for_each(|img| {
// TODO: Add error handling and return errors as a vec // TODO: Add error handling and return errors as a vec
@ -100,10 +131,10 @@ pub fn generate_epubs(
let appendix = generate_appendix(articles.iter().collect()); let appendix = generate_appendix(articles.iter().collect());
if let Err(err) = epub.add_content( if let Err(err) = epub.add_content(
EpubContent::new("appendix.xhtml", appendix.as_bytes()) EpubContent::new("appendix.xhtml", appendix.as_bytes())
.title(replace_metadata_value("Article Sources")), .title(replace_escaped_characters("Article Sources")),
) { ) {
let mut paperoni_err: PaperoniError = err.into(); let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name); paperoni_err.set_article_source(&name);
errors.push(paperoni_err); errors.push(paperoni_err);
return Err(errors); return Err(errors);
} }
@ -113,7 +144,7 @@ pub fn generate_epubs(
Ok(_) => (), Ok(_) => (),
Err(err) => { Err(err) => {
let mut paperoni_err: PaperoniError = err.into(); let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name); paperoni_err.set_article_source(&name);
errors.push(paperoni_err); errors.push(paperoni_err);
return Err(errors); return Err(errors);
} }
@ -135,7 +166,8 @@ pub fn generate_epubs(
let mut result = || -> Result<(), PaperoniError> { let mut result = || -> Result<(), PaperoniError> {
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
let file_name = format!( let file_name = format!(
"{}.epub", "{}/{}.epub",
app_config.output_directory.as_deref().unwrap_or("."),
article article
.metadata() .metadata()
.title() .title()
@ -144,15 +176,31 @@ pub fn generate_epubs(
); );
debug!("Creating {:?}", file_name); debug!("Creating {:?}", file_name);
let mut out_file = File::create(&file_name).unwrap(); let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new(); let mut xhtml_buf = Vec::new();
extractor::serialize_to_xhtml(article.article(), &mut html_buf) let header_level_tocs =
get_header_level_toc_vec("index.xhtml", article.article());
serialize_to_xhtml(article.article(), &mut xhtml_buf)
.expect("Unable to serialize to xhtml"); .expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap(); let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
if let Some(author) = article.metadata().byline() { if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))?; epub.metadata("author", replace_escaped_characters(author))?;
} }
epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; epub.stylesheet(stylesheet.as_bytes())?;
let title = replace_escaped_characters(article.metadata().title());
epub.metadata("title", &title)?;
let mut content =
EpubContent::new("index.xhtml", xhtml_str.as_bytes()).title(title);
for toc_element in header_level_tocs {
content = content.child(toc_element);
}
epub.add_content(content)?;
for img in &article.img_urls { for img in &article.img_urls {
let mut file_path = std::env::temp_dir(); let mut file_path = std::env::temp_dir();
file_path.push(&img.0); file_path.push(&img.0);
@ -167,7 +215,7 @@ pub fn generate_epubs(
let appendix = generate_appendix(vec![&article]); let appendix = generate_appendix(vec![&article]);
epub.add_content( epub.add_content(
EpubContent::new("appendix.xhtml", appendix.as_bytes()) EpubContent::new("appendix.xhtml", appendix.as_bytes())
.title(replace_metadata_value("Article Source")), .title(replace_escaped_characters("Article Source")),
)?; )?;
epub.generate(&mut out_file)?; epub.generate(&mut out_file)?;
bar.inc(1); bar.inc(1);
@ -194,7 +242,7 @@ pub fn generate_epubs(
} }
/// Replaces characters that have to be escaped before adding to the epub's metadata /// Replaces characters that have to be escaped before adding to the epub's metadata
fn replace_metadata_value(value: &str) -> String { fn replace_escaped_characters(value: &str) -> String {
value value
.replace("&", "&amp;") .replace("&", "&amp;")
.replace("<", "&lt;") .replace("<", "&lt;")
@ -213,14 +261,15 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String {
}; };
format!( format!(
"<a href=\"{}\">{}</a><br></br>", "<a href=\"{}\">{}</a><br></br>",
replace_metadata_value(&article.url), replace_escaped_characters(&article.url),
replace_metadata_value(article_name) replace_escaped_characters(article_name)
) )
}) })
.collect(); .collect();
let template = format!( let template = format!(
r#"<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> r#"<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head> <head>
<link rel="stylesheet" href="stylesheet.css" type="text/css"></link>
</head> </head>
<body> <body>
<h2>Appendix</h2><h3>Article sources</h3> <h2>Appendix</h2><h3>Article sources</h3>
@ -232,23 +281,334 @@ fn generate_appendix(articles: Vec<&Extractor>) -> String {
template template
} }
/// Adds an id attribute to header elements and assigns a value based on
/// the hash of the text content. Headers with id attributes are not modified.
/// The headers here are known to have text because the grabbed article from
/// readability removes headers with no text.
fn generate_header_ids(root_node: &NodeRef) {
let headers = root_node
.select("h1, h2, h3, h4")
.expect("Unable to create selector for headings");
let headers_no_id = headers.filter(|node_data_ref| {
let attrs = node_data_ref.attributes.borrow();
!attrs.contains("id")
});
for header in headers_no_id {
let mut attrs = header.attributes.borrow_mut();
let text = header.text_contents();
// The value of the id begins with an underscore because the hexadecimal
// digest might start with a number which would make it an invalid id
// when querying with selectors
let value = format!("_{:x}", md5::compute(text));
attrs.insert("id", value);
}
}
/// Returns a vector of `TocElement` from a NodeRef used for adding to the Table of Contents for navigation
fn get_header_level_toc_vec(content_url: &str, article: &NodeRef) -> Vec<TocElement> {
// Depth starts from 1
const HEADER_LEVEL_MAX_DEPTH: usize = 4;
let mut headers_vec: Vec<TocElement> = Vec::new();
let mut header_levels = HashMap::with_capacity(HEADER_LEVEL_MAX_DEPTH);
header_levels.insert("h1", 1);
header_levels.insert("h2", 2);
header_levels.insert("h3", 3);
header_levels.insert("h4", 4);
generate_header_ids(article);
let headings = article
.select("h1, h2, h3, h4")
.expect("Unable to create selector for headings");
// The header list will be generated using some sort of backtracking algorithm
// There will be a stack of maximum size 4 (since it only goes to h4 now)
let mut stack: Vec<Option<TocElement>> = std::iter::repeat(None)
.take(HEADER_LEVEL_MAX_DEPTH)
.collect::<_>();
for heading in headings {
let elem_name: &str = &heading.name.local;
let attrs = heading.attributes.borrow();
let id = attrs
.get("id")
.map(ToOwned::to_owned)
.expect("Unable to get id value in get_header_level_toc_vec");
let url = format!("{}#{}", content_url, id);
let level = header_levels[elem_name];
let index = level - 1;
if let Some(mut existing_toc) = stack.get_mut(index).take().cloned().flatten() {
// If a toc element already exists at that header level, consume all the toc elements
// of a lower hierarchy e.g if the existing toc is a h2, then the h3 and h4 in the stack
// will be consumed.
// We collapse the children by folding from the right to the left of the stack.
let descendants_levels = HEADER_LEVEL_MAX_DEPTH - level;
let folded_descendants = stack
.iter_mut()
.rev()
.take(descendants_levels)
.map(|toc_elem| toc_elem.take())
.filter(|toc_elem| toc_elem.is_some())
.map(|toc_elem| toc_elem.unwrap())
.reduce(|child, parent| parent.child(child));
if let Some(child) = folded_descendants {
existing_toc = existing_toc.child(child);
};
// Find the nearest ancestor to embed into.
// If this toc_elem was a h1, then just add it to the headers_vec
if index == 0 {
headers_vec.push(existing_toc);
} else {
// Otherwise, find the nearest ancestor to add it to. If none exists, add it to the headers_vec
let first_ancestor = stack
.iter_mut()
.take(level - 1)
.map(|toc_elem| toc_elem.as_mut())
.rfind(|toc_elem| toc_elem.is_some())
.flatten();
match first_ancestor {
Some(ancestor_toc_elem) => {
*ancestor_toc_elem = ancestor_toc_elem.clone().child(existing_toc);
}
None => {
headers_vec.push(existing_toc);
}
}
}
}
if let Some(toc_elem) = stack.get_mut(index) {
*toc_elem = Some(TocElement::new(
url,
replace_escaped_characters(&heading.text_contents()),
));
}
}
let folded_stack = stack
.into_iter()
.rev()
.filter(|toc_elem| toc_elem.is_some())
.map(|opt_toc_elem| opt_toc_elem.unwrap())
.reduce(|child, parent| parent.child(child));
if let Some(toc_elem) = folded_stack {
headers_vec.push(toc_elem)
}
headers_vec
}
/// Serializes a NodeRef to a string that is XHTML compatible
/// The only DOM nodes serialized are Text and Element nodes
fn serialize_to_xhtml<W: std::io::Write>(
node_ref: &NodeRef,
mut w: &mut W,
) -> Result<(), PaperoniError> {
let mut escape_map = HashMap::new();
escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;");
escape_map.insert("&", "&amp;");
escape_map.insert("\"", "&quot;");
escape_map.insert("'", "&apos;");
for edge in node_ref.traverse_inclusive() {
match edge {
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
kuchiki::NodeData::Text(rc_text) => {
let text = rc_text.borrow();
let esc_text = ESC_SEQ_REGEX
.replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
write!(&mut w, "{}", esc_text)?;
}
kuchiki::NodeData::Element(elem_data) => {
let attrs = elem_data.attributes.borrow();
let attrs_str = attrs
.map
.iter()
.filter(|(k, _)| !k.local.contains("\""))
.map(|(k, v)| {
format!(
"{}=\"{}\"",
k.local,
ESC_SEQ_REGEX
.replace_all(&v.value, |captures: &regex::Captures| {
escape_map[&captures[1]]
})
)
})
.fold("".to_string(), |acc, val| acc + " " + &val);
write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
}
_ => (),
},
kuchiki::iter::NodeEdge::End(n) => match n.data() {
kuchiki::NodeData::Element(elem_data) => {
write!(&mut w, "</{}>", &elem_data.name.local)?;
}
_ => (),
},
}
}
Ok(())
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::replace_metadata_value; use kuchiki::traits::*;
use super::{generate_header_ids, get_header_level_toc_vec, replace_escaped_characters};
#[test] #[test]
fn test_replace_metadata_value() { fn test_replace_escaped_characters() {
let mut value = "Lorem ipsum"; let mut value = "Lorem ipsum";
assert_eq!(replace_metadata_value(value), "Lorem ipsum"); assert_eq!(replace_escaped_characters(value), "Lorem ipsum");
value = "Memory safe > memory unsafe"; value = "Memory safe > memory unsafe";
assert_eq!( assert_eq!(
replace_metadata_value(value), replace_escaped_characters(value),
"Memory safe &gt; memory unsafe" "Memory safe &gt; memory unsafe"
); );
value = "Author Name <author@mail.example>"; value = "Author Name <author@mail.example>";
assert_eq!( assert_eq!(
replace_metadata_value(value), replace_escaped_characters(value),
"Author Name &lt;author@mail.example&gt;" "Author Name &lt;author@mail.example&gt;"
); );
} }
#[test]
fn test_generate_header_ids() {
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<h1>Heading 1</h1>
<h2 id="heading-2">Heading 2</h2>
<h2 id="heading-2-again">Heading 2 again</h2>
<h4>Heading 4</h4>
<h1>Heading 1 again</h1>
<h3 class="heading">Heading 3</h3>
</body>
</html>
"#;
let doc = kuchiki::parse_html().one(html_str);
generate_header_ids(&doc);
let mut headers = doc.select("h1, h2, h3, h4").unwrap();
let all_headers_have_ids = headers.all(|node_data_ref| {
let attrs = node_data_ref.attributes.borrow();
if let Some(id) = attrs.get("id") {
!id.trim().is_empty()
} else {
false
}
});
assert_eq!(true, all_headers_have_ids);
let selector = format!("h1#_{:x}", md5::compute("Heading 1"));
assert_eq!(true, doc.select_first(&selector).is_ok());
let selector = format!("h1#_{:x}", md5::compute("Heading 1 again"));
assert_eq!(true, doc.select_first(&selector).is_ok());
let selector = "h2#heading-2-again";
assert_eq!(true, doc.select_first(selector).is_ok());
}
#[test]
fn test_get_header_level_toc_vec() {
// NOTE: Due to `TocElement` not implementing PartialEq, the tests here
// will need to be manually written to cover for this
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<p>Lorem ipsum</p>
</body>
</html>
"#;
let doc = kuchiki::parse_html().one(html_str);
let toc_vec = get_header_level_toc_vec("index.xhtml", &doc);
assert_eq!(0, toc_vec.len());
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<h1 id="heading-1">Heading 1</h1>
<p>Lorem ipsum</p>
<div>
<h2 id="heading-2">Heading 2</h2>
<p>Lorem ipsum</p>
<p>Lorem ipsum</p>
</div>
<h3 id="subheading-3">Subheading 3</h2>
<p>Lorem ipsum</p>
<h1 id="heading-2">Second Heading 1</h2>
<p>Lorem ipsum</p>
</body>
</html>
"#;
let doc = kuchiki::parse_html().one(html_str);
let toc_vec = get_header_level_toc_vec("index.xhtml", &doc);
assert_eq!(2, toc_vec.len());
let first_h1_toc = toc_vec.first().unwrap();
assert_eq!("Heading 1", first_h1_toc.title);
assert_eq!(1, first_h1_toc.children.len());
let h2_toc = first_h1_toc.children.first().unwrap();
assert_eq!("Heading 2", h2_toc.title);
assert_eq!(1, h2_toc.children.len());
let h3_toc = h2_toc.children.first().unwrap();
assert_eq!("Subheading 3", h3_toc.title);
assert_eq!(0, h3_toc.children.len());
let last_h1_toc = toc_vec.last().unwrap();
assert_eq!("Second Heading 1", last_h1_toc.title);
assert_eq!(0, last_h1_toc.children.len());
let html_str = r#"
<!DOCTYPE html>
<html>
<body>
<h1 id="heading-1">Heading 1</h1>
<p>Lorem ipsum</p>
<div>
<h2 id="heading-2">Heading 2</h2>
<p>Lorem ipsum</p>
<p>Lorem ipsum</p>
<h3 id="subheading-3">Subheading 3</h2>
<p>Lorem ipsum</p>
</div>
<h2 id="heading-2">Heading 2</h2>
<p>Lorem ipsum</p>
<h4 id="subheading-4">Subheading 4</h4>
<h2 id="conclusion">Conclusion</h2>
</body>
</html>
"#;
let doc = kuchiki::parse_html().one(html_str);
let toc_vec = get_header_level_toc_vec("index.xhtml", &doc);
assert_eq!(1, toc_vec.len());
let h1_toc = toc_vec.first().unwrap();
assert_eq!("Heading 1", h1_toc.title);
assert_eq!(3, h1_toc.children.len());
let first_h2_toc = h1_toc.children.first().unwrap();
assert_eq!("Heading 2", first_h2_toc.title);
assert_eq!(1, first_h2_toc.children.len());
let h3_toc = first_h2_toc.children.first().unwrap();
assert_eq!("Subheading 3", h3_toc.title);
assert_eq!(0, h3_toc.children.len());
}
} }

View file

@ -1,3 +1,6 @@
use std::fmt::{Debug, Display};
use flexi_logger::FlexiLoggerError;
use thiserror::Error; use thiserror::Error;
#[derive(Error, Debug)] #[derive(Error, Debug)]
@ -124,3 +127,33 @@ impl From<std::str::Utf8Error> for PaperoniError {
PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string())) PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string()))
} }
} }
#[derive(Debug, Error)]
pub enum LogError {
#[error(transparent)]
FlexiError(#[from] FlexiLoggerError),
#[error("Unable to get user directories for logging purposes")]
UserDirectoriesError,
#[error("Can't create log directory: {0}")]
CreateLogDirectoryError(#[from] std::io::Error),
}
#[derive(Debug, Error)]
pub enum CliError<BuilderError: Debug + Display> {
#[error("Failed to open file with urls: {0}")]
UrlFileError(#[from] std::io::Error),
#[error("Failed to parse max connection value: {0}")]
InvalidMaxConnectionCount(#[from] std::num::ParseIntError),
#[error("No urls were provided")]
NoUrls,
#[error("Failed to build cli application: {0}")]
AppBuildError(BuilderError),
#[error("Invalid output path name for merged epubs: {0}")]
InvalidOutputPath(String),
#[error("Wrong output directory")]
WrongOutputDirectory,
#[error("Output directory does not exist")]
OutputDirectoryNotExists,
#[error("Unable to start logger!\n{0}")]
LogError(#[from] LogError),
}

View file

@ -1,5 +1,4 @@
use std::collections::HashMap; use itertools::Itertools;
use kuchiki::{traits::*, NodeRef}; use kuchiki::{traits::*, NodeRef};
use crate::errors::PaperoniError; use crate::errors::PaperoniError;
@ -7,10 +6,6 @@ use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
lazy_static! {
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
}
pub struct Extractor { pub struct Extractor {
article: Option<NodeRef>, article: Option<NodeRef>,
pub img_urls: Vec<ResourceInfo>, pub img_urls: Vec<ResourceInfo>,
@ -37,6 +32,7 @@ impl Extractor {
let template = r#" let template = r#"
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head> <head>
<link rel="stylesheet" href="stylesheet.css" type="text/css"></link>
</head> </head>
<body> <body>
</body> </body>
@ -53,15 +49,19 @@ impl Extractor {
/// Traverses the DOM tree of the content and retrieves the IMG URLs /// Traverses the DOM tree of the content and retrieves the IMG URLs
pub fn extract_img_urls(&mut self) { pub fn extract_img_urls(&mut self) {
if let Some(content_ref) = &self.article { if let Some(content_ref) = &self.article {
for img_ref in content_ref.select("img").unwrap() { self.img_urls = content_ref
img_ref.as_node().as_element().map(|img_elem| { .select("img")
img_elem.attributes.borrow().get("src").map(|img_url| { .unwrap()
if !(img_url.is_empty() || img_url.starts_with("data:image")) { .filter_map(|img_ref| {
self.img_urls.push((img_url.to_string(), None)) let attrs = img_ref.attributes.borrow();
} attrs
.get("src")
.filter(|val| !(val.is_empty() || val.starts_with("data:image")))
.map(ToString::to_string)
}) })
}); .unique()
} .map(|val| (val, None))
.collect();
} }
} }
@ -77,59 +77,6 @@ impl Extractor {
} }
} }
/// Serializes a NodeRef to a string that is XHTML compatible
/// The only DOM nodes serialized are Text and Element nodes
pub fn serialize_to_xhtml<W: std::io::Write>(
node_ref: &NodeRef,
mut w: &mut W,
) -> Result<(), PaperoniError> {
let mut escape_map = HashMap::new();
escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;");
escape_map.insert("&", "&amp;");
escape_map.insert("\"", "&quot;");
escape_map.insert("'", "&apos;");
for edge in node_ref.traverse_inclusive() {
match edge {
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
kuchiki::NodeData::Text(rc_text) => {
let text = rc_text.borrow();
let esc_text = ESC_SEQ_REGEX
.replace_all(&text, |captures: &regex::Captures| escape_map[&captures[1]]);
write!(&mut w, "{}", esc_text)?;
}
kuchiki::NodeData::Element(elem_data) => {
let attrs = elem_data.attributes.borrow();
let attrs_str = attrs
.map
.iter()
.filter(|(k, _)| &k.local != "\"")
.map(|(k, v)| {
format!(
"{}=\"{}\"",
k.local,
ESC_SEQ_REGEX
.replace_all(&v.value, |captures: &regex::Captures| {
escape_map[&captures[1]]
})
)
})
.fold("".to_string(), |acc, val| acc + " " + &val);
write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
}
_ => (),
},
kuchiki::iter::NodeEdge::End(n) => match n.data() {
kuchiki::NodeData::Element(elem_data) => {
write!(&mut w, "</{}>", &elem_data.name.local)?;
}
_ => (),
},
}
}
Ok(())
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;

View file

@ -1,14 +1,72 @@
use async_std::io::prelude::*; use async_std::io::prelude::*;
use async_std::task;
use async_std::{fs::File, stream}; use async_std::{fs::File, stream};
use futures::StreamExt; use futures::StreamExt;
use indicatif::ProgressBar; use indicatif::ProgressBar;
use log::warn;
use log::{debug, info}; use log::{debug, info};
use url::Url; use url::Url;
use crate::cli::AppConfig;
use crate::errors::{ErrorKind, ImgError, PaperoniError}; use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Extractor; use crate::extractor::Extractor;
type HTMLResource = (String, String); type HTMLResource = (String, String);
pub fn download(
app_config: &AppConfig,
bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>,
errors: &mut Vec<PaperoniError>,
) -> Vec<Extractor> {
task::block_on(async {
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
match fetch_result {
Ok((url, html)) => {
debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.await
{
partial_downloads
.push(PartialDownload::new(&url, extractor.metadata().title()));
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
}
}
Err(e) => errors.push(e),
}
bar.inc(1);
}
articles
})
}
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> { pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
let client = surf::Client::new(); let client = surf::Client::new();
debug!("Fetching {}", url); debug!("Fetching {}", url);
@ -153,7 +211,11 @@ pub async fn download_images(
}) })
.enumerate() .enumerate()
.map(|(img_idx, (url, req))| async move { .map(|(img_idx, (url, req))| async move {
bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str()); bar.set_message(format!(
"Downloading images [{}/{}]",
img_idx + 1,
img_count
));
match req.await { match req.await {
Ok(mut img_response) => { Ok(mut img_response) => {
let process_response = let process_response =
@ -206,6 +268,20 @@ pub async fn download_images(
} }
} }
pub struct PartialDownload {
pub link: String,
pub title: String,
}
impl PartialDownload {
pub fn new(link: &str, title: &str) -> Self {
Self {
link: link.into(),
title: title.into(),
}
}
}
/// Handles getting the extension from a given MIME subtype. /// Handles getting the extension from a given MIME subtype.
fn map_mime_subtype_to_ext(subtype: &str) -> &str { fn map_mime_subtype_to_ext(subtype: &str) -> &str {
if subtype == ("svg+xml") { if subtype == ("svg+xml") {
@ -234,9 +310,9 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String {
.unwrap() .unwrap()
.join(url) .join(url)
.unwrap() .unwrap()
.into_string() .into()
} else { } else {
request_url.join(url).unwrap().into_string() request_url.join(url).unwrap().into()
} }
} }

View file

@ -1,18 +1,21 @@
use std::fs;
use chrono::{DateTime, Local};
use colored::*; use colored::*;
use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY;
use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; use comfy_table::{Cell, CellAlignment, ContentArrangement, Table};
use directories::UserDirs; use flexi_logger::{FileSpec, LevelFilter};
use flexi_logger::LogSpecBuilder;
use log::error; use log::error;
use crate::{cli::AppConfig, errors::PaperoniError}; use crate::errors::PaperoniError;
pub fn display_summary( pub fn display_summary(
initial_article_count: usize, initial_article_count: usize,
succesful_articles_table: Table, succesful_articles_table: Table,
partial_downloads_count: usize, partial_downloads: Vec<PartialDownload>,
errors: Vec<PaperoniError>, errors: Vec<PaperoniError>,
) { ) {
let partial_downloads_count = partial_downloads.len();
let successfully_downloaded_count = let successfully_downloaded_count =
initial_article_count - partial_downloads_count - errors.len(); initial_article_count - partial_downloads_count - errors.len();
@ -30,6 +33,24 @@ pub fn display_summary(
if successfully_downloaded_count > 0 { if successfully_downloaded_count > 0 {
println!("{}", succesful_articles_table); println!("{}", succesful_articles_table);
} }
if partial_downloads_count > 0 {
println!("\n{}", "Partially failed downloads".yellow().bold());
let mut table_partial = Table::new();
table_partial
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_header(vec![
Cell::new("Link").set_alignment(CellAlignment::Center),
Cell::new("Title").set_alignment(CellAlignment::Center),
])
.set_content_arrangement(ContentArrangement::Dynamic);
for partial in partial_downloads {
table_partial.add_row(vec![&partial.link, &partial.title]);
}
println!("{}", table_partial);
}
if !errors.is_empty() { if !errors.is_empty() {
println!("\n{}", "Failed article downloads".bright_red().bold()); println!("\n{}", "Failed article downloads".bright_red().bold());
let mut table_failed = Table::new(); let mut table_failed = Table::new();
@ -55,77 +76,56 @@ pub fn display_summary(
/// Returns a string summary of the total number of failed and successful article downloads /// Returns a string summary of the total number of failed and successful article downloads
fn short_summary(download_count: DownloadCount) -> String { fn short_summary(download_count: DownloadCount) -> String {
// TODO: Refactor this
if download_count.total if download_count.total
!= download_count.successful + download_count.failed + download_count.partial != download_count.successful + download_count.failed + download_count.partial
{ {
panic!("initial_count must be equal to the sum of failed and successful count") panic!("initial_count must be equal to the sum of failed and successful count")
} }
let get_noun = |count: usize| if count == 1 { "article" } else { "articles" }; let get_noun = |count: usize| if count == 1 { "article" } else { "articles" };
if download_count.successful == download_count.total && download_count.successful == 1 { let get_summary = |count, label, color: Color| {
"Article downloaded successfully".green().to_string() if count == 0 {
} else if download_count.total == download_count.failed && download_count.failed == 1 { return "".to_string();
"Article failed to download".red().to_string() };
} else if download_count.total == download_count.partial && download_count.partial == 1 {
"Article partially failed to download".yellow().to_string()
} else if download_count.successful == download_count.total {
"All articles downloaded successfully".green().to_string()
} else if download_count.failed == download_count.total {
"All articles failed to download".red().to_string()
} else if download_count.partial == download_count.total {
"All articles partially failed to download"
.yellow()
.to_string()
} else if download_count.partial == 0 {
format!(
"{} {} downloaded successfully, {} {} failed",
download_count.successful,
get_noun(download_count.successful),
download_count.failed,
get_noun(download_count.failed)
)
.yellow()
.to_string()
} else if download_count.successful == 0
&& download_count.partial > 0
&& download_count.failed > 0
{ {
format!( if count == 1 && count == download_count.total {
"{} {} partially failed to download, {} {} failed", "Article".to_string() + label
download_count.partial, } else if count == download_count.total {
get_noun(download_count.partial), "All ".to_string() + get_noun(count) + label
download_count.failed,
get_noun(download_count.failed)
)
.yellow()
.to_string()
} else if download_count.failed == 0
&& download_count.successful > 0
&& download_count.partial > 0
{
format!(
"{} {} downloaded successfully, {} {} partially failed to download",
download_count.successful,
get_noun(download_count.successful),
download_count.partial,
get_noun(download_count.partial)
)
.yellow()
.to_string()
} else { } else {
format!( count.to_string() + " " + get_noun(count) + label
"{} {} downloaded successfully, {} {} partially failed to download, {} {} failed",
download_count.successful,
get_noun(download_count.successful),
download_count.partial,
get_noun(download_count.partial),
download_count.failed,
get_noun(download_count.failed)
)
.yellow()
.to_string()
} }
} }
.color(color)
.to_string()
};
let mut summary = get_summary(
download_count.successful,
" downloaded successfully",
Color::BrightGreen,
);
let partial_summary = get_summary(
download_count.partial,
" partially failed to download",
Color::Yellow,
);
if !summary.is_empty() && !partial_summary.is_empty() {
summary = summary + ", " + &partial_summary;
} else {
summary = summary + &partial_summary;
}
let failed_summary = get_summary(download_count.failed, " failed to download", Color::Red);
if !summary.is_empty() && !failed_summary.is_empty() {
summary = summary + ", " + &failed_summary;
} else {
summary = summary + &failed_summary;
}
summary
}
struct DownloadCount { struct DownloadCount {
total: usize, total: usize,
@ -144,44 +144,43 @@ impl DownloadCount {
} }
} }
pub fn init_logger(app_config: &AppConfig) { use crate::errors::LogError as Error;
use crate::http::PartialDownload;
pub fn init_logger(
log_level: LevelFilter,
start_time: &DateTime<Local>,
is_logging_to_file: bool,
) -> Result<(), Error> {
use directories::UserDirs;
use flexi_logger::LogSpecBuilder;
match UserDirs::new() { match UserDirs::new() {
Some(user_dirs) => { Some(user_dirs) => {
let home_dir = user_dirs.home_dir(); let home_dir = user_dirs.home_dir();
let paperoni_dir = home_dir.join(".paperoni"); let paperoni_dir = home_dir.join(".paperoni");
let log_dir = paperoni_dir.join("logs"); let log_dir = paperoni_dir.join("logs");
let log_spec = LogSpecBuilder::new() let log_spec = LogSpecBuilder::new().module("paperoni", log_level).build();
.module("paperoni", app_config.log_level()) let formatted_timestamp = start_time.format("%Y-%m-%d_%H-%M-%S");
.build();
let formatted_timestamp = app_config.start_time().format("%Y-%m-%d_%H-%M-%S");
let mut logger = flexi_logger::Logger::with(log_spec); let mut logger = flexi_logger::Logger::with(log_spec);
if app_config.is_logging_to_file() && (!paperoni_dir.is_dir() || !log_dir.is_dir()) { if is_logging_to_file {
match std::fs::create_dir_all(&log_dir) { if !paperoni_dir.is_dir() || !log_dir.is_dir() {
Ok(_) => (), fs::create_dir_all(&log_dir)?;
Err(e) => {
eprintln!("Unable to create paperoni directories on home directory for logging purposes\n{}",e);
std::process::exit(1);
} }
}; logger = logger.log_to_file(
} FileSpec::default()
if app_config.is_logging_to_file() {
logger = logger
.directory(log_dir) .directory(log_dir)
.discriminant(formatted_timestamp.to_string()) .discriminant(formatted_timestamp.to_string())
.suppress_timestamp() .suppress_timestamp(),
.log_to_file(); );
} }
logger.start()?;
match logger.start() { Ok(())
Ok(_) => (),
Err(e) => eprintln!("Unable to start logger!\n{}", e),
} }
None => Err(Error::UserDirectoriesError),
} }
None => eprintln!("Unable to get user directories for logging purposes"),
};
} }
#[cfg(test)] #[cfg(test)]
@ -192,7 +191,7 @@ mod tests {
fn test_short_summary() { fn test_short_summary() {
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(1, 1, 0, 0)), short_summary(DownloadCount::new(1, 1, 0, 0)),
"Article downloaded successfully".green().to_string() "Article downloaded successfully".bright_green().to_string()
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(1, 0, 0, 1)), short_summary(DownloadCount::new(1, 0, 0, 1)),
@ -200,7 +199,9 @@ mod tests {
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(10, 10, 0, 0)), short_summary(DownloadCount::new(10, 10, 0, 0)),
"All articles downloaded successfully".green().to_string() "All articles downloaded successfully"
.bright_green()
.to_string()
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(10, 0, 0, 10)), short_summary(DownloadCount::new(10, 0, 0, 10)),
@ -208,39 +209,52 @@ mod tests {
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(10, 8, 0, 2)), short_summary(DownloadCount::new(10, 8, 0, 2)),
"8 articles downloaded successfully, 2 articles failed" format!(
.yellow() "{}, {}",
.to_string() "8 articles downloaded successfully".bright_green(),
"2 articles failed to download".red()
)
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(10, 1, 0, 9)), short_summary(DownloadCount::new(10, 1, 0, 9)),
"1 article downloaded successfully, 9 articles failed" format!(
.yellow() "{}, {}",
.to_string() "1 article downloaded successfully".bright_green(),
"9 articles failed to download".red()
)
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(7, 6, 0, 1)), short_summary(DownloadCount::new(7, 6, 0, 1)),
"6 articles downloaded successfully, 1 article failed" format!(
.yellow() "{}, {}",
.to_string() "6 articles downloaded successfully".bright_green(),
"1 article failed to download".red()
)
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(7, 4, 2, 1)), short_summary(DownloadCount::new(7, 4, 2, 1)),
"4 articles downloaded successfully, 2 articles partially failed to download, 1 article failed" format!(
.yellow() "{}, {}, {}",
.to_string() "4 articles downloaded successfully".bright_green(),
"2 articles partially failed to download".yellow(),
"1 article failed to download".red()
)
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(12, 6, 6, 0)), short_summary(DownloadCount::new(12, 6, 6, 0)),
"6 articles downloaded successfully, 6 articles partially failed to download" format!(
.yellow() "{}, {}",
.to_string() "6 articles downloaded successfully".bright_green(),
"6 articles partially failed to download".yellow()
)
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(5, 0, 4, 1)), short_summary(DownloadCount::new(5, 0, 4, 1)),
"4 articles partially failed to download, 1 article failed" format!(
.yellow() "{}, {}",
.to_string() "4 articles partially failed to download".yellow(),
"1 article failed to download".red()
)
); );
assert_eq!( assert_eq!(
short_summary(DownloadCount::new(4, 0, 4, 0)), short_summary(DownloadCount::new(4, 0, 4, 0)),

View file

@ -1,14 +1,12 @@
#[macro_use] #[macro_use]
extern crate lazy_static; extern crate lazy_static;
use async_std::stream; use std::process::exit;
use async_std::task;
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
use comfy_table::{ContentArrangement, Table}; use comfy_table::{ContentArrangement, Table};
use futures::stream::StreamExt; use http::download;
use indicatif::{ProgressBar, ProgressStyle}; use indicatif::{ProgressBar, ProgressStyle};
use log::{debug, warn};
use url::Url;
mod cli; mod cli;
mod epub; mod epub;
@ -22,25 +20,39 @@ mod moz_readability;
use cli::AppConfig; use cli::AppConfig;
use epub::generate_epubs; use epub::generate_epubs;
use extractor::Extractor;
use http::{download_images, fetch_html};
use logs::display_summary; use logs::display_summary;
fn main() { fn main() {
let app_config = cli::cli_init(); let app_config = match cli::AppConfig::init_with_cli() {
Ok(app_config) => app_config,
Err(err) => {
eprintln!("{}", err);
exit(1);
}
};
if !app_config.urls().is_empty() { if !app_config.urls.is_empty() {
download(app_config); run(app_config);
} }
} }
fn download(app_config: AppConfig) { fn run(app_config: AppConfig) {
let mut errors = Vec::new(); let mut errors = Vec::new();
let mut partial_download_count: usize = 0; let mut partial_downloads = Vec::new();
let bar = if app_config.can_disable_progress_bar() {
if let Some(dir_name) = &app_config.output_directory {
let noun = if app_config.urls.len() > 1 {
"articles"
} else {
"article"
};
println!("Downloading {} to {}", noun, dir_name);
}
let bar = if app_config.can_disable_progress_bar {
ProgressBar::hidden() ProgressBar::hidden()
} else { } else {
let enabled_bar = ProgressBar::new(app_config.urls().len() as u64); let enabled_bar = ProgressBar::new(app_config.urls.len() as u64);
let style = ProgressStyle::default_bar().template( let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
); );
@ -48,52 +60,8 @@ fn download(app_config: AppConfig) {
enabled_bar.enable_steady_tick(500); enabled_bar.enable_steady_tick(500);
enabled_bar enabled_bar
}; };
let articles = task::block_on(async {
let urls_iter = app_config.urls().iter().map(|url| fetch_html(url)); let articles = download(&app_config, &bar, &mut partial_downloads, &mut errors);
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
match fetch_result {
Ok((url, html)) => {
debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.await
{
partial_download_count += 1;
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
}
}
Err(e) => errors.push(e),
}
bar.inc(1);
}
articles
});
bar.finish_with_message("Downloaded articles"); bar.finish_with_message("Downloaded articles");
let mut succesful_articles_table = Table::new(); let mut succesful_articles_table = Table::new();
@ -107,19 +75,24 @@ fn download(app_config: AppConfig) {
errors.extend(gen_epub_errors); errors.extend(gen_epub_errors);
} }
}; };
let has_errors = !errors.is_empty();
let has_errors = !errors.is_empty() || !partial_downloads.is_empty();
display_summary( display_summary(
app_config.urls().len(), app_config.urls.len(),
succesful_articles_table, succesful_articles_table,
partial_download_count, partial_downloads,
errors, errors,
); );
if app_config.is_logging_to_file() {
if app_config.is_logging_to_file {
println!( println!(
"Log written to paperoni_{}.log\n", "Log written to paperoni_{}.log\n",
app_config.start_time().format("%Y-%m-%d_%H-%M-%S") app_config.start_time.format("%Y-%m-%d_%H-%M-%S")
); );
} else if has_errors && !app_config.is_logging_to_file {
println!("\nRun paperoni with the --log-to-file flag to create a log file");
} }
if has_errors { if has_errors {
std::process::exit(1); std::process::exit(1);
} }

View file

@ -659,10 +659,24 @@ impl Readability {
.map(|node_ref| { .map(|node_ref| {
let node_attrs = node_ref.attributes.borrow(); let node_attrs = node_ref.attributes.borrow();
let href = node_attrs.get("href").unwrap(); let href = node_attrs.get("href").unwrap();
if href.trim() == "/" {
document_uri.join("/").unwrap() match Url::parse(href) {
} else { Ok(url) => url,
Url::parse(href).unwrap() Err(e) => match e {
url::ParseError::RelativeUrlWithoutBase => {
match document_uri.join(href) {
Ok(joined_url) => joined_url,
Err(e) => panic!(
"{:} unable to parse url {:?} on element {}",
e, href, &node_ref.name.local
),
}
}
e => panic!(
"{:} unable to parse url {:?} on element {}",
e, href, &node_ref.name.local
),
},
} }
}) })
.next() .next()
@ -1609,13 +1623,11 @@ impl Readability {
// // class name "comment", etc), and turn divs into P tags where they have been // // class name "comment", etc), and turn divs into P tags where they have been
// // used inappropriately (as in, where they contain no other block level elements.) // // used inappropriately (as in, where they contain no other block level elements.)
let mut elements_to_score: Vec<NodeRef> = Vec::new(); let mut elements_to_score: Vec<NodeRef> = Vec::new();
let mut node = Some( let mut node = self
self.root_node .root_node
.select_first("html") .select_first("html")
.unwrap() .ok()
.as_node() .map(|n| n.as_node().clone());
.clone(),
);
while let Some(node_ref) = node { while let Some(node_ref) = node {
let node_elem = node_ref.as_element().unwrap(); let node_elem = node_ref.as_element().unwrap();