Merge pull request #10 from hipstermojo/dev

v0.4.0 release
This commit is contained in:
Kenneth Gitere 2021-04-30 08:48:11 +03:00 committed by GitHub
commit 474d97c6bd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 1424 additions and 281 deletions

3
.gitignore vendored
View file

@ -1,2 +1,3 @@
/target
*.epub
*.epub
*.log

480
Cargo.lock generated
View file

@ -126,12 +126,15 @@ dependencies = [
[[package]]
name = "async-global-executor"
version = "1.4.3"
version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73079b49cd26b8fd5a15f68fc7707fc78698dc2a3d61430f2a7a9430230dfa04"
checksum = "9586ec52317f36de58453159d48351bc244bc24ced3effc1fce22f3d48664af6"
dependencies = [
"async-channel",
"async-executor",
"async-io",
"async-mutex",
"blocking",
"futures-lite",
"num_cpus",
"once_cell",
@ -147,7 +150,7 @@ dependencies = [
"fastrand",
"futures-lite",
"libc",
"log 0.4.11",
"log 0.4.14",
"nb-connect",
"once_cell",
"parking",
@ -157,6 +160,15 @@ dependencies = [
"winapi",
]
[[package]]
name = "async-lock"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6a8ea61bf9947a1007c5cada31e647dbc77b103c679858150003ba697ea798b"
dependencies = [
"event-listener",
]
[[package]]
name = "async-mutex"
version = "1.4.0"
@ -168,14 +180,14 @@ dependencies = [
[[package]]
name = "async-std"
version = "1.7.0"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7e82538bc65a25dbdff70e4c5439d52f068048ab97cdea0acd73f131594caa1"
checksum = "d9f06685bad74e0570f5213741bea82158279a4103d988e57bfada11ad230341"
dependencies = [
"async-channel",
"async-global-executor",
"async-io",
"async-mutex",
"blocking",
"async-lock",
"crossbeam-utils",
"futures-channel",
"futures-core",
@ -183,11 +195,11 @@ dependencies = [
"futures-lite",
"gloo-timers",
"kv-log-macro",
"log 0.4.11",
"log 0.4.14",
"memchr",
"num_cpus",
"once_cell",
"pin-project-lite 0.1.11",
"pin-project-lite 0.2.4",
"pin-utils",
"slab",
"wasm-bindgen-futures",
@ -394,6 +406,28 @@ dependencies = [
"vec_map",
]
[[package]]
name = "colored"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd"
dependencies = [
"atty",
"lazy_static",
"winapi",
]
[[package]]
name = "comfy-table"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17b99e9022e080d384b58d8eaf5976b42a311ff7a9669f8200eb2453c0b2b81a"
dependencies = [
"crossterm",
"strum",
"strum_macros",
]
[[package]]
name = "concurrent-queue"
version = "1.2.2"
@ -403,6 +437,21 @@ dependencies = [
"cache-padded",
]
[[package]]
name = "console"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3993e6445baa160675931ec041a5e03ca84b9c6e32a056150d3aa2bdda0a1f45"
dependencies = [
"encode_unicode",
"lazy_static",
"libc",
"regex",
"terminal_size",
"unicode-width",
"winapi",
]
[[package]]
name = "const_fn"
version = "0.4.3"
@ -453,6 +502,31 @@ dependencies = [
"lazy_static",
]
[[package]]
name = "crossterm"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c"
dependencies = [
"bitflags",
"crossterm_winapi",
"lazy_static",
"libc",
"mio",
"parking_lot",
"signal-hook",
"winapi",
]
[[package]]
name = "crossterm_winapi"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9"
dependencies = [
"winapi",
]
[[package]]
name = "crypto-mac"
version = "0.10.0"
@ -490,6 +564,16 @@ dependencies = [
"syn",
]
[[package]]
name = "ctor"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fbaabec2c953050352311293be5c6aba8e141ba19d6811862b232d6fd020484"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "ctr"
version = "0.6.0"
@ -530,6 +614,16 @@ dependencies = [
"winapi",
]
[[package]]
name = "dashmap"
version = "4.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e77a43b28d0668df09411cb0bc9a8c2adc40f9a048afe863e05fd43251e8e39c"
dependencies = [
"cfg-if 1.0.0",
"num_cpus",
]
[[package]]
name = "data-encoding"
version = "2.3.1"
@ -556,6 +650,26 @@ dependencies = [
"generic-array",
]
[[package]]
name = "directories"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e69600ff1703123957937708eb27f7a564e48885c537782722ed0ba3189ce1d7"
dependencies = [
"dirs-sys",
]
[[package]]
name = "dirs-sys"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03d86534ed367a67548dc68113a0f5db55432fdfbb6e6f9d77704397d95d5780"
dependencies = [
"libc",
"redox_users",
"winapi",
]
[[package]]
name = "discard"
version = "1.0.4"
@ -577,6 +691,12 @@ dependencies = [
"dtoa",
]
[[package]]
name = "encode_unicode"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
[[package]]
name = "encoding_rs"
version = "0.8.26"
@ -640,6 +760,22 @@ dependencies = [
"miniz_oxide 0.3.7",
]
[[package]]
name = "flexi_logger"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab94b6ac8eb69f1496a6993f26f785b5fd6d99b7416023eb2a6175c0b242b1"
dependencies = [
"atty",
"chrono",
"glob",
"lazy_static",
"log 0.4.14",
"regex",
"thiserror",
"yansi",
]
[[package]]
name = "flume"
version = "0.9.2"
@ -685,9 +821,9 @@ dependencies = [
[[package]]
name = "futures"
version = "0.3.12"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
dependencies = [
"futures-channel",
"futures-core",
@ -700,9 +836,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.12"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
dependencies = [
"futures-core",
"futures-sink",
@ -710,15 +846,15 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.12"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
[[package]]
name = "futures-executor"
version = "0.3.12"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
dependencies = [
"futures-core",
"futures-task",
@ -727,9 +863,9 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.12"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
[[package]]
name = "futures-lite"
@ -748,9 +884,9 @@ dependencies = [
[[package]]
name = "futures-macro"
version = "0.3.12"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
dependencies = [
"proc-macro-hack",
"proc-macro2",
@ -760,24 +896,21 @@ dependencies = [
[[package]]
name = "futures-sink"
version = "0.3.12"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
[[package]]
name = "futures-task"
version = "0.3.12"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
dependencies = [
"once_cell",
]
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
[[package]]
name = "futures-util"
version = "0.3.12"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
dependencies = [
"futures-channel",
"futures-core",
@ -823,6 +956,17 @@ dependencies = [
"wasi 0.9.0+wasi-snapshot-preview1",
]
[[package]]
name = "getrandom"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8"
dependencies = [
"cfg-if 1.0.0",
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
]
[[package]]
name = "ghash"
version = "0.3.0"
@ -838,6 +982,12 @@ version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6503fe142514ca4799d4c26297c4248239fe8838d827db6bd6065c6ed29a6ce"
[[package]]
name = "glob"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
[[package]]
name = "gloo-timers"
version = "0.2.1"
@ -851,6 +1001,15 @@ dependencies = [
"web-sys",
]
[[package]]
name = "heck"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac"
dependencies = [
"unicode-segmentation",
]
[[package]]
name = "hermit-abi"
version = "0.1.17"
@ -895,7 +1054,7 @@ version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b"
dependencies = [
"log 0.4.11",
"log 0.4.14",
"mac",
"markup5ever",
"proc-macro2",
@ -916,15 +1075,17 @@ dependencies = [
[[package]]
name = "http-client"
version = "6.2.0"
version = "6.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "010092b71b94ee49293995625ce7a607778b8b4099c8088fa84fd66bd3e0f21c"
checksum = "5566ecc26bc6b04e773e680d66141fced78e091ad818e420d726c152b05a64ff"
dependencies = [
"async-std",
"async-trait",
"cfg-if 1.0.0",
"dashmap",
"http-types",
"isahc",
"log 0.4.11",
"log 0.4.14",
]
[[package]]
@ -960,6 +1121,18 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "indicatif"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4"
dependencies = [
"console",
"lazy_static",
"number_prefix",
"regex",
]
[[package]]
name = "infer"
version = "0.2.3"
@ -988,7 +1161,7 @@ dependencies = [
"flume",
"futures-lite",
"http",
"log 0.4.11",
"log 0.4.14",
"once_cell",
"slab",
"sluice",
@ -1031,7 +1204,7 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f"
dependencies = [
"log 0.4.11",
"log 0.4.14",
]
[[package]]
@ -1042,9 +1215,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.80"
version = "0.2.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614"
checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41"
[[package]]
name = "libnghttp2-sys"
@ -1083,16 +1256,17 @@ version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b"
dependencies = [
"log 0.4.11",
"log 0.4.14",
]
[[package]]
name = "log"
version = "0.4.11"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b"
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
dependencies = [
"cfg-if 0.1.10",
"cfg-if 1.0.0",
"value-bag",
]
[[package]]
@ -1107,7 +1281,7 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
dependencies = [
"log 0.4.11",
"log 0.4.14",
"phf",
"phf_codegen",
"serde",
@ -1171,6 +1345,28 @@ dependencies = [
"autocfg",
]
[[package]]
name = "mio"
version = "0.7.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956"
dependencies = [
"libc",
"log 0.4.14",
"miow",
"ntapi",
"winapi",
]
[[package]]
name = "miow"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21"
dependencies = [
"winapi",
]
[[package]]
name = "mustache"
version = "0.9.0"
@ -1203,6 +1399,15 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "ntapi"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44"
dependencies = [
"winapi",
]
[[package]]
name = "num-integer"
version = "0.1.44"
@ -1232,6 +1437,12 @@ dependencies = [
"libc",
]
[[package]]
name = "number_prefix"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
[[package]]
name = "object"
version = "0.22.0"
@ -1271,18 +1482,26 @@ dependencies = [
[[package]]
name = "paperoni"
version = "0.3.0-alpha1"
version = "0.4.0-alpha1"
dependencies = [
"async-std",
"chrono",
"clap",
"colored",
"comfy-table",
"directories",
"epub-builder",
"flexi_logger",
"futures",
"html5ever",
"indicatif",
"kuchiki",
"lazy_static",
"log 0.4.14",
"md5",
"regex",
"surf",
"thiserror",
"url",
]
@ -1292,6 +1511,31 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72"
[[package]]
name = "parking_lot"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb"
dependencies = [
"instant",
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018"
dependencies = [
"cfg-if 1.0.0",
"instant",
"libc",
"redox_syscall 0.2.6",
"smallvec",
"winapi",
]
[[package]]
name = "percent-encoding"
version = "2.1.0"
@ -1404,7 +1648,7 @@ checksum = "a2a7bc6b2a29e632e45451c941832803a18cce6781db04de8a04696cdca8bde4"
dependencies = [
"cfg-if 0.1.10",
"libc",
"log 0.4.11",
"log 0.4.14",
"wepoll-sys",
"winapi",
]
@ -1480,7 +1724,7 @@ version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [
"getrandom",
"getrandom 0.1.15",
"libc",
"rand_chacha",
"rand_core 0.5.1",
@ -1519,7 +1763,7 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [
"getrandom",
"getrandom 0.1.15",
]
[[package]]
@ -1556,22 +1800,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
[[package]]
name = "regex"
version = "1.4.2"
name = "redox_syscall"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c"
checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041"
dependencies = [
"bitflags",
]
[[package]]
name = "redox_users"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64"
dependencies = [
"getrandom 0.2.2",
"redox_syscall 0.2.6",
]
[[package]]
name = "regex"
version = "1.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "957056ecddbeba1b26965114e191d2e8589ce74db242b6ea25fc4062427a5c19"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
"thread_local",
]
[[package]]
name = "regex-syntax"
version = "0.6.21"
version = "0.6.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189"
checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548"
[[package]]
name = "remove_dir_all"
@ -1629,7 +1891,7 @@ dependencies = [
"cssparser",
"derive_more",
"fxhash",
"log 0.4.11",
"log 0.4.14",
"matches",
"phf",
"phf_codegen",
@ -1738,6 +2000,26 @@ dependencies = [
"opaque-debug",
]
[[package]]
name = "signal-hook"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729"
dependencies = [
"libc",
"mio",
"signal-hook-registry",
]
[[package]]
name = "signal-hook-registry"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6"
dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "0.3.3"
@ -1763,9 +2045,9 @@ dependencies = [
[[package]]
name = "smallvec"
version = "1.5.0"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7acad6f34eb9e8a259d3283d1e8c1d34d7415943d4895f65cc73813c7396fc85"
checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
[[package]]
name = "socket2"
@ -1775,7 +2057,7 @@ checksum = "2c29947abdee2a218277abeca306f25789c938e500ea5a9d4b12a5a504466902"
dependencies = [
"cfg-if 1.0.0",
"libc",
"redox_syscall",
"redox_syscall 0.1.57",
"winapi",
]
@ -1883,6 +2165,24 @@ version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "strum"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7318c509b5ba57f18533982607f24070a55d353e90d4cae30c467cdb2ad5ac5c"
[[package]]
name = "strum_macros"
version = "0.20.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee8bc6b87a5112aeeab1f4a9f7ab634fe6cbefc4850006df31267f4cfb9e3149"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "subtle"
version = "2.3.0"
@ -1891,21 +2191,21 @@ checksum = "343f3f510c2915908f155e94f17220b19ccfacf2a64a2a5d8004f2c3e311e7fd"
[[package]]
name = "surf"
version = "2.1.0"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7189c787d96fe18fef704950de76d590022d9d70858a4a201e1f07a0666882ea"
checksum = "2a154d33ca6b5e1fe6fd1c760e5a5cc1202425f6cca2e13229f16a69009f6328"
dependencies = [
"async-std",
"async-trait",
"cfg-if 0.1.10",
"cfg-if 1.0.0",
"encoding_rs",
"futures-util",
"http-client",
"http-types",
"log 0.4.11",
"log 0.4.14",
"mime_guess",
"once_cell",
"pin-project-lite 0.1.11",
"pin-project-lite 0.2.4",
"serde",
"serde_json",
"web-sys",
@ -1943,6 +2243,16 @@ dependencies = [
"utf-8",
]
[[package]]
name = "terminal_size"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86ca8ced750734db02076f44132d802af0b33b09942331f4459dde8636fd2406"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "textwrap"
version = "0.11.0"
@ -1960,33 +2270,24 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]]
name = "thiserror"
version = "1.0.22"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e"
checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.22"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56"
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thread_local"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
dependencies = [
"lazy_static",
]
[[package]]
name = "time"
version = "0.1.44"
@ -2058,7 +2359,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27"
dependencies = [
"cfg-if 0.1.10",
"log 0.4.11",
"log 0.4.14",
"pin-project-lite 0.1.11",
"tracing-attributes",
"tracing-core",
@ -2127,6 +2428,12 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-segmentation"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb0d2e7be6ae3a5fa87eed5fb451aff96f2573d2694942e40543ae0bbe19c796"
[[package]]
name = "unicode-width"
version = "0.1.8"
@ -2151,9 +2458,9 @@ dependencies = [
[[package]]
name = "url"
version = "2.2.0"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e"
checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b"
dependencies = [
"form_urlencoded",
"idna",
@ -2183,6 +2490,15 @@ dependencies = [
"rand 0.7.3",
]
[[package]]
name = "value-bag"
version = "1.0.0-alpha.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b676010e055c99033117c2343b33a40a30b91fecd6c49055ac9cd2d6c305ab1"
dependencies = [
"ctor",
]
[[package]]
name = "vcpkg"
version = "0.2.10"
@ -2243,7 +2559,7 @@ checksum = "f22b422e2a757c35a73774860af8e112bff612ce6cb604224e8e47641a9e4f68"
dependencies = [
"bumpalo",
"lazy_static",
"log 0.4.11",
"log 0.4.14",
"proc-macro2",
"quote",
"syn",
@ -2332,6 +2648,12 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "yansi"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fc79f4a1e39857fc00c3f662cbf2651c771f00e9c15fe2abc341806bd46bd71"
[[package]]
name = "zip"
version = "0.5.8"

View file

@ -3,7 +3,7 @@ description = "A web article downloader"
homepage = "https://github.com/hipstermojo/paperoni"
repository = "https://github.com/hipstermojo/paperoni"
name = "paperoni"
version = "0.3.0-alpha1"
version = "0.4.0-alpha1"
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
edition = "2018"
license = "MIT"
@ -12,14 +12,23 @@ readme = "README.md"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
async-std = "1.7.0"
async-std = "1.9.0"
# atty = "0.2.14"
chrono = "0.4.19"
clap = "2.33.3"
colored = "2.0.0"
comfy-table = "2.1.0"
directories = "3.0.2"
epub-builder = "0.4.8"
futures = "0.3.12"
flexi_logger = "0.17.1"
futures = "0.3.14"
html5ever = "0.25.1"
indicatif = "0.15.0"
kuchiki = "0.8.1"
lazy_static = "1.4.0"
log = "0.4.14"
md5 = "0.7.0"
regex = "1.4.2"
surf = "2.1.0"
url = "2.2.0"
regex = "1.4.5"
surf = "2.2.0"
thiserror = "1.0.24"
url = "2.2.1"

View file

@ -1,8 +1,10 @@
![crates.io](https://img.shields.io/crates/v/paperoni.svg)
<p align="center"><img src="./paperoni-dark.png"></p>
<p align="center"><i>Salami not included</i></p>
Paperoni is a web article downloader written in Rust. The downloaded articles are then exported as EPUB files.
Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs.
> This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash.
@ -17,7 +19,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for
Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run:
```sh
cargo install paperoni --version 0.3.0-alpha1
cargo install paperoni --version 0.4.0-alpha1
```
_Paperoni is still in alpha so the `version` flag has to be passed._
@ -37,6 +39,27 @@ cargo run -- # pass your url here
## Usage
```
USAGE:
paperoni [OPTIONS] [urls]...
OPTIONS:
-f, --file <file> Input file containing links
-h, --help Prints help information
--log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level
of debug. Use -v to specify the logging level
--max_conn <max_conn> The maximum number of concurrent HTTP connections when downloading articles. Default is
8
--merge <output_name> Merge multiple articles into a single epub
-V, --version Prints version information
-v Enables logging of events and set the verbosity level. Use -h to read on its usage
ARGS:
<urls>... Urls of web articles
```
To download a single article pass in its URL
```sh
paperoni https://en.wikipedia.org/wiki/Pepperoni
```
@ -68,10 +91,23 @@ into a single epub using the `merge` flag and specifying the output file.
paperoni -f links.txt --merge out.epub
```
### Logging events
Logging is disabled by default. This can be activated by either using the `-v` flag or `--log-to-file` flag. If the `--log-to-file` flag is passed the logs are sent to a file in the default Paperoni directory `.paperoni/logs` which is on your home directory. The `-v` flag configures the verbosity levels such that:
```
-v Logs only the error level
-vv Logs only the warn level
-vvv Logs only the info level
-vvvv Logs only the debug level
```
If only the `-v` flag is passed, the progress bars are disabled. If both `-v` and `--log-to-file` are passed then the progress bars will still be shown.
## How it works
The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor.
This extractor retrieves a possible article using a port of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB.
This extractor retrieves a possible article using a [custom port](https://github.com/hipstermojo/paperoni/blob/master/src/moz_readability/mod.rs) of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB.
> The port of the algorithm is still unstable as well so it is not fully compatible with all the websites that can be extracted using Readability.
@ -82,3 +118,5 @@ This program is still in alpha so a number of things won't work:
- Websites that only run with JavaScript cannot be extracted.
- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB.
There are also web pages it won't work on in general such as Twitter and Reddit threads.

View file

@ -1,6 +1,10 @@
use std::{fs::File, io::Read};
use std::{fs::File, io::Read, path::Path};
use chrono::{DateTime, Local};
use clap::{App, AppSettings, Arg};
use flexi_logger::LevelFilter as LogLevel;
use crate::logs::init_logger;
pub fn cli_init() -> AppConfig {
let app = App::new("paperoni")
@ -8,12 +12,9 @@ pub fn cli_init() -> AppConfig {
AppSettings::ArgRequiredElseHelp,
AppSettings::UnifiedHelpMessage,
])
.version("0.3.0-alpha1")
.version(clap::crate_version!())
.about(
"
Paperoni is an article downloader.
It takes a url and downloads the article content from it and saves it to an epub.
",
"Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs",
)
.arg(
Arg::with_name("urls")
@ -38,8 +39,29 @@ It takes a url and downloads the article content from it and saves it to an epub
.long("max_conn")
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
.takes_value(true));
.takes_value(true))
.arg(
Arg::with_name("verbosity")
.short("v")
.multiple(true)
.help("Enables logging of events and set the verbosity level. Use --help to read on its usage")
.long_help(
"This takes upto 4 levels of verbosity in the following order.
- Error (-v)
- Warn (-vv)
- Info (-vvv)
- Debug (-vvvv)
When this flag is passed, it disables the progress bars and logs to stderr.
If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag."
)
.takes_value(false))
.arg(
Arg::with_name("log-to-file")
.long("log-to-file")
.help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level")
.takes_value(false));
let arg_matches = app.get_matches();
let mut urls: Vec<String> = match arg_matches.value_of("file") {
Some(file_name) => {
if let Ok(mut file) = File::open(file_name) {
@ -76,14 +98,51 @@ It takes a url and downloads the article content from it and saves it to an epub
let mut app_config = AppConfig::new(max_conn);
app_config.set_urls(urls);
if let Some(name) = arg_matches.value_of("output_name") {
let file_name = if name.ends_with(".epub") && name.len() > 5 {
let file_path = Path::new(name);
if file_path.is_dir() {
eprintln!("{:?} is a directory", name);
std::process::exit(1);
}
let file_name = if file_path.extension().is_some() {
name.to_owned()
} else {
name.to_owned() + ".epub"
};
app_config.set_merged(file_name);
match std::fs::File::create(&file_name) {
Ok(_) => (),
Err(e) => {
eprintln!("Unable to create file {:?}\n{}", file_path, e);
std::process::exit(1)
}
}
app_config.merged = Some(file_name);
}
if arg_matches.is_present("verbosity") {
if !arg_matches.is_present("log-to-file") {
app_config.can_disable_progress_bar = true;
}
let log_levels: [LogLevel; 5] = [
LogLevel::Off,
LogLevel::Error,
LogLevel::Warn,
LogLevel::Info,
LogLevel::Debug,
];
let level = arg_matches.occurrences_of("verbosity").clamp(0, 4) as usize;
app_config.log_level = log_levels[level];
}
if arg_matches.is_present("log-to-file") {
app_config.log_level = LogLevel::Debug;
app_config.is_logging_to_file = true;
}
init_logger(&app_config);
app_config
}
@ -91,6 +150,10 @@ pub struct AppConfig {
urls: Vec<String>,
max_conn: usize,
merged: Option<String>,
log_level: LogLevel,
can_disable_progress_bar: bool,
start_time: DateTime<Local>,
is_logging_to_file: bool,
}
impl AppConfig {
@ -99,6 +162,10 @@ impl AppConfig {
urls: vec![],
max_conn,
merged: None,
log_level: LogLevel::Off,
can_disable_progress_bar: false,
start_time: Local::now(),
is_logging_to_file: false,
}
}
@ -106,10 +173,6 @@ impl AppConfig {
self.urls.extend(urls);
}
fn set_merged(&mut self, name: String) {
self.merged = Some(name);
}
pub fn urls(&self) -> &Vec<String> {
&self.urls
}
@ -120,4 +183,20 @@ impl AppConfig {
pub fn merged(&self) -> Option<&String> {
self.merged.as_ref()
}
pub fn log_level(&self) -> LogLevel {
self.log_level
}
pub fn can_disable_progress_bar(&self) -> bool {
self.can_disable_progress_bar
}
pub fn start_time(&self) -> &DateTime<Local> {
&self.start_time
}
pub fn is_logging_to_file(&self) -> bool {
self.is_logging_to_file
}
}

View file

@ -1,32 +1,159 @@
use std::fs::File;
use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table};
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use indicatif::{ProgressBar, ProgressStyle};
use log::{debug, info};
use crate::extractor::{self, Extractor};
use crate::{
cli::AppConfig,
errors::PaperoniError,
extractor::{self, Extractor},
};
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
match merged {
pub fn generate_epubs(
articles: Vec<Extractor>,
app_config: &AppConfig,
successful_articles_table: &mut Table,
) -> Result<(), Vec<PaperoniError>> {
let bar = if app_config.can_disable_progress_bar() {
ProgressBar::hidden()
} else {
let enabled_bar = ProgressBar::new(articles.len() as u64);
let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}",
);
enabled_bar.set_style(style);
if !articles.is_empty() {
enabled_bar.set_message("Generating epubs");
}
enabled_bar
};
let mut errors: Vec<PaperoniError> = Vec::new();
match app_config.merged() {
Some(name) => {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
successful_articles_table.set_header(vec![Cell::new("Table of Contents")
.add_attribute(Attribute::Bold)
.set_alignment(CellAlignment::Center)
.fg(Color::Green)]);
let mut epub = match EpubBuilder::new(match ZipLibrary::new() {
Ok(zip_library) => zip_library,
Err(err) => {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
}) {
Ok(epub) => epub,
Err(err) => {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
};
debug!("Creating {:?}", name);
epub.inline_toc();
epub = articles
articles
.iter()
.enumerate()
.fold(epub, |mut epub, (idx, article)| {
.fold(&mut epub, |epub, (idx, article)| {
let mut article_result = || -> Result<(), PaperoniError> {
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article(), &mut html_buf)?;
let html_str = std::str::from_utf8(&html_buf)?;
epub.metadata("title", replace_metadata_value(name))?;
let section_name = article.metadata().title();
epub.add_content(
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
.title(replace_metadata_value(section_name)),
)?;
info!("Adding images for {:?}", name);
article.img_urls.iter().for_each(|img| {
// TODO: Add error handling and return errors as a vec
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.as_ref().unwrap(),
)
.unwrap();
});
info!("Added images for {:?}", name);
Ok(())
};
if let Err(mut error) = article_result() {
error.set_article_source(&article.url);
errors.push(error);
}
bar.inc(1);
successful_articles_table.add_row(vec![article.metadata().title()]);
epub
});
let appendix = generate_appendix(articles.iter().collect());
if let Err(err) = epub.add_content(
EpubContent::new("appendix.xhtml", appendix.as_bytes())
.title(replace_metadata_value("Article Sources")),
) {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
let mut out_file = File::create(&name).unwrap();
match epub.generate(&mut out_file) {
Ok(_) => (),
Err(err) => {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
}
bar.finish_with_message("Generated epub\n");
debug!("Created {:?}", name);
println!("Created {:?}", name);
}
None => {
successful_articles_table
.set_header(vec![Cell::new("Downloaded articles")
.add_attribute(Attribute::Bold)
.set_alignment(CellAlignment::Center)
.fg(Color::Green)])
.set_content_arrangement(ContentArrangement::Dynamic);
for article in &articles {
let mut result = || -> Result<(), PaperoniError> {
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
let file_name = format!(
"{}.epub",
article
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
debug!("Creating {:?}", file_name);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
extractor::serialize_to_xhtml(article.article(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
epub.metadata("title", replace_metadata_value(name))
.unwrap();
let section_name = article.metadata().title();
epub.add_content(
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
.title(replace_metadata_value(section_name)),
)
.unwrap();
article.img_urls.iter().for_each(|img| {
if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))?;
}
epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
for img in &article.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
@ -35,52 +162,35 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
file_path.file_name().unwrap(),
img_buf,
img.1.as_ref().unwrap(),
)
.unwrap();
});
epub
});
let mut out_file = File::create(&name).unwrap();
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", name);
}
None => {
for article in articles {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
let file_name = format!(
"{}.epub",
article
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))
.unwrap();
}
epub.metadata("title", replace_metadata_value(article.metadata().title()))
.unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
.unwrap();
for img in article.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
)?;
}
let appendix = generate_appendix(vec![&article]);
epub.add_content(
EpubContent::new("appendix.xhtml", appendix.as_bytes())
.title(replace_metadata_value("Article Source")),
)?;
epub.generate(&mut out_file)?;
bar.inc(1);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
.unwrap();
successful_articles_table.add_row(vec![article.metadata().title()]);
debug!("Created {:?}", file_name);
Ok(())
};
if let Err(mut error) = result() {
error.set_article_source(&article.url);
errors.push(error);
}
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name);
}
bar.finish_with_message("Generated epubs\n");
}
}
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
}
/// Replaces characters that have to be escaped before adding to the epub's metadata
@ -91,6 +201,37 @@ fn replace_metadata_value(value: &str) -> String {
.replace(">", "&gt;")
}
//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
fn generate_appendix(articles: Vec<&Extractor>) -> String {
let link_tags: String = articles
.iter()
.map(|article| {
let article_name = if !article.metadata().title().is_empty() {
article.metadata().title()
} else {
&article.url
};
format!(
"<a href=\"{}\">{}</a><br></br>",
replace_metadata_value(&article.url),
replace_metadata_value(article_name)
)
})
.collect();
let template = format!(
r#"<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
</head>
<body>
<h2>Appendix</h2><h3>Article sources</h3>
{}
</body>
</html>"#,
link_tags
);
template
}
#[cfg(test)]
mod test {
use super::replace_metadata_value;

126
src/errors.rs Normal file
View file

@ -0,0 +1,126 @@
use thiserror::Error;
#[derive(Error, Debug)]
pub enum ErrorKind {
#[error("[EpubError]: {0}")]
EpubError(String),
#[error("[HTTPError]: {0}")]
HTTPError(String),
#[error("[IOError]: {0}")]
IOError(String),
#[error("[UTF8Error]: {0}")]
UTF8Error(String),
#[error("[ReadabilityError]: {0}")]
ReadabilityError(String),
}
#[derive(Error, Debug)]
#[error("{kind}")]
/// Used to represent errors from downloading images. Errors from here are used solely for debugging
/// as they are considered recoverable.
pub struct ImgError {
kind: ErrorKind,
url: Option<String>,
}
impl ImgError {
pub fn with_kind(kind: ErrorKind) -> Self {
ImgError { url: None, kind }
}
pub fn set_url(&mut self, url: &str) {
self.url = Some(url.to_string());
}
pub fn url(&self) -> &Option<String> {
&self.url
}
}
impl From<ErrorKind> for ImgError {
fn from(kind: ErrorKind) -> Self {
ImgError::with_kind(kind)
}
}
impl From<surf::Error> for ImgError {
fn from(err: surf::Error) -> Self {
ImgError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<url::ParseError> for ImgError {
fn from(err: url::ParseError) -> Self {
ImgError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<std::io::Error> for ImgError {
fn from(err: std::io::Error) -> Self {
ImgError::with_kind(ErrorKind::IOError(err.to_string()))
}
}
#[derive(Error, Debug)]
#[error("{kind}")]
pub struct PaperoniError {
article_source: Option<String>,
kind: ErrorKind,
}
impl PaperoniError {
pub fn with_kind(kind: ErrorKind) -> Self {
PaperoniError {
article_source: None,
kind,
}
}
pub fn kind(&self) -> &ErrorKind {
&self.kind
}
pub fn article_source(&self) -> &Option<String> {
&self.article_source
}
pub fn set_article_source(&mut self, article_source: &str) {
self.article_source = Some(article_source.to_owned());
}
}
impl From<ErrorKind> for PaperoniError {
fn from(kind: ErrorKind) -> Self {
PaperoniError::with_kind(kind)
}
}
impl From<epub_builder::Error> for PaperoniError {
fn from(err: epub_builder::Error) -> Self {
PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned()))
}
}
impl From<surf::Error> for PaperoniError {
fn from(err: surf::Error) -> Self {
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<url::ParseError> for PaperoniError {
fn from(err: url::ParseError) -> Self {
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<std::io::Error> for PaperoniError {
fn from(err: std::io::Error) -> Self {
PaperoniError::with_kind(ErrorKind::IOError(err.to_string()))
}
}
impl From<std::str::Utf8Error> for PaperoniError {
fn from(err: std::str::Utf8Error) -> Self {
PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string()))
}
}

View file

@ -2,6 +2,7 @@ use std::collections::HashMap;
use kuchiki::{traits::*, NodeRef};
use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>);
@ -14,22 +15,24 @@ pub struct Extractor {
article: Option<NodeRef>,
pub img_urls: Vec<ResourceInfo>,
readability: Readability,
pub url: String,
}
impl Extractor {
/// Create a new instance of an HTML extractor given an HTML string
pub fn from_html(html_str: &str) -> Self {
pub fn from_html(html_str: &str, url: &str) -> Self {
Extractor {
article: None,
img_urls: Vec::new(),
readability: Readability::new(html_str),
url: url.to_string(),
}
}
/// Locates and extracts the HTML in a document which is determined to be
/// the source of the content
pub fn extract_content(&mut self, url: &str) {
self.readability.parse(url);
pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
self.readability.parse(&self.url)?;
if let Some(article_node_ref) = &self.readability.article_node {
let template = r#"
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
@ -44,6 +47,7 @@ impl Extractor {
body.as_node().append(article_node_ref.clone());
self.article = Some(doc);
}
Ok(())
}
/// Traverses the DOM tree of the content and retrieves the IMG URLs
@ -61,8 +65,11 @@ impl Extractor {
}
}
pub fn article(&self) -> Option<&NodeRef> {
self.article.as_ref()
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
pub fn article(&self) -> &NodeRef {
self.article.as_ref().expect(
"Article node doesn't exist. This may be because the document has not been parsed",
)
}
pub fn metadata(&self) -> &MetaData {
@ -75,7 +82,7 @@ impl Extractor {
pub fn serialize_to_xhtml<W: std::io::Write>(
node_ref: &NodeRef,
mut w: &mut W,
) -> Result<(), Box<dyn std::error::Error>> {
) -> Result<(), PaperoniError> {
let mut escape_map = HashMap::new();
escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;");
@ -96,6 +103,7 @@ pub fn serialize_to_xhtml<W: std::io::Write>(
let attrs_str = attrs
.map
.iter()
.filter(|(k, _)| &k.local != "\"")
.map(|(k, v)| {
format!(
"{}=\"{}\"",
@ -156,8 +164,10 @@ mod test {
#[test]
fn test_extract_img_urls() {
let mut extractor = Extractor::from_html(TEST_HTML);
extractor.extract_content("http://example.com/");
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
extractor
.extract_content()
.expect("Article extraction failed unexpectedly");
extractor.extract_img_urls();
assert!(extractor.img_urls.len() > 0);

View file

@ -1,65 +1,90 @@
use async_std::io::prelude::*;
use async_std::{fs::File, stream};
use futures::StreamExt;
use indicatif::ProgressBar;
use log::{debug, info};
use url::Url;
use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Extractor;
type HTMLResource = (String, String);
pub async fn fetch_url(
url: &str,
) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
let client = surf::Client::new();
println!("Fetching...");
debug!("Fetching {}", url);
let mut redirect_count: u8 = 0;
let base_url = Url::parse(&url)?;
let mut url = base_url.clone();
while redirect_count < 5 {
redirect_count += 1;
let req = surf::get(&url);
let mut res = client.send(req).await?;
if res.status().is_redirection() {
if let Some(location) = res.header(surf::http::headers::LOCATION) {
match Url::parse(location.last().as_str()) {
Ok(valid_url) => url = valid_url,
Err(e) => match e {
url::ParseError::RelativeUrlWithoutBase => {
url = base_url.join(location.last().as_str())?
let process_request = async {
let mut redirect_count: u8 = 0;
let base_url = Url::parse(&url)?;
let mut url = base_url.clone();
while redirect_count < 5 {
redirect_count += 1;
let req = surf::get(&url);
let mut res = client.send(req).await?;
if res.status().is_redirection() {
if let Some(location) = res.header(surf::http::headers::LOCATION) {
match Url::parse(location.last().as_str()) {
Ok(valid_url) => {
info!("Redirecting {} to {}", url, valid_url);
url = valid_url
}
e => return Err(e.into()),
},
};
}
} else if res.status().is_success() {
if let Some(mime) = res.content_type() {
if mime.essence() == "text/html" {
return Ok((url.to_string(), res.body_string().await?));
Err(e) => match e {
url::ParseError::RelativeUrlWithoutBase => {
match base_url.join(location.last().as_str()) {
Ok(joined_url) => {
info!("Redirecting {} to {}", url, joined_url);
url = joined_url;
}
Err(e) => return Err(e.into()),
}
}
e => return Err(e.into()),
},
};
}
} else if res.status().is_success() {
if let Some(mime) = res.content_type() {
if mime.essence() == "text/html" {
debug!("Successfully fetched {}", url);
return Ok((url.to_string(), res.body_string().await?));
} else {
let msg = format!(
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
);
return Err(ErrorKind::HTTPError(msg).into());
}
} else {
return Err(format!(
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
)
.into());
return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
}
} else {
return Err("Unknown HTTP response".into());
let msg = format!("Request failed: HTTP {}", res.status());
return Err(ErrorKind::HTTPError(msg).into());
}
} else {
return Err(format!("Request failed: HTTP {}", res.status()).into());
}
}
Err("Unable to fetch HTML".into())
Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
};
process_request.await.map_err(|mut error: PaperoniError| {
error.set_article_source(url);
error
})
}
pub async fn download_images(
extractor: &mut Extractor,
article_origin: &Url,
) -> async_std::io::Result<()> {
bar: &ProgressBar,
) -> Result<(), Vec<ImgError>> {
if extractor.img_urls.len() > 0 {
println!("Downloading images...");
debug!(
"Downloading {} images for {}",
extractor.img_urls.len(),
article_origin
);
}
let img_count = extractor.img_urls.len();
let imgs_req_iter = extractor
.img_urls
@ -67,43 +92,73 @@ pub async fn download_images(
.map(|(url, _)| {
(
url,
surf::Client::new().get(get_absolute_url(&url, article_origin)),
surf::Client::new()
.with(surf::middleware::Redirect::default())
.get(get_absolute_url(&url, article_origin)),
)
})
.map(|(url, req)| async move {
let mut img_response = req.await.expect("Unable to retrieve image");
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
let img_mime = img_response
.content_type()
.map(|mime| mime.essence().to_string());
let img_ext = img_response
.content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
.unwrap();
.enumerate()
.map(|(img_idx, (url, req))| async move {
bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());
match req.await {
Ok(mut img_response) => {
let process_response = async {
let img_content: Vec<u8> = match img_response.body_bytes().await {
Ok(bytes) => bytes,
Err(e) => return Err(e.into()),
};
let img_mime = img_response
.content_type()
.map(|mime| mime.essence().to_string());
let img_ext = match img_response
.content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
{
Some(mime_str) => mime_str,
None => {
return Err(ErrorKind::HTTPError(
"Image has no Content-Type".to_owned(),
)
.into())
}
};
let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
let mut img_file = File::create(&img_path)
.await
.expect("Unable to create file");
img_file
.write_all(&img_content)
.await
.expect("Unable to save to file");
let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
let mut img_file = match File::create(&img_path).await {
Ok(file) => file,
Err(e) => return Err(e.into()),
};
match img_file.write_all(&img_content).await {
Ok(_) => (),
Err(e) => return Err(e.into()),
}
(
url,
img_path
.file_name()
.map(|os_str_name| {
os_str_name
.to_str()
.expect("Unable to get image file name")
.to_string()
Ok((
url,
img_path
.file_name()
.map(|os_str_name| {
os_str_name
.to_str()
.expect("Unable to get image file name")
.to_string()
})
.unwrap(),
img_mime,
))
};
process_response.await.map_err(|mut e: ImgError| {
e.set_url(url);
e
})
.unwrap(),
img_mime,
)
}
Err(e) => {
let mut img_err: ImgError = e.into();
img_err.set_url(url);
Err(img_err)
}
}
});
// A utility closure used when update the value of an image source after downloading is successful
@ -112,8 +167,6 @@ pub async fn download_images(
let (img_url, img_path, img_mime) = img_item;
let img_ref = extractor
.article()
.as_mut()
.expect("Unable to get mutable ref")
.select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut();
@ -124,14 +177,24 @@ pub async fn download_images(
(img_path, img_mime)
};
extractor.img_urls = stream::from_iter(imgs_req_iter)
let imgs_req_iter = stream::from_iter(imgs_req_iter)
.buffered(10)
.collect::<Vec<_>>()
.await
.into_iter()
.map(replace_existing_img_src)
.collect();
Ok(())
.collect::<Vec<Result<_, ImgError>>>()
.await;
let mut errors = Vec::new();
let mut replaced_imgs = Vec::new();
for img_req_result in imgs_req_iter {
match img_req_result {
Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
Err(e) => errors.push(e),
}
}
extractor.img_urls = replaced_imgs;
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
}
/// Handles getting the extension from a given MIME subtype.

260
src/logs.rs Normal file
View file

@ -0,0 +1,260 @@
use colored::*;
use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY;
use comfy_table::{Cell, CellAlignment, ContentArrangement, Table};
use directories::UserDirs;
use flexi_logger::LogSpecBuilder;
use log::error;
use crate::{cli::AppConfig, errors::PaperoniError};
pub fn display_summary(
initial_article_count: usize,
succesful_articles_table: Table,
partial_downloads_count: usize,
errors: Vec<PaperoniError>,
) {
let successfully_downloaded_count =
initial_article_count - partial_downloads_count - errors.len();
println!(
"{}",
short_summary(DownloadCount::new(
initial_article_count,
successfully_downloaded_count,
partial_downloads_count,
errors.len()
))
.bold()
);
if successfully_downloaded_count > 0 {
println!("{}", succesful_articles_table);
}
if !errors.is_empty() {
println!("\n{}", "Failed article downloads".bright_red().bold());
let mut table_failed = Table::new();
table_failed
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_header(vec![
Cell::new("Link").set_alignment(CellAlignment::Center),
Cell::new("Reason").set_alignment(CellAlignment::Center),
])
.set_content_arrangement(ContentArrangement::Dynamic);
for error in errors {
let error_source = error
.article_source()
.clone()
.unwrap_or_else(|| "<unknown link>".to_string());
table_failed.add_row(vec![&error_source, &format!("{}", error.kind())]);
error!("{}\n - {}", error, error_source);
}
println!("{}", table_failed);
}
}
/// Returns a string summary of the total number of failed and successful article downloads
fn short_summary(download_count: DownloadCount) -> String {
// TODO: Refactor this
if download_count.total
!= download_count.successful + download_count.failed + download_count.partial
{
panic!("initial_count must be equal to the sum of failed and successful count")
}
let get_noun = |count: usize| if count == 1 { "article" } else { "articles" };
if download_count.successful == download_count.total && download_count.successful == 1 {
"Article downloaded successfully".green().to_string()
} else if download_count.total == download_count.failed && download_count.failed == 1 {
"Article failed to download".red().to_string()
} else if download_count.total == download_count.partial && download_count.partial == 1 {
"Article partially failed to download".yellow().to_string()
} else if download_count.successful == download_count.total {
"All articles downloaded successfully".green().to_string()
} else if download_count.failed == download_count.total {
"All articles failed to download".red().to_string()
} else if download_count.partial == download_count.total {
"All articles partially failed to download"
.yellow()
.to_string()
} else if download_count.partial == 0 {
format!(
"{} {} downloaded successfully, {} {} failed",
download_count.successful,
get_noun(download_count.successful),
download_count.failed,
get_noun(download_count.failed)
)
.yellow()
.to_string()
} else if download_count.successful == 0
&& download_count.partial > 0
&& download_count.failed > 0
{
format!(
"{} {} partially failed to download, {} {} failed",
download_count.partial,
get_noun(download_count.partial),
download_count.failed,
get_noun(download_count.failed)
)
.yellow()
.to_string()
} else if download_count.failed == 0
&& download_count.successful > 0
&& download_count.partial > 0
{
format!(
"{} {} downloaded successfully, {} {} partially failed to download",
download_count.successful,
get_noun(download_count.successful),
download_count.partial,
get_noun(download_count.partial)
)
.yellow()
.to_string()
} else {
format!(
"{} {} downloaded successfully, {} {} partially failed to download, {} {} failed",
download_count.successful,
get_noun(download_count.successful),
download_count.partial,
get_noun(download_count.partial),
download_count.failed,
get_noun(download_count.failed)
)
.yellow()
.to_string()
}
}
struct DownloadCount {
total: usize,
successful: usize,
partial: usize,
failed: usize,
}
impl DownloadCount {
fn new(total: usize, successful: usize, partial: usize, failed: usize) -> Self {
Self {
total,
successful,
partial,
failed,
}
}
}
pub fn init_logger(app_config: &AppConfig) {
match UserDirs::new() {
Some(user_dirs) => {
let home_dir = user_dirs.home_dir();
let paperoni_dir = home_dir.join(".paperoni");
let log_dir = paperoni_dir.join("logs");
let log_spec = LogSpecBuilder::new()
.module("paperoni", app_config.log_level())
.build();
let formatted_timestamp = app_config.start_time().format("%Y-%m-%d_%H-%M-%S");
let mut logger = flexi_logger::Logger::with(log_spec);
if app_config.is_logging_to_file() && (!paperoni_dir.is_dir() || !log_dir.is_dir()) {
match std::fs::create_dir_all(&log_dir) {
Ok(_) => (),
Err(e) => {
eprintln!("Unable to create paperoni directories on home directory for logging purposes\n{}",e);
std::process::exit(1);
}
};
}
if app_config.is_logging_to_file() {
logger = logger
.directory(log_dir)
.discriminant(formatted_timestamp.to_string())
.suppress_timestamp()
.log_to_file();
}
match logger.start() {
Ok(_) => (),
Err(e) => eprintln!("Unable to start logger!\n{}", e),
}
}
None => eprintln!("Unable to get user directories for logging purposes"),
};
}
#[cfg(test)]
mod tests {
use super::{short_summary, DownloadCount};
use colored::*;
#[test]
fn test_short_summary() {
assert_eq!(
short_summary(DownloadCount::new(1, 1, 0, 0)),
"Article downloaded successfully".green().to_string()
);
assert_eq!(
short_summary(DownloadCount::new(1, 0, 0, 1)),
"Article failed to download".red().to_string()
);
assert_eq!(
short_summary(DownloadCount::new(10, 10, 0, 0)),
"All articles downloaded successfully".green().to_string()
);
assert_eq!(
short_summary(DownloadCount::new(10, 0, 0, 10)),
"All articles failed to download".red().to_string()
);
assert_eq!(
short_summary(DownloadCount::new(10, 8, 0, 2)),
"8 articles downloaded successfully, 2 articles failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(10, 1, 0, 9)),
"1 article downloaded successfully, 9 articles failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(7, 6, 0, 1)),
"6 articles downloaded successfully, 1 article failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(7, 4, 2, 1)),
"4 articles downloaded successfully, 2 articles partially failed to download, 1 article failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(12, 6, 6, 0)),
"6 articles downloaded successfully, 6 articles partially failed to download"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(5, 0, 4, 1)),
"4 articles partially failed to download, 1 article failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(4, 0, 4, 0)),
"All articles partially failed to download"
.yellow()
.to_string()
);
}
#[test]
#[should_panic(
expected = "initial_count must be equal to the sum of failed and successful count"
)]
fn test_short_summary_panics_on_invalid_input() {
short_summary(DownloadCount::new(0, 12, 0, 43));
}
}

View file

@ -3,21 +3,28 @@ extern crate lazy_static;
use async_std::stream;
use async_std::task;
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
use comfy_table::{ContentArrangement, Table};
use futures::stream::StreamExt;
use indicatif::{ProgressBar, ProgressStyle};
use log::{debug, warn};
use url::Url;
mod cli;
mod epub;
mod errors;
mod extractor;
/// This module is responsible for async HTTP calls for downloading
/// the HTML content and images
mod http;
mod logs;
mod moz_readability;
use cli::AppConfig;
use epub::generate_epubs;
use extractor::Extractor;
use http::{download_images, fetch_url};
use http::{download_images, fetch_html};
use logs::display_summary;
fn main() {
let app_config = cli::cli_init();
@ -28,29 +35,92 @@ fn main() {
}
fn download(app_config: AppConfig) {
let mut errors = Vec::new();
let mut partial_download_count: usize = 0;
let bar = if app_config.can_disable_progress_bar() {
ProgressBar::hidden()
} else {
let enabled_bar = ProgressBar::new(app_config.urls().len() as u64);
let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
);
enabled_bar.set_style(style);
enabled_bar.enable_steady_tick(500);
enabled_bar
};
let articles = task::block_on(async {
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url));
let urls_iter = app_config.urls().iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await {
match fetch_result {
Ok((url, html)) => {
println!("Extracting");
let mut extractor = Extractor::from_html(&html);
extractor.extract_content(&url);
if extractor.article().is_some() {
extractor.extract_img_urls();
download_images(&mut extractor, &Url::parse(&url).unwrap())
.await
.expect("Unable to download images");
articles.push(extractor);
debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.await
{
partial_download_count += 1;
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
}
}
Err(e) => eprintln!("{}", e),
Err(e) => errors.push(e),
}
bar.inc(1);
}
articles
});
generate_epubs(articles, app_config.merged());
bar.finish_with_message("Downloaded articles");
let mut succesful_articles_table = Table::new();
succesful_articles_table
.load_preset(UTF8_FULL)
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_content_arrangement(ContentArrangement::Dynamic);
match generate_epubs(articles, &app_config, &mut succesful_articles_table) {
Ok(_) => (),
Err(gen_epub_errors) => {
errors.extend(gen_epub_errors);
}
};
let has_errors = !errors.is_empty();
display_summary(
app_config.urls().len(),
succesful_articles_table,
partial_download_count,
errors,
);
if app_config.is_logging_to_file() {
println!(
"Log written to paperoni_{}.log\n",
app_config.start_time().format("%Y-%m-%d_%H-%M-%S")
);
}
if has_errors {
std::process::exit(1);
}
}

View file

@ -7,8 +7,11 @@ use kuchiki::{
traits::*,
NodeData, NodeRef,
};
use log::info;
use url::Url;
use crate::errors::{ErrorKind, PaperoniError};
const DEFAULT_CHAR_THRESHOLD: usize = 500;
const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
@ -76,14 +79,15 @@ impl Readability {
metadata: MetaData::new(),
}
}
pub fn parse(&mut self, url: &str) {
pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> {
self.unwrap_no_script_tags();
self.remove_scripts();
self.prep_document();
self.metadata = self.get_article_metadata();
self.article_title = self.metadata.title.clone();
self.grab_article();
self.grab_article()?;
self.post_process_content(url);
Ok(())
}
/// Recursively check if node is image, or if node contains exactly only one image
@ -426,8 +430,7 @@ impl Readability {
let mut matches = None;
if let Some(property) = node_attr.get("property") {
matches = regexes::PROPERTY_REGEX.captures(property);
if matches.is_some() {
let captures = matches.as_ref().unwrap();
if let Some(captures) = &matches {
for capture in captures.iter() {
let mut name = capture.unwrap().as_str().to_lowercase();
name = regexes::REPLACE_WHITESPACE_REGEX
@ -561,7 +564,7 @@ impl Readability {
.root_node
.select_first("title")
.map(|title| title.text_contents().trim().to_string())
.expect("This file has no <title> tag to extract a title from");
.unwrap_or("".to_string());
let orig_title = cur_title.clone();
let mut title_had_hierarchical_separators = false;
let word_count = |s: &str| -> usize { s.split_whitespace().count() };
@ -595,8 +598,8 @@ impl Readability {
}
} else if cur_title.len() > 150 || cur_title.len() < 15 {
let mut h1_nodes = self.root_node.select("h1").unwrap();
let (_, h1_count) = h1_nodes.size_hint();
if Some(1) == h1_count {
let h1_count = self.root_node.select("h1").unwrap().count();
if h1_count == 1 {
cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None);
}
}
@ -799,6 +802,7 @@ impl Readability {
state = State::ReadProp;
decl.1 = Some(token.trim().to_string());
tokens.push(decl.clone());
decl = (None, None);
token.clear();
} else {
token.push(c);
@ -819,11 +823,18 @@ impl Readability {
}
}
if !token.is_empty() {
decl.1 = Some(token.trim().to_string());
tokens.push(decl);
match state {
State::ReadVal => {
decl.1 = Some(token.trim().to_string());
tokens.push(decl);
}
_ => (),
}
}
tokens
.into_iter()
.filter(|tok_pair| tok_pair.0.is_some() && tok_pair.1.is_some())
.map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap()))
.collect()
}
@ -1576,16 +1587,14 @@ impl Readability {
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
/// a user wants to read. Then return it wrapped up in a div.
fn grab_article(&mut self) {
println!("Grabbing article");
fn grab_article(&mut self) -> Result<(), PaperoniError> {
info!("Grabbing article {:?}", self.metadata.title);
// var doc = this._doc;
// var isPaging = (page !== null ? true: false);
// page = page ? page : this._doc.body;
let page = self.root_node.select_first("body");
if page.is_err() {
// TODO:Have error logging for this
println!("Document has no <body>");
return;
return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
}
let page = page.unwrap();
let mut attempts: Vec<ExtractAttempt> = Vec::new();
@ -2075,8 +2084,10 @@ impl Readability {
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
if attempts.first().as_ref().unwrap().length == 0 {
println!("Unable to extract content");
break;
return Err(ErrorKind::ReadabilityError(
"Unable to extract content".into(),
)
.into());
}
article_content = attempts[0].article.clone();
parse_successful = true;
@ -2102,7 +2113,8 @@ impl Readability {
false
});
self.article_node = Some(article_content);
return;
info!("Successfully grabbed article {:?}", self.metadata.title);
return Ok(());
}
}
}
@ -2460,12 +2472,24 @@ mod test {
css_map.insert("align-items".to_string(), "center".to_string());
css_map.insert("border".to_string(), "2px solid black".to_string());
let css_str_to_vec = Readability::inline_css_str_to_map(css_str);
assert_eq!(css_map, css_str_to_vec);
let css_str_to_map = Readability::inline_css_str_to_map(css_str);
assert_eq!(css_map, css_str_to_map);
let mut css_map = HashMap::new();
css_map.insert("color".to_string(), "red".to_string());
css_map.insert("background-image".to_string(), "url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')".to_string());
assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')"));
let empty_map = HashMap::new();
assert_eq!(empty_map, Readability::inline_css_str_to_map(" \n \t \r"));
assert_eq!(empty_map, Readability::inline_css_str_to_map("color"));
let mut css_map = HashMap::new();
css_map.insert("color".to_string(), "red".to_string());
css_map.insert("height".to_string(), "300px".to_string());
assert_eq!(
css_map,
Readability::inline_css_str_to_map("color: red;height: 300px;width")
);
}
#[test]