Merge pull request #10 from hipstermojo/dev

v0.4.0 release
This commit is contained in:
Kenneth Gitere 2021-04-30 08:48:11 +03:00 committed by GitHub
commit 474d97c6bd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 1424 additions and 281 deletions

3
.gitignore vendored
View file

@ -1,2 +1,3 @@
/target /target
*.epub *.epub
*.log

480
Cargo.lock generated
View file

@ -126,12 +126,15 @@ dependencies = [
[[package]] [[package]]
name = "async-global-executor" name = "async-global-executor"
version = "1.4.3" version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73079b49cd26b8fd5a15f68fc7707fc78698dc2a3d61430f2a7a9430230dfa04" checksum = "9586ec52317f36de58453159d48351bc244bc24ced3effc1fce22f3d48664af6"
dependencies = [ dependencies = [
"async-channel",
"async-executor", "async-executor",
"async-io", "async-io",
"async-mutex",
"blocking",
"futures-lite", "futures-lite",
"num_cpus", "num_cpus",
"once_cell", "once_cell",
@ -147,7 +150,7 @@ dependencies = [
"fastrand", "fastrand",
"futures-lite", "futures-lite",
"libc", "libc",
"log 0.4.11", "log 0.4.14",
"nb-connect", "nb-connect",
"once_cell", "once_cell",
"parking", "parking",
@ -157,6 +160,15 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "async-lock"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6a8ea61bf9947a1007c5cada31e647dbc77b103c679858150003ba697ea798b"
dependencies = [
"event-listener",
]
[[package]] [[package]]
name = "async-mutex" name = "async-mutex"
version = "1.4.0" version = "1.4.0"
@ -168,14 +180,14 @@ dependencies = [
[[package]] [[package]]
name = "async-std" name = "async-std"
version = "1.7.0" version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7e82538bc65a25dbdff70e4c5439d52f068048ab97cdea0acd73f131594caa1" checksum = "d9f06685bad74e0570f5213741bea82158279a4103d988e57bfada11ad230341"
dependencies = [ dependencies = [
"async-channel",
"async-global-executor", "async-global-executor",
"async-io", "async-io",
"async-mutex", "async-lock",
"blocking",
"crossbeam-utils", "crossbeam-utils",
"futures-channel", "futures-channel",
"futures-core", "futures-core",
@ -183,11 +195,11 @@ dependencies = [
"futures-lite", "futures-lite",
"gloo-timers", "gloo-timers",
"kv-log-macro", "kv-log-macro",
"log 0.4.11", "log 0.4.14",
"memchr", "memchr",
"num_cpus", "num_cpus",
"once_cell", "once_cell",
"pin-project-lite 0.1.11", "pin-project-lite 0.2.4",
"pin-utils", "pin-utils",
"slab", "slab",
"wasm-bindgen-futures", "wasm-bindgen-futures",
@ -394,6 +406,28 @@ dependencies = [
"vec_map", "vec_map",
] ]
[[package]]
name = "colored"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd"
dependencies = [
"atty",
"lazy_static",
"winapi",
]
[[package]]
name = "comfy-table"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17b99e9022e080d384b58d8eaf5976b42a311ff7a9669f8200eb2453c0b2b81a"
dependencies = [
"crossterm",
"strum",
"strum_macros",
]
[[package]] [[package]]
name = "concurrent-queue" name = "concurrent-queue"
version = "1.2.2" version = "1.2.2"
@ -403,6 +437,21 @@ dependencies = [
"cache-padded", "cache-padded",
] ]
[[package]]
name = "console"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3993e6445baa160675931ec041a5e03ca84b9c6e32a056150d3aa2bdda0a1f45"
dependencies = [
"encode_unicode",
"lazy_static",
"libc",
"regex",
"terminal_size",
"unicode-width",
"winapi",
]
[[package]] [[package]]
name = "const_fn" name = "const_fn"
version = "0.4.3" version = "0.4.3"
@ -453,6 +502,31 @@ dependencies = [
"lazy_static", "lazy_static",
] ]
[[package]]
name = "crossterm"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c"
dependencies = [
"bitflags",
"crossterm_winapi",
"lazy_static",
"libc",
"mio",
"parking_lot",
"signal-hook",
"winapi",
]
[[package]]
name = "crossterm_winapi"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9"
dependencies = [
"winapi",
]
[[package]] [[package]]
name = "crypto-mac" name = "crypto-mac"
version = "0.10.0" version = "0.10.0"
@ -490,6 +564,16 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "ctor"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fbaabec2c953050352311293be5c6aba8e141ba19d6811862b232d6fd020484"
dependencies = [
"quote",
"syn",
]
[[package]] [[package]]
name = "ctr" name = "ctr"
version = "0.6.0" version = "0.6.0"
@ -530,6 +614,16 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "dashmap"
version = "4.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e77a43b28d0668df09411cb0bc9a8c2adc40f9a048afe863e05fd43251e8e39c"
dependencies = [
"cfg-if 1.0.0",
"num_cpus",
]
[[package]] [[package]]
name = "data-encoding" name = "data-encoding"
version = "2.3.1" version = "2.3.1"
@ -556,6 +650,26 @@ dependencies = [
"generic-array", "generic-array",
] ]
[[package]]
name = "directories"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e69600ff1703123957937708eb27f7a564e48885c537782722ed0ba3189ce1d7"
dependencies = [
"dirs-sys",
]
[[package]]
name = "dirs-sys"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03d86534ed367a67548dc68113a0f5db55432fdfbb6e6f9d77704397d95d5780"
dependencies = [
"libc",
"redox_users",
"winapi",
]
[[package]] [[package]]
name = "discard" name = "discard"
version = "1.0.4" version = "1.0.4"
@ -577,6 +691,12 @@ dependencies = [
"dtoa", "dtoa",
] ]
[[package]]
name = "encode_unicode"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
[[package]] [[package]]
name = "encoding_rs" name = "encoding_rs"
version = "0.8.26" version = "0.8.26"
@ -640,6 +760,22 @@ dependencies = [
"miniz_oxide 0.3.7", "miniz_oxide 0.3.7",
] ]
[[package]]
name = "flexi_logger"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab94b6ac8eb69f1496a6993f26f785b5fd6d99b7416023eb2a6175c0b242b1"
dependencies = [
"atty",
"chrono",
"glob",
"lazy_static",
"log 0.4.14",
"regex",
"thiserror",
"yansi",
]
[[package]] [[package]]
name = "flume" name = "flume"
version = "0.9.2" version = "0.9.2"
@ -685,9 +821,9 @@ dependencies = [
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.12" version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150" checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
dependencies = [ dependencies = [
"futures-channel", "futures-channel",
"futures-core", "futures-core",
@ -700,9 +836,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-channel" name = "futures-channel"
version = "0.3.12" version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-sink", "futures-sink",
@ -710,15 +846,15 @@ dependencies = [
[[package]] [[package]]
name = "futures-core" name = "futures-core"
version = "0.3.12" version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
[[package]] [[package]]
name = "futures-executor" name = "futures-executor"
version = "0.3.12" version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9" checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-task", "futures-task",
@ -727,9 +863,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-io" name = "futures-io"
version = "0.3.12" version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
[[package]] [[package]]
name = "futures-lite" name = "futures-lite"
@ -748,9 +884,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-macro" name = "futures-macro"
version = "0.3.12" version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
dependencies = [ dependencies = [
"proc-macro-hack", "proc-macro-hack",
"proc-macro2", "proc-macro2",
@ -760,24 +896,21 @@ dependencies = [
[[package]] [[package]]
name = "futures-sink" name = "futures-sink"
version = "0.3.12" version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
[[package]] [[package]]
name = "futures-task" name = "futures-task"
version = "0.3.12" version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
dependencies = [
"once_cell",
]
[[package]] [[package]]
name = "futures-util" name = "futures-util"
version = "0.3.12" version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
dependencies = [ dependencies = [
"futures-channel", "futures-channel",
"futures-core", "futures-core",
@ -823,6 +956,17 @@ dependencies = [
"wasi 0.9.0+wasi-snapshot-preview1", "wasi 0.9.0+wasi-snapshot-preview1",
] ]
[[package]]
name = "getrandom"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8"
dependencies = [
"cfg-if 1.0.0",
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
]
[[package]] [[package]]
name = "ghash" name = "ghash"
version = "0.3.0" version = "0.3.0"
@ -838,6 +982,12 @@ version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6503fe142514ca4799d4c26297c4248239fe8838d827db6bd6065c6ed29a6ce" checksum = "f6503fe142514ca4799d4c26297c4248239fe8838d827db6bd6065c6ed29a6ce"
[[package]]
name = "glob"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
[[package]] [[package]]
name = "gloo-timers" name = "gloo-timers"
version = "0.2.1" version = "0.2.1"
@ -851,6 +1001,15 @@ dependencies = [
"web-sys", "web-sys",
] ]
[[package]]
name = "heck"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac"
dependencies = [
"unicode-segmentation",
]
[[package]] [[package]]
name = "hermit-abi" name = "hermit-abi"
version = "0.1.17" version = "0.1.17"
@ -895,7 +1054,7 @@ version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b"
dependencies = [ dependencies = [
"log 0.4.11", "log 0.4.14",
"mac", "mac",
"markup5ever", "markup5ever",
"proc-macro2", "proc-macro2",
@ -916,15 +1075,17 @@ dependencies = [
[[package]] [[package]]
name = "http-client" name = "http-client"
version = "6.2.0" version = "6.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "010092b71b94ee49293995625ce7a607778b8b4099c8088fa84fd66bd3e0f21c" checksum = "5566ecc26bc6b04e773e680d66141fced78e091ad818e420d726c152b05a64ff"
dependencies = [ dependencies = [
"async-std", "async-std",
"async-trait", "async-trait",
"cfg-if 1.0.0",
"dashmap",
"http-types", "http-types",
"isahc", "isahc",
"log 0.4.11", "log 0.4.14",
] ]
[[package]] [[package]]
@ -960,6 +1121,18 @@ dependencies = [
"unicode-normalization", "unicode-normalization",
] ]
[[package]]
name = "indicatif"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4"
dependencies = [
"console",
"lazy_static",
"number_prefix",
"regex",
]
[[package]] [[package]]
name = "infer" name = "infer"
version = "0.2.3" version = "0.2.3"
@ -988,7 +1161,7 @@ dependencies = [
"flume", "flume",
"futures-lite", "futures-lite",
"http", "http",
"log 0.4.11", "log 0.4.14",
"once_cell", "once_cell",
"slab", "slab",
"sluice", "sluice",
@ -1031,7 +1204,7 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f"
dependencies = [ dependencies = [
"log 0.4.11", "log 0.4.14",
] ]
[[package]] [[package]]
@ -1042,9 +1215,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.80" version = "0.2.93"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41"
[[package]] [[package]]
name = "libnghttp2-sys" name = "libnghttp2-sys"
@ -1083,16 +1256,17 @@ version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b"
dependencies = [ dependencies = [
"log 0.4.11", "log 0.4.14",
] ]
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.11" version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
dependencies = [ dependencies = [
"cfg-if 0.1.10", "cfg-if 1.0.0",
"value-bag",
] ]
[[package]] [[package]]
@ -1107,7 +1281,7 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
dependencies = [ dependencies = [
"log 0.4.11", "log 0.4.14",
"phf", "phf",
"phf_codegen", "phf_codegen",
"serde", "serde",
@ -1171,6 +1345,28 @@ dependencies = [
"autocfg", "autocfg",
] ]
[[package]]
name = "mio"
version = "0.7.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956"
dependencies = [
"libc",
"log 0.4.14",
"miow",
"ntapi",
"winapi",
]
[[package]]
name = "miow"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21"
dependencies = [
"winapi",
]
[[package]] [[package]]
name = "mustache" name = "mustache"
version = "0.9.0" version = "0.9.0"
@ -1203,6 +1399,15 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "ntapi"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44"
dependencies = [
"winapi",
]
[[package]] [[package]]
name = "num-integer" name = "num-integer"
version = "0.1.44" version = "0.1.44"
@ -1232,6 +1437,12 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "number_prefix"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
[[package]] [[package]]
name = "object" name = "object"
version = "0.22.0" version = "0.22.0"
@ -1271,18 +1482,26 @@ dependencies = [
[[package]] [[package]]
name = "paperoni" name = "paperoni"
version = "0.3.0-alpha1" version = "0.4.0-alpha1"
dependencies = [ dependencies = [
"async-std", "async-std",
"chrono",
"clap", "clap",
"colored",
"comfy-table",
"directories",
"epub-builder", "epub-builder",
"flexi_logger",
"futures", "futures",
"html5ever", "html5ever",
"indicatif",
"kuchiki", "kuchiki",
"lazy_static", "lazy_static",
"log 0.4.14",
"md5", "md5",
"regex", "regex",
"surf", "surf",
"thiserror",
"url", "url",
] ]
@ -1292,6 +1511,31 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72"
[[package]]
name = "parking_lot"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb"
dependencies = [
"instant",
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018"
dependencies = [
"cfg-if 1.0.0",
"instant",
"libc",
"redox_syscall 0.2.6",
"smallvec",
"winapi",
]
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"
version = "2.1.0" version = "2.1.0"
@ -1404,7 +1648,7 @@ checksum = "a2a7bc6b2a29e632e45451c941832803a18cce6781db04de8a04696cdca8bde4"
dependencies = [ dependencies = [
"cfg-if 0.1.10", "cfg-if 0.1.10",
"libc", "libc",
"log 0.4.11", "log 0.4.14",
"wepoll-sys", "wepoll-sys",
"winapi", "winapi",
] ]
@ -1480,7 +1724,7 @@ version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [ dependencies = [
"getrandom", "getrandom 0.1.15",
"libc", "libc",
"rand_chacha", "rand_chacha",
"rand_core 0.5.1", "rand_core 0.5.1",
@ -1519,7 +1763,7 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [ dependencies = [
"getrandom", "getrandom 0.1.15",
] ]
[[package]] [[package]]
@ -1556,22 +1800,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
[[package]] [[package]]
name = "regex" name = "redox_syscall"
version = "1.4.2" version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c" checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041"
dependencies = [
"bitflags",
]
[[package]]
name = "redox_users"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64"
dependencies = [
"getrandom 0.2.2",
"redox_syscall 0.2.6",
]
[[package]]
name = "regex"
version = "1.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "957056ecddbeba1b26965114e191d2e8589ce74db242b6ea25fc4062427a5c19"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"memchr", "memchr",
"regex-syntax", "regex-syntax",
"thread_local",
] ]
[[package]] [[package]]
name = "regex-syntax" name = "regex-syntax"
version = "0.6.21" version = "0.6.23"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189" checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548"
[[package]] [[package]]
name = "remove_dir_all" name = "remove_dir_all"
@ -1629,7 +1891,7 @@ dependencies = [
"cssparser", "cssparser",
"derive_more", "derive_more",
"fxhash", "fxhash",
"log 0.4.11", "log 0.4.14",
"matches", "matches",
"phf", "phf",
"phf_codegen", "phf_codegen",
@ -1738,6 +2000,26 @@ dependencies = [
"opaque-debug", "opaque-debug",
] ]
[[package]]
name = "signal-hook"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729"
dependencies = [
"libc",
"mio",
"signal-hook-registry",
]
[[package]]
name = "signal-hook-registry"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "siphasher" name = "siphasher"
version = "0.3.3" version = "0.3.3"
@ -1763,9 +2045,9 @@ dependencies = [
[[package]] [[package]]
name = "smallvec" name = "smallvec"
version = "1.5.0" version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7acad6f34eb9e8a259d3283d1e8c1d34d7415943d4895f65cc73813c7396fc85" checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
[[package]] [[package]]
name = "socket2" name = "socket2"
@ -1775,7 +2057,7 @@ checksum = "2c29947abdee2a218277abeca306f25789c938e500ea5a9d4b12a5a504466902"
dependencies = [ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"libc", "libc",
"redox_syscall", "redox_syscall 0.1.57",
"winapi", "winapi",
] ]
@ -1883,6 +2165,24 @@ version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "strum"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7318c509b5ba57f18533982607f24070a55d353e90d4cae30c467cdb2ad5ac5c"
[[package]]
name = "strum_macros"
version = "0.20.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee8bc6b87a5112aeeab1f4a9f7ab634fe6cbefc4850006df31267f4cfb9e3149"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "subtle" name = "subtle"
version = "2.3.0" version = "2.3.0"
@ -1891,21 +2191,21 @@ checksum = "343f3f510c2915908f155e94f17220b19ccfacf2a64a2a5d8004f2c3e311e7fd"
[[package]] [[package]]
name = "surf" name = "surf"
version = "2.1.0" version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7189c787d96fe18fef704950de76d590022d9d70858a4a201e1f07a0666882ea" checksum = "2a154d33ca6b5e1fe6fd1c760e5a5cc1202425f6cca2e13229f16a69009f6328"
dependencies = [ dependencies = [
"async-std", "async-std",
"async-trait", "async-trait",
"cfg-if 0.1.10", "cfg-if 1.0.0",
"encoding_rs", "encoding_rs",
"futures-util", "futures-util",
"http-client", "http-client",
"http-types", "http-types",
"log 0.4.11", "log 0.4.14",
"mime_guess", "mime_guess",
"once_cell", "once_cell",
"pin-project-lite 0.1.11", "pin-project-lite 0.2.4",
"serde", "serde",
"serde_json", "serde_json",
"web-sys", "web-sys",
@ -1943,6 +2243,16 @@ dependencies = [
"utf-8", "utf-8",
] ]
[[package]]
name = "terminal_size"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86ca8ced750734db02076f44132d802af0b33b09942331f4459dde8636fd2406"
dependencies = [
"libc",
"winapi",
]
[[package]] [[package]]
name = "textwrap" name = "textwrap"
version = "0.11.0" version = "0.11.0"
@ -1960,33 +2270,24 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.22" version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e" checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e"
dependencies = [ dependencies = [
"thiserror-impl", "thiserror-impl",
] ]
[[package]] [[package]]
name = "thiserror-impl" name = "thiserror-impl"
version = "1.0.22" version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56" checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn",
] ]
[[package]]
name = "thread_local"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
dependencies = [
"lazy_static",
]
[[package]] [[package]]
name = "time" name = "time"
version = "0.1.44" version = "0.1.44"
@ -2058,7 +2359,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27" checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27"
dependencies = [ dependencies = [
"cfg-if 0.1.10", "cfg-if 0.1.10",
"log 0.4.11", "log 0.4.14",
"pin-project-lite 0.1.11", "pin-project-lite 0.1.11",
"tracing-attributes", "tracing-attributes",
"tracing-core", "tracing-core",
@ -2127,6 +2428,12 @@ dependencies = [
"tinyvec", "tinyvec",
] ]
[[package]]
name = "unicode-segmentation"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb0d2e7be6ae3a5fa87eed5fb451aff96f2573d2694942e40543ae0bbe19c796"
[[package]] [[package]]
name = "unicode-width" name = "unicode-width"
version = "0.1.8" version = "0.1.8"
@ -2151,9 +2458,9 @@ dependencies = [
[[package]] [[package]]
name = "url" name = "url"
version = "2.2.0" version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b"
dependencies = [ dependencies = [
"form_urlencoded", "form_urlencoded",
"idna", "idna",
@ -2183,6 +2490,15 @@ dependencies = [
"rand 0.7.3", "rand 0.7.3",
] ]
[[package]]
name = "value-bag"
version = "1.0.0-alpha.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b676010e055c99033117c2343b33a40a30b91fecd6c49055ac9cd2d6c305ab1"
dependencies = [
"ctor",
]
[[package]] [[package]]
name = "vcpkg" name = "vcpkg"
version = "0.2.10" version = "0.2.10"
@ -2243,7 +2559,7 @@ checksum = "f22b422e2a757c35a73774860af8e112bff612ce6cb604224e8e47641a9e4f68"
dependencies = [ dependencies = [
"bumpalo", "bumpalo",
"lazy_static", "lazy_static",
"log 0.4.11", "log 0.4.14",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn",
@ -2332,6 +2648,12 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "yansi"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fc79f4a1e39857fc00c3f662cbf2651c771f00e9c15fe2abc341806bd46bd71"
[[package]] [[package]]
name = "zip" name = "zip"
version = "0.5.8" version = "0.5.8"

View file

@ -3,7 +3,7 @@ description = "A web article downloader"
homepage = "https://github.com/hipstermojo/paperoni" homepage = "https://github.com/hipstermojo/paperoni"
repository = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni"
name = "paperoni" name = "paperoni"
version = "0.3.0-alpha1" version = "0.4.0-alpha1"
authors = ["Kenneth Gitere <gitere81@gmail.com>"] authors = ["Kenneth Gitere <gitere81@gmail.com>"]
edition = "2018" edition = "2018"
license = "MIT" license = "MIT"
@ -12,14 +12,23 @@ readme = "README.md"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
async-std = "1.7.0" async-std = "1.9.0"
# atty = "0.2.14"
chrono = "0.4.19"
clap = "2.33.3" clap = "2.33.3"
colored = "2.0.0"
comfy-table = "2.1.0"
directories = "3.0.2"
epub-builder = "0.4.8" epub-builder = "0.4.8"
futures = "0.3.12" flexi_logger = "0.17.1"
futures = "0.3.14"
html5ever = "0.25.1" html5ever = "0.25.1"
indicatif = "0.15.0"
kuchiki = "0.8.1" kuchiki = "0.8.1"
lazy_static = "1.4.0" lazy_static = "1.4.0"
log = "0.4.14"
md5 = "0.7.0" md5 = "0.7.0"
regex = "1.4.2" regex = "1.4.5"
surf = "2.1.0" surf = "2.2.0"
url = "2.2.0" thiserror = "1.0.24"
url = "2.2.1"

View file

@ -1,8 +1,10 @@
![crates.io](https://img.shields.io/crates/v/paperoni.svg)
<p align="center"><img src="./paperoni-dark.png"></p> <p align="center"><img src="./paperoni-dark.png"></p>
<p align="center"><i>Salami not included</i></p> <p align="center"><i>Salami not included</i></p>
Paperoni is a web article downloader written in Rust. The downloaded articles are then exported as EPUB files. Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs.
> This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash. > This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash.
@ -17,7 +19,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for
Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run: Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run:
```sh ```sh
cargo install paperoni --version 0.3.0-alpha1 cargo install paperoni --version 0.4.0-alpha1
``` ```
_Paperoni is still in alpha so the `version` flag has to be passed._ _Paperoni is still in alpha so the `version` flag has to be passed._
@ -37,6 +39,27 @@ cargo run -- # pass your url here
## Usage ## Usage
```
USAGE:
paperoni [OPTIONS] [urls]...
OPTIONS:
-f, --file <file> Input file containing links
-h, --help Prints help information
--log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level
of debug. Use -v to specify the logging level
--max_conn <max_conn> The maximum number of concurrent HTTP connections when downloading articles. Default is
8
--merge <output_name> Merge multiple articles into a single epub
-V, --version Prints version information
-v Enables logging of events and set the verbosity level. Use -h to read on its usage
ARGS:
<urls>... Urls of web articles
```
To download a single article pass in its URL
```sh ```sh
paperoni https://en.wikipedia.org/wiki/Pepperoni paperoni https://en.wikipedia.org/wiki/Pepperoni
``` ```
@ -68,10 +91,23 @@ into a single epub using the `merge` flag and specifying the output file.
paperoni -f links.txt --merge out.epub paperoni -f links.txt --merge out.epub
``` ```
### Logging events
Logging is disabled by default. This can be activated by either using the `-v` flag or `--log-to-file` flag. If the `--log-to-file` flag is passed the logs are sent to a file in the default Paperoni directory `.paperoni/logs` which is on your home directory. The `-v` flag configures the verbosity levels such that:
```
-v Logs only the error level
-vv Logs only the warn level
-vvv Logs only the info level
-vvvv Logs only the debug level
```
If only the `-v` flag is passed, the progress bars are disabled. If both `-v` and `--log-to-file` are passed then the progress bars will still be shown.
## How it works ## How it works
The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor. The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor.
This extractor retrieves a possible article using a port of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB. This extractor retrieves a possible article using a [custom port](https://github.com/hipstermojo/paperoni/blob/master/src/moz_readability/mod.rs) of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB.
> The port of the algorithm is still unstable as well so it is not fully compatible with all the websites that can be extracted using Readability. > The port of the algorithm is still unstable as well so it is not fully compatible with all the websites that can be extracted using Readability.
@ -82,3 +118,5 @@ This program is still in alpha so a number of things won't work:
- Websites that only run with JavaScript cannot be extracted. - Websites that only run with JavaScript cannot be extracted.
- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either. - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
- Code snippets on Medium articles that are lazy loaded will not appear in the EPUB. - Code snippets on Medium articles that are lazy loaded will not appear in the EPUB.
There are also web pages it won't work on in general such as Twitter and Reddit threads.

View file

@ -1,6 +1,10 @@
use std::{fs::File, io::Read}; use std::{fs::File, io::Read, path::Path};
use chrono::{DateTime, Local};
use clap::{App, AppSettings, Arg}; use clap::{App, AppSettings, Arg};
use flexi_logger::LevelFilter as LogLevel;
use crate::logs::init_logger;
pub fn cli_init() -> AppConfig { pub fn cli_init() -> AppConfig {
let app = App::new("paperoni") let app = App::new("paperoni")
@ -8,12 +12,9 @@ pub fn cli_init() -> AppConfig {
AppSettings::ArgRequiredElseHelp, AppSettings::ArgRequiredElseHelp,
AppSettings::UnifiedHelpMessage, AppSettings::UnifiedHelpMessage,
]) ])
.version("0.3.0-alpha1") .version(clap::crate_version!())
.about( .about(
" "Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs",
Paperoni is an article downloader.
It takes a url and downloads the article content from it and saves it to an epub.
",
) )
.arg( .arg(
Arg::with_name("urls") Arg::with_name("urls")
@ -38,8 +39,29 @@ It takes a url and downloads the article content from it and saves it to an epub
.long("max_conn") .long("max_conn")
.help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8")
.long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.")
.takes_value(true)); .takes_value(true))
.arg(
Arg::with_name("verbosity")
.short("v")
.multiple(true)
.help("Enables logging of events and set the verbosity level. Use --help to read on its usage")
.long_help(
"This takes upto 4 levels of verbosity in the following order.
- Error (-v)
- Warn (-vv)
- Info (-vvv)
- Debug (-vvvv)
When this flag is passed, it disables the progress bars and logs to stderr.
If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag."
)
.takes_value(false))
.arg(
Arg::with_name("log-to-file")
.long("log-to-file")
.help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level")
.takes_value(false));
let arg_matches = app.get_matches(); let arg_matches = app.get_matches();
let mut urls: Vec<String> = match arg_matches.value_of("file") { let mut urls: Vec<String> = match arg_matches.value_of("file") {
Some(file_name) => { Some(file_name) => {
if let Ok(mut file) = File::open(file_name) { if let Ok(mut file) = File::open(file_name) {
@ -76,14 +98,51 @@ It takes a url and downloads the article content from it and saves it to an epub
let mut app_config = AppConfig::new(max_conn); let mut app_config = AppConfig::new(max_conn);
app_config.set_urls(urls); app_config.set_urls(urls);
if let Some(name) = arg_matches.value_of("output_name") { if let Some(name) = arg_matches.value_of("output_name") {
let file_name = if name.ends_with(".epub") && name.len() > 5 { let file_path = Path::new(name);
if file_path.is_dir() {
eprintln!("{:?} is a directory", name);
std::process::exit(1);
}
let file_name = if file_path.extension().is_some() {
name.to_owned() name.to_owned()
} else { } else {
name.to_owned() + ".epub" name.to_owned() + ".epub"
}; };
app_config.set_merged(file_name);
match std::fs::File::create(&file_name) {
Ok(_) => (),
Err(e) => {
eprintln!("Unable to create file {:?}\n{}", file_path, e);
std::process::exit(1)
}
}
app_config.merged = Some(file_name);
} }
if arg_matches.is_present("verbosity") {
if !arg_matches.is_present("log-to-file") {
app_config.can_disable_progress_bar = true;
}
let log_levels: [LogLevel; 5] = [
LogLevel::Off,
LogLevel::Error,
LogLevel::Warn,
LogLevel::Info,
LogLevel::Debug,
];
let level = arg_matches.occurrences_of("verbosity").clamp(0, 4) as usize;
app_config.log_level = log_levels[level];
}
if arg_matches.is_present("log-to-file") {
app_config.log_level = LogLevel::Debug;
app_config.is_logging_to_file = true;
}
init_logger(&app_config);
app_config app_config
} }
@ -91,6 +150,10 @@ pub struct AppConfig {
urls: Vec<String>, urls: Vec<String>,
max_conn: usize, max_conn: usize,
merged: Option<String>, merged: Option<String>,
log_level: LogLevel,
can_disable_progress_bar: bool,
start_time: DateTime<Local>,
is_logging_to_file: bool,
} }
impl AppConfig { impl AppConfig {
@ -99,6 +162,10 @@ impl AppConfig {
urls: vec![], urls: vec![],
max_conn, max_conn,
merged: None, merged: None,
log_level: LogLevel::Off,
can_disable_progress_bar: false,
start_time: Local::now(),
is_logging_to_file: false,
} }
} }
@ -106,10 +173,6 @@ impl AppConfig {
self.urls.extend(urls); self.urls.extend(urls);
} }
fn set_merged(&mut self, name: String) {
self.merged = Some(name);
}
pub fn urls(&self) -> &Vec<String> { pub fn urls(&self) -> &Vec<String> {
&self.urls &self.urls
} }
@ -120,4 +183,20 @@ impl AppConfig {
pub fn merged(&self) -> Option<&String> { pub fn merged(&self) -> Option<&String> {
self.merged.as_ref() self.merged.as_ref()
} }
pub fn log_level(&self) -> LogLevel {
self.log_level
}
pub fn can_disable_progress_bar(&self) -> bool {
self.can_disable_progress_bar
}
pub fn start_time(&self) -> &DateTime<Local> {
&self.start_time
}
pub fn is_logging_to_file(&self) -> bool {
self.is_logging_to_file
}
} }

View file

@ -1,32 +1,159 @@
use std::fs::File; use std::fs::File;
use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table};
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
use indicatif::{ProgressBar, ProgressStyle};
use log::{debug, info};
use crate::extractor::{self, Extractor}; use crate::{
cli::AppConfig,
errors::PaperoniError,
extractor::{self, Extractor},
};
pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) { pub fn generate_epubs(
match merged { articles: Vec<Extractor>,
app_config: &AppConfig,
successful_articles_table: &mut Table,
) -> Result<(), Vec<PaperoniError>> {
let bar = if app_config.can_disable_progress_bar() {
ProgressBar::hidden()
} else {
let enabled_bar = ProgressBar::new(articles.len() as u64);
let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}",
);
enabled_bar.set_style(style);
if !articles.is_empty() {
enabled_bar.set_message("Generating epubs");
}
enabled_bar
};
let mut errors: Vec<PaperoniError> = Vec::new();
match app_config.merged() {
Some(name) => { Some(name) => {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); successful_articles_table.set_header(vec![Cell::new("Table of Contents")
.add_attribute(Attribute::Bold)
.set_alignment(CellAlignment::Center)
.fg(Color::Green)]);
let mut epub = match EpubBuilder::new(match ZipLibrary::new() {
Ok(zip_library) => zip_library,
Err(err) => {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
}) {
Ok(epub) => epub,
Err(err) => {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
};
debug!("Creating {:?}", name);
epub.inline_toc(); epub.inline_toc();
epub = articles articles
.iter() .iter()
.enumerate() .enumerate()
.fold(epub, |mut epub, (idx, article)| { .fold(&mut epub, |epub, (idx, article)| {
let mut article_result = || -> Result<(), PaperoniError> {
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article(), &mut html_buf)?;
let html_str = std::str::from_utf8(&html_buf)?;
epub.metadata("title", replace_metadata_value(name))?;
let section_name = article.metadata().title();
epub.add_content(
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes())
.title(replace_metadata_value(section_name)),
)?;
info!("Adding images for {:?}", name);
article.img_urls.iter().for_each(|img| {
// TODO: Add error handling and return errors as a vec
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.as_ref().unwrap(),
)
.unwrap();
});
info!("Added images for {:?}", name);
Ok(())
};
if let Err(mut error) = article_result() {
error.set_article_source(&article.url);
errors.push(error);
}
bar.inc(1);
successful_articles_table.add_row(vec![article.metadata().title()]);
epub
});
let appendix = generate_appendix(articles.iter().collect());
if let Err(err) = epub.add_content(
EpubContent::new("appendix.xhtml", appendix.as_bytes())
.title(replace_metadata_value("Article Sources")),
) {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
let mut out_file = File::create(&name).unwrap();
match epub.generate(&mut out_file) {
Ok(_) => (),
Err(err) => {
let mut paperoni_err: PaperoniError = err.into();
paperoni_err.set_article_source(name);
errors.push(paperoni_err);
return Err(errors);
}
}
bar.finish_with_message("Generated epub\n");
debug!("Created {:?}", name);
println!("Created {:?}", name);
}
None => {
successful_articles_table
.set_header(vec![Cell::new("Downloaded articles")
.add_attribute(Attribute::Bold)
.set_alignment(CellAlignment::Center)
.fg(Color::Green)])
.set_content_arrangement(ContentArrangement::Dynamic);
for article in &articles {
let mut result = || -> Result<(), PaperoniError> {
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
let file_name = format!(
"{}.epub",
article
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
debug!("Creating {:?}", file_name);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new(); let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) extractor::serialize_to_xhtml(article.article(), &mut html_buf)
.expect("Unable to serialize to xhtml"); .expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap(); let html_str = std::str::from_utf8(&html_buf).unwrap();
epub.metadata("title", replace_metadata_value(name)) if let Some(author) = article.metadata().byline() {
.unwrap(); epub.metadata("author", replace_metadata_value(author))?;
let section_name = article.metadata().title(); }
epub.add_content( epub.metadata("title", replace_metadata_value(article.metadata().title()))?;
EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?;
.title(replace_metadata_value(section_name)), for img in &article.img_urls {
)
.unwrap();
article.img_urls.iter().for_each(|img| {
let mut file_path = std::env::temp_dir(); let mut file_path = std::env::temp_dir();
file_path.push(&img.0); file_path.push(&img.0);
@ -35,52 +162,35 @@ pub fn generate_epubs(articles: Vec<Extractor>, merged: Option<&String>) {
file_path.file_name().unwrap(), file_path.file_name().unwrap(),
img_buf, img_buf,
img.1.as_ref().unwrap(), img.1.as_ref().unwrap(),
) )?;
.unwrap(); }
}); let appendix = generate_appendix(vec![&article]);
epub epub.add_content(
}); EpubContent::new("appendix.xhtml", appendix.as_bytes())
let mut out_file = File::create(&name).unwrap(); .title(replace_metadata_value("Article Source")),
epub.generate(&mut out_file).unwrap(); )?;
println!("Created {:?}", name); epub.generate(&mut out_file)?;
} bar.inc(1);
None => {
for article in articles {
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
let file_name = format!(
"{}.epub",
article
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = article.metadata().byline() {
epub.metadata("author", replace_metadata_value(author))
.unwrap();
}
epub.metadata("title", replace_metadata_value(article.metadata().title()))
.unwrap();
epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))
.unwrap();
for img in article.img_urls {
let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file"); successful_articles_table.add_row(vec![article.metadata().title()]);
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
.unwrap(); debug!("Created {:?}", file_name);
Ok(())
};
if let Err(mut error) = result() {
error.set_article_source(&article.url);
errors.push(error);
} }
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name);
} }
bar.finish_with_message("Generated epubs\n");
} }
} }
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
} }
/// Replaces characters that have to be escaped before adding to the epub's metadata /// Replaces characters that have to be escaped before adding to the epub's metadata
@ -91,6 +201,37 @@ fn replace_metadata_value(value: &str) -> String {
.replace(">", "&gt;") .replace(">", "&gt;")
} }
//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
fn generate_appendix(articles: Vec<&Extractor>) -> String {
let link_tags: String = articles
.iter()
.map(|article| {
let article_name = if !article.metadata().title().is_empty() {
article.metadata().title()
} else {
&article.url
};
format!(
"<a href=\"{}\">{}</a><br></br>",
replace_metadata_value(&article.url),
replace_metadata_value(article_name)
)
})
.collect();
let template = format!(
r#"<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
</head>
<body>
<h2>Appendix</h2><h3>Article sources</h3>
{}
</body>
</html>"#,
link_tags
);
template
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::replace_metadata_value; use super::replace_metadata_value;

126
src/errors.rs Normal file
View file

@ -0,0 +1,126 @@
use thiserror::Error;
#[derive(Error, Debug)]
pub enum ErrorKind {
#[error("[EpubError]: {0}")]
EpubError(String),
#[error("[HTTPError]: {0}")]
HTTPError(String),
#[error("[IOError]: {0}")]
IOError(String),
#[error("[UTF8Error]: {0}")]
UTF8Error(String),
#[error("[ReadabilityError]: {0}")]
ReadabilityError(String),
}
#[derive(Error, Debug)]
#[error("{kind}")]
/// Used to represent errors from downloading images. Errors from here are used solely for debugging
/// as they are considered recoverable.
pub struct ImgError {
kind: ErrorKind,
url: Option<String>,
}
impl ImgError {
pub fn with_kind(kind: ErrorKind) -> Self {
ImgError { url: None, kind }
}
pub fn set_url(&mut self, url: &str) {
self.url = Some(url.to_string());
}
pub fn url(&self) -> &Option<String> {
&self.url
}
}
impl From<ErrorKind> for ImgError {
fn from(kind: ErrorKind) -> Self {
ImgError::with_kind(kind)
}
}
impl From<surf::Error> for ImgError {
fn from(err: surf::Error) -> Self {
ImgError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<url::ParseError> for ImgError {
fn from(err: url::ParseError) -> Self {
ImgError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<std::io::Error> for ImgError {
fn from(err: std::io::Error) -> Self {
ImgError::with_kind(ErrorKind::IOError(err.to_string()))
}
}
#[derive(Error, Debug)]
#[error("{kind}")]
pub struct PaperoniError {
article_source: Option<String>,
kind: ErrorKind,
}
impl PaperoniError {
pub fn with_kind(kind: ErrorKind) -> Self {
PaperoniError {
article_source: None,
kind,
}
}
pub fn kind(&self) -> &ErrorKind {
&self.kind
}
pub fn article_source(&self) -> &Option<String> {
&self.article_source
}
pub fn set_article_source(&mut self, article_source: &str) {
self.article_source = Some(article_source.to_owned());
}
}
impl From<ErrorKind> for PaperoniError {
fn from(kind: ErrorKind) -> Self {
PaperoniError::with_kind(kind)
}
}
impl From<epub_builder::Error> for PaperoniError {
fn from(err: epub_builder::Error) -> Self {
PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned()))
}
}
impl From<surf::Error> for PaperoniError {
fn from(err: surf::Error) -> Self {
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<url::ParseError> for PaperoniError {
fn from(err: url::ParseError) -> Self {
PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string()))
}
}
impl From<std::io::Error> for PaperoniError {
fn from(err: std::io::Error) -> Self {
PaperoniError::with_kind(ErrorKind::IOError(err.to_string()))
}
}
impl From<std::str::Utf8Error> for PaperoniError {
fn from(err: std::str::Utf8Error) -> Self {
PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string()))
}
}

View file

@ -2,6 +2,7 @@ use std::collections::HashMap;
use kuchiki::{traits::*, NodeRef}; use kuchiki::{traits::*, NodeRef};
use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability}; use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
@ -14,22 +15,24 @@ pub struct Extractor {
article: Option<NodeRef>, article: Option<NodeRef>,
pub img_urls: Vec<ResourceInfo>, pub img_urls: Vec<ResourceInfo>,
readability: Readability, readability: Readability,
pub url: String,
} }
impl Extractor { impl Extractor {
/// Create a new instance of an HTML extractor given an HTML string /// Create a new instance of an HTML extractor given an HTML string
pub fn from_html(html_str: &str) -> Self { pub fn from_html(html_str: &str, url: &str) -> Self {
Extractor { Extractor {
article: None, article: None,
img_urls: Vec::new(), img_urls: Vec::new(),
readability: Readability::new(html_str), readability: Readability::new(html_str),
url: url.to_string(),
} }
} }
/// Locates and extracts the HTML in a document which is determined to be /// Locates and extracts the HTML in a document which is determined to be
/// the source of the content /// the source of the content
pub fn extract_content(&mut self, url: &str) { pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
self.readability.parse(url); self.readability.parse(&self.url)?;
if let Some(article_node_ref) = &self.readability.article_node { if let Some(article_node_ref) = &self.readability.article_node {
let template = r#" let template = r#"
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
@ -44,6 +47,7 @@ impl Extractor {
body.as_node().append(article_node_ref.clone()); body.as_node().append(article_node_ref.clone());
self.article = Some(doc); self.article = Some(doc);
} }
Ok(())
} }
/// Traverses the DOM tree of the content and retrieves the IMG URLs /// Traverses the DOM tree of the content and retrieves the IMG URLs
@ -61,8 +65,11 @@ impl Extractor {
} }
} }
pub fn article(&self) -> Option<&NodeRef> { /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
self.article.as_ref() pub fn article(&self) -> &NodeRef {
self.article.as_ref().expect(
"Article node doesn't exist. This may be because the document has not been parsed",
)
} }
pub fn metadata(&self) -> &MetaData { pub fn metadata(&self) -> &MetaData {
@ -75,7 +82,7 @@ impl Extractor {
pub fn serialize_to_xhtml<W: std::io::Write>( pub fn serialize_to_xhtml<W: std::io::Write>(
node_ref: &NodeRef, node_ref: &NodeRef,
mut w: &mut W, mut w: &mut W,
) -> Result<(), Box<dyn std::error::Error>> { ) -> Result<(), PaperoniError> {
let mut escape_map = HashMap::new(); let mut escape_map = HashMap::new();
escape_map.insert("<", "&lt;"); escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;"); escape_map.insert(">", "&gt;");
@ -96,6 +103,7 @@ pub fn serialize_to_xhtml<W: std::io::Write>(
let attrs_str = attrs let attrs_str = attrs
.map .map
.iter() .iter()
.filter(|(k, _)| &k.local != "\"")
.map(|(k, v)| { .map(|(k, v)| {
format!( format!(
"{}=\"{}\"", "{}=\"{}\"",
@ -156,8 +164,10 @@ mod test {
#[test] #[test]
fn test_extract_img_urls() { fn test_extract_img_urls() {
let mut extractor = Extractor::from_html(TEST_HTML); let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
extractor.extract_content("http://example.com/"); extractor
.extract_content()
.expect("Article extraction failed unexpectedly");
extractor.extract_img_urls(); extractor.extract_img_urls();
assert!(extractor.img_urls.len() > 0); assert!(extractor.img_urls.len() > 0);

View file

@ -1,65 +1,90 @@
use async_std::io::prelude::*; use async_std::io::prelude::*;
use async_std::{fs::File, stream}; use async_std::{fs::File, stream};
use futures::StreamExt; use futures::StreamExt;
use indicatif::ProgressBar;
use log::{debug, info};
use url::Url; use url::Url;
use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Extractor; use crate::extractor::Extractor;
type HTMLResource = (String, String); type HTMLResource = (String, String);
pub async fn fetch_url( pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
url: &str,
) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
let client = surf::Client::new(); let client = surf::Client::new();
println!("Fetching..."); debug!("Fetching {}", url);
let mut redirect_count: u8 = 0; let process_request = async {
let base_url = Url::parse(&url)?; let mut redirect_count: u8 = 0;
let mut url = base_url.clone(); let base_url = Url::parse(&url)?;
while redirect_count < 5 { let mut url = base_url.clone();
redirect_count += 1; while redirect_count < 5 {
let req = surf::get(&url); redirect_count += 1;
let mut res = client.send(req).await?; let req = surf::get(&url);
if res.status().is_redirection() { let mut res = client.send(req).await?;
if let Some(location) = res.header(surf::http::headers::LOCATION) { if res.status().is_redirection() {
match Url::parse(location.last().as_str()) { if let Some(location) = res.header(surf::http::headers::LOCATION) {
Ok(valid_url) => url = valid_url, match Url::parse(location.last().as_str()) {
Err(e) => match e { Ok(valid_url) => {
url::ParseError::RelativeUrlWithoutBase => { info!("Redirecting {} to {}", url, valid_url);
url = base_url.join(location.last().as_str())? url = valid_url
} }
e => return Err(e.into()), Err(e) => match e {
}, url::ParseError::RelativeUrlWithoutBase => {
}; match base_url.join(location.last().as_str()) {
} Ok(joined_url) => {
} else if res.status().is_success() { info!("Redirecting {} to {}", url, joined_url);
if let Some(mime) = res.content_type() { url = joined_url;
if mime.essence() == "text/html" { }
return Ok((url.to_string(), res.body_string().await?)); Err(e) => return Err(e.into()),
}
}
e => return Err(e.into()),
},
};
}
} else if res.status().is_success() {
if let Some(mime) = res.content_type() {
if mime.essence() == "text/html" {
debug!("Successfully fetched {}", url);
return Ok((url.to_string(), res.body_string().await?));
} else {
let msg = format!(
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
);
return Err(ErrorKind::HTTPError(msg).into());
}
} else { } else {
return Err(format!( return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into());
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
)
.into());
} }
} else { } else {
return Err("Unknown HTTP response".into()); let msg = format!("Request failed: HTTP {}", res.status());
return Err(ErrorKind::HTTPError(msg).into());
} }
} else {
return Err(format!("Request failed: HTTP {}", res.status()).into());
} }
} Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into())
Err("Unable to fetch HTML".into()) };
process_request.await.map_err(|mut error: PaperoniError| {
error.set_article_source(url);
error
})
} }
pub async fn download_images( pub async fn download_images(
extractor: &mut Extractor, extractor: &mut Extractor,
article_origin: &Url, article_origin: &Url,
) -> async_std::io::Result<()> { bar: &ProgressBar,
) -> Result<(), Vec<ImgError>> {
if extractor.img_urls.len() > 0 { if extractor.img_urls.len() > 0 {
println!("Downloading images..."); debug!(
"Downloading {} images for {}",
extractor.img_urls.len(),
article_origin
);
} }
let img_count = extractor.img_urls.len();
let imgs_req_iter = extractor let imgs_req_iter = extractor
.img_urls .img_urls
@ -67,43 +92,73 @@ pub async fn download_images(
.map(|(url, _)| { .map(|(url, _)| {
( (
url, url,
surf::Client::new().get(get_absolute_url(&url, article_origin)), surf::Client::new()
.with(surf::middleware::Redirect::default())
.get(get_absolute_url(&url, article_origin)),
) )
}) })
.map(|(url, req)| async move { .enumerate()
let mut img_response = req.await.expect("Unable to retrieve image"); .map(|(img_idx, (url, req))| async move {
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap(); bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());
let img_mime = img_response match req.await {
.content_type() Ok(mut img_response) => {
.map(|mime| mime.essence().to_string()); let process_response = async {
let img_ext = img_response let img_content: Vec<u8> = match img_response.body_bytes().await {
.content_type() Ok(bytes) => bytes,
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) Err(e) => return Err(e.into()),
.unwrap(); };
let img_mime = img_response
.content_type()
.map(|mime| mime.essence().to_string());
let img_ext = match img_response
.content_type()
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
{
Some(mime_str) => mime_str,
None => {
return Err(ErrorKind::HTTPError(
"Image has no Content-Type".to_owned(),
)
.into())
}
};
let mut img_path = std::env::temp_dir(); let mut img_path = std::env::temp_dir();
img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
let mut img_file = File::create(&img_path) let mut img_file = match File::create(&img_path).await {
.await Ok(file) => file,
.expect("Unable to create file"); Err(e) => return Err(e.into()),
img_file };
.write_all(&img_content) match img_file.write_all(&img_content).await {
.await Ok(_) => (),
.expect("Unable to save to file"); Err(e) => return Err(e.into()),
}
( Ok((
url, url,
img_path img_path
.file_name() .file_name()
.map(|os_str_name| { .map(|os_str_name| {
os_str_name os_str_name
.to_str() .to_str()
.expect("Unable to get image file name") .expect("Unable to get image file name")
.to_string() .to_string()
})
.unwrap(),
img_mime,
))
};
process_response.await.map_err(|mut e: ImgError| {
e.set_url(url);
e
}) })
.unwrap(), }
img_mime, Err(e) => {
) let mut img_err: ImgError = e.into();
img_err.set_url(url);
Err(img_err)
}
}
}); });
// A utility closure used when update the value of an image source after downloading is successful // A utility closure used when update the value of an image source after downloading is successful
@ -112,8 +167,6 @@ pub async fn download_images(
let (img_url, img_path, img_mime) = img_item; let (img_url, img_path, img_mime) = img_item;
let img_ref = extractor let img_ref = extractor
.article() .article()
.as_mut()
.expect("Unable to get mutable ref")
.select_first(&format!("img[src='{}']", img_url)) .select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist"); .expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut(); let mut img_node = img_ref.attributes.borrow_mut();
@ -124,14 +177,24 @@ pub async fn download_images(
(img_path, img_mime) (img_path, img_mime)
}; };
extractor.img_urls = stream::from_iter(imgs_req_iter) let imgs_req_iter = stream::from_iter(imgs_req_iter)
.buffered(10) .buffered(10)
.collect::<Vec<_>>() .collect::<Vec<Result<_, ImgError>>>()
.await .await;
.into_iter() let mut errors = Vec::new();
.map(replace_existing_img_src) let mut replaced_imgs = Vec::new();
.collect(); for img_req_result in imgs_req_iter {
Ok(()) match img_req_result {
Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)),
Err(e) => errors.push(e),
}
}
extractor.img_urls = replaced_imgs;
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
} }
/// Handles getting the extension from a given MIME subtype. /// Handles getting the extension from a given MIME subtype.

260
src/logs.rs Normal file
View file

@ -0,0 +1,260 @@
use colored::*;
use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY;
use comfy_table::{Cell, CellAlignment, ContentArrangement, Table};
use directories::UserDirs;
use flexi_logger::LogSpecBuilder;
use log::error;
use crate::{cli::AppConfig, errors::PaperoniError};
pub fn display_summary(
initial_article_count: usize,
succesful_articles_table: Table,
partial_downloads_count: usize,
errors: Vec<PaperoniError>,
) {
let successfully_downloaded_count =
initial_article_count - partial_downloads_count - errors.len();
println!(
"{}",
short_summary(DownloadCount::new(
initial_article_count,
successfully_downloaded_count,
partial_downloads_count,
errors.len()
))
.bold()
);
if successfully_downloaded_count > 0 {
println!("{}", succesful_articles_table);
}
if !errors.is_empty() {
println!("\n{}", "Failed article downloads".bright_red().bold());
let mut table_failed = Table::new();
table_failed
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_header(vec![
Cell::new("Link").set_alignment(CellAlignment::Center),
Cell::new("Reason").set_alignment(CellAlignment::Center),
])
.set_content_arrangement(ContentArrangement::Dynamic);
for error in errors {
let error_source = error
.article_source()
.clone()
.unwrap_or_else(|| "<unknown link>".to_string());
table_failed.add_row(vec![&error_source, &format!("{}", error.kind())]);
error!("{}\n - {}", error, error_source);
}
println!("{}", table_failed);
}
}
/// Returns a string summary of the total number of failed and successful article downloads
fn short_summary(download_count: DownloadCount) -> String {
// TODO: Refactor this
if download_count.total
!= download_count.successful + download_count.failed + download_count.partial
{
panic!("initial_count must be equal to the sum of failed and successful count")
}
let get_noun = |count: usize| if count == 1 { "article" } else { "articles" };
if download_count.successful == download_count.total && download_count.successful == 1 {
"Article downloaded successfully".green().to_string()
} else if download_count.total == download_count.failed && download_count.failed == 1 {
"Article failed to download".red().to_string()
} else if download_count.total == download_count.partial && download_count.partial == 1 {
"Article partially failed to download".yellow().to_string()
} else if download_count.successful == download_count.total {
"All articles downloaded successfully".green().to_string()
} else if download_count.failed == download_count.total {
"All articles failed to download".red().to_string()
} else if download_count.partial == download_count.total {
"All articles partially failed to download"
.yellow()
.to_string()
} else if download_count.partial == 0 {
format!(
"{} {} downloaded successfully, {} {} failed",
download_count.successful,
get_noun(download_count.successful),
download_count.failed,
get_noun(download_count.failed)
)
.yellow()
.to_string()
} else if download_count.successful == 0
&& download_count.partial > 0
&& download_count.failed > 0
{
format!(
"{} {} partially failed to download, {} {} failed",
download_count.partial,
get_noun(download_count.partial),
download_count.failed,
get_noun(download_count.failed)
)
.yellow()
.to_string()
} else if download_count.failed == 0
&& download_count.successful > 0
&& download_count.partial > 0
{
format!(
"{} {} downloaded successfully, {} {} partially failed to download",
download_count.successful,
get_noun(download_count.successful),
download_count.partial,
get_noun(download_count.partial)
)
.yellow()
.to_string()
} else {
format!(
"{} {} downloaded successfully, {} {} partially failed to download, {} {} failed",
download_count.successful,
get_noun(download_count.successful),
download_count.partial,
get_noun(download_count.partial),
download_count.failed,
get_noun(download_count.failed)
)
.yellow()
.to_string()
}
}
struct DownloadCount {
total: usize,
successful: usize,
partial: usize,
failed: usize,
}
impl DownloadCount {
fn new(total: usize, successful: usize, partial: usize, failed: usize) -> Self {
Self {
total,
successful,
partial,
failed,
}
}
}
pub fn init_logger(app_config: &AppConfig) {
match UserDirs::new() {
Some(user_dirs) => {
let home_dir = user_dirs.home_dir();
let paperoni_dir = home_dir.join(".paperoni");
let log_dir = paperoni_dir.join("logs");
let log_spec = LogSpecBuilder::new()
.module("paperoni", app_config.log_level())
.build();
let formatted_timestamp = app_config.start_time().format("%Y-%m-%d_%H-%M-%S");
let mut logger = flexi_logger::Logger::with(log_spec);
if app_config.is_logging_to_file() && (!paperoni_dir.is_dir() || !log_dir.is_dir()) {
match std::fs::create_dir_all(&log_dir) {
Ok(_) => (),
Err(e) => {
eprintln!("Unable to create paperoni directories on home directory for logging purposes\n{}",e);
std::process::exit(1);
}
};
}
if app_config.is_logging_to_file() {
logger = logger
.directory(log_dir)
.discriminant(formatted_timestamp.to_string())
.suppress_timestamp()
.log_to_file();
}
match logger.start() {
Ok(_) => (),
Err(e) => eprintln!("Unable to start logger!\n{}", e),
}
}
None => eprintln!("Unable to get user directories for logging purposes"),
};
}
#[cfg(test)]
mod tests {
use super::{short_summary, DownloadCount};
use colored::*;
#[test]
fn test_short_summary() {
assert_eq!(
short_summary(DownloadCount::new(1, 1, 0, 0)),
"Article downloaded successfully".green().to_string()
);
assert_eq!(
short_summary(DownloadCount::new(1, 0, 0, 1)),
"Article failed to download".red().to_string()
);
assert_eq!(
short_summary(DownloadCount::new(10, 10, 0, 0)),
"All articles downloaded successfully".green().to_string()
);
assert_eq!(
short_summary(DownloadCount::new(10, 0, 0, 10)),
"All articles failed to download".red().to_string()
);
assert_eq!(
short_summary(DownloadCount::new(10, 8, 0, 2)),
"8 articles downloaded successfully, 2 articles failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(10, 1, 0, 9)),
"1 article downloaded successfully, 9 articles failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(7, 6, 0, 1)),
"6 articles downloaded successfully, 1 article failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(7, 4, 2, 1)),
"4 articles downloaded successfully, 2 articles partially failed to download, 1 article failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(12, 6, 6, 0)),
"6 articles downloaded successfully, 6 articles partially failed to download"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(5, 0, 4, 1)),
"4 articles partially failed to download, 1 article failed"
.yellow()
.to_string()
);
assert_eq!(
short_summary(DownloadCount::new(4, 0, 4, 0)),
"All articles partially failed to download"
.yellow()
.to_string()
);
}
#[test]
#[should_panic(
expected = "initial_count must be equal to the sum of failed and successful count"
)]
fn test_short_summary_panics_on_invalid_input() {
short_summary(DownloadCount::new(0, 12, 0, 43));
}
}

View file

@ -3,21 +3,28 @@ extern crate lazy_static;
use async_std::stream; use async_std::stream;
use async_std::task; use async_std::task;
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
use comfy_table::{ContentArrangement, Table};
use futures::stream::StreamExt; use futures::stream::StreamExt;
use indicatif::{ProgressBar, ProgressStyle};
use log::{debug, warn};
use url::Url; use url::Url;
mod cli; mod cli;
mod epub; mod epub;
mod errors;
mod extractor; mod extractor;
/// This module is responsible for async HTTP calls for downloading /// This module is responsible for async HTTP calls for downloading
/// the HTML content and images /// the HTML content and images
mod http; mod http;
mod logs;
mod moz_readability; mod moz_readability;
use cli::AppConfig; use cli::AppConfig;
use epub::generate_epubs; use epub::generate_epubs;
use extractor::Extractor; use extractor::Extractor;
use http::{download_images, fetch_url}; use http::{download_images, fetch_html};
use logs::display_summary;
fn main() { fn main() {
let app_config = cli::cli_init(); let app_config = cli::cli_init();
@ -28,29 +35,92 @@ fn main() {
} }
fn download(app_config: AppConfig) { fn download(app_config: AppConfig) {
let mut errors = Vec::new();
let mut partial_download_count: usize = 0;
let bar = if app_config.can_disable_progress_bar() {
ProgressBar::hidden()
} else {
let enabled_bar = ProgressBar::new(app_config.urls().len() as u64);
let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
);
enabled_bar.set_style(style);
enabled_bar.enable_steady_tick(500);
enabled_bar
};
let articles = task::block_on(async { let articles = task::block_on(async {
let urls_iter = app_config.urls().iter().map(|url| fetch_url(url)); let urls_iter = app_config.urls().iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn());
let mut articles = Vec::new(); let mut articles = Vec::new();
while let Some(fetch_result) = responses.next().await { while let Some(fetch_result) = responses.next().await {
match fetch_result { match fetch_result {
Ok((url, html)) => { Ok((url, html)) => {
println!("Extracting"); debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html); let mut extractor = Extractor::from_html(&html, &url);
extractor.extract_content(&url); bar.set_message("Extracting...");
match extractor.extract_content() {
if extractor.article().is_some() { Ok(_) => {
extractor.extract_img_urls(); extractor.extract_img_urls();
download_images(&mut extractor, &Url::parse(&url).unwrap()) if let Err(img_errors) =
.await download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.expect("Unable to download images"); .await
articles.push(extractor); {
partial_download_count += 1;
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
} }
} }
Err(e) => eprintln!("{}", e), Err(e) => errors.push(e),
} }
bar.inc(1);
} }
articles articles
}); });
generate_epubs(articles, app_config.merged()); bar.finish_with_message("Downloaded articles");
let mut succesful_articles_table = Table::new();
succesful_articles_table
.load_preset(UTF8_FULL)
.load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_content_arrangement(ContentArrangement::Dynamic);
match generate_epubs(articles, &app_config, &mut succesful_articles_table) {
Ok(_) => (),
Err(gen_epub_errors) => {
errors.extend(gen_epub_errors);
}
};
let has_errors = !errors.is_empty();
display_summary(
app_config.urls().len(),
succesful_articles_table,
partial_download_count,
errors,
);
if app_config.is_logging_to_file() {
println!(
"Log written to paperoni_{}.log\n",
app_config.start_time().format("%Y-%m-%d_%H-%M-%S")
);
}
if has_errors {
std::process::exit(1);
}
} }

View file

@ -7,8 +7,11 @@ use kuchiki::{
traits::*, traits::*,
NodeData, NodeRef, NodeData, NodeRef,
}; };
use log::info;
use url::Url; use url::Url;
use crate::errors::{ErrorKind, PaperoniError};
const DEFAULT_CHAR_THRESHOLD: usize = 500; const DEFAULT_CHAR_THRESHOLD: usize = 500;
const FLAG_STRIP_UNLIKELYS: u32 = 0x1; const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
const FLAG_WEIGHT_CLASSES: u32 = 0x2; const FLAG_WEIGHT_CLASSES: u32 = 0x2;
@ -76,14 +79,15 @@ impl Readability {
metadata: MetaData::new(), metadata: MetaData::new(),
} }
} }
pub fn parse(&mut self, url: &str) { pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> {
self.unwrap_no_script_tags(); self.unwrap_no_script_tags();
self.remove_scripts(); self.remove_scripts();
self.prep_document(); self.prep_document();
self.metadata = self.get_article_metadata(); self.metadata = self.get_article_metadata();
self.article_title = self.metadata.title.clone(); self.article_title = self.metadata.title.clone();
self.grab_article(); self.grab_article()?;
self.post_process_content(url); self.post_process_content(url);
Ok(())
} }
/// Recursively check if node is image, or if node contains exactly only one image /// Recursively check if node is image, or if node contains exactly only one image
@ -426,8 +430,7 @@ impl Readability {
let mut matches = None; let mut matches = None;
if let Some(property) = node_attr.get("property") { if let Some(property) = node_attr.get("property") {
matches = regexes::PROPERTY_REGEX.captures(property); matches = regexes::PROPERTY_REGEX.captures(property);
if matches.is_some() { if let Some(captures) = &matches {
let captures = matches.as_ref().unwrap();
for capture in captures.iter() { for capture in captures.iter() {
let mut name = capture.unwrap().as_str().to_lowercase(); let mut name = capture.unwrap().as_str().to_lowercase();
name = regexes::REPLACE_WHITESPACE_REGEX name = regexes::REPLACE_WHITESPACE_REGEX
@ -561,7 +564,7 @@ impl Readability {
.root_node .root_node
.select_first("title") .select_first("title")
.map(|title| title.text_contents().trim().to_string()) .map(|title| title.text_contents().trim().to_string())
.expect("This file has no <title> tag to extract a title from"); .unwrap_or("".to_string());
let orig_title = cur_title.clone(); let orig_title = cur_title.clone();
let mut title_had_hierarchical_separators = false; let mut title_had_hierarchical_separators = false;
let word_count = |s: &str| -> usize { s.split_whitespace().count() }; let word_count = |s: &str| -> usize { s.split_whitespace().count() };
@ -595,8 +598,8 @@ impl Readability {
} }
} else if cur_title.len() > 150 || cur_title.len() < 15 { } else if cur_title.len() > 150 || cur_title.len() < 15 {
let mut h1_nodes = self.root_node.select("h1").unwrap(); let mut h1_nodes = self.root_node.select("h1").unwrap();
let (_, h1_count) = h1_nodes.size_hint(); let h1_count = self.root_node.select("h1").unwrap().count();
if Some(1) == h1_count { if h1_count == 1 {
cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None); cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None);
} }
} }
@ -799,6 +802,7 @@ impl Readability {
state = State::ReadProp; state = State::ReadProp;
decl.1 = Some(token.trim().to_string()); decl.1 = Some(token.trim().to_string());
tokens.push(decl.clone()); tokens.push(decl.clone());
decl = (None, None);
token.clear(); token.clear();
} else { } else {
token.push(c); token.push(c);
@ -819,11 +823,18 @@ impl Readability {
} }
} }
if !token.is_empty() { if !token.is_empty() {
decl.1 = Some(token.trim().to_string()); match state {
tokens.push(decl); State::ReadVal => {
decl.1 = Some(token.trim().to_string());
tokens.push(decl);
}
_ => (),
}
} }
tokens tokens
.into_iter() .into_iter()
.filter(|tok_pair| tok_pair.0.is_some() && tok_pair.1.is_some())
.map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap())) .map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap()))
.collect() .collect()
} }
@ -1576,16 +1587,14 @@ impl Readability {
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
/// a user wants to read. Then return it wrapped up in a div. /// a user wants to read. Then return it wrapped up in a div.
fn grab_article(&mut self) { fn grab_article(&mut self) -> Result<(), PaperoniError> {
println!("Grabbing article"); info!("Grabbing article {:?}", self.metadata.title);
// var doc = this._doc; // var doc = this._doc;
// var isPaging = (page !== null ? true: false); // var isPaging = (page !== null ? true: false);
// page = page ? page : this._doc.body; // page = page ? page : this._doc.body;
let page = self.root_node.select_first("body"); let page = self.root_node.select_first("body");
if page.is_err() { if page.is_err() {
// TODO:Have error logging for this return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
println!("Document has no <body>");
return;
} }
let page = page.unwrap(); let page = page.unwrap();
let mut attempts: Vec<ExtractAttempt> = Vec::new(); let mut attempts: Vec<ExtractAttempt> = Vec::new();
@ -2075,8 +2084,10 @@ impl Readability {
attempts.push(ExtractAttempt::new(article_content.clone(), text_length)); attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap()); attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
if attempts.first().as_ref().unwrap().length == 0 { if attempts.first().as_ref().unwrap().length == 0 {
println!("Unable to extract content"); return Err(ErrorKind::ReadabilityError(
break; "Unable to extract content".into(),
)
.into());
} }
article_content = attempts[0].article.clone(); article_content = attempts[0].article.clone();
parse_successful = true; parse_successful = true;
@ -2102,7 +2113,8 @@ impl Readability {
false false
}); });
self.article_node = Some(article_content); self.article_node = Some(article_content);
return; info!("Successfully grabbed article {:?}", self.metadata.title);
return Ok(());
} }
} }
} }
@ -2460,12 +2472,24 @@ mod test {
css_map.insert("align-items".to_string(), "center".to_string()); css_map.insert("align-items".to_string(), "center".to_string());
css_map.insert("border".to_string(), "2px solid black".to_string()); css_map.insert("border".to_string(), "2px solid black".to_string());
let css_str_to_vec = Readability::inline_css_str_to_map(css_str); let css_str_to_map = Readability::inline_css_str_to_map(css_str);
assert_eq!(css_map, css_str_to_vec); assert_eq!(css_map, css_str_to_map);
let mut css_map = HashMap::new(); let mut css_map = HashMap::new();
css_map.insert("color".to_string(), "red".to_string()); css_map.insert("color".to_string(), "red".to_string());
css_map.insert("background-image".to_string(), "url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')".to_string()); css_map.insert("background-image".to_string(), "url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')".to_string());
assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')")); assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')"));
let empty_map = HashMap::new();
assert_eq!(empty_map, Readability::inline_css_str_to_map(" \n \t \r"));
assert_eq!(empty_map, Readability::inline_css_str_to_map("color"));
let mut css_map = HashMap::new();
css_map.insert("color".to_string(), "red".to_string());
css_map.insert("height".to_string(), "300px".to_string());
assert_eq!(
css_map,
Readability::inline_css_str_to_map("color: red;height: 300px;width")
);
} }
#[test] #[test]