diff --git a/.gitignore b/.gitignore index 3ae8faf..8e42494 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /target -*.epub \ No newline at end of file +*.epub +*.log \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 8466dbf..ca5456c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -126,12 +126,15 @@ dependencies = [ [[package]] name = "async-global-executor" -version = "1.4.3" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73079b49cd26b8fd5a15f68fc7707fc78698dc2a3d61430f2a7a9430230dfa04" +checksum = "9586ec52317f36de58453159d48351bc244bc24ced3effc1fce22f3d48664af6" dependencies = [ + "async-channel", "async-executor", "async-io", + "async-mutex", + "blocking", "futures-lite", "num_cpus", "once_cell", @@ -147,7 +150,7 @@ dependencies = [ "fastrand", "futures-lite", "libc", - "log 0.4.11", + "log 0.4.14", "nb-connect", "once_cell", "parking", @@ -157,6 +160,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "async-lock" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6a8ea61bf9947a1007c5cada31e647dbc77b103c679858150003ba697ea798b" +dependencies = [ + "event-listener", +] + [[package]] name = "async-mutex" version = "1.4.0" @@ -168,14 +180,14 @@ dependencies = [ [[package]] name = "async-std" -version = "1.7.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7e82538bc65a25dbdff70e4c5439d52f068048ab97cdea0acd73f131594caa1" +checksum = "d9f06685bad74e0570f5213741bea82158279a4103d988e57bfada11ad230341" dependencies = [ + "async-channel", "async-global-executor", "async-io", - "async-mutex", - "blocking", + "async-lock", "crossbeam-utils", "futures-channel", "futures-core", @@ -183,11 +195,11 @@ dependencies = [ "futures-lite", "gloo-timers", "kv-log-macro", - "log 0.4.11", + "log 0.4.14", "memchr", "num_cpus", "once_cell", - "pin-project-lite 0.1.11", + "pin-project-lite 0.2.4", "pin-utils", "slab", "wasm-bindgen-futures", @@ -394,6 +406,28 @@ dependencies = [ "vec_map", ] +[[package]] +name = "colored" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd" +dependencies = [ + "atty", + "lazy_static", + "winapi", +] + +[[package]] +name = "comfy-table" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b99e9022e080d384b58d8eaf5976b42a311ff7a9669f8200eb2453c0b2b81a" +dependencies = [ + "crossterm", + "strum", + "strum_macros", +] + [[package]] name = "concurrent-queue" version = "1.2.2" @@ -403,6 +437,21 @@ dependencies = [ "cache-padded", ] +[[package]] +name = "console" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3993e6445baa160675931ec041a5e03ca84b9c6e32a056150d3aa2bdda0a1f45" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "regex", + "terminal_size", + "unicode-width", + "winapi", +] + [[package]] name = "const_fn" version = "0.4.3" @@ -453,6 +502,31 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "crossterm" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c" +dependencies = [ + "bitflags", + "crossterm_winapi", + "lazy_static", + "libc", + "mio", + "parking_lot", + "signal-hook", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9" +dependencies = [ + "winapi", +] + [[package]] name = "crypto-mac" version = "0.10.0" @@ -490,6 +564,16 @@ dependencies = [ "syn", ] +[[package]] +name = "ctor" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fbaabec2c953050352311293be5c6aba8e141ba19d6811862b232d6fd020484" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "ctr" version = "0.6.0" @@ -530,6 +614,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "dashmap" +version = "4.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e77a43b28d0668df09411cb0bc9a8c2adc40f9a048afe863e05fd43251e8e39c" +dependencies = [ + "cfg-if 1.0.0", + "num_cpus", +] + [[package]] name = "data-encoding" version = "2.3.1" @@ -556,6 +650,26 @@ dependencies = [ "generic-array", ] +[[package]] +name = "directories" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e69600ff1703123957937708eb27f7a564e48885c537782722ed0ba3189ce1d7" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03d86534ed367a67548dc68113a0f5db55432fdfbb6e6f9d77704397d95d5780" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + [[package]] name = "discard" version = "1.0.4" @@ -577,6 +691,12 @@ dependencies = [ "dtoa", ] +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "encoding_rs" version = "0.8.26" @@ -640,6 +760,22 @@ dependencies = [ "miniz_oxide 0.3.7", ] +[[package]] +name = "flexi_logger" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab94b6ac8eb69f1496a6993f26f785b5fd6d99b7416023eb2a6175c0b242b1" +dependencies = [ + "atty", + "chrono", + "glob", + "lazy_static", + "log 0.4.14", + "regex", + "thiserror", + "yansi", +] + [[package]] name = "flume" version = "0.9.2" @@ -685,9 +821,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150" +checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" dependencies = [ "futures-channel", "futures-core", @@ -700,9 +836,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" +checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" dependencies = [ "futures-core", "futures-sink", @@ -710,15 +846,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" +checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" [[package]] name = "futures-executor" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9" +checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" dependencies = [ "futures-core", "futures-task", @@ -727,9 +863,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" +checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" [[package]] name = "futures-lite" @@ -748,9 +884,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" +checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -760,24 +896,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" +checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" [[package]] name = "futures-task" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" -dependencies = [ - "once_cell", -] +checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" [[package]] name = "futures-util" -version = "0.3.12" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" +checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" dependencies = [ "futures-channel", "futures-core", @@ -823,6 +956,17 @@ dependencies = [ "wasi 0.9.0+wasi-snapshot-preview1", ] +[[package]] +name = "getrandom" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", +] + [[package]] name = "ghash" version = "0.3.0" @@ -838,6 +982,12 @@ version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6503fe142514ca4799d4c26297c4248239fe8838d827db6bd6065c6ed29a6ce" +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "gloo-timers" version = "0.2.1" @@ -851,6 +1001,15 @@ dependencies = [ "web-sys", ] +[[package]] +name = "heck" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "hermit-abi" version = "0.1.17" @@ -895,7 +1054,7 @@ version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" dependencies = [ - "log 0.4.11", + "log 0.4.14", "mac", "markup5ever", "proc-macro2", @@ -916,15 +1075,17 @@ dependencies = [ [[package]] name = "http-client" -version = "6.2.0" +version = "6.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "010092b71b94ee49293995625ce7a607778b8b4099c8088fa84fd66bd3e0f21c" +checksum = "5566ecc26bc6b04e773e680d66141fced78e091ad818e420d726c152b05a64ff" dependencies = [ "async-std", "async-trait", + "cfg-if 1.0.0", + "dashmap", "http-types", "isahc", - "log 0.4.11", + "log 0.4.14", ] [[package]] @@ -960,6 +1121,18 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indicatif" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4" +dependencies = [ + "console", + "lazy_static", + "number_prefix", + "regex", +] + [[package]] name = "infer" version = "0.2.3" @@ -988,7 +1161,7 @@ dependencies = [ "flume", "futures-lite", "http", - "log 0.4.11", + "log 0.4.14", "once_cell", "slab", "sluice", @@ -1031,7 +1204,7 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" dependencies = [ - "log 0.4.11", + "log 0.4.14", ] [[package]] @@ -1042,9 +1215,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.80" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" +checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41" [[package]] name = "libnghttp2-sys" @@ -1083,16 +1256,17 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" dependencies = [ - "log 0.4.11", + "log 0.4.14", ] [[package]] name = "log" -version = "0.4.11" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", + "value-bag", ] [[package]] @@ -1107,7 +1281,7 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" dependencies = [ - "log 0.4.11", + "log 0.4.14", "phf", "phf_codegen", "serde", @@ -1171,6 +1345,28 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mio" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956" +dependencies = [ + "libc", + "log 0.4.14", + "miow", + "ntapi", + "winapi", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi", +] + [[package]] name = "mustache" version = "0.9.0" @@ -1203,6 +1399,15 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -1232,6 +1437,12 @@ dependencies = [ "libc", ] +[[package]] +name = "number_prefix" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" + [[package]] name = "object" version = "0.22.0" @@ -1271,18 +1482,26 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.3.0-alpha1" +version = "0.4.0-alpha1" dependencies = [ "async-std", + "chrono", "clap", + "colored", + "comfy-table", + "directories", "epub-builder", + "flexi_logger", "futures", "html5ever", + "indicatif", "kuchiki", "lazy_static", + "log 0.4.14", "md5", "regex", "surf", + "thiserror", "url", ] @@ -1292,6 +1511,31 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" +[[package]] +name = "parking_lot" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall 0.2.6", + "smallvec", + "winapi", +] + [[package]] name = "percent-encoding" version = "2.1.0" @@ -1404,7 +1648,7 @@ checksum = "a2a7bc6b2a29e632e45451c941832803a18cce6781db04de8a04696cdca8bde4" dependencies = [ "cfg-if 0.1.10", "libc", - "log 0.4.11", + "log 0.4.14", "wepoll-sys", "winapi", ] @@ -1480,7 +1724,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ - "getrandom", + "getrandom 0.1.15", "libc", "rand_chacha", "rand_core 0.5.1", @@ -1519,7 +1763,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" dependencies = [ - "getrandom", + "getrandom 0.1.15", ] [[package]] @@ -1556,22 +1800,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" [[package]] -name = "regex" -version = "1.4.2" +name = "redox_syscall" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c" +checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_users" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" +dependencies = [ + "getrandom 0.2.2", + "redox_syscall 0.2.6", +] + +[[package]] +name = "regex" +version = "1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "957056ecddbeba1b26965114e191d2e8589ce74db242b6ea25fc4062427a5c19" dependencies = [ "aho-corasick", "memchr", "regex-syntax", - "thread_local", ] [[package]] name = "regex-syntax" -version = "0.6.21" +version = "0.6.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189" +checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" [[package]] name = "remove_dir_all" @@ -1629,7 +1891,7 @@ dependencies = [ "cssparser", "derive_more", "fxhash", - "log 0.4.11", + "log 0.4.14", "matches", "phf", "phf_codegen", @@ -1738,6 +2000,26 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "signal-hook" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729" +dependencies = [ + "libc", + "mio", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-registry" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6" +dependencies = [ + "libc", +] + [[package]] name = "siphasher" version = "0.3.3" @@ -1763,9 +2045,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.5.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acad6f34eb9e8a259d3283d1e8c1d34d7415943d4895f65cc73813c7396fc85" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" [[package]] name = "socket2" @@ -1775,7 +2057,7 @@ checksum = "2c29947abdee2a218277abeca306f25789c938e500ea5a9d4b12a5a504466902" dependencies = [ "cfg-if 1.0.0", "libc", - "redox_syscall", + "redox_syscall 0.1.57", "winapi", ] @@ -1883,6 +2165,24 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +[[package]] +name = "strum" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7318c509b5ba57f18533982607f24070a55d353e90d4cae30c467cdb2ad5ac5c" + +[[package]] +name = "strum_macros" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8bc6b87a5112aeeab1f4a9f7ab634fe6cbefc4850006df31267f4cfb9e3149" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "subtle" version = "2.3.0" @@ -1891,21 +2191,21 @@ checksum = "343f3f510c2915908f155e94f17220b19ccfacf2a64a2a5d8004f2c3e311e7fd" [[package]] name = "surf" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7189c787d96fe18fef704950de76d590022d9d70858a4a201e1f07a0666882ea" +checksum = "2a154d33ca6b5e1fe6fd1c760e5a5cc1202425f6cca2e13229f16a69009f6328" dependencies = [ "async-std", "async-trait", - "cfg-if 0.1.10", + "cfg-if 1.0.0", "encoding_rs", "futures-util", "http-client", "http-types", - "log 0.4.11", + "log 0.4.14", "mime_guess", "once_cell", - "pin-project-lite 0.1.11", + "pin-project-lite 0.2.4", "serde", "serde_json", "web-sys", @@ -1943,6 +2243,16 @@ dependencies = [ "utf-8", ] +[[package]] +name = "terminal_size" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86ca8ced750734db02076f44132d802af0b33b09942331f4459dde8636fd2406" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -1960,33 +2270,24 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" [[package]] name = "thiserror" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e" +checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56" +checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" dependencies = [ "proc-macro2", "quote", "syn", ] -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static", -] - [[package]] name = "time" version = "0.1.44" @@ -2058,7 +2359,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27" dependencies = [ "cfg-if 0.1.10", - "log 0.4.11", + "log 0.4.14", "pin-project-lite 0.1.11", "tracing-attributes", "tracing-core", @@ -2127,6 +2428,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0d2e7be6ae3a5fa87eed5fb451aff96f2573d2694942e40543ae0bbe19c796" + [[package]] name = "unicode-width" version = "0.1.8" @@ -2151,9 +2458,9 @@ dependencies = [ [[package]] name = "url" -version = "2.2.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" +checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" dependencies = [ "form_urlencoded", "idna", @@ -2183,6 +2490,15 @@ dependencies = [ "rand 0.7.3", ] +[[package]] +name = "value-bag" +version = "1.0.0-alpha.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b676010e055c99033117c2343b33a40a30b91fecd6c49055ac9cd2d6c305ab1" +dependencies = [ + "ctor", +] + [[package]] name = "vcpkg" version = "0.2.10" @@ -2243,7 +2559,7 @@ checksum = "f22b422e2a757c35a73774860af8e112bff612ce6cb604224e8e47641a9e4f68" dependencies = [ "bumpalo", "lazy_static", - "log 0.4.11", + "log 0.4.14", "proc-macro2", "quote", "syn", @@ -2332,6 +2648,12 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "yansi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc79f4a1e39857fc00c3f662cbf2651c771f00e9c15fe2abc341806bd46bd71" + [[package]] name = "zip" version = "0.5.8" diff --git a/Cargo.toml b/Cargo.toml index 05660ed..3fbd83c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ description = "A web article downloader" homepage = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.3.0-alpha1" +version = "0.4.0-alpha1" authors = ["Kenneth Gitere "] edition = "2018" license = "MIT" @@ -12,14 +12,23 @@ readme = "README.md" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -async-std = "1.7.0" +async-std = "1.9.0" +# atty = "0.2.14" +chrono = "0.4.19" clap = "2.33.3" +colored = "2.0.0" +comfy-table = "2.1.0" +directories = "3.0.2" epub-builder = "0.4.8" -futures = "0.3.12" +flexi_logger = "0.17.1" +futures = "0.3.14" html5ever = "0.25.1" +indicatif = "0.15.0" kuchiki = "0.8.1" lazy_static = "1.4.0" +log = "0.4.14" md5 = "0.7.0" -regex = "1.4.2" -surf = "2.1.0" -url = "2.2.0" +regex = "1.4.5" +surf = "2.2.0" +thiserror = "1.0.24" +url = "2.2.1" diff --git a/README.md b/README.md index 96e15c5..99c9771 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ +![crates.io](https://img.shields.io/crates/v/paperoni.svg) +

Salami not included

-Paperoni is a web article downloader written in Rust. The downloaded articles are then exported as EPUB files. +Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs. > This project is in an alpha release so it might crash when you use it. Please open an [issue on Github](https://github.com/hipstermojo/paperoni/issues/new) if it does crash. @@ -17,7 +19,7 @@ Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for Paperoni is published on [crates.io](https://crates.io). If you have [cargo](https://github.com/rust-lang/cargo) installed, then run: ```sh -cargo install paperoni --version 0.3.0-alpha1 +cargo install paperoni --version 0.4.0-alpha1 ``` _Paperoni is still in alpha so the `version` flag has to be passed._ @@ -37,6 +39,27 @@ cargo run -- # pass your url here ## Usage +``` +USAGE: + paperoni [OPTIONS] [urls]... + +OPTIONS: + -f, --file Input file containing links + -h, --help Prints help information + --log-to-file Enables logging of events to a file located in .paperoni/logs with a default log level + of debug. Use -v to specify the logging level + --max_conn The maximum number of concurrent HTTP connections when downloading articles. Default is + 8 + --merge Merge multiple articles into a single epub + -V, --version Prints version information + -v Enables logging of events and set the verbosity level. Use -h to read on its usage + +ARGS: + ... Urls of web articles +``` + +To download a single article pass in its URL + ```sh paperoni https://en.wikipedia.org/wiki/Pepperoni ``` @@ -68,10 +91,23 @@ into a single epub using the `merge` flag and specifying the output file. paperoni -f links.txt --merge out.epub ``` +### Logging events + +Logging is disabled by default. This can be activated by either using the `-v` flag or `--log-to-file` flag. If the `--log-to-file` flag is passed the logs are sent to a file in the default Paperoni directory `.paperoni/logs` which is on your home directory. The `-v` flag configures the verbosity levels such that: + +``` +-v Logs only the error level +-vv Logs only the warn level +-vvv Logs only the info level +-vvvv Logs only the debug level +``` + +If only the `-v` flag is passed, the progress bars are disabled. If both `-v` and `--log-to-file` are passed then the progress bars will still be shown. + ## How it works The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor. -This extractor retrieves a possible article using a port of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB. +This extractor retrieves a possible article using a [custom port](https://github.com/hipstermojo/paperoni/blob/master/src/moz_readability/mod.rs) of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB. > The port of the algorithm is still unstable as well so it is not fully compatible with all the websites that can be extracted using Readability. @@ -82,3 +118,5 @@ This program is still in alpha so a number of things won't work: - Websites that only run with JavaScript cannot be extracted. - Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either. - Code snippets on Medium articles that are lazy loaded will not appear in the EPUB. + +There are also web pages it won't work on in general such as Twitter and Reddit threads. diff --git a/src/cli.rs b/src/cli.rs index 9815e08..19ce379 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,6 +1,10 @@ -use std::{fs::File, io::Read}; +use std::{fs::File, io::Read, path::Path}; +use chrono::{DateTime, Local}; use clap::{App, AppSettings, Arg}; +use flexi_logger::LevelFilter as LogLevel; + +use crate::logs::init_logger; pub fn cli_init() -> AppConfig { let app = App::new("paperoni") @@ -8,12 +12,9 @@ pub fn cli_init() -> AppConfig { AppSettings::ArgRequiredElseHelp, AppSettings::UnifiedHelpMessage, ]) - .version("0.3.0-alpha1") + .version(clap::crate_version!()) .about( - " -Paperoni is an article downloader. -It takes a url and downloads the article content from it and saves it to an epub. - ", + "Paperoni is a CLI tool made in Rust for downloading web articles as EPUBs", ) .arg( Arg::with_name("urls") @@ -38,8 +39,29 @@ It takes a url and downloads the article content from it and saves it to an epub .long("max_conn") .help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8") .long_help("The maximum number of concurrent HTTP connections when downloading articles. Default is 8.\nNOTE: It is advised to use as few connections as needed i.e between 1 and 50. Using more connections can end up overloading your network card with too many concurrent requests.") - .takes_value(true)); + .takes_value(true)) + .arg( + Arg::with_name("verbosity") + .short("v") + .multiple(true) + .help("Enables logging of events and set the verbosity level. Use --help to read on its usage") + .long_help( +"This takes upto 4 levels of verbosity in the following order. + - Error (-v) + - Warn (-vv) + - Info (-vvv) + - Debug (-vvvv) + When this flag is passed, it disables the progress bars and logs to stderr. + If you would like to send the logs to a file (and enable progress bars), pass the log-to-file flag." + ) + .takes_value(false)) + .arg( + Arg::with_name("log-to-file") + .long("log-to-file") + .help("Enables logging of events to a file located in .paperoni/logs with a default log level of debug. Use -v to specify the logging level") + .takes_value(false)); let arg_matches = app.get_matches(); + let mut urls: Vec = match arg_matches.value_of("file") { Some(file_name) => { if let Ok(mut file) = File::open(file_name) { @@ -76,14 +98,51 @@ It takes a url and downloads the article content from it and saves it to an epub let mut app_config = AppConfig::new(max_conn); app_config.set_urls(urls); + if let Some(name) = arg_matches.value_of("output_name") { - let file_name = if name.ends_with(".epub") && name.len() > 5 { + let file_path = Path::new(name); + if file_path.is_dir() { + eprintln!("{:?} is a directory", name); + std::process::exit(1); + } + + let file_name = if file_path.extension().is_some() { name.to_owned() } else { name.to_owned() + ".epub" }; - app_config.set_merged(file_name); + + match std::fs::File::create(&file_name) { + Ok(_) => (), + Err(e) => { + eprintln!("Unable to create file {:?}\n{}", file_path, e); + std::process::exit(1) + } + } + app_config.merged = Some(file_name); } + + if arg_matches.is_present("verbosity") { + if !arg_matches.is_present("log-to-file") { + app_config.can_disable_progress_bar = true; + } + let log_levels: [LogLevel; 5] = [ + LogLevel::Off, + LogLevel::Error, + LogLevel::Warn, + LogLevel::Info, + LogLevel::Debug, + ]; + let level = arg_matches.occurrences_of("verbosity").clamp(0, 4) as usize; + app_config.log_level = log_levels[level]; + } + if arg_matches.is_present("log-to-file") { + app_config.log_level = LogLevel::Debug; + app_config.is_logging_to_file = true; + } + + init_logger(&app_config); + app_config } @@ -91,6 +150,10 @@ pub struct AppConfig { urls: Vec, max_conn: usize, merged: Option, + log_level: LogLevel, + can_disable_progress_bar: bool, + start_time: DateTime, + is_logging_to_file: bool, } impl AppConfig { @@ -99,6 +162,10 @@ impl AppConfig { urls: vec![], max_conn, merged: None, + log_level: LogLevel::Off, + can_disable_progress_bar: false, + start_time: Local::now(), + is_logging_to_file: false, } } @@ -106,10 +173,6 @@ impl AppConfig { self.urls.extend(urls); } - fn set_merged(&mut self, name: String) { - self.merged = Some(name); - } - pub fn urls(&self) -> &Vec { &self.urls } @@ -120,4 +183,20 @@ impl AppConfig { pub fn merged(&self) -> Option<&String> { self.merged.as_ref() } + + pub fn log_level(&self) -> LogLevel { + self.log_level + } + + pub fn can_disable_progress_bar(&self) -> bool { + self.can_disable_progress_bar + } + + pub fn start_time(&self) -> &DateTime { + &self.start_time + } + + pub fn is_logging_to_file(&self) -> bool { + self.is_logging_to_file + } } diff --git a/src/epub.rs b/src/epub.rs index e6e0376..75f2b9e 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -1,32 +1,159 @@ use std::fs::File; +use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; +use indicatif::{ProgressBar, ProgressStyle}; +use log::{debug, info}; -use crate::extractor::{self, Extractor}; +use crate::{ + cli::AppConfig, + errors::PaperoniError, + extractor::{self, Extractor}, +}; -pub fn generate_epubs(articles: Vec, merged: Option<&String>) { - match merged { +pub fn generate_epubs( + articles: Vec, + app_config: &AppConfig, + successful_articles_table: &mut Table, +) -> Result<(), Vec> { + let bar = if app_config.can_disable_progress_bar() { + ProgressBar::hidden() + } else { + let enabled_bar = ProgressBar::new(articles.len() as u64); + let style = ProgressStyle::default_bar().template( + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", + ); + enabled_bar.set_style(style); + if !articles.is_empty() { + enabled_bar.set_message("Generating epubs"); + } + enabled_bar + }; + + let mut errors: Vec = Vec::new(); + + match app_config.merged() { Some(name) => { - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + successful_articles_table.set_header(vec![Cell::new("Table of Contents") + .add_attribute(Attribute::Bold) + .set_alignment(CellAlignment::Center) + .fg(Color::Green)]); + + let mut epub = match EpubBuilder::new(match ZipLibrary::new() { + Ok(zip_library) => zip_library, + Err(err) => { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + }) { + Ok(epub) => epub, + Err(err) => { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + }; + debug!("Creating {:?}", name); epub.inline_toc(); - epub = articles + articles .iter() .enumerate() - .fold(epub, |mut epub, (idx, article)| { + .fold(&mut epub, |epub, (idx, article)| { + let mut article_result = || -> Result<(), PaperoniError> { + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(article.article(), &mut html_buf)?; + let html_str = std::str::from_utf8(&html_buf)?; + epub.metadata("title", replace_metadata_value(name))?; + let section_name = article.metadata().title(); + epub.add_content( + EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) + .title(replace_metadata_value(section_name)), + )?; + info!("Adding images for {:?}", name); + article.img_urls.iter().for_each(|img| { + // TODO: Add error handling and return errors as a vec + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); + + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.as_ref().unwrap(), + ) + .unwrap(); + }); + info!("Added images for {:?}", name); + Ok(()) + }; + if let Err(mut error) = article_result() { + error.set_article_source(&article.url); + errors.push(error); + } + bar.inc(1); + successful_articles_table.add_row(vec![article.metadata().title()]); + epub + }); + let appendix = generate_appendix(articles.iter().collect()); + if let Err(err) = epub.add_content( + EpubContent::new("appendix.xhtml", appendix.as_bytes()) + .title(replace_metadata_value("Article Sources")), + ) { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + + let mut out_file = File::create(&name).unwrap(); + match epub.generate(&mut out_file) { + Ok(_) => (), + Err(err) => { + let mut paperoni_err: PaperoniError = err.into(); + paperoni_err.set_article_source(name); + errors.push(paperoni_err); + return Err(errors); + } + } + + bar.finish_with_message("Generated epub\n"); + debug!("Created {:?}", name); + println!("Created {:?}", name); + } + None => { + successful_articles_table + .set_header(vec![Cell::new("Downloaded articles") + .add_attribute(Attribute::Bold) + .set_alignment(CellAlignment::Center) + .fg(Color::Green)]) + .set_content_arrangement(ContentArrangement::Dynamic); + + for article in &articles { + let mut result = || -> Result<(), PaperoniError> { + let mut epub = EpubBuilder::new(ZipLibrary::new()?)?; + let file_name = format!( + "{}.epub", + article + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + debug!("Creating {:?}", file_name); + let mut out_file = File::create(&file_name).unwrap(); let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) + extractor::serialize_to_xhtml(article.article(), &mut html_buf) .expect("Unable to serialize to xhtml"); let html_str = std::str::from_utf8(&html_buf).unwrap(); - epub.metadata("title", replace_metadata_value(name)) - .unwrap(); - let section_name = article.metadata().title(); - epub.add_content( - EpubContent::new(format!("article_{}.xhtml", idx), html_str.as_bytes()) - .title(replace_metadata_value(section_name)), - ) - .unwrap(); - - article.img_urls.iter().for_each(|img| { + if let Some(author) = article.metadata().byline() { + epub.metadata("author", replace_metadata_value(author))?; + } + epub.metadata("title", replace_metadata_value(article.metadata().title()))?; + epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes()))?; + for img in &article.img_urls { let mut file_path = std::env::temp_dir(); file_path.push(&img.0); @@ -35,52 +162,35 @@ pub fn generate_epubs(articles: Vec, merged: Option<&String>) { file_path.file_name().unwrap(), img_buf, img.1.as_ref().unwrap(), - ) - .unwrap(); - }); - epub - }); - let mut out_file = File::create(&name).unwrap(); - epub.generate(&mut out_file).unwrap(); - println!("Created {:?}", name); - } - None => { - for article in articles { - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); - let file_name = format!( - "{}.epub", - article - .metadata() - .title() - .replace("/", " ") - .replace("\\", " ") - ); - let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_str = std::str::from_utf8(&html_buf).unwrap(); - if let Some(author) = article.metadata().byline() { - epub.metadata("author", replace_metadata_value(author)) - .unwrap(); - } - epub.metadata("title", replace_metadata_value(article.metadata().title())) - .unwrap(); - epub.add_content(EpubContent::new("index.xhtml", html_str.as_bytes())) - .unwrap(); - for img in article.img_urls { - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); + )?; + } + let appendix = generate_appendix(vec![&article]); + epub.add_content( + EpubContent::new("appendix.xhtml", appendix.as_bytes()) + .title(replace_metadata_value("Article Source")), + )?; + epub.generate(&mut out_file)?; + bar.inc(1); - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) - .unwrap(); + successful_articles_table.add_row(vec![article.metadata().title()]); + + debug!("Created {:?}", file_name); + Ok(()) + }; + if let Err(mut error) = result() { + error.set_article_source(&article.url); + errors.push(error); } - epub.generate(&mut out_file).unwrap(); - println!("Created {:?}", file_name); } + bar.finish_with_message("Generated epubs\n"); } } + + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } } /// Replaces characters that have to be escaped before adding to the epub's metadata @@ -91,6 +201,37 @@ fn replace_metadata_value(value: &str) -> String { .replace(">", ">") } +//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references +fn generate_appendix(articles: Vec<&Extractor>) -> String { + let link_tags: String = articles + .iter() + .map(|article| { + let article_name = if !article.metadata().title().is_empty() { + article.metadata().title() + } else { + &article.url + }; + format!( + "{}

", + replace_metadata_value(&article.url), + replace_metadata_value(article_name) + ) + }) + .collect(); + let template = format!( + r#" + + + +

Appendix

Article sources

+ {} + +"#, + link_tags + ); + template +} + #[cfg(test)] mod test { use super::replace_metadata_value; diff --git a/src/errors.rs b/src/errors.rs new file mode 100644 index 0000000..84d1535 --- /dev/null +++ b/src/errors.rs @@ -0,0 +1,126 @@ +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum ErrorKind { + #[error("[EpubError]: {0}")] + EpubError(String), + #[error("[HTTPError]: {0}")] + HTTPError(String), + #[error("[IOError]: {0}")] + IOError(String), + #[error("[UTF8Error]: {0}")] + UTF8Error(String), + #[error("[ReadabilityError]: {0}")] + ReadabilityError(String), +} + +#[derive(Error, Debug)] +#[error("{kind}")] +/// Used to represent errors from downloading images. Errors from here are used solely for debugging +/// as they are considered recoverable. +pub struct ImgError { + kind: ErrorKind, + url: Option, +} + +impl ImgError { + pub fn with_kind(kind: ErrorKind) -> Self { + ImgError { url: None, kind } + } + + pub fn set_url(&mut self, url: &str) { + self.url = Some(url.to_string()); + } + + pub fn url(&self) -> &Option { + &self.url + } +} + +impl From for ImgError { + fn from(kind: ErrorKind) -> Self { + ImgError::with_kind(kind) + } +} + +impl From for ImgError { + fn from(err: surf::Error) -> Self { + ImgError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From for ImgError { + fn from(err: url::ParseError) -> Self { + ImgError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From for ImgError { + fn from(err: std::io::Error) -> Self { + ImgError::with_kind(ErrorKind::IOError(err.to_string())) + } +} + +#[derive(Error, Debug)] +#[error("{kind}")] +pub struct PaperoniError { + article_source: Option, + kind: ErrorKind, +} + +impl PaperoniError { + pub fn with_kind(kind: ErrorKind) -> Self { + PaperoniError { + article_source: None, + kind, + } + } + + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub fn article_source(&self) -> &Option { + &self.article_source + } + + pub fn set_article_source(&mut self, article_source: &str) { + self.article_source = Some(article_source.to_owned()); + } +} + +impl From for PaperoniError { + fn from(kind: ErrorKind) -> Self { + PaperoniError::with_kind(kind) + } +} + +impl From for PaperoniError { + fn from(err: epub_builder::Error) -> Self { + PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned())) + } +} + +impl From for PaperoniError { + fn from(err: surf::Error) -> Self { + PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From for PaperoniError { + fn from(err: url::ParseError) -> Self { + PaperoniError::with_kind(ErrorKind::HTTPError(err.to_string())) + } +} + +impl From for PaperoniError { + fn from(err: std::io::Error) -> Self { + PaperoniError::with_kind(ErrorKind::IOError(err.to_string())) + } +} + +impl From for PaperoniError { + fn from(err: std::str::Utf8Error) -> Self { + PaperoniError::with_kind(ErrorKind::UTF8Error(err.to_string())) + } +} diff --git a/src/extractor.rs b/src/extractor.rs index 0fcc5e8..2cf3f25 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use kuchiki::{traits::*, NodeRef}; +use crate::errors::PaperoniError; use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); @@ -14,22 +15,24 @@ pub struct Extractor { article: Option, pub img_urls: Vec, readability: Readability, + pub url: String, } impl Extractor { /// Create a new instance of an HTML extractor given an HTML string - pub fn from_html(html_str: &str) -> Self { + pub fn from_html(html_str: &str, url: &str) -> Self { Extractor { article: None, img_urls: Vec::new(), readability: Readability::new(html_str), + url: url.to_string(), } } /// Locates and extracts the HTML in a document which is determined to be /// the source of the content - pub fn extract_content(&mut self, url: &str) { - self.readability.parse(url); + pub fn extract_content(&mut self) -> Result<(), PaperoniError> { + self.readability.parse(&self.url)?; if let Some(article_node_ref) = &self.readability.article_node { let template = r#" @@ -44,6 +47,7 @@ impl Extractor { body.as_node().append(article_node_ref.clone()); self.article = Some(doc); } + Ok(()) } /// Traverses the DOM tree of the content and retrieves the IMG URLs @@ -61,8 +65,11 @@ impl Extractor { } } - pub fn article(&self) -> Option<&NodeRef> { - self.article.as_ref() + /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse + pub fn article(&self) -> &NodeRef { + self.article.as_ref().expect( + "Article node doesn't exist. This may be because the document has not been parsed", + ) } pub fn metadata(&self) -> &MetaData { @@ -75,7 +82,7 @@ impl Extractor { pub fn serialize_to_xhtml( node_ref: &NodeRef, mut w: &mut W, -) -> Result<(), Box> { +) -> Result<(), PaperoniError> { let mut escape_map = HashMap::new(); escape_map.insert("<", "<"); escape_map.insert(">", ">"); @@ -96,6 +103,7 @@ pub fn serialize_to_xhtml( let attrs_str = attrs .map .iter() + .filter(|(k, _)| &k.local != "\"") .map(|(k, v)| { format!( "{}=\"{}\"", @@ -156,8 +164,10 @@ mod test { #[test] fn test_extract_img_urls() { - let mut extractor = Extractor::from_html(TEST_HTML); - extractor.extract_content("http://example.com/"); + let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/"); + extractor + .extract_content() + .expect("Article extraction failed unexpectedly"); extractor.extract_img_urls(); assert!(extractor.img_urls.len() > 0); diff --git a/src/http.rs b/src/http.rs index faf9428..bb457b1 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,65 +1,90 @@ use async_std::io::prelude::*; use async_std::{fs::File, stream}; use futures::StreamExt; +use indicatif::ProgressBar; +use log::{debug, info}; use url::Url; +use crate::errors::{ErrorKind, ImgError, PaperoniError}; use crate::extractor::Extractor; - type HTMLResource = (String, String); -pub async fn fetch_url( - url: &str, -) -> Result> { +pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); - println!("Fetching..."); + debug!("Fetching {}", url); - let mut redirect_count: u8 = 0; - let base_url = Url::parse(&url)?; - let mut url = base_url.clone(); - while redirect_count < 5 { - redirect_count += 1; - let req = surf::get(&url); - let mut res = client.send(req).await?; - if res.status().is_redirection() { - if let Some(location) = res.header(surf::http::headers::LOCATION) { - match Url::parse(location.last().as_str()) { - Ok(valid_url) => url = valid_url, - Err(e) => match e { - url::ParseError::RelativeUrlWithoutBase => { - url = base_url.join(location.last().as_str())? + let process_request = async { + let mut redirect_count: u8 = 0; + let base_url = Url::parse(&url)?; + let mut url = base_url.clone(); + while redirect_count < 5 { + redirect_count += 1; + let req = surf::get(&url); + let mut res = client.send(req).await?; + if res.status().is_redirection() { + if let Some(location) = res.header(surf::http::headers::LOCATION) { + match Url::parse(location.last().as_str()) { + Ok(valid_url) => { + info!("Redirecting {} to {}", url, valid_url); + url = valid_url } - e => return Err(e.into()), - }, - }; - } - } else if res.status().is_success() { - if let Some(mime) = res.content_type() { - if mime.essence() == "text/html" { - return Ok((url.to_string(), res.body_string().await?)); + Err(e) => match e { + url::ParseError::RelativeUrlWithoutBase => { + match base_url.join(location.last().as_str()) { + Ok(joined_url) => { + info!("Redirecting {} to {}", url, joined_url); + url = joined_url; + } + Err(e) => return Err(e.into()), + } + } + e => return Err(e.into()), + }, + }; + } + } else if res.status().is_success() { + if let Some(mime) = res.content_type() { + if mime.essence() == "text/html" { + debug!("Successfully fetched {}", url); + return Ok((url.to_string(), res.body_string().await?)); + } else { + let msg = format!( + "Invalid HTTP response. Received {} instead of text/html", + mime.essence() + ); + + return Err(ErrorKind::HTTPError(msg).into()); + } } else { - return Err(format!( - "Invalid HTTP response. Received {} instead of text/html", - mime.essence() - ) - .into()); + return Err(ErrorKind::HTTPError("Unknown HTTP response".to_owned()).into()); } } else { - return Err("Unknown HTTP response".into()); + let msg = format!("Request failed: HTTP {}", res.status()); + return Err(ErrorKind::HTTPError(msg).into()); } - } else { - return Err(format!("Request failed: HTTP {}", res.status()).into()); } - } - Err("Unable to fetch HTML".into()) + Err(ErrorKind::HTTPError("Unable to fetch HTML".to_owned()).into()) + }; + + process_request.await.map_err(|mut error: PaperoniError| { + error.set_article_source(url); + error + }) } pub async fn download_images( extractor: &mut Extractor, article_origin: &Url, -) -> async_std::io::Result<()> { + bar: &ProgressBar, +) -> Result<(), Vec> { if extractor.img_urls.len() > 0 { - println!("Downloading images..."); + debug!( + "Downloading {} images for {}", + extractor.img_urls.len(), + article_origin + ); } + let img_count = extractor.img_urls.len(); let imgs_req_iter = extractor .img_urls @@ -67,43 +92,73 @@ pub async fn download_images( .map(|(url, _)| { ( url, - surf::Client::new().get(get_absolute_url(&url, article_origin)), + surf::Client::new() + .with(surf::middleware::Redirect::default()) + .get(get_absolute_url(&url, article_origin)), ) }) - .map(|(url, req)| async move { - let mut img_response = req.await.expect("Unable to retrieve image"); - let img_content: Vec = img_response.body_bytes().await.unwrap(); - let img_mime = img_response - .content_type() - .map(|mime| mime.essence().to_string()); - let img_ext = img_response - .content_type() - .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) - .unwrap(); + .enumerate() + .map(|(img_idx, (url, req))| async move { + bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str()); + match req.await { + Ok(mut img_response) => { + let process_response = async { + let img_content: Vec = match img_response.body_bytes().await { + Ok(bytes) => bytes, + Err(e) => return Err(e.into()), + }; + let img_mime = img_response + .content_type() + .map(|mime| mime.essence().to_string()); + let img_ext = match img_response + .content_type() + .map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string()) + { + Some(mime_str) => mime_str, + None => { + return Err(ErrorKind::HTTPError( + "Image has no Content-Type".to_owned(), + ) + .into()) + } + }; - let mut img_path = std::env::temp_dir(); - img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); - let mut img_file = File::create(&img_path) - .await - .expect("Unable to create file"); - img_file - .write_all(&img_content) - .await - .expect("Unable to save to file"); + let mut img_path = std::env::temp_dir(); + img_path.push(format!("{}.{}", hash_url(&url), &img_ext)); + let mut img_file = match File::create(&img_path).await { + Ok(file) => file, + Err(e) => return Err(e.into()), + }; + match img_file.write_all(&img_content).await { + Ok(_) => (), + Err(e) => return Err(e.into()), + } - ( - url, - img_path - .file_name() - .map(|os_str_name| { - os_str_name - .to_str() - .expect("Unable to get image file name") - .to_string() + Ok(( + url, + img_path + .file_name() + .map(|os_str_name| { + os_str_name + .to_str() + .expect("Unable to get image file name") + .to_string() + }) + .unwrap(), + img_mime, + )) + }; + process_response.await.map_err(|mut e: ImgError| { + e.set_url(url); + e }) - .unwrap(), - img_mime, - ) + } + Err(e) => { + let mut img_err: ImgError = e.into(); + img_err.set_url(url); + Err(img_err) + } + } }); // A utility closure used when update the value of an image source after downloading is successful @@ -112,8 +167,6 @@ pub async fn download_images( let (img_url, img_path, img_mime) = img_item; let img_ref = extractor .article() - .as_mut() - .expect("Unable to get mutable ref") .select_first(&format!("img[src='{}']", img_url)) .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut(); @@ -124,14 +177,24 @@ pub async fn download_images( (img_path, img_mime) }; - extractor.img_urls = stream::from_iter(imgs_req_iter) + let imgs_req_iter = stream::from_iter(imgs_req_iter) .buffered(10) - .collect::>() - .await - .into_iter() - .map(replace_existing_img_src) - .collect(); - Ok(()) + .collect::>>() + .await; + let mut errors = Vec::new(); + let mut replaced_imgs = Vec::new(); + for img_req_result in imgs_req_iter { + match img_req_result { + Ok(img_req) => replaced_imgs.push(replace_existing_img_src(img_req)), + Err(e) => errors.push(e), + } + } + extractor.img_urls = replaced_imgs; + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } } /// Handles getting the extension from a given MIME subtype. diff --git a/src/logs.rs b/src/logs.rs new file mode 100644 index 0000000..87b5d1b --- /dev/null +++ b/src/logs.rs @@ -0,0 +1,260 @@ +use colored::*; +use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY; +use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; +use directories::UserDirs; +use flexi_logger::LogSpecBuilder; +use log::error; + +use crate::{cli::AppConfig, errors::PaperoniError}; + +pub fn display_summary( + initial_article_count: usize, + succesful_articles_table: Table, + partial_downloads_count: usize, + errors: Vec, +) { + let successfully_downloaded_count = + initial_article_count - partial_downloads_count - errors.len(); + + println!( + "{}", + short_summary(DownloadCount::new( + initial_article_count, + successfully_downloaded_count, + partial_downloads_count, + errors.len() + )) + .bold() + ); + + if successfully_downloaded_count > 0 { + println!("{}", succesful_articles_table); + } + if !errors.is_empty() { + println!("\n{}", "Failed article downloads".bright_red().bold()); + let mut table_failed = Table::new(); + table_failed + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_header(vec![ + Cell::new("Link").set_alignment(CellAlignment::Center), + Cell::new("Reason").set_alignment(CellAlignment::Center), + ]) + .set_content_arrangement(ContentArrangement::Dynamic); + + for error in errors { + let error_source = error + .article_source() + .clone() + .unwrap_or_else(|| "".to_string()); + table_failed.add_row(vec![&error_source, &format!("{}", error.kind())]); + error!("{}\n - {}", error, error_source); + } + println!("{}", table_failed); + } +} + +/// Returns a string summary of the total number of failed and successful article downloads +fn short_summary(download_count: DownloadCount) -> String { + // TODO: Refactor this + if download_count.total + != download_count.successful + download_count.failed + download_count.partial + { + panic!("initial_count must be equal to the sum of failed and successful count") + } + let get_noun = |count: usize| if count == 1 { "article" } else { "articles" }; + if download_count.successful == download_count.total && download_count.successful == 1 { + "Article downloaded successfully".green().to_string() + } else if download_count.total == download_count.failed && download_count.failed == 1 { + "Article failed to download".red().to_string() + } else if download_count.total == download_count.partial && download_count.partial == 1 { + "Article partially failed to download".yellow().to_string() + } else if download_count.successful == download_count.total { + "All articles downloaded successfully".green().to_string() + } else if download_count.failed == download_count.total { + "All articles failed to download".red().to_string() + } else if download_count.partial == download_count.total { + "All articles partially failed to download" + .yellow() + .to_string() + } else if download_count.partial == 0 { + format!( + "{} {} downloaded successfully, {} {} failed", + download_count.successful, + get_noun(download_count.successful), + download_count.failed, + get_noun(download_count.failed) + ) + .yellow() + .to_string() + } else if download_count.successful == 0 + && download_count.partial > 0 + && download_count.failed > 0 + { + format!( + "{} {} partially failed to download, {} {} failed", + download_count.partial, + get_noun(download_count.partial), + download_count.failed, + get_noun(download_count.failed) + ) + .yellow() + .to_string() + } else if download_count.failed == 0 + && download_count.successful > 0 + && download_count.partial > 0 + { + format!( + "{} {} downloaded successfully, {} {} partially failed to download", + download_count.successful, + get_noun(download_count.successful), + download_count.partial, + get_noun(download_count.partial) + ) + .yellow() + .to_string() + } else { + format!( + "{} {} downloaded successfully, {} {} partially failed to download, {} {} failed", + download_count.successful, + get_noun(download_count.successful), + download_count.partial, + get_noun(download_count.partial), + download_count.failed, + get_noun(download_count.failed) + ) + .yellow() + .to_string() + } +} + +struct DownloadCount { + total: usize, + successful: usize, + partial: usize, + failed: usize, +} +impl DownloadCount { + fn new(total: usize, successful: usize, partial: usize, failed: usize) -> Self { + Self { + total, + successful, + partial, + failed, + } + } +} + +pub fn init_logger(app_config: &AppConfig) { + match UserDirs::new() { + Some(user_dirs) => { + let home_dir = user_dirs.home_dir(); + let paperoni_dir = home_dir.join(".paperoni"); + let log_dir = paperoni_dir.join("logs"); + + let log_spec = LogSpecBuilder::new() + .module("paperoni", app_config.log_level()) + .build(); + let formatted_timestamp = app_config.start_time().format("%Y-%m-%d_%H-%M-%S"); + let mut logger = flexi_logger::Logger::with(log_spec); + + if app_config.is_logging_to_file() && (!paperoni_dir.is_dir() || !log_dir.is_dir()) { + match std::fs::create_dir_all(&log_dir) { + Ok(_) => (), + Err(e) => { + eprintln!("Unable to create paperoni directories on home directory for logging purposes\n{}",e); + std::process::exit(1); + } + }; + } + + if app_config.is_logging_to_file() { + logger = logger + .directory(log_dir) + .discriminant(formatted_timestamp.to_string()) + .suppress_timestamp() + .log_to_file(); + } + + match logger.start() { + Ok(_) => (), + Err(e) => eprintln!("Unable to start logger!\n{}", e), + } + } + None => eprintln!("Unable to get user directories for logging purposes"), + }; +} + +#[cfg(test)] +mod tests { + use super::{short_summary, DownloadCount}; + use colored::*; + #[test] + fn test_short_summary() { + assert_eq!( + short_summary(DownloadCount::new(1, 1, 0, 0)), + "Article downloaded successfully".green().to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(1, 0, 0, 1)), + "Article failed to download".red().to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(10, 10, 0, 0)), + "All articles downloaded successfully".green().to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(10, 0, 0, 10)), + "All articles failed to download".red().to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(10, 8, 0, 2)), + "8 articles downloaded successfully, 2 articles failed" + .yellow() + .to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(10, 1, 0, 9)), + "1 article downloaded successfully, 9 articles failed" + .yellow() + .to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(7, 6, 0, 1)), + "6 articles downloaded successfully, 1 article failed" + .yellow() + .to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(7, 4, 2, 1)), + "4 articles downloaded successfully, 2 articles partially failed to download, 1 article failed" + .yellow() + .to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(12, 6, 6, 0)), + "6 articles downloaded successfully, 6 articles partially failed to download" + .yellow() + .to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(5, 0, 4, 1)), + "4 articles partially failed to download, 1 article failed" + .yellow() + .to_string() + ); + assert_eq!( + short_summary(DownloadCount::new(4, 0, 4, 0)), + "All articles partially failed to download" + .yellow() + .to_string() + ); + } + + #[test] + #[should_panic( + expected = "initial_count must be equal to the sum of failed and successful count" + )] + fn test_short_summary_panics_on_invalid_input() { + short_summary(DownloadCount::new(0, 12, 0, 43)); + } +} diff --git a/src/main.rs b/src/main.rs index 0467712..0f8b34a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,21 +3,28 @@ extern crate lazy_static; use async_std::stream; use async_std::task; +use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY}; +use comfy_table::{ContentArrangement, Table}; use futures::stream::StreamExt; +use indicatif::{ProgressBar, ProgressStyle}; +use log::{debug, warn}; use url::Url; mod cli; mod epub; +mod errors; mod extractor; /// This module is responsible for async HTTP calls for downloading /// the HTML content and images mod http; +mod logs; mod moz_readability; use cli::AppConfig; use epub::generate_epubs; use extractor::Extractor; -use http::{download_images, fetch_url}; +use http::{download_images, fetch_html}; +use logs::display_summary; fn main() { let app_config = cli::cli_init(); @@ -28,29 +35,92 @@ fn main() { } fn download(app_config: AppConfig) { + let mut errors = Vec::new(); + let mut partial_download_count: usize = 0; + let bar = if app_config.can_disable_progress_bar() { + ProgressBar::hidden() + } else { + let enabled_bar = ProgressBar::new(app_config.urls().len() as u64); + let style = ProgressStyle::default_bar().template( + "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", + ); + enabled_bar.set_style(style); + enabled_bar.enable_steady_tick(500); + enabled_bar + }; let articles = task::block_on(async { - let urls_iter = app_config.urls().iter().map(|url| fetch_url(url)); + let urls_iter = app_config.urls().iter().map(|url| fetch_html(url)); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn()); let mut articles = Vec::new(); while let Some(fetch_result) = responses.next().await { match fetch_result { Ok((url, html)) => { - println!("Extracting"); - let mut extractor = Extractor::from_html(&html); - extractor.extract_content(&url); - - if extractor.article().is_some() { - extractor.extract_img_urls(); - download_images(&mut extractor, &Url::parse(&url).unwrap()) - .await - .expect("Unable to download images"); - articles.push(extractor); + debug!("Extracting {}", &url); + let mut extractor = Extractor::from_html(&html, &url); + bar.set_message("Extracting..."); + match extractor.extract_content() { + Ok(_) => { + extractor.extract_img_urls(); + if let Err(img_errors) = + download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) + .await + { + partial_download_count += 1; + warn!( + "{} image{} failed to download for {}", + img_errors.len(), + if img_errors.len() > 1 { "s" } else { "" }, + url + ); + for img_error in img_errors { + warn!( + "{}\n\t\tReason {}", + img_error.url().as_ref().unwrap(), + img_error + ); + } + } + articles.push(extractor); + } + Err(mut e) => { + e.set_article_source(&url); + errors.push(e); + } } } - Err(e) => eprintln!("{}", e), + Err(e) => errors.push(e), } + bar.inc(1); } articles }); - generate_epubs(articles, app_config.merged()); + bar.finish_with_message("Downloaded articles"); + + let mut succesful_articles_table = Table::new(); + succesful_articles_table + .load_preset(UTF8_FULL) + .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY) + .set_content_arrangement(ContentArrangement::Dynamic); + match generate_epubs(articles, &app_config, &mut succesful_articles_table) { + Ok(_) => (), + Err(gen_epub_errors) => { + errors.extend(gen_epub_errors); + } + }; + let has_errors = !errors.is_empty(); + display_summary( + app_config.urls().len(), + succesful_articles_table, + partial_download_count, + errors, + ); + if app_config.is_logging_to_file() { + println!( + "Log written to paperoni_{}.log\n", + app_config.start_time().format("%Y-%m-%d_%H-%M-%S") + ); + } + if has_errors { + std::process::exit(1); + } } diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 9b25b79..38236d3 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -7,8 +7,11 @@ use kuchiki::{ traits::*, NodeData, NodeRef, }; +use log::info; use url::Url; +use crate::errors::{ErrorKind, PaperoniError}; + const DEFAULT_CHAR_THRESHOLD: usize = 500; const FLAG_STRIP_UNLIKELYS: u32 = 0x1; const FLAG_WEIGHT_CLASSES: u32 = 0x2; @@ -76,14 +79,15 @@ impl Readability { metadata: MetaData::new(), } } - pub fn parse(&mut self, url: &str) { + pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> { self.unwrap_no_script_tags(); self.remove_scripts(); self.prep_document(); self.metadata = self.get_article_metadata(); self.article_title = self.metadata.title.clone(); - self.grab_article(); + self.grab_article()?; self.post_process_content(url); + Ok(()) } /// Recursively check if node is image, or if node contains exactly only one image @@ -426,8 +430,7 @@ impl Readability { let mut matches = None; if let Some(property) = node_attr.get("property") { matches = regexes::PROPERTY_REGEX.captures(property); - if matches.is_some() { - let captures = matches.as_ref().unwrap(); + if let Some(captures) = &matches { for capture in captures.iter() { let mut name = capture.unwrap().as_str().to_lowercase(); name = regexes::REPLACE_WHITESPACE_REGEX @@ -561,7 +564,7 @@ impl Readability { .root_node .select_first("title") .map(|title| title.text_contents().trim().to_string()) - .expect("This file has no tag to extract a title from"); + .unwrap_or("".to_string()); let orig_title = cur_title.clone(); let mut title_had_hierarchical_separators = false; let word_count = |s: &str| -> usize { s.split_whitespace().count() }; @@ -595,8 +598,8 @@ impl Readability { } } else if cur_title.len() > 150 || cur_title.len() < 15 { let mut h1_nodes = self.root_node.select("h1").unwrap(); - let (_, h1_count) = h1_nodes.size_hint(); - if Some(1) == h1_count { + let h1_count = self.root_node.select("h1").unwrap().count(); + if h1_count == 1 { cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None); } } @@ -799,6 +802,7 @@ impl Readability { state = State::ReadProp; decl.1 = Some(token.trim().to_string()); tokens.push(decl.clone()); + decl = (None, None); token.clear(); } else { token.push(c); @@ -819,11 +823,18 @@ impl Readability { } } if !token.is_empty() { - decl.1 = Some(token.trim().to_string()); - tokens.push(decl); + match state { + State::ReadVal => { + decl.1 = Some(token.trim().to_string()); + tokens.push(decl); + } + _ => (), + } } + tokens .into_iter() + .filter(|tok_pair| tok_pair.0.is_some() && tok_pair.1.is_some()) .map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap())) .collect() } @@ -1576,16 +1587,14 @@ impl Readability { /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff /// a user wants to read. Then return it wrapped up in a div. - fn grab_article(&mut self) { - println!("Grabbing article"); + fn grab_article(&mut self) -> Result<(), PaperoniError> { + info!("Grabbing article {:?}", self.metadata.title); // var doc = this._doc; // var isPaging = (page !== null ? true: false); // page = page ? page : this._doc.body; let page = self.root_node.select_first("body"); if page.is_err() { - // TODO:Have error logging for this - println!("Document has no <body>"); - return; + return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into()); } let page = page.unwrap(); let mut attempts: Vec<ExtractAttempt> = Vec::new(); @@ -2075,8 +2084,10 @@ impl Readability { attempts.push(ExtractAttempt::new(article_content.clone(), text_length)); attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap()); if attempts.first().as_ref().unwrap().length == 0 { - println!("Unable to extract content"); - break; + return Err(ErrorKind::ReadabilityError( + "Unable to extract content".into(), + ) + .into()); } article_content = attempts[0].article.clone(); parse_successful = true; @@ -2102,7 +2113,8 @@ impl Readability { false }); self.article_node = Some(article_content); - return; + info!("Successfully grabbed article {:?}", self.metadata.title); + return Ok(()); } } } @@ -2460,12 +2472,24 @@ mod test { css_map.insert("align-items".to_string(), "center".to_string()); css_map.insert("border".to_string(), "2px solid black".to_string()); - let css_str_to_vec = Readability::inline_css_str_to_map(css_str); - assert_eq!(css_map, css_str_to_vec); + let css_str_to_map = Readability::inline_css_str_to_map(css_str); + assert_eq!(css_map, css_str_to_map); let mut css_map = HashMap::new(); css_map.insert("color".to_string(), "red".to_string()); css_map.insert("background-image".to_string(), "url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')".to_string()); assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('data:image/jpeg;base64,/wgARCAALABQDASIAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgADBP/')")); + + let empty_map = HashMap::new(); + assert_eq!(empty_map, Readability::inline_css_str_to_map(" \n \t \r")); + assert_eq!(empty_map, Readability::inline_css_str_to_map("color")); + + let mut css_map = HashMap::new(); + css_map.insert("color".to_string(), "red".to_string()); + css_map.insert("height".to_string(), "300px".to_string()); + assert_eq!( + css_map, + Readability::inline_css_str_to_map("color: red;height: 300px;width") + ); } #[test]