diff --git a/.gitignore b/.gitignore index ea8c4bf..3ae8faf 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +*.epub \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index f48fe72..45e53ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,6 +15,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi 0.3.8", +] + [[package]] name = "async-std" version = "1.5.0" @@ -50,6 +59,17 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi 0.3.8", +] + [[package]] name = "autocfg" version = "0.1.7" @@ -155,6 +175,21 @@ dependencies = [ "time", ] +[[package]] +name = "clap" +version = "2.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + [[package]] name = "cloudabi" version = "0.0.3" @@ -523,6 +558,15 @@ dependencies = [ "wasi", ] +[[package]] +name = "heck" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "hermit-abi" version = "0.1.12" @@ -660,9 +704,9 @@ dependencies = [ [[package]] name = "kuchiki" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1beeffc5ae5ab0def2cb85e26063a8e6b4f579b0adec3805bf87472086948956" +checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358" dependencies = [ "cssparser", "html5ever 0.25.1", @@ -966,12 +1010,16 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.1.0" +version = "0.1.0-alpha1" dependencies = [ "async-std", "epub-builder", + "html5ever 0.25.1", "kuchiki", + "lazy_static 1.4.0", "md5", + "regex", + "structopt", "surf", "url", ] @@ -1110,6 +1158,32 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +[[package]] +name = "proc-macro-error" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98e9e4b82e0ef281812565ea4751049f1bdcdfccda7d3f459f2e138a40c08678" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f5444ead4e9935abd7f27dc51f7e852a0569ac888096d5ec2499470794e2e53" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "syn-mid", + "version_check", +] + [[package]] name = "proc-macro-hack" version = "0.5.15" @@ -1321,9 +1395,9 @@ checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" [[package]] name = "regex" -version = "1.3.7" +version = "1.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692" +checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" dependencies = [ "aho-corasick", "memchr", @@ -1333,9 +1407,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.17" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" +checksum = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8" [[package]] name = "remove_dir_all" @@ -1577,6 +1651,36 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1884d1bc09741d466d9b14e6d37ac89d6909cbcac41dd9ae982d4d063bbedfc" +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "structopt" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef" +dependencies = [ + "clap", + "lazy_static 1.4.0", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "surf" version = "1.0.3" @@ -1610,6 +1714,17 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "syn-mid" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7be3539f6c128a931cf19dcee741c1af532c7fd387baa739c03dd2e96479338a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tempdir" version = "0.3.7" @@ -1631,6 +1746,15 @@ dependencies = [ "utf-8", ] +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + [[package]] name = "thin-slice" version = "0.1.1" @@ -1694,6 +1818,12 @@ dependencies = [ "smallvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" + [[package]] name = "unicode-width" version = "0.1.7" @@ -1747,6 +1877,12 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fc439f2794e98976c88a2a2dafce96b930fe8010b0a256b3c2199a773933168" +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + [[package]] name = "version_check" version = "0.9.1" diff --git a/Cargo.toml b/Cargo.toml index 867a34a..a630c09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,9 @@ [package] +description = "A web article downloader" +homepage = "https://github.com/hipstermojo/paperoni" +repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.1.0" +version = "0.1.0-alpha1" authors = ["Kenneth Gitere "] edition = "2018" license = "MIT" @@ -10,7 +13,11 @@ license = "MIT" [dependencies] async-std = "1.5.0" epub-builder = "0.4.5" -kuchiki = "0.8.0" +html5ever = "0.25.1" +kuchiki = "0.8.1" +lazy_static = "1.3.9" md5 = "0.7.0" +regex = "1.3.9" surf = "1.0.3" +structopt = { version = "0.3" } url = "2.1.1" \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fa57d8f --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +

+ +

Salami not included

+ +Paperoni is a web article downloader written in Rust. The downloaded articles are then exported as EPUB files. + +> This project is in an alpha release so it is pretty unstable. + +## Usage + +```sh +paperoni https://en.wikipedia.org/wiki/Pepperoni +``` + +Paperoni also supports passing multiple links as arguments. If you are on a Unix-like OS, you can simply do something like this: + +```sh +cat links.txt | xargs paperoni +``` + +## How it works + +The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor. +This extractor retrieves a possible article using a port of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB. + +> The port of the algorithm is still unstable as well so it is not fully compatible with all the websites that can be extracted using Readability. + +## How it (currently) doesn't work + +This program is still in alpha so a number of things currently break: + +- Links with redirects will crash the program as it has no redirect logic. +- Websites that only run with JavaScript cannot be extracted. +- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either. + +## Running locally + +### Precompiled binaries + +Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for precompiled binaries. Currently there are only builds for Debian and Arch. + +### Building from source + +This project uses `async/.await` so it should be compiled using a minimum Rust version of 1.33. Preferrably use the latest version of Rust. + +```sh +git clone https://github.com/hipstermojo/paperoni.git +cd paperoni +## You can build and install paperoni locally +cargo install --path . +## or use it from within the project +cargo run -- # pass your url here +``` diff --git a/paperoni-dark.png b/paperoni-dark.png new file mode 100644 index 0000000..8339a48 Binary files /dev/null and b/paperoni-dark.png differ diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..e0e12db --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,13 @@ +use structopt::StructOpt; + +#[derive(Debug, StructOpt)] +#[structopt(name = "paperoni")] +/// Paperoni is an article downloader. +/// +/// It takes a url and downloads the article content from it and +/// saves it to an epub. +pub struct Opts { + // #[structopt(conflicts_with("links"))] + /// Url of a web article + pub urls: Vec, +} diff --git a/src/extractor.rs b/src/extractor.rs index 2355939..93ab5bb 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,90 +1,53 @@ use async_std::fs::File; use async_std::io::prelude::*; use async_std::task; -use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef}; +use kuchiki::{traits::*, NodeRef}; use url::Url; +use crate::moz_readability::{MetaData, Readability}; + pub type ResourceInfo = (String, Option); pub struct Extractor { - pub root_node: NodeRef, - pub content: Option>, + article: Option, pub img_urls: Vec, + readability: Readability, } impl Extractor { /// Create a new instance of an HTML extractor given an HTML string pub fn from_html(html_str: &str) -> Self { Extractor { - content: None, + article: None, img_urls: Vec::new(), - root_node: kuchiki::parse_html().one(html_str), + readability: Readability::new(html_str), } } - /// Extract the value of an attribute - fn extract_attr_val U, U>( - &self, - css_selector: &str, - attr_target: &str, - mapper: T, - ) -> Option { - self.root_node - .select_first(css_selector) - .ok() - .and_then(|data| data.attributes.borrow().get(attr_target).map(mapper)) - } - - /// Extract the text of a DOM node given its CSS selector - fn extract_inner_text(&self, css_selector: &str) -> Option { - let node_ref = self.root_node.select_first(css_selector).ok()?; - extract_text_from_node(node_ref.as_node()) - } - /// Locates and extracts the HTML in a document which is determined to be /// the source of the content - pub fn extract_content(&mut self) { - // Extract the useful parts of the head section - let author: Option = - self.extract_attr_val("meta[name='author']", "content", |author| { - author.to_string() - }); - - let description = - self.extract_attr_val("meta[name='description']", "content", |description| { - description.to_string() - }); - - let tags = self.extract_attr_val("meta[name='keywords']", "content", |tags| { - tags.split(",") - .map(|tag| tag.trim().to_string()) - .collect::>() - }); - - let title = self.extract_inner_text("title").unwrap_or("".to_string()); - let lang = self - .extract_attr_val("html", "lang", |lang| lang.to_string()) - .unwrap_or("en".to_string()); - - let meta_attrs = MetaAttr::new(author, description, lang, tags, title); - - // Extract the article - - let article_ref = self.root_node.select_first("article").unwrap(); - - for node_ref in article_ref.as_node().descendants() { - match node_ref.data() { - kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (), - _ => node_ref.detach(), - } + pub fn extract_content(&mut self, url: &str) { + self.readability.parse(url); + if let Some(article_node_ref) = &self.readability.article_node { + let template = r#" + + + + + + + "#; + let doc = kuchiki::parse_html().one(template); + let body = doc.select_first("body").unwrap(); + body.as_node().append(article_node_ref.clone()); + self.article = Some(doc); } - self.content = Some(article_ref); } /// Traverses the DOM tree of the content and retrieves the IMG URLs fn extract_img_urls(&mut self) { - if let Some(content_ref) = &self.content { - for img_ref in content_ref.as_node().select("img").unwrap() { + if let Some(content_ref) = &self.readability.article_node { + for img_ref in content_ref.select("img").unwrap() { img_ref.as_node().as_element().map(|img_elem| { img_elem.attributes.borrow().get("src").map(|img_url| { if !img_url.is_empty() { @@ -101,10 +64,10 @@ impl Extractor { self.extract_img_urls(); println!("Downloading images to res/"); for img_url in &self.img_urls { - let mut img_url = img_url.0.clone(); - get_absolute_url(&mut img_url, article_origin); - async_download_tasks.push(task::spawn(async { - let mut img_response = surf::get(&img_url).await.expect("Unable to retrieve file"); + let img_url = img_url.0.clone(); + let abs_url = get_absolute_url(&img_url, article_origin); + async_download_tasks.push(task::spawn(async move { + let mut img_response = surf::get(&abs_url).await.expect("Unable to retrieve file"); let img_content: Vec = img_response.body_bytes().await.unwrap(); let img_mime = img_response .header("Content-Type") @@ -114,7 +77,7 @@ impl Extractor { .and_then(map_mime_type_to_ext) .unwrap(); - let img_path = format!("res/{}{}", hash_url(&img_url), &img_ext); + let img_path = format!("res/{}{}", hash_url(&abs_url), &img_ext); let mut img_file = File::create(&img_path) .await .expect("Unable to create file"); @@ -133,10 +96,10 @@ impl Extractor { let (img_url, img_path, img_mime) = async_task.await; // Update the image sources let img_ref = self - .content + .readability + .article_node .as_mut() .expect("Unable to get mutable ref") - .as_node() .select_first(&format!("img[src='{}']", img_url)) .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut(); @@ -145,11 +108,14 @@ impl Extractor { } Ok(()) } -} -fn extract_text_from_node(node: &NodeRef) -> Option { - node.first_child() - .map(|child_ref| child_ref.text_contents()) + pub fn article(&self) -> Option<&NodeRef> { + self.article.as_ref() + } + + pub fn metadata(&self) -> &MetaData { + &self.readability.metadata + } } /// Utility for hashing URLs. This is used to help store files locally with unique values @@ -174,10 +140,11 @@ fn map_mime_type_to_ext(mime_type: &str) -> Option { .map(|format| String::from(".") + format) } -fn get_absolute_url(url: &mut String, request_url: &Url) { +fn get_absolute_url(url: &str, request_url: &Url) -> String { if Url::parse(url).is_ok() { + url.to_owned() } else if url.starts_with("/") { - *url = Url::parse(&format!( + Url::parse(&format!( "{}://{}", request_url.scheme(), request_url.host_str().unwrap() @@ -185,36 +152,9 @@ fn get_absolute_url(url: &mut String, request_url: &Url) { .unwrap() .join(url) .unwrap() - .into_string(); + .into_string() } else { - *url = request_url.join(url).unwrap().into_string(); - } -} - -#[derive(Debug)] -pub struct MetaAttr { - author: Option, - description: Option, - language: String, - tags: Option>, - title: String, -} - -impl MetaAttr { - pub fn new( - author: Option, - description: Option, - language: String, - tags: Option>, - title: String, - ) -> Self { - MetaAttr { - author, - description, - language, - tags, - title, - } + request_url.join(url).unwrap().into_string() } } @@ -249,86 +189,17 @@ mod test { "#; - #[test] - fn test_extract_attr_val() { - let extractor = Extractor::from_html(TEST_HTML); - let ext_author = - extractor.extract_attr_val("meta[name='author']", "content", |val| val.to_string()); - assert!(ext_author.is_some()); - assert_eq!("Paperoni", &ext_author.unwrap()); - let ext_author = - extractor.extract_attr_val("meta[name='invalid-name']", "content", |val| { - val.to_string() - }); - assert!(ext_author.is_none()); - let lang_attr = extractor.extract_attr_val("html", "lang", |lang| lang.to_string()); - assert!(lang_attr.is_some()); - assert_eq!("en".to_string(), lang_attr.unwrap()); - } - - #[test] - fn test_extract_inner_text() { - let extractor = Extractor::from_html(TEST_HTML); - let title_text = extractor.extract_inner_text("title"); - assert!(title_text.is_some()); - assert_eq!("Testing Paperoni".to_string(), title_text.unwrap()); - - let title_text = extractor.extract_inner_text("titln"); - assert!(title_text.is_none()); - } - #[test] - fn test_extract_text() { - let extractor = Extractor::from_html(TEST_HTML); - let h1_node = extractor.root_node.select_first("h1").unwrap(); - let h1_text = extract_text_from_node(h1_node.as_node()); - assert!(h1_text.is_some()); - assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap()); - } - - #[test] - fn test_extract_content() { - let extracted_html: String = r#" -
-

Starting out

-

Some Lorem Ipsum text here

-

Observe this picture

- Random image -
- "# - .lines() - .map(|line| line.trim()) - .collect(); - - let mut extractor = Extractor::from_html( - &TEST_HTML - .lines() - .map(|line| line.trim()) - .collect::(), - ); - - extractor.extract_content(); - let mut output = Vec::new(); - assert!(extractor.content.is_some()); - - extractor - .content - .unwrap() - .as_node() - .serialize(&mut output) - .expect("Unable to serialize output HTML"); - let output = std::str::from_utf8(&output).unwrap(); - - assert_eq!(extracted_html, output); - } - #[test] fn test_extract_img_urls() { let mut extractor = Extractor::from_html(TEST_HTML); - extractor.extract_content(); + extractor.extract_content("http://example.com/"); extractor.extract_img_urls(); assert!(extractor.img_urls.len() > 0); - assert_eq!(vec![("./img.jpg".to_string(), None)], extractor.img_urls); + assert_eq!( + vec![("http://example.com/img.jpg".to_string(), None)], + extractor.img_urls + ); } #[test] @@ -353,27 +224,4 @@ mod test { exts ); } - - #[test] - fn test_get_absolute_url() { - let mut absolute_url = "https://example.image.com/images/1.jpg".to_owned(); - let mut relative_url = "../../images/2.jpg".to_owned(); - let mut relative_from_host_url = "/images/3.jpg".to_owned(); - let host_url = Url::parse("https://example.image.com/blog/how-to-test-resolvers/").unwrap(); - get_absolute_url(&mut absolute_url, &host_url); - assert_eq!("https://example.image.com/images/1.jpg", absolute_url); - get_absolute_url(&mut relative_url, &host_url); - assert_eq!("https://example.image.com/images/2.jpg", relative_url); - relative_url = "2-1.jpg".to_owned(); - get_absolute_url(&mut relative_url, &host_url); - assert_eq!( - "https://example.image.com/blog/how-to-test-resolvers/2-1.jpg", - relative_url - ); - get_absolute_url(&mut relative_from_host_url, &host_url); - assert_eq!( - "https://example.image.com/images/3.jpg", - relative_from_host_url - ); - } } diff --git a/src/main.rs b/src/main.rs index d790f9b..78ba0e2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,64 +1,90 @@ +#[macro_use] +extern crate lazy_static; + use std::fs::File; use async_std::{fs::create_dir, fs::remove_dir_all, task}; use epub_builder::{EpubBuilder, EpubContent, ZipLibrary}; +use structopt::StructOpt; use url::Url; +mod cli; mod extractor; +mod moz_readability; use extractor::Extractor; fn main() { - task::block_on(async { - let urls = vec![ - "https://saveandrun.com/posts/2020-01-24-generating-mazes-with-haskell-part-1.html", - "https://saveandrun.com/posts/2020-04-05-querying-pacman-with-datalog.html", - "https://blog.hipstermojo.xyz/posts/redis-orm-preface/", - "https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators", - "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10", - "https://dev.to/steelwolf180/full-stack-development-in-django-3768" - ]; - let html = fetch_url(urls[4]).await; - let mut extractor = Extractor::from_html(&html); - println!("Extracting"); - extractor.extract_content(); - create_dir("res/") - .await - .expect("Unable to create res/ output folder"); - extractor - .download_images(&Url::parse(urls[5]).unwrap()) - .await - .expect("Unable to download images"); - let mut out_file = File::create("out.epub").unwrap(); - let mut html_buf = Vec::new(); - extractor - .content - .unwrap() - .as_node() - .serialize(&mut html_buf) - .expect("Unable to serialize"); - let html_buf = std::str::from_utf8(&html_buf).unwrap(); - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); - epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) - .unwrap(); - for img in extractor.img_urls { - let file_path = format!("{}", &img.0); - - let img_buf = File::open(file_path).expect("Can't read file"); - epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap(); - } - epub.generate(&mut out_file).unwrap(); - println!("Cleaning up"); - remove_dir_all("res/").await.unwrap(); - }) + let opt = cli::Opts::from_args(); + if !opt.urls.is_empty() { + println!("Downloading single article"); + download(opt.urls); + } } -async fn fetch_url(url: &str) -> String { +type HTMLResource = (String, String); + +async fn fetch_url(url: &str) -> HTMLResource { let client = surf::Client::new(); println!("Fetching..."); // TODO: Add middleware for following redirects - client - .get(url) - .recv_string() - .await - .expect("Unable to fetch URL") + ( + url.to_string(), + client + .get(url) + .recv_string() + .await + .expect("Unable to fetch URL"), + ) +} + +fn download(urls: Vec) { + let mut async_url_tasks = Vec::with_capacity(urls.len()); + for url in urls { + async_url_tasks.push(task::spawn(async move { fetch_url(&url).await })); + } + task::block_on(async { + for url_task in async_url_tasks { + let (url, html) = url_task.await; + println!("Extracting"); + let mut extractor = Extractor::from_html(&html); + extractor.extract_content(&url); + if extractor.article().is_some() { + create_dir("res/") + .await + .expect("Unable to create res/ output folder"); + extractor + .download_images(&Url::parse(&url).unwrap()) + .await + .expect("Unable to download images"); + let mut out_file = + File::create(format!("{}.epub", extractor.metadata().title())).unwrap(); + let mut html_buf = Vec::new(); + extractor + .article() + .unwrap() + .serialize(&mut html_buf) + .expect("Unable to serialize"); + let html_buf = std::str::from_utf8(&html_buf).unwrap(); + let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX + .replace_all(html_buf, "$tag/>"); + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + if let Some(author) = extractor.metadata().byline() { + epub.metadata("author", author).unwrap(); + } + epub.metadata("title", extractor.metadata().title()) + .unwrap(); + epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) + .unwrap(); + for img in extractor.img_urls { + let file_path = format!("{}", &img.0); + + let img_buf = File::open(file_path).expect("Can't read file"); + epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap(); + } + epub.generate(&mut out_file).unwrap(); + println!("Cleaning up"); + remove_dir_all("res/").await.unwrap(); + } + } + }) } diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs new file mode 100644 index 0000000..2f3f7ae --- /dev/null +++ b/src/moz_readability/mod.rs @@ -0,0 +1,3912 @@ +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::str::FromStr; + +use html5ever::{LocalName, Namespace, QualName}; +use kuchiki::{ + iter::{Descendants, Elements, Select}, + traits::*, + NodeData, NodeRef, +}; +use url::Url; + +const DEFAULT_CHAR_THRESHOLD: usize = 500; +const FLAG_STRIP_UNLIKELYS: u32 = 0x1; +const FLAG_WEIGHT_CLASSES: u32 = 0x2; +const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4; +const READABILITY_SCORE: &'static str = "readability-score"; +const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; +// TODO: Change to HashSet +const PHRASING_ELEMS: [&str; 39] = [ + "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", + "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object", + "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong", + "sub", "sup", "textarea", "time", "var", "wbr", +]; +// TODO: Change to HashSet +const DEFAULT_TAGS_TO_SCORE: [&str; 9] = + ["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"]; +// TODO: Change to HashSet +const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"]; +const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [ + "align", + "background", + "bgcolor", + "border", + "cellpadding", + "cellspacing", + "frame", + "hspace", + "rules", + "style", + "valign", + "vspace", +]; + +const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"]; +// TODO: Change to HashSet +const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"]; + +pub mod regexes; + +pub struct Readability { + root_node: NodeRef, + byline: Option, + article_title: String, + pub article_node: Option, + article_dir: Option, + flags: u32, + pub metadata: MetaData, +} + +#[derive(Debug, PartialEq)] +struct SizeInfo { + rows: usize, + columns: usize, +} + +impl Readability { + pub fn new(html_str: &str) -> Self { + Self { + root_node: kuchiki::parse_html().one(html_str), + byline: None, + article_title: "".into(), + article_node: None, + article_dir: None, + flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY, + metadata: MetaData::new(), + } + } + pub fn parse(&mut self, url: &str) { + self.unwrap_no_script_tags(); + self.remove_scripts(); + self.prep_document(); + self.metadata = self.get_article_metadata(); + self.article_title = self.metadata.title.clone(); + self.grab_article(); + self.post_process_content(url); + } + + /// Recursively check if node is image, or if node contains exactly only one image + /// whether as a direct child or as its descendants. + fn is_single_image(node_ref: &NodeRef) -> bool { + if let Some(element) = node_ref.as_element() { + if &element.name.local == "img" { + return true; + } + } + + if node_ref.children().filter(Self::has_content).count() != 1 + || !node_ref.text_contents().trim().is_empty() + { + return false; + } + + return Readability::is_single_image( + &node_ref + .children() + .filter(Self::has_content) + .next() + .expect("Unable to get first child which should exist"), + ); + } + + fn has_content(node_ref: &NodeRef) -> bool { + match node_ref.data() { + NodeData::Text(text) => !text.borrow().trim().is_empty(), + _ => true, + } + } + + /// Find all