diff --git a/Cargo.lock b/Cargo.lock index fafdfb7..45e53ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -704,9 +704,9 @@ dependencies = [ [[package]] name = "kuchiki" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1beeffc5ae5ab0def2cb85e26063a8e6b4f579b0adec3805bf87472086948956" +checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358" dependencies = [ "cssparser", "html5ever 0.25.1", @@ -1010,12 +1010,15 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.1.0" +version = "0.1.0-alpha1" dependencies = [ "async-std", "epub-builder", + "html5ever 0.25.1", "kuchiki", + "lazy_static 1.4.0", "md5", + "regex", "structopt", "surf", "url", @@ -1392,9 +1395,9 @@ checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" [[package]] name = "regex" -version = "1.3.7" +version = "1.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692" +checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" dependencies = [ "aho-corasick", "memchr", @@ -1404,9 +1407,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.17" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" +checksum = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8" [[package]] name = "remove_dir_all" diff --git a/Cargo.toml b/Cargo.toml index 801a3a8..a630c09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,9 @@ [package] +description = "A web article downloader" +homepage = "https://github.com/hipstermojo/paperoni" +repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.1.0" +version = "0.1.0-alpha1" authors = ["Kenneth Gitere "] edition = "2018" license = "MIT" @@ -10,8 +13,11 @@ license = "MIT" [dependencies] async-std = "1.5.0" epub-builder = "0.4.5" -kuchiki = "0.8.0" +html5ever = "0.25.1" +kuchiki = "0.8.1" +lazy_static = "1.3.9" md5 = "0.7.0" +regex = "1.3.9" surf = "1.0.3" structopt = { version = "0.3" } url = "2.1.1" \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fa57d8f --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +

+ +

Salami not included

+ +Paperoni is a web article downloader written in Rust. The downloaded articles are then exported as EPUB files. + +> This project is in an alpha release so it is pretty unstable. + +## Usage + +```sh +paperoni https://en.wikipedia.org/wiki/Pepperoni +``` + +Paperoni also supports passing multiple links as arguments. If you are on a Unix-like OS, you can simply do something like this: + +```sh +cat links.txt | xargs paperoni +``` + +## How it works + +The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor. +This extractor retrieves a possible article using a port of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB. + +> The port of the algorithm is still unstable as well so it is not fully compatible with all the websites that can be extracted using Readability. + +## How it (currently) doesn't work + +This program is still in alpha so a number of things currently break: + +- Links with redirects will crash the program as it has no redirect logic. +- Websites that only run with JavaScript cannot be extracted. +- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either. + +## Running locally + +### Precompiled binaries + +Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for precompiled binaries. Currently there are only builds for Debian and Arch. + +### Building from source + +This project uses `async/.await` so it should be compiled using a minimum Rust version of 1.33. Preferrably use the latest version of Rust. + +```sh +git clone https://github.com/hipstermojo/paperoni.git +cd paperoni +## You can build and install paperoni locally +cargo install --path . +## or use it from within the project +cargo run -- # pass your url here +``` diff --git a/paperoni-dark.png b/paperoni-dark.png new file mode 100644 index 0000000..8339a48 Binary files /dev/null and b/paperoni-dark.png differ diff --git a/src/cli.rs b/src/cli.rs index ba0273d..e0e12db 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -9,5 +9,5 @@ use structopt::StructOpt; pub struct Opts { // #[structopt(conflicts_with("links"))] /// Url of a web article - pub url: Option, + pub urls: Vec, } diff --git a/src/extractor.rs b/src/extractor.rs index b393f5b..93ab5bb 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,90 +1,53 @@ use async_std::fs::File; use async_std::io::prelude::*; use async_std::task; -use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef}; +use kuchiki::{traits::*, NodeRef}; use url::Url; +use crate::moz_readability::{MetaData, Readability}; + pub type ResourceInfo = (String, Option); pub struct Extractor { - pub root_node: NodeRef, - pub content: Option>, + article: Option, pub img_urls: Vec, + readability: Readability, } impl Extractor { /// Create a new instance of an HTML extractor given an HTML string pub fn from_html(html_str: &str) -> Self { Extractor { - content: None, + article: None, img_urls: Vec::new(), - root_node: kuchiki::parse_html().one(html_str), + readability: Readability::new(html_str), } } - /// Extract the value of an attribute - fn extract_attr_val U, U>( - &self, - css_selector: &str, - attr_target: &str, - mapper: T, - ) -> Option { - self.root_node - .select_first(css_selector) - .ok() - .and_then(|data| data.attributes.borrow().get(attr_target).map(mapper)) - } - - /// Extract the text of a DOM node given its CSS selector - fn extract_inner_text(&self, css_selector: &str) -> Option { - let node_ref = self.root_node.select_first(css_selector).ok()?; - extract_text_from_node(node_ref.as_node()) - } - /// Locates and extracts the HTML in a document which is determined to be /// the source of the content - pub fn extract_content(&mut self) { - // Extract the useful parts of the head section - let author: Option = - self.extract_attr_val("meta[name='author']", "content", |author| { - author.to_string() - }); - - let description = - self.extract_attr_val("meta[name='description']", "content", |description| { - description.to_string() - }); - - let tags = self.extract_attr_val("meta[name='keywords']", "content", |tags| { - tags.split(",") - .map(|tag| tag.trim().to_string()) - .collect::>() - }); - - let title = self.extract_inner_text("title").unwrap_or("".to_string()); - let lang = self - .extract_attr_val("html", "lang", |lang| lang.to_string()) - .unwrap_or("en".to_string()); - - let meta_attrs = MetaAttr::new(author, description, lang, tags, title); - - // Extract the article - - let article_ref = self.root_node.select_first("article").unwrap(); - - for node_ref in article_ref.as_node().descendants() { - match node_ref.data() { - kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (), - _ => node_ref.detach(), - } + pub fn extract_content(&mut self, url: &str) { + self.readability.parse(url); + if let Some(article_node_ref) = &self.readability.article_node { + let template = r#" + + + + + + + "#; + let doc = kuchiki::parse_html().one(template); + let body = doc.select_first("body").unwrap(); + body.as_node().append(article_node_ref.clone()); + self.article = Some(doc); } - self.content = Some(article_ref); } /// Traverses the DOM tree of the content and retrieves the IMG URLs fn extract_img_urls(&mut self) { - if let Some(content_ref) = &self.content { - for img_ref in content_ref.as_node().select("img").unwrap() { + if let Some(content_ref) = &self.readability.article_node { + for img_ref in content_ref.select("img").unwrap() { img_ref.as_node().as_element().map(|img_elem| { img_elem.attributes.borrow().get("src").map(|img_url| { if !img_url.is_empty() { @@ -133,10 +96,10 @@ impl Extractor { let (img_url, img_path, img_mime) = async_task.await; // Update the image sources let img_ref = self - .content + .readability + .article_node .as_mut() .expect("Unable to get mutable ref") - .as_node() .select_first(&format!("img[src='{}']", img_url)) .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut(); @@ -145,11 +108,14 @@ impl Extractor { } Ok(()) } -} -fn extract_text_from_node(node: &NodeRef) -> Option { - node.first_child() - .map(|child_ref| child_ref.text_contents()) + pub fn article(&self) -> Option<&NodeRef> { + self.article.as_ref() + } + + pub fn metadata(&self) -> &MetaData { + &self.readability.metadata + } } /// Utility for hashing URLs. This is used to help store files locally with unique values @@ -192,33 +158,6 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String { } } -#[derive(Debug)] -pub struct MetaAttr { - author: Option, - description: Option, - language: String, - tags: Option>, - title: String, -} - -impl MetaAttr { - pub fn new( - author: Option, - description: Option, - language: String, - tags: Option>, - title: String, - ) -> Self { - MetaAttr { - author, - description, - language, - tags, - title, - } - } -} - #[cfg(test)] mod test { use super::*; @@ -250,86 +189,17 @@ mod test { "#; - #[test] - fn test_extract_attr_val() { - let extractor = Extractor::from_html(TEST_HTML); - let ext_author = - extractor.extract_attr_val("meta[name='author']", "content", |val| val.to_string()); - assert!(ext_author.is_some()); - assert_eq!("Paperoni", &ext_author.unwrap()); - let ext_author = - extractor.extract_attr_val("meta[name='invalid-name']", "content", |val| { - val.to_string() - }); - assert!(ext_author.is_none()); - let lang_attr = extractor.extract_attr_val("html", "lang", |lang| lang.to_string()); - assert!(lang_attr.is_some()); - assert_eq!("en".to_string(), lang_attr.unwrap()); - } - - #[test] - fn test_extract_inner_text() { - let extractor = Extractor::from_html(TEST_HTML); - let title_text = extractor.extract_inner_text("title"); - assert!(title_text.is_some()); - assert_eq!("Testing Paperoni".to_string(), title_text.unwrap()); - - let title_text = extractor.extract_inner_text("titln"); - assert!(title_text.is_none()); - } - #[test] - fn test_extract_text() { - let extractor = Extractor::from_html(TEST_HTML); - let h1_node = extractor.root_node.select_first("h1").unwrap(); - let h1_text = extract_text_from_node(h1_node.as_node()); - assert!(h1_text.is_some()); - assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap()); - } - - #[test] - fn test_extract_content() { - let extracted_html: String = r#" -
-

Starting out

-

Some Lorem Ipsum text here

-

Observe this picture

- Random image -
- "# - .lines() - .map(|line| line.trim()) - .collect(); - - let mut extractor = Extractor::from_html( - &TEST_HTML - .lines() - .map(|line| line.trim()) - .collect::(), - ); - - extractor.extract_content(); - let mut output = Vec::new(); - assert!(extractor.content.is_some()); - - extractor - .content - .unwrap() - .as_node() - .serialize(&mut output) - .expect("Unable to serialize output HTML"); - let output = std::str::from_utf8(&output).unwrap(); - - assert_eq!(extracted_html, output); - } - #[test] fn test_extract_img_urls() { let mut extractor = Extractor::from_html(TEST_HTML); - extractor.extract_content(); + extractor.extract_content("http://example.com/"); extractor.extract_img_urls(); assert!(extractor.img_urls.len() > 0); - assert_eq!(vec![("./img.jpg".to_string(), None)], extractor.img_urls); + assert_eq!( + vec![("http://example.com/img.jpg".to_string(), None)], + extractor.img_urls + ); } #[test] @@ -354,24 +224,4 @@ mod test { exts ); } - - #[test] - fn test_get_absolute_url() { - let absolute_url = "https://example.image.com/images/1.jpg"; - let relative_url = "../../images/2.jpg"; - let relative_from_host_url = "/images/3.jpg"; - let host_url = Url::parse("https://example.image.com/blog/how-to-test-resolvers/").unwrap(); - let abs_url = get_absolute_url(&absolute_url, &host_url); - assert_eq!("https://example.image.com/images/1.jpg", abs_url); - let abs_url = get_absolute_url(&relative_url, &host_url); - assert_eq!("https://example.image.com/images/2.jpg", abs_url); - let relative_url = "2-1.jpg"; - let abs_url = get_absolute_url(&relative_url, &host_url); - assert_eq!( - "https://example.image.com/blog/how-to-test-resolvers/2-1.jpg", - abs_url - ); - let abs_url = get_absolute_url(&relative_from_host_url, &host_url); - assert_eq!("https://example.image.com/images/3.jpg", abs_url); - } } diff --git a/src/main.rs b/src/main.rs index 6f15e9e..78ba0e2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,6 @@ +#[macro_use] +extern crate lazy_static; + use std::fs::File; use async_std::{fs::create_dir, fs::remove_dir_all, task}; @@ -7,60 +10,81 @@ use url::Url; mod cli; mod extractor; +mod moz_readability; use extractor::Extractor; fn main() { let opt = cli::Opts::from_args(); - if let Some(url) = opt.url { + if !opt.urls.is_empty() { println!("Downloading single article"); - download(url) + download(opt.urls); } } -async fn fetch_url(url: &str) -> String { +type HTMLResource = (String, String); + +async fn fetch_url(url: &str) -> HTMLResource { let client = surf::Client::new(); println!("Fetching..."); // TODO: Add middleware for following redirects - client - .get(url) - .recv_string() - .await - .expect("Unable to fetch URL") + ( + url.to_string(), + client + .get(url) + .recv_string() + .await + .expect("Unable to fetch URL"), + ) } -fn download(url: String) { +fn download(urls: Vec) { + let mut async_url_tasks = Vec::with_capacity(urls.len()); + for url in urls { + async_url_tasks.push(task::spawn(async move { fetch_url(&url).await })); + } task::block_on(async { - let html = fetch_url(&url).await; - let mut extractor = Extractor::from_html(&html); - println!("Extracting"); - extractor.extract_content(); - create_dir("res/") - .await - .expect("Unable to create res/ output folder"); - extractor - .download_images(&Url::parse(&url).unwrap()) - .await - .expect("Unable to download images"); - let mut out_file = File::create("out.epub").unwrap(); - let mut html_buf = Vec::new(); - extractor - .content - .unwrap() - .as_node() - .serialize(&mut html_buf) - .expect("Unable to serialize"); - let html_buf = std::str::from_utf8(&html_buf).unwrap(); - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); - epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) - .unwrap(); - for img in extractor.img_urls { - let file_path = format!("{}", &img.0); + for url_task in async_url_tasks { + let (url, html) = url_task.await; + println!("Extracting"); + let mut extractor = Extractor::from_html(&html); + extractor.extract_content(&url); + if extractor.article().is_some() { + create_dir("res/") + .await + .expect("Unable to create res/ output folder"); + extractor + .download_images(&Url::parse(&url).unwrap()) + .await + .expect("Unable to download images"); + let mut out_file = + File::create(format!("{}.epub", extractor.metadata().title())).unwrap(); + let mut html_buf = Vec::new(); + extractor + .article() + .unwrap() + .serialize(&mut html_buf) + .expect("Unable to serialize"); + let html_buf = std::str::from_utf8(&html_buf).unwrap(); + let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX + .replace_all(html_buf, "$tag/>"); + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + if let Some(author) = extractor.metadata().byline() { + epub.metadata("author", author).unwrap(); + } + epub.metadata("title", extractor.metadata().title()) + .unwrap(); + epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) + .unwrap(); + for img in extractor.img_urls { + let file_path = format!("{}", &img.0); - let img_buf = File::open(file_path).expect("Can't read file"); - epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap(); + let img_buf = File::open(file_path).expect("Can't read file"); + epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap(); + } + epub.generate(&mut out_file).unwrap(); + println!("Cleaning up"); + remove_dir_all("res/").await.unwrap(); + } } - epub.generate(&mut out_file).unwrap(); - println!("Cleaning up"); - remove_dir_all("res/").await.unwrap(); }) } diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs new file mode 100644 index 0000000..2f3f7ae --- /dev/null +++ b/src/moz_readability/mod.rs @@ -0,0 +1,3912 @@ +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::str::FromStr; + +use html5ever::{LocalName, Namespace, QualName}; +use kuchiki::{ + iter::{Descendants, Elements, Select}, + traits::*, + NodeData, NodeRef, +}; +use url::Url; + +const DEFAULT_CHAR_THRESHOLD: usize = 500; +const FLAG_STRIP_UNLIKELYS: u32 = 0x1; +const FLAG_WEIGHT_CLASSES: u32 = 0x2; +const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4; +const READABILITY_SCORE: &'static str = "readability-score"; +const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; +// TODO: Change to HashSet +const PHRASING_ELEMS: [&str; 39] = [ + "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", + "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object", + "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong", + "sub", "sup", "textarea", "time", "var", "wbr", +]; +// TODO: Change to HashSet +const DEFAULT_TAGS_TO_SCORE: [&str; 9] = + ["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"]; +// TODO: Change to HashSet +const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"]; +const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [ + "align", + "background", + "bgcolor", + "border", + "cellpadding", + "cellspacing", + "frame", + "hspace", + "rules", + "style", + "valign", + "vspace", +]; + +const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"]; +// TODO: Change to HashSet +const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"]; + +pub mod regexes; + +pub struct Readability { + root_node: NodeRef, + byline: Option, + article_title: String, + pub article_node: Option, + article_dir: Option, + flags: u32, + pub metadata: MetaData, +} + +#[derive(Debug, PartialEq)] +struct SizeInfo { + rows: usize, + columns: usize, +} + +impl Readability { + pub fn new(html_str: &str) -> Self { + Self { + root_node: kuchiki::parse_html().one(html_str), + byline: None, + article_title: "".into(), + article_node: None, + article_dir: None, + flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY, + metadata: MetaData::new(), + } + } + pub fn parse(&mut self, url: &str) { + self.unwrap_no_script_tags(); + self.remove_scripts(); + self.prep_document(); + self.metadata = self.get_article_metadata(); + self.article_title = self.metadata.title.clone(); + self.grab_article(); + self.post_process_content(url); + } + + /// Recursively check if node is image, or if node contains exactly only one image + /// whether as a direct child or as its descendants. + fn is_single_image(node_ref: &NodeRef) -> bool { + if let Some(element) = node_ref.as_element() { + if &element.name.local == "img" { + return true; + } + } + + if node_ref.children().filter(Self::has_content).count() != 1 + || !node_ref.text_contents().trim().is_empty() + { + return false; + } + + return Readability::is_single_image( + &node_ref + .children() + .filter(Self::has_content) + .next() + .expect("Unable to get first child which should exist"), + ); + } + + fn has_content(node_ref: &NodeRef) -> bool { + match node_ref.data() { + NodeData::Text(text) => !text.borrow().trim().is_empty(), + _ => true, + } + } + + /// Find all