diff --git a/.gitignore b/.gitignore
index ea8c4bf..3ae8faf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /target
+*.epub
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index f48fe72..45e53ef 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -15,6 +15,15 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "ansi_term"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
+dependencies = [
+ "winapi 0.3.8",
+]
+
 [[package]]
 name = "async-std"
 version = "1.5.0"
@@ -50,6 +59,17 @@ dependencies = [
  "winapi 0.3.8",
 ]
 
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "winapi 0.3.8",
+]
+
 [[package]]
 name = "autocfg"
 version = "0.1.7"
@@ -155,6 +175,21 @@ dependencies = [
  "time",
 ]
 
+[[package]]
+name = "clap"
+version = "2.33.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129"
+dependencies = [
+ "ansi_term",
+ "atty",
+ "bitflags",
+ "strsim",
+ "textwrap",
+ "unicode-width",
+ "vec_map",
+]
+
 [[package]]
 name = "cloudabi"
 version = "0.0.3"
@@ -523,6 +558,15 @@ dependencies = [
  "wasi",
 ]
 
+[[package]]
+name = "heck"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205"
+dependencies = [
+ "unicode-segmentation",
+]
+
 [[package]]
 name = "hermit-abi"
 version = "0.1.12"
@@ -660,9 +704,9 @@ dependencies = [
 
 [[package]]
 name = "kuchiki"
-version = "0.8.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1beeffc5ae5ab0def2cb85e26063a8e6b4f579b0adec3805bf87472086948956"
+checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358"
 dependencies = [
  "cssparser",
  "html5ever 0.25.1",
@@ -966,12 +1010,16 @@ dependencies = [
 
 [[package]]
 name = "paperoni"
-version = "0.1.0"
+version = "0.1.0-alpha1"
 dependencies = [
  "async-std",
  "epub-builder",
+ "html5ever 0.25.1",
  "kuchiki",
+ "lazy_static 1.4.0",
  "md5",
+ "regex",
+ "structopt",
  "surf",
  "url",
 ]
@@ -1110,6 +1158,32 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
 
+[[package]]
+name = "proc-macro-error"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "98e9e4b82e0ef281812565ea4751049f1bdcdfccda7d3f459f2e138a40c08678"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f5444ead4e9935abd7f27dc51f7e852a0569ac888096d5ec2499470794e2e53"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "syn-mid",
+ "version_check",
+]
+
 [[package]]
 name = "proc-macro-hack"
 version = "0.5.15"
@@ -1321,9 +1395,9 @@ checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
 
 [[package]]
 name = "regex"
-version = "1.3.7"
+version = "1.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692"
+checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -1333,9 +1407,9 @@ dependencies = [
 
 [[package]]
 name = "regex-syntax"
-version = "0.6.17"
+version = "0.6.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae"
+checksum = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8"
 
 [[package]]
 name = "remove_dir_all"
@@ -1577,6 +1651,36 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b1884d1bc09741d466d9b14e6d37ac89d6909cbcac41dd9ae982d4d063bbedfc"
 
+[[package]]
+name = "strsim"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
+
+[[package]]
+name = "structopt"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef"
+dependencies = [
+ "clap",
+ "lazy_static 1.4.0",
+ "structopt-derive",
+]
+
+[[package]]
+name = "structopt-derive"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a"
+dependencies = [
+ "heck",
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "surf"
 version = "1.0.3"
@@ -1610,6 +1714,17 @@ dependencies = [
  "unicode-xid",
 ]
 
+[[package]]
+name = "syn-mid"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7be3539f6c128a931cf19dcee741c1af532c7fd387baa739c03dd2e96479338a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "tempdir"
 version = "0.3.7"
@@ -1631,6 +1746,15 @@ dependencies = [
  "utf-8",
 ]
 
+[[package]]
+name = "textwrap"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
+dependencies = [
+ "unicode-width",
+]
+
 [[package]]
 name = "thin-slice"
 version = "0.1.1"
@@ -1694,6 +1818,12 @@ dependencies = [
  "smallvec",
 ]
 
+[[package]]
+name = "unicode-segmentation"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
+
 [[package]]
 name = "unicode-width"
 version = "0.1.7"
@@ -1747,6 +1877,12 @@ version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fc439f2794e98976c88a2a2dafce96b930fe8010b0a256b3c2199a773933168"
 
+[[package]]
+name = "vec_map"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
+
 [[package]]
 name = "version_check"
 version = "0.9.1"
diff --git a/Cargo.toml b/Cargo.toml
index 867a34a..a630c09 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,9 @@
 [package]
+description = "A web article downloader"
+homepage = "https://github.com/hipstermojo/paperoni"
+repository = "https://github.com/hipstermojo/paperoni"
 name = "paperoni"
-version = "0.1.0"
+version = "0.1.0-alpha1"
 authors = ["Kenneth Gitere <gitere81@gmail.com>"]
 edition = "2018"
 license = "MIT"
@@ -10,7 +13,11 @@ license = "MIT"
 [dependencies]
 async-std = "1.5.0"
 epub-builder = "0.4.5"
-kuchiki = "0.8.0"
+html5ever = "0.25.1"
+kuchiki = "0.8.1"
+lazy_static = "1.3.9"
 md5 = "0.7.0"
+regex = "1.3.9"
 surf = "1.0.3"
+structopt = { version = "0.3" }
 url = "2.1.1"
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fa57d8f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,53 @@
+<p align="center"><img src="./paperoni-dark.png" width="400"></p>
+
+<p align="center"><i>Salami not included</i></p>
+
+Paperoni is a web article downloader written in Rust. The downloaded articles are then exported as EPUB files.
+
+> This project is in an alpha release so it is pretty unstable.
+
+## Usage
+
+```sh
+paperoni https://en.wikipedia.org/wiki/Pepperoni
+```
+
+Paperoni also supports passing multiple links as arguments. If you are on a Unix-like OS, you can simply do something like this:
+
+```sh
+cat links.txt | xargs paperoni
+```
+
+## How it works
+
+The URL passed to Paperoni is fetched and the returned HTML response is passed to the extractor.
+This extractor retrieves a possible article using a port of the [Mozilla Readability algorithm](https://github.com/mozilla/readability). This article is then saved in an EPUB.
+
+> The port of the algorithm is still unstable as well so it is not fully compatible with all the websites that can be extracted using Readability.
+
+## How it (currently) doesn't work
+
+This program is still in alpha so a number of things currently break:
+
+- Links with redirects will crash the program as it has no redirect logic.
+- Websites that only run with JavaScript cannot be extracted.
+- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
+
+## Running locally
+
+### Precompiled binaries
+
+Check the [releases](https://github.com/hipstermojo/paperoni/releases) page for precompiled binaries. Currently there are only builds for Debian and Arch.
+
+### Building from source
+
+This project uses `async/.await` so it should be compiled using a minimum Rust version of 1.33. Preferrably use the latest version of Rust.
+
+```sh
+git clone https://github.com/hipstermojo/paperoni.git
+cd paperoni
+## You can build and install paperoni locally
+cargo install --path .
+## or use it from within the project
+cargo run -- # pass your url here
+```
diff --git a/paperoni-dark.png b/paperoni-dark.png
new file mode 100644
index 0000000..8339a48
Binary files /dev/null and b/paperoni-dark.png differ
diff --git a/src/cli.rs b/src/cli.rs
new file mode 100644
index 0000000..e0e12db
--- /dev/null
+++ b/src/cli.rs
@@ -0,0 +1,13 @@
+use structopt::StructOpt;
+
+#[derive(Debug, StructOpt)]
+#[structopt(name = "paperoni")]
+/// Paperoni is an article downloader.
+///
+/// It takes a url and downloads the article content from it and
+/// saves it to an epub.
+pub struct Opts {
+    // #[structopt(conflicts_with("links"))]
+    /// Url of a web article
+    pub urls: Vec<String>,
+}
diff --git a/src/extractor.rs b/src/extractor.rs
index 2355939..93ab5bb 100644
--- a/src/extractor.rs
+++ b/src/extractor.rs
@@ -1,90 +1,53 @@
 use async_std::fs::File;
 use async_std::io::prelude::*;
 use async_std::task;
-use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef};
+use kuchiki::{traits::*, NodeRef};
 use url::Url;
 
+use crate::moz_readability::{MetaData, Readability};
+
 pub type ResourceInfo = (String, Option<String>);
 
 pub struct Extractor {
-    pub root_node: NodeRef,
-    pub content: Option<NodeDataRef<ElementData>>,
+    article: Option<NodeRef>,
     pub img_urls: Vec<ResourceInfo>,
+    readability: Readability,
 }
 
 impl Extractor {
     /// Create a new instance of an HTML extractor given an HTML string
     pub fn from_html(html_str: &str) -> Self {
         Extractor {
-            content: None,
+            article: None,
             img_urls: Vec::new(),
-            root_node: kuchiki::parse_html().one(html_str),
+            readability: Readability::new(html_str),
         }
     }
 
-    /// Extract the value of an attribute
-    fn extract_attr_val<T: Fn(&str) -> U, U>(
-        &self,
-        css_selector: &str,
-        attr_target: &str,
-        mapper: T,
-    ) -> Option<U> {
-        self.root_node
-            .select_first(css_selector)
-            .ok()
-            .and_then(|data| data.attributes.borrow().get(attr_target).map(mapper))
-    }
-
-    /// Extract the text of a DOM node given its CSS selector
-    fn extract_inner_text(&self, css_selector: &str) -> Option<String> {
-        let node_ref = self.root_node.select_first(css_selector).ok()?;
-        extract_text_from_node(node_ref.as_node())
-    }
-
     /// Locates and extracts the HTML in a document which is determined to be
     /// the source of the content
-    pub fn extract_content(&mut self) {
-        // Extract the useful parts of the head section
-        let author: Option<String> =
-            self.extract_attr_val("meta[name='author']", "content", |author| {
-                author.to_string()
-            });
-
-        let description =
-            self.extract_attr_val("meta[name='description']", "content", |description| {
-                description.to_string()
-            });
-
-        let tags = self.extract_attr_val("meta[name='keywords']", "content", |tags| {
-            tags.split(",")
-                .map(|tag| tag.trim().to_string())
-                .collect::<Vec<String>>()
-        });
-
-        let title = self.extract_inner_text("title").unwrap_or("".to_string());
-        let lang = self
-            .extract_attr_val("html", "lang", |lang| lang.to_string())
-            .unwrap_or("en".to_string());
-
-        let meta_attrs = MetaAttr::new(author, description, lang, tags, title);
-
-        // Extract the article
-
-        let article_ref = self.root_node.select_first("article").unwrap();
-
-        for node_ref in article_ref.as_node().descendants() {
-            match node_ref.data() {
-                kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (),
-                _ => node_ref.detach(),
-            }
+    pub fn extract_content(&mut self, url: &str) {
+        self.readability.parse(url);
+        if let Some(article_node_ref) = &self.readability.article_node {
+            let template = r#"
+            <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
+                <head>
+                </head>
+                <body>
+                </body>
+            </html>
+            "#;
+            let doc = kuchiki::parse_html().one(template);
+            let body = doc.select_first("body").unwrap();
+            body.as_node().append(article_node_ref.clone());
+            self.article = Some(doc);
         }
-        self.content = Some(article_ref);
     }
 
     /// Traverses the DOM tree of the content and retrieves the IMG URLs
     fn extract_img_urls(&mut self) {
-        if let Some(content_ref) = &self.content {
-            for img_ref in content_ref.as_node().select("img").unwrap() {
+        if let Some(content_ref) = &self.readability.article_node {
+            for img_ref in content_ref.select("img").unwrap() {
                 img_ref.as_node().as_element().map(|img_elem| {
                     img_elem.attributes.borrow().get("src").map(|img_url| {
                         if !img_url.is_empty() {
@@ -101,10 +64,10 @@ impl Extractor {
         self.extract_img_urls();
         println!("Downloading images to res/");
         for img_url in &self.img_urls {
-            let mut img_url = img_url.0.clone();
-            get_absolute_url(&mut img_url, article_origin);
-            async_download_tasks.push(task::spawn(async {
-                let mut img_response = surf::get(&img_url).await.expect("Unable to retrieve file");
+            let img_url = img_url.0.clone();
+            let abs_url = get_absolute_url(&img_url, article_origin);
+            async_download_tasks.push(task::spawn(async move {
+                let mut img_response = surf::get(&abs_url).await.expect("Unable to retrieve file");
                 let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
                 let img_mime = img_response
                     .header("Content-Type")
@@ -114,7 +77,7 @@ impl Extractor {
                     .and_then(map_mime_type_to_ext)
                     .unwrap();
 
-                let img_path = format!("res/{}{}", hash_url(&img_url), &img_ext);
+                let img_path = format!("res/{}{}", hash_url(&abs_url), &img_ext);
                 let mut img_file = File::create(&img_path)
                     .await
                     .expect("Unable to create file");
@@ -133,10 +96,10 @@ impl Extractor {
             let (img_url, img_path, img_mime) = async_task.await;
             // Update the image sources
             let img_ref = self
-                .content
+                .readability
+                .article_node
                 .as_mut()
                 .expect("Unable to get mutable ref")
-                .as_node()
                 .select_first(&format!("img[src='{}']", img_url))
                 .expect("Image node does not exist");
             let mut img_node = img_ref.attributes.borrow_mut();
@@ -145,11 +108,14 @@ impl Extractor {
         }
         Ok(())
     }
-}
 
-fn extract_text_from_node(node: &NodeRef) -> Option<String> {
-    node.first_child()
-        .map(|child_ref| child_ref.text_contents())
+    pub fn article(&self) -> Option<&NodeRef> {
+        self.article.as_ref()
+    }
+
+    pub fn metadata(&self) -> &MetaData {
+        &self.readability.metadata
+    }
 }
 
 /// Utility for hashing URLs. This is used to help store files locally with unique values
@@ -174,10 +140,11 @@ fn map_mime_type_to_ext(mime_type: &str) -> Option<String> {
         .map(|format| String::from(".") + format)
 }
 
-fn get_absolute_url(url: &mut String, request_url: &Url) {
+fn get_absolute_url(url: &str, request_url: &Url) -> String {
     if Url::parse(url).is_ok() {
+        url.to_owned()
     } else if url.starts_with("/") {
-        *url = Url::parse(&format!(
+        Url::parse(&format!(
             "{}://{}",
             request_url.scheme(),
             request_url.host_str().unwrap()
@@ -185,36 +152,9 @@ fn get_absolute_url(url: &mut String, request_url: &Url) {
         .unwrap()
         .join(url)
         .unwrap()
-        .into_string();
+        .into_string()
     } else {
-        *url = request_url.join(url).unwrap().into_string();
-    }
-}
-
-#[derive(Debug)]
-pub struct MetaAttr {
-    author: Option<String>,
-    description: Option<String>,
-    language: String,
-    tags: Option<Vec<String>>,
-    title: String,
-}
-
-impl MetaAttr {
-    pub fn new(
-        author: Option<String>,
-        description: Option<String>,
-        language: String,
-        tags: Option<Vec<String>>,
-        title: String,
-    ) -> Self {
-        MetaAttr {
-            author,
-            description,
-            language,
-            tags,
-            title,
-        }
+        request_url.join(url).unwrap().into_string()
     }
 }
 
@@ -249,86 +189,17 @@ mod test {
         </html>
         "#;
 
-    #[test]
-    fn test_extract_attr_val() {
-        let extractor = Extractor::from_html(TEST_HTML);
-        let ext_author =
-            extractor.extract_attr_val("meta[name='author']", "content", |val| val.to_string());
-        assert!(ext_author.is_some());
-        assert_eq!("Paperoni", &ext_author.unwrap());
-        let ext_author =
-            extractor.extract_attr_val("meta[name='invalid-name']", "content", |val| {
-                val.to_string()
-            });
-        assert!(ext_author.is_none());
-        let lang_attr = extractor.extract_attr_val("html", "lang", |lang| lang.to_string());
-        assert!(lang_attr.is_some());
-        assert_eq!("en".to_string(), lang_attr.unwrap());
-    }
-
-    #[test]
-    fn test_extract_inner_text() {
-        let extractor = Extractor::from_html(TEST_HTML);
-        let title_text = extractor.extract_inner_text("title");
-        assert!(title_text.is_some());
-        assert_eq!("Testing Paperoni".to_string(), title_text.unwrap());
-
-        let title_text = extractor.extract_inner_text("titln");
-        assert!(title_text.is_none());
-    }
-    #[test]
-    fn test_extract_text() {
-        let extractor = Extractor::from_html(TEST_HTML);
-        let h1_node = extractor.root_node.select_first("h1").unwrap();
-        let h1_text = extract_text_from_node(h1_node.as_node());
-        assert!(h1_text.is_some());
-        assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap());
-    }
-
-    #[test]
-    fn test_extract_content() {
-        let extracted_html: String = r#"
-            <article>
-                <h1>Starting out</h1>
-                <p>Some Lorem Ipsum text here</p>
-                <p>Observe this picture</p>
-                <img alt="Random image" src="./img.jpg">
-            </article>
-        "#
-        .lines()
-        .map(|line| line.trim())
-        .collect();
-
-        let mut extractor = Extractor::from_html(
-            &TEST_HTML
-                .lines()
-                .map(|line| line.trim())
-                .collect::<String>(),
-        );
-
-        extractor.extract_content();
-        let mut output = Vec::new();
-        assert!(extractor.content.is_some());
-
-        extractor
-            .content
-            .unwrap()
-            .as_node()
-            .serialize(&mut output)
-            .expect("Unable to serialize output HTML");
-        let output = std::str::from_utf8(&output).unwrap();
-
-        assert_eq!(extracted_html, output);
-    }
-
     #[test]
     fn test_extract_img_urls() {
         let mut extractor = Extractor::from_html(TEST_HTML);
-        extractor.extract_content();
+        extractor.extract_content("http://example.com/");
         extractor.extract_img_urls();
 
         assert!(extractor.img_urls.len() > 0);
-        assert_eq!(vec![("./img.jpg".to_string(), None)], extractor.img_urls);
+        assert_eq!(
+            vec![("http://example.com/img.jpg".to_string(), None)],
+            extractor.img_urls
+        );
     }
 
     #[test]
@@ -353,27 +224,4 @@ mod test {
             exts
         );
     }
-
-    #[test]
-    fn test_get_absolute_url() {
-        let mut absolute_url = "https://example.image.com/images/1.jpg".to_owned();
-        let mut relative_url = "../../images/2.jpg".to_owned();
-        let mut relative_from_host_url = "/images/3.jpg".to_owned();
-        let host_url = Url::parse("https://example.image.com/blog/how-to-test-resolvers/").unwrap();
-        get_absolute_url(&mut absolute_url, &host_url);
-        assert_eq!("https://example.image.com/images/1.jpg", absolute_url);
-        get_absolute_url(&mut relative_url, &host_url);
-        assert_eq!("https://example.image.com/images/2.jpg", relative_url);
-        relative_url = "2-1.jpg".to_owned();
-        get_absolute_url(&mut relative_url, &host_url);
-        assert_eq!(
-            "https://example.image.com/blog/how-to-test-resolvers/2-1.jpg",
-            relative_url
-        );
-        get_absolute_url(&mut relative_from_host_url, &host_url);
-        assert_eq!(
-            "https://example.image.com/images/3.jpg",
-            relative_from_host_url
-        );
-    }
 }
diff --git a/src/main.rs b/src/main.rs
index d790f9b..78ba0e2 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,64 +1,90 @@
+#[macro_use]
+extern crate lazy_static;
+
 use std::fs::File;
 
 use async_std::{fs::create_dir, fs::remove_dir_all, task};
 use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
+use structopt::StructOpt;
 use url::Url;
 
+mod cli;
 mod extractor;
+mod moz_readability;
 
 use extractor::Extractor;
 fn main() {
-    task::block_on(async {
-        let urls = vec![
-            "https://saveandrun.com/posts/2020-01-24-generating-mazes-with-haskell-part-1.html",
-            "https://saveandrun.com/posts/2020-04-05-querying-pacman-with-datalog.html",
-            "https://blog.hipstermojo.xyz/posts/redis-orm-preface/",
-            "https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators",
-            "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
-            "https://dev.to/steelwolf180/full-stack-development-in-django-3768"
-        ];
-        let html = fetch_url(urls[4]).await;
-        let mut extractor = Extractor::from_html(&html);
-        println!("Extracting");
-        extractor.extract_content();
-        create_dir("res/")
-            .await
-            .expect("Unable to create res/ output folder");
-        extractor
-            .download_images(&Url::parse(urls[5]).unwrap())
-            .await
-            .expect("Unable to download images");
-        let mut out_file = File::create("out.epub").unwrap();
-        let mut html_buf = Vec::new();
-        extractor
-            .content
-            .unwrap()
-            .as_node()
-            .serialize(&mut html_buf)
-            .expect("Unable to serialize");
-        let html_buf = std::str::from_utf8(&html_buf).unwrap();
-        let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
-        epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
-            .unwrap();
-        for img in extractor.img_urls {
-            let file_path = format!("{}", &img.0);
-
-            let img_buf = File::open(file_path).expect("Can't read file");
-            epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
-        }
-        epub.generate(&mut out_file).unwrap();
-        println!("Cleaning up");
-        remove_dir_all("res/").await.unwrap();
-    })
+    let opt = cli::Opts::from_args();
+    if !opt.urls.is_empty() {
+        println!("Downloading single article");
+        download(opt.urls);
+    }
 }
 
-async fn fetch_url(url: &str) -> String {
+type HTMLResource = (String, String);
+
+async fn fetch_url(url: &str) -> HTMLResource {
     let client = surf::Client::new();
     println!("Fetching...");
     // TODO: Add middleware for following redirects
-    client
-        .get(url)
-        .recv_string()
-        .await
-        .expect("Unable to fetch URL")
+    (
+        url.to_string(),
+        client
+            .get(url)
+            .recv_string()
+            .await
+            .expect("Unable to fetch URL"),
+    )
+}
+
+fn download(urls: Vec<String>) {
+    let mut async_url_tasks = Vec::with_capacity(urls.len());
+    for url in urls {
+        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
+    }
+    task::block_on(async {
+        for url_task in async_url_tasks {
+            let (url, html) = url_task.await;
+            println!("Extracting");
+            let mut extractor = Extractor::from_html(&html);
+            extractor.extract_content(&url);
+            if extractor.article().is_some() {
+                create_dir("res/")
+                    .await
+                    .expect("Unable to create res/ output folder");
+                extractor
+                    .download_images(&Url::parse(&url).unwrap())
+                    .await
+                    .expect("Unable to download images");
+                let mut out_file =
+                    File::create(format!("{}.epub", extractor.metadata().title())).unwrap();
+                let mut html_buf = Vec::new();
+                extractor
+                    .article()
+                    .unwrap()
+                    .serialize(&mut html_buf)
+                    .expect("Unable to serialize");
+                let html_buf = std::str::from_utf8(&html_buf).unwrap();
+                let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX
+                    .replace_all(html_buf, "$tag/>");
+                let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
+                if let Some(author) = extractor.metadata().byline() {
+                    epub.metadata("author", author).unwrap();
+                }
+                epub.metadata("title", extractor.metadata().title())
+                    .unwrap();
+                epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
+                    .unwrap();
+                for img in extractor.img_urls {
+                    let file_path = format!("{}", &img.0);
+
+                    let img_buf = File::open(file_path).expect("Can't read file");
+                    epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
+                }
+                epub.generate(&mut out_file).unwrap();
+                println!("Cleaning up");
+                remove_dir_all("res/").await.unwrap();
+            }
+        }
+    })
 }
diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs
new file mode 100644
index 0000000..2f3f7ae
--- /dev/null
+++ b/src/moz_readability/mod.rs
@@ -0,0 +1,3912 @@
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::str::FromStr;
+
+use html5ever::{LocalName, Namespace, QualName};
+use kuchiki::{
+    iter::{Descendants, Elements, Select},
+    traits::*,
+    NodeData, NodeRef,
+};
+use url::Url;
+
+const DEFAULT_CHAR_THRESHOLD: usize = 500;
+const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
+const FLAG_WEIGHT_CLASSES: u32 = 0x2;
+const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
+const READABILITY_SCORE: &'static str = "readability-score";
+const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
+// TODO: Change to HashSet
+const PHRASING_ELEMS: [&str; 39] = [
+    "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
+    "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
+    "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
+    "sub", "sup", "textarea", "time", "var", "wbr",
+];
+// TODO: Change to HashSet
+const DEFAULT_TAGS_TO_SCORE: [&str; 9] =
+    ["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"];
+// TODO: Change to HashSet
+const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"];
+const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
+    "align",
+    "background",
+    "bgcolor",
+    "border",
+    "cellpadding",
+    "cellspacing",
+    "frame",
+    "hspace",
+    "rules",
+    "style",
+    "valign",
+    "vspace",
+];
+
+const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"];
+// TODO: Change to HashSet
+const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"];
+
+pub mod regexes;
+
+pub struct Readability {
+    root_node: NodeRef,
+    byline: Option<String>,
+    article_title: String,
+    pub article_node: Option<NodeRef>,
+    article_dir: Option<String>,
+    flags: u32,
+    pub metadata: MetaData,
+}
+
+#[derive(Debug, PartialEq)]
+struct SizeInfo {
+    rows: usize,
+    columns: usize,
+}
+
+impl Readability {
+    pub fn new(html_str: &str) -> Self {
+        Self {
+            root_node: kuchiki::parse_html().one(html_str),
+            byline: None,
+            article_title: "".into(),
+            article_node: None,
+            article_dir: None,
+            flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
+            metadata: MetaData::new(),
+        }
+    }
+    pub fn parse(&mut self, url: &str) {
+        self.unwrap_no_script_tags();
+        self.remove_scripts();
+        self.prep_document();
+        self.metadata = self.get_article_metadata();
+        self.article_title = self.metadata.title.clone();
+        self.grab_article();
+        self.post_process_content(url);
+    }
+
+    /// Recursively check if node is image, or if node contains exactly only one image
+    /// whether as a direct child or as its descendants.
+    fn is_single_image(node_ref: &NodeRef) -> bool {
+        if let Some(element) = node_ref.as_element() {
+            if &element.name.local == "img" {
+                return true;
+            }
+        }
+
+        if node_ref.children().filter(Self::has_content).count() != 1
+            || !node_ref.text_contents().trim().is_empty()
+        {
+            return false;
+        }
+
+        return Readability::is_single_image(
+            &node_ref
+                .children()
+                .filter(Self::has_content)
+                .next()
+                .expect("Unable to get first child which should exist"),
+        );
+    }
+
+    fn has_content(node_ref: &NodeRef) -> bool {
+        match node_ref.data() {
+            NodeData::Text(text) => !text.borrow().trim().is_empty(),
+            _ => true,
+        }
+    }
+
+    /// Find all <noscript> that are located after <img> nodes, and which contain only one <img> element.
+    /// Replace the first image with the image from inside the <noscript> tag, and remove the <noscript> tag.
+    /// This improves the quality of the images we use on some sites (e.g. Medium).
+    fn unwrap_no_script_tags(&mut self) {
+        if let Ok(imgs) = self.root_node.select("img") {
+            let mut nodes = imgs.filter(|img_node_ref| {
+                let img_attrs = img_node_ref.attributes.borrow();
+                !img_attrs.map.iter().any(|(name, attr)| {
+                    &name.local == "src"
+                        || &name.local == "srcset"
+                        || &name.local == "data-src"
+                        || &name.local == "data-srcset"
+                        || regexes::is_match_img_ext(&attr.value)
+                })
+            });
+            let mut node_ref = nodes.next();
+            while let Some(img_ref) = node_ref {
+                node_ref = nodes.next();
+                img_ref.as_node().detach();
+            }
+        }
+
+        if let Ok(noscripts) = self.root_node.select("noscript") {
+            for noscript in noscripts {
+                let inner_node_ref = kuchiki::parse_fragment(
+                    QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
+                    Vec::new(),
+                )
+                .one(noscript.text_contents());
+                if !Self::is_single_image(&inner_node_ref) {
+                    continue;
+                }
+                if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
+                    // TODO: Fix this to have a better way of extracting nodes that are elements
+                    while prev_elem.as_element().is_none() {
+                        match prev_elem.previous_sibling() {
+                            Some(new_prev) => prev_elem = new_prev,
+                            None => break,
+                        };
+                    }
+
+                    if Self::is_single_image(&prev_elem) && prev_elem.as_element().is_some() {
+                        let prev_img = if &prev_elem.as_element().unwrap().name.local != "img" {
+                            prev_elem.select_first("img").unwrap().as_node().clone()
+                        } else {
+                            prev_elem.clone()
+                        };
+                        let new_img = inner_node_ref.select_first("img").unwrap();
+                        let prev_attrs = prev_img.as_element().unwrap().attributes.borrow();
+                        let prev_attrs = prev_attrs.map.iter().filter(|(attr, val)| {
+                            !val.value.trim().is_empty()
+                                && (&attr.local == "src"
+                                    || &attr.local == "srcset"
+                                    || regexes::is_match_img_ext(&val.value))
+                        });
+                        for (prev_attr, prev_value) in prev_attrs {
+                            match new_img.attributes.borrow().get(&prev_attr.local) {
+                                Some(value) => {
+                                    if value == prev_value.value {
+                                        continue;
+                                    }
+                                }
+                                None => (),
+                            }
+
+                            let attr_name: &str = &prev_attr.local;
+                            let mut attr_name = attr_name.to_owned();
+                            if new_img.attributes.borrow().contains(attr_name.clone()) {
+                                let new_name = format!("data-old-{}", &attr_name);
+                                attr_name = new_name;
+                            }
+                            new_img
+                                .attributes
+                                .borrow_mut()
+                                .insert(attr_name, prev_value.value.clone());
+                        }
+                        // WARN: This assumes `next_element` returns an element node!!
+                        let inner_node_child =
+                            Self::next_element(inner_node_ref.first_child(), true);
+                        prev_elem.insert_after(inner_node_child.unwrap());
+                        prev_elem.detach();
+                    }
+                }
+            }
+        }
+    }
+
+    /// Removes script tags from the document.
+    fn remove_scripts(&mut self) {
+        match self.root_node.select("script") {
+            Ok(mut script_elems) => {
+                let mut next_script = script_elems.next();
+                while let Some(next_script_ref) = next_script {
+                    next_script = script_elems.next();
+                    next_script_ref.as_node().detach();
+                }
+            }
+            Err(_) => (),
+        }
+        match self.root_node.select("noscript") {
+            Ok(mut noscript_elems) => {
+                let mut next_noscript = noscript_elems.next();
+                while let Some(noscript_ref) = next_noscript {
+                    next_noscript = noscript_elems.next();
+                    noscript_ref.as_node().detach();
+                }
+            }
+            Err(_) => (),
+        }
+    }
+
+    /// Prepare the HTML document for readability to scrape it. This includes things like stripping
+    /// CSS, and handling terrible markup.
+    fn prep_document(&mut self) {
+        match self.root_node.select("style") {
+            Ok(mut style_elems) => {
+                let mut style_elem = style_elems.next();
+                while let Some(style_ref) = style_elem {
+                    style_elem = style_elems.next();
+                    style_ref.as_node().detach();
+                }
+            }
+            Err(_) => (),
+        }
+        self.replace_brs();
+        match self.root_node.select("font") {
+            Ok(nodes_iter) => Self::replace_node_tags(nodes_iter, "span"),
+            Err(_) => (),
+        }
+    }
+
+    /// Replaces 2 or more successive <br> elements with a single <p>.
+    /// Whitespace between <br> elements are ignored. For example:
+    ///  <div>foo<br>bar<br> <br><br>abc</div>
+    /// will become:
+    ///   <div>foo<br>bar<p>abc</p></div>
+    fn replace_brs(&mut self) {
+        if let Ok(mut br_tags) = self.root_node.select("br") {
+            // The uses of `next_element` here are safe as it explicitly ensures the next element is an element node
+            while let Some(br_tag) = br_tags.next() {
+                let mut next = Self::next_element(br_tag.as_node().next_sibling(), false);
+                let mut replaced = false;
+                while let Some(next_elem) = next {
+                    if next_elem.as_element().is_some()
+                        && &next_elem.as_element().as_ref().unwrap().name.local == "br"
+                    {
+                        replaced = true;
+                        let br_sibling = next_elem.next_sibling();
+                        next = Self::next_element(br_sibling, false);
+                        next_elem.detach();
+                    } else {
+                        break;
+                    }
+                }
+                if replaced {
+                    let p = NodeRef::new_element(
+                        QualName::new(None, Namespace::from(HTML_NS), LocalName::from("p")),
+                        BTreeMap::new(),
+                    );
+                    br_tag.as_node().insert_before(p);
+                    let p = br_tag.as_node().previous_sibling().unwrap();
+                    br_tag.as_node().detach();
+
+                    next = p.next_sibling();
+                    while next.is_some() {
+                        let next_sibling = next.unwrap();
+                        if let Some(next_elem) = next_sibling.as_element() {
+                            if &next_elem.name.local == "br" {
+                                if let Some(second_sibling) = next_sibling.next_sibling() {
+                                    if second_sibling.as_element().is_some()
+                                        && "br" == &second_sibling.as_element().unwrap().name.local
+                                    {
+                                        break;
+                                    }
+                                }
+                            }
+                        }
+
+                        if !Self::is_phrasing_content(&next_sibling) {
+                            break;
+                        }
+
+                        let sibling = next_sibling.next_sibling();
+                        p.append(next_sibling);
+                        next = sibling;
+                    }
+
+                    while let Some(first_child) = p.first_child() {
+                        if Self::is_whitespace(&first_child) {
+                            first_child.detach();
+                        } else {
+                            break;
+                        }
+                    }
+
+                    while let Some(last_child) = p.last_child() {
+                        if Self::is_whitespace(&last_child) {
+                            last_child.detach();
+                        } else {
+                            break;
+                        }
+                    }
+
+                    if let Some(parent) = p.parent() {
+                        if &parent.as_element().as_ref().unwrap().name.local == "p" {
+                            Self::set_node_tag(&parent, "div");
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Iterates over a Select, and calls set_node_tag for each node.
+    fn replace_node_tags(nodes: Select<Elements<Descendants>>, name: &str) {
+        for node in nodes {
+            Self::set_node_tag(node.as_node(), name);
+        }
+    }
+
+    /// Replaces the specified NodeRef by replacing its name. This works by copying over its
+    /// children and its attributes.
+    fn set_node_tag(node_ref: &NodeRef, name: &str) -> NodeRef {
+        match node_ref.as_element() {
+            Some(elem) => {
+                let attributes = elem.attributes.borrow().clone().map.into_iter();
+                let replacement = NodeRef::new_element(
+                    QualName::new(None, Namespace::from(HTML_NS), LocalName::from(name)),
+                    attributes,
+                );
+                for child in node_ref.children() {
+                    replacement.append(child);
+                }
+                node_ref.insert_before(replacement);
+                let new_node = node_ref.previous_sibling().unwrap();
+                node_ref.detach();
+                return new_node;
+            }
+            None => (),
+        }
+        node_ref.clone()
+    }
+
+    fn is_whitespace(node_ref: &NodeRef) -> bool {
+        match node_ref.data() {
+            NodeData::Element(elem_data) => &elem_data.name.local == "br",
+            NodeData::Text(text_ref) => text_ref.borrow().trim().len() == 0,
+            _ => false,
+        }
+    }
+
+    /// Finds the next element, starting from the given node, and ignoring
+    /// whitespace in between. If the given node is an element, the same node is
+    /// returned.
+    /// The must_be_element argument ensure the next element is actually an element node.
+    /// This is likely to factored out into a new function.
+    fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
+        // TODO: Could probably be refactored to use the elements method
+        let mut node_ref = node_ref;
+        while node_ref.is_some() {
+            match node_ref.as_ref().unwrap().data() {
+                NodeData::Element(_) => break,
+                _ => {
+                    if node_ref.as_ref().unwrap().text_contents().trim().is_empty() {
+                        node_ref = node_ref.as_ref().unwrap().next_sibling();
+                    } else if must_be_element
+                        && !node_ref.as_ref().unwrap().text_contents().trim().is_empty()
+                    {
+                        node_ref = node_ref.as_ref().unwrap().next_sibling();
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+        node_ref
+    }
+
+    /// Determine if a node qualifies as phrasing content.
+    /// https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
+    fn is_phrasing_content(node_ref: &NodeRef) -> bool {
+        node_ref.as_text().is_some()
+            || match node_ref.as_element() {
+                Some(elem) => {
+                    let name: &str = &elem.name.local;
+                    PHRASING_ELEMS.contains(&name)
+                        || ((name == "a" || name == "del" || name == "ins")
+                            && node_ref
+                                .children()
+                                .all(|child_ref| Self::is_phrasing_content(&child_ref)))
+                }
+                None => false,
+            }
+    }
+
+    ///Attempts to get excerpt and byline metadata for the article. @return Object with optional "excerpt" and "byline" properties
+    fn get_article_metadata(&self) -> MetaData {
+        let mut values: HashMap<String, String> = HashMap::new();
+        let mut meta_data = MetaData::new();
+        if let Ok(meta_elems) = self.root_node.select("meta") {
+            meta_elems
+                .filter(|node_ref| {
+                    let node_attr = node_ref.attributes.borrow();
+                    node_attr.get("content").is_some()
+                })
+                .for_each(|node_ref| {
+                    let node_attr = node_ref.attributes.borrow();
+                    let content = node_attr.get("content").unwrap();
+                    let name_attr = node_attr.get("name");
+                    let mut matches = None;
+                    if let Some(property) = node_attr.get("property") {
+                        matches = regexes::PROPERTY_REGEX.captures(property);
+                        if matches.is_some() {
+                            let captures = matches.as_ref().unwrap();
+                            for capture in captures.iter() {
+                                let mut name = capture.unwrap().as_str().to_lowercase();
+                                name = regexes::REPLACE_WHITESPACE_REGEX
+                                    .replace_all(&name, "")
+                                    .to_string();
+                                values.insert(name, content.trim().to_string());
+                            }
+                        }
+                    }
+                    if matches.is_none() && name_attr.is_some() {
+                        let name_val = name_attr.unwrap();
+                        if regexes::is_match_name_pattern(name_val) {
+                            let name = name_val.to_lowercase();
+                            let name = regexes::REPLACE_WHITESPACE_REGEX.replace_all(&name, "");
+                            let name = regexes::REPLACE_DOT_REGEX.replace_all(&name, ":");
+                            values.insert(name.to_string(), content.trim().to_string());
+                        }
+                    }
+                });
+        }
+
+        let meta_title_keys = [
+            "dc:title",
+            "dcterm:title",
+            "og:title",
+            "weibo:article:title",
+            "weibo:webpage:title",
+            "title",
+            "twitter:title",
+        ];
+        meta_data.title = if let Some(key) = meta_title_keys
+            .iter()
+            .find(|key| values.contains_key(**key))
+        {
+            values.get(*key).map(|title| title.to_owned()).unwrap()
+        } else {
+            self.get_article_title()
+        };
+
+        let meta_byline_keys = ["dc:creator", "dcterm:creator", "author"];
+        meta_data.byline = {
+            let possible_key = meta_byline_keys
+                .iter()
+                .find(|key| values.contains_key(**key));
+            if let Some(actual_key) = possible_key {
+                values.get(*actual_key).map(|byline| byline.to_owned())
+            } else {
+                None
+            }
+        };
+
+        let meta_excerpt_keys = [
+            "dc:description",
+            "dcterm:description",
+            "og:description",
+            "weibo:article:description",
+            "weibo:webpage:description",
+            "description",
+            "twitter:description",
+        ];
+        meta_data.excerpt = {
+            let possible_key = meta_excerpt_keys
+                .iter()
+                .find(|key| values.contains_key(**key));
+            if let Some(actual_key) = possible_key {
+                values.get(*actual_key).map(|excerpt| excerpt.to_owned())
+            } else {
+                None
+            }
+        };
+
+        meta_data.site_name = values
+            .get("og:site_name")
+            .map(|site_name| site_name.to_owned());
+
+        Self::unescape_html_entities(&mut meta_data.title);
+        if meta_data.byline.is_some() {
+            Self::unescape_html_entities(&mut meta_data.byline.as_mut().unwrap());
+        }
+
+        if meta_data.excerpt.is_some() {
+            Self::unescape_html_entities(&mut meta_data.excerpt.as_mut().unwrap());
+        }
+
+        if meta_data.site_name.is_some() {
+            Self::unescape_html_entities(&mut meta_data.site_name.as_mut().unwrap());
+        }
+
+        meta_data
+    }
+
+    /// Converts some of the common HTML entities in string to their corresponding characters.
+    fn unescape_html_entities(value: &mut String) {
+        if !value.is_empty() {
+            // TODO: Extract this
+            let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
+            html_escape_map.insert("lt", "<");
+            html_escape_map.insert("gt", ">");
+            html_escape_map.insert("amp", "&");
+            html_escape_map.insert("quot", "\"");
+            html_escape_map.insert("apos", "'");
+            let mut new_value = regexes::REPLACE_HTML_ESCAPE_REGEX
+                .replace_all(&value, |captures: &regex::Captures| {
+                    html_escape_map[&captures[1]].to_string()
+                })
+                .to_string();
+            new_value = regexes::REPLACE_HEX_REGEX
+                .replace_all(&new_value, |captures: &regex::Captures| {
+                    let num = if let Some(hex_capture) = captures.get(1) {
+                        u16::from_str_radix(hex_capture.as_str(), 16)
+                    } else if let Some(dec_capture) = captures.get(2) {
+                        u16::from_str(dec_capture.as_str())
+                    } else {
+                        unreachable!("Unable to match any of the captures");
+                    };
+                    String::from_utf16_lossy(&[num.unwrap()])
+                })
+                .to_string();
+            *value = new_value;
+        }
+    }
+
+    /// Get the article title as an H1.
+    fn get_article_title(&self) -> String {
+        let mut cur_title = self
+            .root_node
+            .select_first("title")
+            .map(|title| title.text_contents().trim().to_string())
+            .expect("This file has no <title> tag to extract a title from");
+        let orig_title = cur_title.clone();
+        let mut title_had_hierarchical_separators = false;
+        let word_count = |s: &str| -> usize { s.split_whitespace().count() };
+        if regexes::is_match_title_separator(&cur_title) {
+            title_had_hierarchical_separators = regexes::is_match_has_title_separator(&cur_title);
+            cur_title = regexes::REPLACE_START_SEPARATOR_REGEX
+                .replace_all(&orig_title, "$start")
+                .to_string();
+            if word_count(&cur_title) < 3 {
+                cur_title = regexes::REPLACE_END_SEPARATOR_REGEX
+                    .replace_all(&orig_title, "$end")
+                    .to_string();
+            }
+        } else if cur_title.contains(": ") {
+            let trimmed_title = cur_title.trim();
+            let is_match_heading = self
+                .root_node
+                .select("h1, h2")
+                .unwrap()
+                .any(|heading| heading.text_contents().trim() == trimmed_title);
+            if !is_match_heading {
+                let mut idx = orig_title.rfind(":").unwrap() + 1;
+                let mut new_title = &orig_title[idx..];
+                if word_count(new_title) < 3 {
+                    idx = orig_title.find(":").unwrap() + 1;
+                    new_title = &orig_title[idx..];
+                } else if word_count(&orig_title[0..orig_title.find(":").unwrap()]) > 5 {
+                    new_title = &orig_title;
+                }
+                cur_title = new_title.to_string();
+            }
+        } else if cur_title.len() > 150 || cur_title.len() < 15 {
+            let mut h1_nodes = self.root_node.select("h1").unwrap();
+            let (_, h1_count) = h1_nodes.size_hint();
+            if Some(1) == h1_count {
+                cur_title = Self::get_inner_text(h1_nodes.next().unwrap().as_node(), None);
+            }
+        }
+        cur_title = regexes::NORMALIZE_REGEX
+            .replace_all(cur_title.trim(), " ")
+            .to_string();
+        let cur_word_count = word_count(&cur_title);
+
+        if cur_word_count <= 4
+            && (!title_had_hierarchical_separators
+                || cur_word_count
+                    != word_count(
+                        &regexes::REPLACE_MULTI_SEPARATOR_REGEX.replace_all(&orig_title, ""),
+                    ) - 1)
+        {
+            cur_title = orig_title;
+        }
+        cur_title
+    }
+
+    /// Removes the class="" attribute from every element in the given subtree, except those that
+    /// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
+    fn clean_classes(&mut self) {
+        // TODO: This should accessed from Self
+        let classes_to_preserve: HashSet<&str> = HashSet::new();
+        if let Some(article_node) = &mut self.article_node {
+            for elem in article_node.inclusive_descendants().elements() {
+                let mut elem_attrs = elem.attributes.borrow_mut();
+                if let Some(class_list) = elem_attrs.get_mut("class") {
+                    let filtered_class: String = class_list
+                        .split_whitespace()
+                        .filter(|class| classes_to_preserve.contains(class))
+                        .fold("".to_string(), |acc, x| acc + " " + x);
+                    if filtered_class.is_empty() {
+                        elem_attrs.remove("class");
+                    } else {
+                        *class_list = filtered_class;
+                    }
+                }
+            }
+        }
+    }
+
+    ///  Converts each <a> and <img> uri in the given element to an absolute URI, ignoring #ref URIs.
+    fn fix_relative_uris(&mut self, document_uri: &str) {
+        if let Some(article_node) = &mut self.article_node {
+            let document_uri =
+                Url::parse(document_uri).expect("Unable to parse the document's URI");
+            let base_uri = self
+                .root_node
+                .select("base")
+                .unwrap()
+                .filter(|node_ref| {
+                    let node_attrs = node_ref.attributes.borrow();
+                    node_attrs.contains("href")
+                })
+                .map(|node_ref| {
+                    let node_attrs = node_ref.attributes.borrow();
+                    Url::parse(node_attrs.get("href").unwrap()).unwrap()
+                })
+                .next()
+                .unwrap_or(document_uri.clone());
+            let to_absolute_uri = |uri_str: &str| -> String {
+                if base_uri == document_uri && uri_str.starts_with("#") {
+                    return uri_str.to_string();
+                }
+
+                if let Ok(new_uri) = Url::parse(uri_str) {
+                    if new_uri.has_host() {
+                        return new_uri.to_string();
+                    }
+                } else if let Ok(joined_uri) = base_uri.join(uri_str) {
+                    return joined_uri.to_string();
+                }
+
+                uri_str.to_string()
+            };
+            let mut links = article_node.select("a").unwrap().filter(|a_ref| {
+                let link_attrs = a_ref.attributes.borrow();
+                link_attrs.contains("href")
+            });
+            let mut link = links.next();
+            while let Some(link_ref) = link {
+                link = links.next();
+                let mut link_attrs = link_ref.attributes.borrow_mut();
+                let href = link_attrs.get("href").map(|val| val.to_string()).unwrap();
+                if href.starts_with("javascript:") {
+                    let link_node = link_ref.as_node();
+                    if link_node.children().count() == 1
+                        && link_node
+                            .first_child()
+                            .map(|node_ref| node_ref.as_text().is_some())
+                            .unwrap()
+                    {
+                        let text_node = NodeRef::new_text(link_node.text_contents());
+                        link_node.insert_before(text_node);
+                        link_node.detach();
+                    } else {
+                        let container = NodeRef::new_element(
+                            QualName::new(None, Namespace::from(HTML_NS), LocalName::from("span")),
+                            BTreeMap::new(),
+                        );
+                        let mut children = link_node.children();
+                        let mut child = children.next();
+                        while let Some(child_ref) = child {
+                            child = children.next();
+                            container.append(child_ref);
+                        }
+                        link_node.insert_before(container);
+                        link_node.detach();
+                    }
+                } else {
+                    link_attrs.insert("href", to_absolute_uri(&href));
+                }
+            }
+            let media_nodes = article_node
+                .select("img, picture, figure, video, audio, source")
+                .unwrap();
+            for media_node in media_nodes {
+                let mut media_attrs = media_node.attributes.borrow_mut();
+                if let Some(src) = media_attrs.get_mut("src") {
+                    *src = to_absolute_uri(&src);
+                }
+
+                if let Some(poster) = media_attrs.get_mut("poster") {
+                    *poster = to_absolute_uri(&poster);
+                }
+
+                if let Some(srcset) = media_attrs.get_mut("srcset") {
+                    let new_srcset = regexes::SRCSET_CAPTURE_REGEX.replace_all(
+                        &srcset,
+                        |captures: &regex::Captures| {
+                            to_absolute_uri(&captures[1])
+                                + &captures.get(2).map(|cap| cap.as_str()).unwrap_or("")
+                                + &captures[3]
+                        },
+                    );
+                    *srcset = new_srcset.to_string();
+                }
+            }
+        }
+    }
+
+    /// Removes readability attributes from DOM nodes as they are not needed in the final article
+    fn clean_readability_attrs(&mut self) {
+        if let Some(article_node) = &mut self.article_node {
+            for node in article_node.inclusive_descendants().elements() {
+                let mut node_attrs = node.attributes.borrow_mut();
+                node_attrs.remove(READABILITY_SCORE);
+                node_attrs.remove("readability-data-table");
+            }
+        }
+    }
+
+    /// Run any post-process modifications to article content as necessary.
+    fn post_process_content(&mut self, url: &str) {
+        self.fix_relative_uris(url);
+        // TODO: Add flag check
+        self.clean_classes();
+        self.clean_readability_attrs();
+    }
+
+    /// Converts an inline CSS string to a [HashMap] of property and value(s)
+    fn inline_css_str_to_map(css_str: &str) -> HashMap<&str, &str> {
+        css_str
+            .split(";")
+            .filter(|split_str| !split_str.trim().is_empty())
+            .map(|str_pair| {
+                let mut vals = str_pair.split(":");
+                (vals.next().unwrap().trim(), vals.next().unwrap().trim())
+            })
+            .collect()
+    }
+
+    fn is_probably_visible(node_ref: &NodeRef) -> bool {
+        if let Some(elem_data) = node_ref.as_element() {
+            let attributes = elem_data.attributes.borrow();
+            (if let Some(css_str) = attributes.get("style"){
+                let style_map = Self::inline_css_str_to_map(css_str);
+                if let Some(display_val) = style_map.get("display") {
+                    display_val != &"none"
+                } else {
+                    true
+                }
+            } else {
+                true
+            })
+                && !attributes.contains("hidden")
+            // check for "fallback-image" so that wikimedia math images are displayed
+                &&
+                    (!attributes.contains("aria-hidden") ||
+                    attributes.get("aria-hidden").map(|val| val != "true").unwrap_or(true) ||
+                    attributes.get("class").map(|class_list| class_list.split(" ").collect::<Vec<&str>>().contains(&"fallback-image")).unwrap_or(false))
+        } else {
+            // Technically, it should not matter what value is returned here
+            true
+        }
+    }
+
+    /// Check whether the input string could be a byline, i.e is less than 100 chars
+    fn is_valid_byline(input: &str) -> bool {
+        let text = input.trim();
+        text.len() > 0 && text.len() < 100
+    }
+
+    fn check_byline(&mut self, node_ref: &NodeRef, match_string: &str) -> bool {
+        if self.byline.is_none() {
+            if let Some(elem_data) = node_ref.as_element() {
+                let elem_attrs = elem_data.attributes.borrow();
+                let rel_attr = elem_attrs.get("rel");
+                let itemprop_attr = elem_attrs.get("itemprop");
+                let is_byline = (if rel_attr.is_some() {
+                    rel_attr.unwrap() == "author"
+                } else if itemprop_attr.is_some() {
+                    itemprop_attr.unwrap().contains("author")
+                } else {
+                    regexes::is_match_byline(match_string)
+                }) && Self::is_valid_byline(&node_ref.text_contents());
+                if is_byline {
+                    self.byline = Some(node_ref.text_contents().trim().to_owned());
+                }
+                is_byline
+            } else {
+                false
+            }
+        } else {
+            false
+        }
+    }
+
+    /// Traverse the DOM from node to node, starting at the node passed in.
+    /// Pass true for the second parameter to indicate this node itself
+    /// (and its kids) are going away, and we want the next node over.
+    ///
+    /// Calling this in a loop will traverse the DOM depth-first.
+    fn get_next_node(node_ref: &NodeRef, ignore_self_and_kids: bool) -> Option<NodeRef> {
+        // WARN: The uses of `next_element` here assume it returns an element node.
+        let has_elem_children = node_ref.children().elements().count();
+        if !ignore_self_and_kids && has_elem_children > 0 {
+            Self::next_element(node_ref.first_child(), true)
+        } else if let Some(next_sibling) = Self::next_element(node_ref.next_sibling(), true) {
+            Some(next_sibling)
+        } else {
+            // Keep walking up the node hierarchy until a parent with element siblings is found
+            let mut node = node_ref.parent();
+            while let Some(parent) = node {
+                if let Some(next_sibling) = Self::next_element(parent.next_sibling(), true) {
+                    return Some(next_sibling);
+                } else {
+                    node = parent.parent();
+                }
+            }
+            None
+        }
+    }
+
+    /// Removes the node_ref passed in and returns the next possible node by calling [get_next_node]
+    fn remove_and_get_next(node_ref: NodeRef) -> Option<NodeRef> {
+        let next_node = Self::get_next_node(&node_ref, true);
+        node_ref.detach();
+        next_node
+    }
+
+    /// Check if a given node has one of its ancestor tag name matching the
+    /// provided one.
+    fn has_ancestor_tag(
+        node_ref: &NodeRef,
+        tag_name: &str,
+        max_depth: Option<i32>,
+        filter_fn: Option<fn(&NodeRef) -> bool>,
+    ) -> bool {
+        let mut depth = 0;
+        let max_depth = max_depth.or(Some(3)).unwrap();
+        let mut parent = node_ref.parent();
+        while parent.is_some() {
+            let parent_node = parent.as_ref().unwrap();
+            if parent_node.as_element().is_none() {
+                // The recursion may go up the DOM tree upto a document node at which point it must stop
+                return false;
+            }
+            let parent_node_elem = parent_node.as_element().unwrap();
+            if max_depth > 0 && depth > max_depth {
+                return false;
+            }
+            if &parent_node_elem.name.local == tag_name
+                && (filter_fn.is_none() || filter_fn.unwrap()(parent_node))
+            {
+                return true;
+            }
+            parent = parent_node.parent();
+            depth += 1;
+        }
+        false
+    }
+
+    fn is_element_without_content(node_ref: &NodeRef) -> bool {
+        let child_count = node_ref.children().count();
+        node_ref.as_element().is_some()
+            && node_ref.text_contents().trim().is_empty()
+            && (child_count == 0
+                || child_count
+                    == node_ref.select("br").unwrap().count()
+                        + node_ref.select("hr").unwrap().count())
+    }
+
+    /// Check if this node has only whitespace and a single element with given tag
+    /// Returns false if the <div> node contains non-empty text nodes
+    /// or if it contains no element with given tag or more than 1 element.
+    fn has_single_tag_inside_element(node_ref: &NodeRef, tag_name: &str) -> bool {
+        let first_child = node_ref.children().elements().next();
+        if node_ref.children().elements().count() != 1
+            || (first_child.is_some() && &first_child.unwrap().name.local != tag_name)
+        {
+            return false;
+        }
+        !node_ref.children().any(|node| {
+            node.as_text().is_some()
+                && regexes::is_match_has_content(&node.text_contents().trim_end())
+        })
+    }
+
+    fn get_inner_text(node_ref: &NodeRef, normalize_spaces: Option<bool>) -> String {
+        let will_normalize = normalize_spaces.unwrap_or(true);
+        let text = node_ref.text_contents();
+        let text = text.trim();
+        if will_normalize {
+            return regexes::NORMALIZE_REGEX.replace_all(&text, " ").to_string();
+        }
+        text.to_owned()
+    }
+
+    /// Get the density of links as a percentage of the content
+    /// This is the amount of text that is inside a link divided by the total text in the node.
+    fn get_link_density(node_ref: &NodeRef) -> f32 {
+        let text_length = Self::get_inner_text(node_ref, None).len() as f32;
+        if text_length == 0_f32 {
+            return 0_f32;
+        }
+        node_ref
+            .select("a")
+            .unwrap()
+            .map(|a_node| Self::get_inner_text(a_node.as_node(), None).len() as f32)
+            .sum::<f32>()
+            / text_length
+    }
+
+    /// Determine whether element has any children block level elements.
+    fn has_child_block_element(node_ref: &NodeRef) -> bool {
+        // TODO: Refer to a static HashSet
+        let block_level_elems: [&str; 32] = [
+            "address",
+            "article",
+            "aside",
+            "blockquote",
+            "details",
+            "dialog",
+            "dd",
+            "div",
+            "dl",
+            "dt",
+            "fieldset",
+            "figcaption",
+            "footer",
+            "form",
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            "header",
+            "hgroup",
+            "hr",
+            "li",
+            "main",
+            "nav",
+            "ol",
+            "p",
+            "pre",
+            "section",
+            "table",
+            "ul",
+        ];
+        node_ref.children().any(|child_node| {
+            if child_node.as_element().is_some() {
+                let child_elem = child_node.as_element().unwrap();
+                block_level_elems.contains(&&*child_elem.name.local)
+                    || Self::has_child_block_element(&child_node)
+            } else {
+                false
+            }
+        })
+    }
+
+    /// Returns a [Vec] of ancestors
+    fn get_node_ancestors(node_ref: &NodeRef, max_depth: Option<usize>) -> Vec<NodeRef> {
+        node_ref.ancestors().take(max_depth.unwrap_or(1)).collect()
+    }
+
+    /// Get an element's class/id weight using regular expressions to tell if this
+    /// element looks good or bad.
+    fn get_class_weight(&self, node_ref: &NodeRef) -> i32 {
+        if !self.flag_is_active(FLAG_WEIGHT_CLASSES) {
+            return 0;
+        }
+        let mut weight = 0;
+        let node_elem = node_ref.as_element().unwrap();
+        let node_attrs = node_elem.attributes.borrow();
+        if let Some(id) = node_attrs.get("id") {
+            if !id.trim().is_empty() {
+                weight = if regexes::is_match_positive(id) {
+                    weight + 25
+                } else if regexes::is_match_negative(id) {
+                    weight - 25
+                } else {
+                    weight
+                }
+            }
+        }
+        if let Some(class) = node_attrs.get("class") {
+            if !class.trim().is_empty() {
+                weight = if regexes::is_match_positive(class) {
+                    weight + 25
+                } else if regexes::is_match_negative(class) {
+                    weight - 25
+                } else {
+                    weight
+                }
+            }
+        }
+        weight
+    }
+
+    /// Initialize a node with the readability attribute. Also checks the
+    /// className/id for special names to add to its score.
+    fn initialize_node(&self, node_ref: &mut NodeRef) {
+        if let Some(element) = node_ref.as_element() {
+            let mut score = 0.0;
+            // This must be computed first because it borrows the NodeRef which
+            // should not also be mutably borrowed
+            score += self.get_class_weight(node_ref) as f32;
+            let mut elem_attrs = element.attributes.borrow_mut();
+            elem_attrs.insert(READABILITY_SCORE, score.to_string());
+            let readability = elem_attrs.get_mut(READABILITY_SCORE);
+            match &*element.name.local {
+                "div" => score += 5.0,
+                "pre" | "td" | "blockquote" => score += 3.0,
+                "address" | "ol" | "ul" | "dl" | "dd" | "dt" | "li" | "form" => score -= 3.0,
+                "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "th" => score -= 5.0,
+                _ => (),
+            }
+            if let Some(x) = readability {
+                *x = score.to_string();
+            }
+        }
+    }
+
+    fn get_row_and_column_count(node_ref: &NodeRef) -> SizeInfo {
+        let mut rows = 0;
+        let mut columns = 0;
+        if let Ok(trs) = node_ref.select("tr") {
+            for tr in trs {
+                let tr_node = tr.as_node();
+                let tr_attr = tr.attributes.borrow();
+                let rowspan = tr_attr
+                    .get("rowspan")
+                    .map(|x| {
+                        x.parse::<usize>()
+                            .expect("Unable to parse rowspan value to usize")
+                    })
+                    .unwrap_or(1);
+                rows += rowspan;
+                let mut columns_in_row = 0;
+                if let Ok(cells) = tr_node.select("td") {
+                    for cell in cells {
+                        let cell_attr = cell.attributes.borrow();
+                        let colspan = cell_attr
+                            .get("colspan")
+                            .map(|x| {
+                                x.parse::<usize>()
+                                    .expect("Unable to parse colspan value to usize")
+                            })
+                            .unwrap_or(1);
+                        columns_in_row += colspan;
+                    }
+                }
+                columns = columns.max(columns_in_row);
+            }
+        }
+        SizeInfo { rows, columns }
+    }
+
+    /// Look for 'data' (as opposed to 'layout') tables, for which we use similar checks as
+    /// https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
+    fn mark_data_tables(&mut self) {
+        if let Ok(tables) = self.root_node.select("table") {
+            for table in tables {
+                let mut table_attr = table.attributes.borrow_mut();
+                let table_node = table.as_node();
+                if table_attr.get("role") == Some("presentation") {
+                    table_attr.insert("readability-data-table", "false".to_string());
+                    continue;
+                }
+                if table_attr.get("datatable") == Some("0") {
+                    table_attr.insert("readability-data-table", "false".to_string());
+                    continue;
+                }
+
+                if table_attr.contains("summary") {
+                    table_attr.insert("readability-data-table", "true".to_string());
+                    continue;
+                }
+                if let Ok(caption) = table_node.select_first("caption") {
+                    if caption.as_node().children().count() > 0 {
+                        table_attr.insert("readability-data-table", "true".to_string());
+                        continue;
+                    }
+                }
+
+                if DATA_TABLE_DESCENDANTS
+                    .iter()
+                    .any(|tag_name| table_node.select_first(tag_name).is_ok())
+                {
+                    table_attr.insert("readability-data-table", "true".to_string());
+                    continue;
+                }
+
+                if table_node.select("table").unwrap().count() > 1 {
+                    table_attr.insert("readability-data-table", "false".to_string());
+                    continue;
+                }
+
+                let size_info = Self::get_row_and_column_count(table_node);
+                if size_info.rows >= 10 || size_info.columns > 4 {
+                    table_attr.insert("readability-data-table", "true".to_string());
+                    continue;
+                }
+
+                if (size_info.rows * size_info.columns) > 10 {
+                    table_attr.insert("readability-data-table", "true".to_string());
+                    continue;
+                } else {
+                    table_attr.insert("readability-data-table", "false".to_string());
+                    continue;
+                }
+            }
+        }
+    }
+
+    /// Convert images and figures that have properties like data-src into images that can be loaded without JS
+    fn fix_lazy_images(node_ref: &mut NodeRef) {
+        let nodes = node_ref.select("img, picture, figure").unwrap();
+        for node in nodes {
+            let mut node_attr = node.attributes.borrow_mut();
+            if let Some(src) = node_attr.get("src") {
+                let src_captures = regexes::B64_DATA_URL_REGEX.captures(src);
+                if src_captures.is_some() {
+                    let svg_capture = src_captures.unwrap().get(1);
+                    if svg_capture.is_some() && svg_capture.unwrap().as_str() == "image/svg+xml" {
+                        continue;
+                    }
+
+                    let src_could_be_removed = node_attr
+                        .map
+                        .iter()
+                        .filter(|(name, _)| &name.local != "src")
+                        .filter(|(_, val)| regexes::is_match_img_ext(&val.value))
+                        .count()
+                        > 0;
+
+                    if src_could_be_removed {
+                        let b64_start = regexes::BASE64_REGEX.find(src).unwrap().start();
+                        let b64_length = src.len() - b64_start;
+                        if b64_length < 133 {
+                            node_attr.remove("src");
+                        }
+                    }
+                }
+            }
+            let src = node_attr.get("src");
+            let srcset = node_attr.get("srcset");
+            let class = node_attr.get("class");
+            if (src.is_some() || srcset.is_some())
+                && class.is_some()
+                && !class.unwrap().contains("lazy")
+            {
+                continue;
+            }
+
+            node_attr
+                .map
+                .clone()
+                .iter()
+                .filter(|(key, _)| !(&key.local == "src" || &key.local == "srcset"))
+                .for_each(|(_, val)| {
+                    let mut copy_to = "";
+                    if regexes::is_match_srcset(&val.value) {
+                        copy_to = "srcset";
+                    } else if regexes::is_match_src_regex(&val.value) {
+                        copy_to = "src";
+                    }
+                    if copy_to.len() > 0 {
+                        let new_val = val.value.clone();
+                        let tag_name = &node.name.local;
+                        if tag_name == "img" || tag_name == "picture" {
+                            node_attr.insert(copy_to, new_val);
+                        } else if tag_name == "figure" {
+                            let node_ref = node.as_node();
+                            let img_picture_nodes = node_ref.select("img, picture").unwrap();
+                            if img_picture_nodes.count() > 0 {
+                                let img = NodeRef::new_element(
+                                    QualName::new(
+                                        None,
+                                        Namespace::from(HTML_NS),
+                                        LocalName::from("img"),
+                                    ),
+                                    BTreeMap::new(),
+                                );
+                                {
+                                    let mut img_attr =
+                                        img.as_element().unwrap().attributes.borrow_mut();
+                                    img_attr.insert(copy_to, new_val);
+                                }
+                                node_ref.append(img);
+                            }
+                        }
+                    }
+                });
+        }
+    }
+
+    /// Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm
+    /// based on content length, classnames, link density, number of images & embeds, etc.
+    fn clean_conditionally(&self, node_ref: &mut NodeRef, tag_name: &str) {
+        if !self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
+            return;
+        }
+        let is_list = tag_name == "ul" || tag_name == "ol";
+        let is_data_table = |node_ref: &NodeRef| {
+            let node_elem = node_ref.as_element().unwrap();
+            let attrs = node_elem.attributes.borrow();
+            attrs.get("readability-data-table") == Some("true")
+        };
+        let get_char_count = |node_ref: &NodeRef| node_ref.text_contents().matches(",").count();
+
+        let mut nodes = node_ref
+            .descendants()
+            .select(tag_name)
+            .unwrap()
+            // Do not remove data tables
+            .filter(|node_data_ref| {
+                !(&node_data_ref.name.local == "table" && is_data_table(node_data_ref.as_node()))
+            })
+            // Do not remove if it is a child of a data table
+            .filter(|node_data_ref| {
+                !Self::has_ancestor_tag(
+                    node_data_ref.as_node(),
+                    tag_name,
+                    Some(-1),
+                    Some(is_data_table),
+                )
+            });
+        let mut next_node = nodes.next();
+        while let Some(node_data_ref) = next_node {
+            next_node = nodes.next();
+            let node = node_data_ref.as_node();
+            let weight = self.get_class_weight(node);
+            // Remove all elements with negative class weights
+            if weight < 0 {
+                node.detach();
+                continue;
+            }
+
+            if get_char_count(node) >= 10 {
+                continue;
+            }
+            let mut embeds = node_data_ref
+                .as_node()
+                .select("object, embed, iframe")
+                .unwrap();
+            let can_skip_embed = embeds.any(|node_data_ref| {
+                &node_data_ref.name.local == "object" || {
+                    let attrs = node_data_ref.attributes.borrow();
+
+                    attrs
+                        .map
+                        .iter()
+                        .any(|(_, val)| regexes::is_match_videos(&val.value))
+                }
+            });
+            if can_skip_embed {
+                continue;
+            }
+
+            let p_nodes = node_data_ref.as_node().select("p").unwrap().count();
+            let img_nodes = node_data_ref.as_node().select("img").unwrap().count();
+            let li_nodes = node_data_ref.as_node().select("li").unwrap().count() as i32 - 100;
+            let input_nodes = node_data_ref.as_node().select("input").unwrap().count();
+
+            let p = p_nodes as f32;
+            let img = img_nodes as f32;
+
+            let embed_count = node.select("object, embed, iframe").unwrap().count();
+            let link_density = Self::get_link_density(node);
+            let content_length = Self::get_inner_text(node, None).len();
+            let has_figure_ancestor = Self::has_ancestor_tag(node, "figure", None, None);
+            let have_to_remove = (img_nodes > 1 && p / img < 0.5 && !has_figure_ancestor)
+                || (!is_list && li_nodes > p_nodes as i32)
+                || (input_nodes > (p_nodes / 3))
+                || (!is_list
+                    && content_length < 25
+                    && (img_nodes == 0 || img_nodes > 2)
+                    && !has_figure_ancestor)
+                || (!is_list && weight < 25 && link_density > 0.2)
+                || (weight >= 25 && link_density > 0.5)
+                || ((embed_count == 1 && content_length < 75) || embed_count > 1);
+            if have_to_remove {
+                node.detach();
+            }
+        }
+    }
+
+    /// Clean a node of all elements of type "tag". (Unless it's a YouTube or Vimeo video)
+    fn clean(node_ref: &mut NodeRef, tag_name: &str) {
+        // Can be changed to a HashSet
+        let is_embed = vec!["object", "embed", "iframe"].contains(&tag_name);
+        let mut nodes = node_ref
+            .descendants()
+            .select(tag_name)
+            .unwrap()
+            .filter(|node_data_ref| {
+                !is_embed
+                    || {
+                        let attrs = node_data_ref.attributes.borrow();
+                        !attrs
+                            .map
+                            .iter()
+                            .any(|(_, val)| regexes::is_match_videos(&val.value))
+                    }
+                    || &node_data_ref.name.local == "object" // This currently does not check the innerHTML.
+            });
+        let mut node = nodes.next();
+        while let Some(node_data_ref) = node {
+            node = nodes.next();
+            node_data_ref.as_node().detach()
+        }
+    }
+
+    /// Clean out spurious headers from an Element. Checks things like classnames and link density.
+    fn clean_headers(&self, node_ref: &mut NodeRef) {
+        let mut nodes = node_ref
+            .descendants()
+            .select("h1, h2")
+            .unwrap()
+            .filter(|node_data_ref| self.get_class_weight(node_data_ref.as_node()) < 0);
+        let mut node = nodes.next();
+
+        while let Some(node_data_ref) = node {
+            node = nodes.next();
+            node_data_ref.as_node().detach();
+        }
+    }
+
+    /// Remove the style attribute on every element and descendants.
+    fn clean_styles(node_ref: &mut NodeRef) {
+        node_ref
+            .inclusive_descendants()
+            .elements()
+            .filter(|node| &node.name.local != "svg")
+            .for_each(|node_data_ref| {
+                let mut attrs = node_data_ref.attributes.borrow_mut();
+                PRESENTATIONAL_ATTRIBUTES.iter().for_each(|pres_attr| {
+                    attrs.remove(*pres_attr);
+                });
+                if DEPRECATED_SIZE_ATTRIBUTE_ELEMS.contains(&node_data_ref.name.local.as_ref()) {
+                    attrs.remove("width");
+                    attrs.remove("height");
+                }
+            });
+    }
+
+    /// Clean out elements that match the specified conditions
+    fn clean_matched_nodes(node_ref: &mut NodeRef, filter_fn: impl Fn(&NodeRef, &str) -> bool) {
+        let end_of_search_marker_node = Self::get_next_node(node_ref, true);
+        let mut next_node = Self::get_next_node(node_ref, false);
+        while next_node.is_some() && next_node != end_of_search_marker_node {
+            let node = next_node.unwrap();
+            let attrs = node.as_element().unwrap().attributes.borrow();
+            let class = attrs.get("class").unwrap_or("");
+            let id = attrs.get("id").unwrap_or("");
+            if filter_fn(&node, &(class.to_string() + " " + id)) {
+                next_node = Self::remove_and_get_next(node.clone());
+            } else {
+                next_node = Self::get_next_node(&node, false);
+            }
+        }
+    }
+
+    /// Prepare the article node for display. Clean out any inline styles, iframes,
+    /// forms, strip extraneous <p> tags, etc.
+    fn prep_article(&mut self, node_ref: &mut NodeRef) {
+        Self::clean_styles(node_ref);
+        self.mark_data_tables();
+        Self::fix_lazy_images(node_ref);
+        self.clean_conditionally(node_ref, "form");
+        self.clean_conditionally(node_ref, "fieldset");
+        Self::clean(node_ref, "object");
+        Self::clean(node_ref, "embed");
+        Self::clean(node_ref, "h1");
+        Self::clean(node_ref, "footer");
+        Self::clean(node_ref, "link");
+        Self::clean(node_ref, "aside");
+
+        node_ref.children().for_each(|mut node| {
+            Self::clean_matched_nodes(&mut node, |node: &NodeRef, match_string| {
+                regexes::is_match_share_elems(match_string)
+                    && node.text_contents().len() < DEFAULT_CHAR_THRESHOLD
+            });
+        });
+
+        let h2_nodes = node_ref.select("h2").unwrap().take(2).collect::<Vec<_>>();
+        if h2_nodes.len() == 1 {
+            let h2_node = h2_nodes[0].as_node();
+            let length_similar_rate = ((h2_node.text_contents().len() as isize
+                - self.article_title.len() as isize) as f32)
+                / self.article_title.len() as f32;
+            if length_similar_rate.abs() < 0.5 {
+                let titles_match = if length_similar_rate > 0.0 {
+                    h2_node.text_contents().contains(&self.article_title)
+                } else {
+                    self.article_title.contains(&h2_node.text_contents())
+                };
+                if titles_match {
+                    Self::clean(node_ref, "h2");
+                }
+            }
+        }
+
+        Self::clean(node_ref, "iframe");
+        Self::clean(node_ref, "input");
+        Self::clean(node_ref, "textarea");
+        Self::clean(node_ref, "select");
+        Self::clean(node_ref, "button");
+        self.clean_headers(node_ref);
+
+        self.clean_conditionally(node_ref, "table");
+        self.clean_conditionally(node_ref, "ul");
+        self.clean_conditionally(node_ref, "div");
+
+        let mut p_nodes = node_ref.select("p").unwrap().filter(|node_data_ref| {
+            let p_node = node_data_ref.as_node();
+            let img_count = p_node.select("img").unwrap().count();
+            let embed_count = p_node.select("embed").unwrap().count();
+            let object_count = p_node.select("object").unwrap().count();
+            let iframe_count = p_node.select("iframe").unwrap().count();
+            let total = img_count + embed_count + object_count + iframe_count;
+            total == 0 && Self::get_inner_text(node_data_ref.as_node(), Some(false)).is_empty()
+        });
+        let mut p_node = p_nodes.next();
+        while let Some(p_node_ref) = p_node {
+            p_node = p_nodes.next();
+            p_node_ref.as_node().detach();
+        }
+
+        let mut br_nodes = node_ref.select("br").unwrap().filter(|node_data_ref| {
+            let br_node = node_data_ref.as_node();
+            // WARN: This assumes `next_element` returns an element node.
+            let next_node = Self::next_element(br_node.next_sibling(), true);
+            next_node.is_some() && &next_node.unwrap().as_element().unwrap().name.local == "p"
+        });
+        let mut br_node = br_nodes.next();
+        while let Some(br_node_ref) = br_node {
+            br_node = br_nodes.next();
+            br_node_ref.as_node().detach();
+        }
+
+        let mut table_nodes = node_ref.select("table").unwrap();
+        let mut table_node = table_nodes.next();
+        while let Some(table_node_ref) = table_node {
+            table_node = table_nodes.next();
+            let table_node = table_node_ref.as_node();
+            // WARN: This assumes `next_element` returns an element node.
+            let table_child = Self::next_element(table_node.first_child(), true);
+            let tbody = if Self::has_single_tag_inside_element(&table_node, "tbody") {
+                table_child.as_ref().unwrap()
+            } else {
+                table_node
+            };
+
+            // WARN: This block assumes `next_element` returns an element node
+            if Self::has_single_tag_inside_element(&tbody, "tr") {
+                let row = Self::next_element(tbody.first_child(), true).unwrap();
+                if Self::has_single_tag_inside_element(&row, "td") {
+                    let mut cell = Self::next_element(row.first_child(), true).unwrap();
+                    let tag = if cell
+                        .children()
+                        .all(|cell_child| Self::is_phrasing_content(&cell_child))
+                    {
+                        "p"
+                    } else {
+                        "div"
+                    };
+                    cell = Self::set_node_tag(&cell, tag);
+                    if let Some(parent) = table_node.parent() {
+                        parent.append(cell);
+                        table_node.detach();
+                    }
+                }
+            }
+        }
+    }
+
+    fn flag_is_active(&self, flag: u32) -> bool {
+        self.flags & flag > 0
+    }
+
+    fn remove_flag(&mut self, flag: u32) {
+        self.flags = self.flags & !flag;
+    }
+
+    /// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
+    /// a user wants to read. Then return it wrapped up in a div.
+    fn grab_article(&mut self) {
+        println!("Grabbing article");
+        // var doc = this._doc;
+        // var isPaging = (page !== null ? true: false);
+        // page = page ? page : this._doc.body;
+        let page = self.root_node.select_first("body");
+        if page.is_err() {
+            // TODO:Have error logging for this
+            println!("Document has no <body>");
+            return;
+        }
+        let page = page.unwrap();
+        let mut attempts: Vec<ExtractAttempt> = Vec::new();
+
+        // var pageCacheHtml = page.innerHTML;
+        //TODO: Add page cache
+
+        loop {
+            //   var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
+            let strip_unlikely_candidates = self.flag_is_active(FLAG_STRIP_UNLIKELYS);
+
+            //   // First, node prepping. Trash nodes that look cruddy (like ones with the
+            //   // class name "comment", etc), and turn divs into P tags where they have been
+            //   // used inappropriately (as in, where they contain no other block level elements.)
+            let mut elements_to_score: Vec<NodeRef> = Vec::new();
+            let mut node = Some(
+                self.root_node
+                    .select_first("html")
+                    .unwrap()
+                    .as_node()
+                    .clone(),
+            );
+
+            while let Some(node_ref) = node {
+                let node_elem = node_ref.as_element().unwrap();
+                let node_name: &str = node_elem.name.local.as_ref();
+                let match_string = {
+                    let node_attrs = node_elem.attributes.borrow();
+                    node_attrs.get("class").unwrap_or("").to_string()
+                        + " "
+                        + node_attrs.get("id").unwrap_or("")
+                };
+                if !Self::is_probably_visible(&node_ref) {
+                    node = Self::remove_and_get_next(node_ref);
+                    continue;
+                }
+
+                if self.check_byline(&node_ref, &match_string) {
+                    node = Self::remove_and_get_next(node_ref);
+                    continue;
+                }
+
+                if strip_unlikely_candidates {
+                    if regexes::is_match_unlikely(&match_string)
+                        && !regexes::is_match_ok_maybe(&match_string)
+                        && !Self::has_ancestor_tag(&node_ref, "table", None, None)
+                        && node_name != "body"
+                        && node_name != "a"
+                    {
+                        node = Self::remove_and_get_next(node_ref);
+                        continue;
+                    }
+
+                    let is_complementary = {
+                        let node_attrs = node_elem.attributes.borrow();
+                        node_attrs.get("role") == Some("complementary")
+                    };
+                    if is_complementary {
+                        node = Self::remove_and_get_next(node_ref);
+                        continue;
+                    }
+                }
+
+                match node_name {
+                    "div" | "section" | "header" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
+                        if Self::is_element_without_content(&node_ref) {
+                            node = Self::remove_and_get_next(node_ref);
+                            continue;
+                        }
+                    }
+                    _ => (),
+                }
+                if DEFAULT_TAGS_TO_SCORE.contains(&node_name) {
+                    elements_to_score.push(node_ref.clone());
+                }
+                if node_name == "div" {
+                    let mut p: Option<NodeRef> = None;
+                    let mut child_node = node_ref.first_child();
+                    while let Some(child_node_ref) = child_node {
+                        let next_sibling = child_node_ref.next_sibling();
+                        if Self::is_phrasing_content(&child_node_ref) {
+                            if let Some(ref p_node) = p {
+                                p_node.append(child_node_ref);
+                            } else if !Self::is_whitespace(&child_node_ref) {
+                                let new_p_node = NodeRef::new_element(
+                                    QualName::new(
+                                        None,
+                                        Namespace::from(HTML_NS),
+                                        LocalName::from("p"),
+                                    ),
+                                    BTreeMap::new(),
+                                );
+                                child_node_ref.insert_before(new_p_node);
+                                p = child_node_ref.previous_sibling();
+                                // Append will implicitly detach the child_node_ref
+                                p.as_mut().unwrap().append(child_node_ref);
+                            }
+                        } else if let Some(ref p_node) = p {
+                            while let Some(last_child) = p_node.last_child() {
+                                if Self::is_whitespace(&last_child) {
+                                    last_child.detach();
+                                } else {
+                                    break;
+                                }
+                            }
+                            p = None;
+                        }
+                        child_node = next_sibling;
+                    }
+                    if Self::has_single_tag_inside_element(&node_ref, "p")
+                        && Self::get_link_density(&node_ref) < 0.25
+                    {
+                        // WARN: This assumes `next_element` returns an element node.
+                        let new_node = Self::next_element(node_ref.first_child(), true).unwrap();
+                        elements_to_score.push(new_node.clone());
+                        node_ref.insert_before(new_node);
+                        let new_node = node_ref.previous_sibling();
+                        node_ref.detach();
+                        node = new_node;
+                        elements_to_score.push(node.clone().unwrap());
+                    } else if !Self::has_child_block_element(&node_ref) {
+                        node = Some(Self::set_node_tag(&node_ref, "p"));
+                        elements_to_score.push(node.clone().unwrap());
+                    }
+                }
+                node = Self::get_next_node(&node_ref, false);
+            }
+
+            let mut candidates: Vec<NodeRef> = Vec::new();
+            elements_to_score
+                .iter()
+                .filter(|node_ref| {
+                    let parent = node_ref.parent();
+                    parent.is_some() && parent.unwrap().as_element().is_some()
+                })
+                .map(|node_ref| (node_ref, Self::get_inner_text(&node_ref, None)))
+                .filter(|(_, inner_text)| inner_text.len() >= 25)
+                .map(|(node_ref, inner_text)| {
+                    (inner_text, Self::get_node_ancestors(&node_ref, Some(3)))
+                })
+                .filter(|(_, ancestors)| ancestors.len() != 0)
+                .for_each(|(inner_text, ancestors)| {
+                    let mut content_score = 0;
+                    content_score += 1;
+                    content_score += inner_text.split(",").count();
+                    content_score += (3).min(inner_text.len() / 100);
+                    ancestors
+                        .into_iter()
+                        .enumerate()
+                        .filter(|(_, node)| {
+                            node.parent().is_some() && node.parent().unwrap().as_element().is_some()
+                        })
+                        .for_each(|(level, mut ancestor)| {
+                            let has_readability = {
+                                let ancestor_attrs =
+                                    ancestor.as_element().unwrap().attributes.borrow();
+                                ancestor_attrs.contains(READABILITY_SCORE)
+                            };
+                            if !has_readability {
+                                self.initialize_node(&mut ancestor);
+                                candidates.push(ancestor.clone());
+                            }
+
+                            let score_divider = if level == 0 {
+                                1.0
+                            } else if level == 1 {
+                                2.0
+                            } else {
+                                level as f32 * 3.0
+                            };
+                            let mut ancestor_attrs =
+                                ancestor.as_element().unwrap().attributes.borrow_mut();
+                            if let Some(readability_score) =
+                                ancestor_attrs.get_mut(READABILITY_SCORE)
+                            {
+                                *readability_score = (readability_score.parse::<f32>().unwrap()
+                                    + (content_score as f32 / score_divider))
+                                    .to_string();
+                            }
+                        });
+                });
+
+            let mut top_candidates: Vec<NodeRef> = Vec::new();
+            for candidate in candidates {
+                let mut candidate_score = 0.0;
+                {
+                    let mut candidate_attr =
+                        candidate.as_element().unwrap().attributes.borrow_mut();
+                    if let Some(readability_score) = candidate_attr.get_mut(READABILITY_SCORE) {
+                        candidate_score = readability_score.parse::<f32>().unwrap()
+                            * (1.0 - Self::get_link_density(&candidate));
+                        *readability_score = candidate_score.to_string();
+                    }
+                }
+                let nb_top_candidates = 5;
+                for i in 0..nb_top_candidates {
+                    let top_candidate = top_candidates.get(i);
+                    let top_candidate_score = top_candidate
+                        .as_ref()
+                        .map(|node_ref| node_ref.as_element().unwrap().attributes.borrow())
+                        .map(|attrs| {
+                            attrs
+                                .get(READABILITY_SCORE)
+                                .unwrap_or("0")
+                                .parse::<f32>()
+                                .unwrap()
+                        });
+                    if top_candidate.is_none() || candidate_score > top_candidate_score.unwrap() {
+                        top_candidates.splice(i..i, vec![candidate].into_iter());
+                        if top_candidates.len() > nb_top_candidates {
+                            top_candidates.pop();
+                        }
+                        break;
+                    }
+                }
+            }
+
+            let possible_top_candidate = top_candidates.get(0);
+            let mut top_candidate;
+            let mut needed_to_create_top_candidate = false;
+            let mut parent_of_top_candidate: NodeRef;
+
+            if possible_top_candidate.is_none()
+                || possible_top_candidate
+                    .map(|node| &node.as_element().unwrap().name.local)
+                    .as_ref()
+                    .unwrap()
+                    == &"body"
+            {
+                top_candidate = NodeRef::new_element(
+                    QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
+                    BTreeMap::new(),
+                );
+                needed_to_create_top_candidate = true;
+                let mut page_children = page.as_node().children();
+                let mut page_child = page_children.next();
+                while let Some(child_node) = page_child {
+                    page_child = page_children.next();
+                    top_candidate.append(child_node);
+                }
+                page.as_node().append(top_candidate.clone());
+                self.initialize_node(&mut top_candidate);
+            } else {
+                let alternative_candidate_ancestors: Vec<Vec<NodeRef>>;
+                top_candidate = top_candidates.get(0).unwrap().clone();
+                let top_candidate_score = {
+                    let top_candidate_node_attrs =
+                        top_candidate.as_element().unwrap().attributes.borrow();
+                    top_candidate_node_attrs
+                        .get(READABILITY_SCORE)
+                        .unwrap()
+                        .parse::<f32>()
+                        .unwrap()
+                };
+
+                alternative_candidate_ancestors = top_candidates
+                    .iter()
+                    .skip(1)
+                    .filter(|top_candidate_node| {
+                        let candidate_node_score = {
+                            let top_candidate_node_attrs =
+                                top_candidate_node.as_element().unwrap().attributes.borrow();
+                            top_candidate_node_attrs
+                                .get(READABILITY_SCORE)
+                                .unwrap()
+                                .parse::<f32>()
+                                .unwrap()
+                        };
+                        (candidate_node_score / top_candidate_score) >= 0.75
+                    })
+                    .map(|node| Self::get_node_ancestors(&node, None))
+                    .collect();
+
+                let minimum_top_candidates = 3;
+                if alternative_candidate_ancestors.len() >= minimum_top_candidates {
+                    parent_of_top_candidate = top_candidate.parent().unwrap();
+                    while &parent_of_top_candidate.as_element().unwrap().name.local != "body" {
+                        let mut lists_containing_this_ancestor = alternative_candidate_ancestors
+                            .iter()
+                            .filter(|node_vec| node_vec.contains(&parent_of_top_candidate))
+                            .count();
+                        lists_containing_this_ancestor =
+                            lists_containing_this_ancestor.min(minimum_top_candidates);
+                        if lists_containing_this_ancestor >= minimum_top_candidates {
+                            top_candidate = parent_of_top_candidate;
+                            break;
+                        }
+                        parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
+                    }
+                }
+
+                let top_candidate_readability = {
+                    let top_candidate_attrs =
+                        top_candidate.as_element().unwrap().attributes.borrow();
+                    top_candidate_attrs
+                        .get(READABILITY_SCORE)
+                        .map(|x| x.to_owned())
+                };
+
+                if top_candidate_readability.is_none() {
+                    self.initialize_node(&mut top_candidate);
+                }
+                parent_of_top_candidate = top_candidate.parent().unwrap();
+
+                let mut last_score = {
+                    let top_candidate_node_attrs =
+                        top_candidate.as_element().unwrap().attributes.borrow();
+                    top_candidate_node_attrs
+                        .get(READABILITY_SCORE)
+                        .unwrap()
+                        .parse::<f32>()
+                        .unwrap()
+                };
+                let score_threshold = last_score / 3.0;
+                while parent_of_top_candidate
+                    .as_element()
+                    .map(|elem| elem.name.local.as_ref())
+                    .unwrap()
+                    != "body"
+                {
+                    let parent_readability = {
+                        let parent_attrs = parent_of_top_candidate
+                            .as_element()
+                            .unwrap()
+                            .attributes
+                            .borrow();
+                        parent_attrs
+                            .get(READABILITY_SCORE)
+                            .map(|score| score.parse::<f32>().unwrap())
+                    };
+                    if parent_readability.is_none() {
+                        parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
+                        continue;
+                    }
+                    if parent_readability.as_ref().unwrap() < &score_threshold {
+                        break;
+                    }
+                    if parent_readability.as_ref().unwrap() > &last_score {
+                        top_candidate = parent_of_top_candidate;
+                        break;
+                    }
+                    last_score = parent_readability.unwrap();
+                    parent_of_top_candidate = parent_of_top_candidate.parent().unwrap();
+                }
+
+                parent_of_top_candidate = top_candidate.parent().unwrap();
+                while &parent_of_top_candidate.as_element().unwrap().name.local != "body"
+                    && parent_of_top_candidate.children().count() == 1
+                {
+                    top_candidate = parent_of_top_candidate;
+                    parent_of_top_candidate = top_candidate.parent().unwrap();
+                }
+                let top_candidate_readability = {
+                    let top_candidate_attrs =
+                        top_candidate.as_element().unwrap().attributes.borrow();
+                    top_candidate_attrs
+                        .get(READABILITY_SCORE)
+                        .map(|score| score.to_string())
+                };
+                if top_candidate_readability.is_none() {
+                    self.initialize_node(&mut top_candidate);
+                }
+            }
+            let mut article_content = NodeRef::new_element(
+                QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
+                BTreeMap::new(),
+            );
+            let top_candidate_score = {
+                let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
+                top_candidate_attrs
+                    .get(READABILITY_SCORE)
+                    .map(|score| score.parse::<f32>().unwrap())
+                    .unwrap()
+            };
+
+            let sibling_score_threshold = (10.0_f32).max(top_candidate_score * 0.2);
+            parent_of_top_candidate = top_candidate.parent().unwrap();
+
+            let mut siblings = parent_of_top_candidate
+                .children()
+                .filter(|node| node.as_element().is_some());
+
+            let (top_candidate_class, top_candidate_score) = {
+                let top_candidate_attrs = top_candidate.as_element().unwrap().attributes.borrow();
+                let class = top_candidate_attrs
+                    .get("class")
+                    .map(|class| class.to_string())
+                    .unwrap_or("".to_string());
+                let score = top_candidate_attrs
+                    .get(READABILITY_SCORE)
+                    .map(|score| score.parse::<f32>().unwrap())
+                    .unwrap();
+                (class, score)
+            };
+            let mut next_sibling = siblings.next();
+            while let Some(sibling) = next_sibling {
+                next_sibling = siblings.next();
+                let mut append = false;
+                if sibling == top_candidate {
+                    append = true;
+                } else {
+                    let mut content_bonus = 0.0;
+                    let sibling_attrs = sibling.as_element().unwrap().attributes.borrow();
+
+                    let sibling_class = sibling_attrs
+                        .get("class")
+                        .map(|class| class.to_string())
+                        .unwrap_or("".to_string());
+                    let sibling_score = sibling_attrs
+                        .get(READABILITY_SCORE)
+                        .map(|score| score.parse::<f32>().unwrap());
+
+                    if sibling_class == top_candidate_class && !top_candidate_class.is_empty() {
+                        content_bonus += top_candidate_score * 0.2;
+                    }
+
+                    if sibling_score.is_some()
+                        && (sibling_score.unwrap() + content_bonus) >= sibling_score_threshold
+                    {
+                        append = true;
+                    } else if sibling.as_element().map(|elem| elem.name.local.as_ref()) == Some("p")
+                    {
+                        let link_density = Self::get_link_density(&sibling);
+                        let node_content = Self::get_inner_text(&sibling, None);
+                        let node_length = node_content.len();
+                        if node_length > 80 && link_density < 0.25 {
+                            append = true;
+                        } else if node_length < 80
+                            && node_length > 0
+                            && link_density == 0.0
+                            && !regexes::is_match_node_content(&node_content)
+                        {
+                            append = true;
+                        }
+                    }
+                }
+                if append {
+                    let new_article_child = if !ALTER_TO_DIV_EXCEPTIONS.contains(
+                        &sibling
+                            .as_element()
+                            .map(|elem| elem.name.local.as_ref())
+                            .unwrap(),
+                    ) {
+                        Self::set_node_tag(&sibling, "div")
+                    } else {
+                        sibling
+                    };
+                    article_content.append(new_article_child);
+                }
+            }
+            self.prep_article(&mut article_content);
+            if needed_to_create_top_candidate {
+                let mut top_candidate_attrs =
+                    top_candidate.as_element().unwrap().attributes.borrow_mut();
+                top_candidate_attrs.insert("id", "readability-page-1".to_string());
+                top_candidate_attrs.insert("class", "page".to_string());
+            } else {
+                let div = NodeRef::new_element(
+                    QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
+                    BTreeMap::new(),
+                );
+                {
+                    let mut div_attrs = div.as_element().unwrap().attributes.borrow_mut();
+                    div_attrs.insert("id", "readability-page-1".to_string());
+                    div_attrs.insert("class", "page".to_string());
+                }
+                for child in article_content.children() {
+                    div.append(child);
+                }
+                article_content.append(div);
+            }
+
+            let text_length = Self::get_inner_text(&article_content, Some(true)).len();
+            let mut parse_successful = true;
+            if text_length < DEFAULT_CHAR_THRESHOLD {
+                parse_successful = false;
+                if self.flag_is_active(FLAG_STRIP_UNLIKELYS) {
+                    self.remove_flag(FLAG_STRIP_UNLIKELYS);
+                    attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
+                } else if self.flag_is_active(FLAG_WEIGHT_CLASSES) {
+                    self.remove_flag(FLAG_WEIGHT_CLASSES);
+                    attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
+                } else if self.flag_is_active(FLAG_CLEAN_CONDITIONALLY) {
+                    self.remove_flag(FLAG_CLEAN_CONDITIONALLY);
+                    attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
+                } else {
+                    attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
+                    attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
+                    if attempts.first().as_ref().unwrap().length == 0 {
+                        println!("Unable to extract content");
+                        break;
+                    }
+                    article_content = attempts[0].article.clone();
+                    parse_successful = true;
+                }
+            }
+            if parse_successful {
+                let parent_ancestors = Self::get_node_ancestors(&parent_of_top_candidate, None);
+                let ancestors = vec![
+                    vec![parent_of_top_candidate, top_candidate],
+                    parent_ancestors,
+                ]
+                .concat();
+                ancestors.iter().any(|node| {
+                    let node_elem = node.as_element();
+                    if node_elem.is_none() {
+                        return false;
+                    }
+                    let node_attrs = node_elem.unwrap().attributes.borrow();
+                    if let Some(dir_attr) = node_attrs.get("dir") {
+                        self.article_dir = Some(dir_attr.to_string());
+                        return true;
+                    }
+                    false
+                });
+                self.article_node = Some(article_content);
+                return;
+            }
+        }
+    }
+}
+
+/// This represents the article node extracted after running the grab_article method
+#[derive(Debug)]
+struct ExtractAttempt {
+    article: NodeRef,
+    length: usize,
+}
+
+impl ExtractAttempt {
+    pub fn new(article: NodeRef, length: usize) -> Self {
+        ExtractAttempt { article, length }
+    }
+}
+
+#[derive(Debug, PartialEq)]
+pub struct MetaData {
+    byline: Option<String>,
+    excerpt: Option<String>,
+    site_name: Option<String>,
+    title: String,
+}
+
+impl MetaData {
+    pub fn new() -> Self {
+        MetaData {
+            byline: None,
+            excerpt: None,
+            site_name: None,
+            title: "".into(),
+        }
+    }
+
+    pub fn title(&self) -> &str {
+        &self.title
+    }
+
+    pub fn byline(&self) -> Option<&String> {
+        self.byline.as_ref()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::{
+        MetaData, Readability, SizeInfo, FLAG_CLEAN_CONDITIONALLY, FLAG_STRIP_UNLIKELYS,
+        FLAG_WEIGHT_CLASSES, HTML_NS, READABILITY_SCORE,
+    };
+    use html5ever::{LocalName, Namespace, QualName};
+    use kuchiki::traits::*;
+    use kuchiki::NodeRef;
+
+    // TODO: Refactor not to use test file possibly
+    const TEST_HTML: &'static str = include_str!("../../test_html/simple.html");
+
+    #[test]
+    fn test_unwrap_no_script_tags() {
+        let mut readability = Readability::new(TEST_HTML);
+        let img_count = readability.root_node.select("img").unwrap().count();
+        assert_eq!(3, img_count);
+        readability.unwrap_no_script_tags();
+        let img_count = readability.root_node.select("img").unwrap().count();
+        assert_eq!(2, img_count);
+
+        // Ensure attributes were copied over
+        let updated_img = readability.root_node.select_first("img#lazy-load").unwrap();
+        let updated_img_attrs = updated_img.attributes.borrow();
+        assert_eq!(true, updated_img_attrs.contains("data-old-src"));
+        assert_eq!(Some("lazy-load.png"), updated_img_attrs.get("data-old-src"));
+        assert_eq!(Some("eager-load.png"), updated_img_attrs.get("src"));
+    }
+
+    #[test]
+    fn test_is_single_image() {
+        let readability = Readability::new(TEST_HTML);
+
+        let img_elem_ref = readability.root_node.select_first("img").unwrap();
+        assert_eq!(true, Readability::is_single_image(&img_elem_ref.as_node()));
+
+        let noscript_elem_ref = readability.root_node.select_first("noscript").unwrap();
+        assert_eq!(
+            false,
+            Readability::is_single_image(&noscript_elem_ref.as_node())
+        );
+
+        let div_elem_ref = readability
+            .root_node
+            .select_first("div.invalid-elems")
+            .unwrap();
+        assert_eq!(false, Readability::is_single_image(&div_elem_ref.as_node()));
+
+        let div_elem_ref = kuchiki::parse_fragment(
+            QualName::new(None, Namespace::from(HTML_NS), LocalName::from("div")),
+            Vec::new(),
+        )
+        .one(noscript_elem_ref.as_node().text_contents().trim());
+
+        assert_eq!(true, Readability::is_single_image(&div_elem_ref));
+    }
+
+    #[test]
+    fn test_remove_scripts() {
+        let mut readability = Readability::new(TEST_HTML);
+
+        let noscript_elems = readability.root_node.select("noscript").unwrap();
+        assert_eq!(1, noscript_elems.count());
+        readability.remove_scripts();
+        let noscript_elems = readability.root_node.select("noscript").unwrap();
+        assert_eq!(0, noscript_elems.count());
+    }
+
+    #[test]
+    fn test_next_element() {
+        let html_str = r#"
+         <p id="a">This is a node</p>
+         <!-- Commented content  -->
+         <p id="b">This is another node. The next line is just whitespace</p>
+
+         This is standalone text
+         <p> Some <span>more</span> text</p>"#;
+        let doc = Readability::new(html_str);
+        let p = doc.root_node.select_first("#a").unwrap();
+        let p = p.as_node();
+        let mut p_node_option: Option<NodeRef> = Some(p.clone());
+        p_node_option = Readability::next_element(p_node_option, false);
+        assert_eq!(Some(p.clone()), p_node_option);
+
+        let p_node_option = p_node_option.unwrap();
+        let p_node_option = p_node_option.as_element();
+        let p_node_option_attr = p_node_option.unwrap().attributes.borrow();
+        assert_eq!("a", p_node_option_attr.get("id").unwrap());
+
+        let next = Readability::next_element(p.next_sibling(), false);
+
+        let next = next.unwrap();
+        let next_elem = next.as_element();
+        let next_attr = next_elem.unwrap().attributes.borrow();
+        assert_eq!("b", next_attr.get("id").unwrap());
+
+        let next = Readability::next_element(next.next_sibling(), false);
+
+        let next = next.unwrap();
+        assert_eq!(true, next.as_text().is_some());
+        assert_eq!("This is standalone text", next.text_contents().trim());
+
+        let next = Readability::next_element(None, false);
+        assert_eq!(None, next);
+    }
+
+    #[test]
+    fn test_is_phrasing_content() {
+        let html_str = r#"
+        Some text node
+        <b>This is a phrasing content node</b>
+        <p>This is not a phrasing content node</p>
+        <a href="\#"><i>This is also a phrasing content</i></a>
+        <a href="\#"><p>This is not a phrasing content</p></a>
+        "#;
+        let doc = Readability::new(html_str);
+        let body = doc.root_node.select_first("body").unwrap();
+        let body = body.as_node();
+        let mut body_children = body.children();
+        let mut node = body_children.next().unwrap();
+        assert_eq!(true, node.as_text().is_some());
+        assert_eq!(true, Readability::is_phrasing_content(&node));
+
+        node = node.next_sibling().unwrap();
+        assert_eq!("b", &node.as_element().unwrap().name.local);
+        assert_eq!(true, Readability::is_phrasing_content(&node));
+
+        node = node.next_sibling().unwrap(); // Skips the text node from the new line character
+        node = node.next_sibling().unwrap();
+        assert_eq!("p", &node.as_element().unwrap().name.local);
+        assert_eq!(false, Readability::is_phrasing_content(&node));
+
+        node = node.next_sibling().unwrap(); // Skips the text node from the new line character
+        node = node.next_sibling().unwrap();
+        assert_eq!("a", &node.as_element().unwrap().name.local);
+        assert_eq!(true, Readability::is_phrasing_content(&node));
+
+        node = node.next_sibling().unwrap(); // Skips the text node from the new line character
+        node = node.next_sibling().unwrap();
+        assert_eq!("a", &node.as_element().unwrap().name.local);
+        assert_eq!(false, Readability::is_phrasing_content(&node));
+    }
+
+    #[test]
+    fn test_is_whitespace() {
+        let html_str = r#"
+        <p>Definitely not whitespace</p>
+        I am also not whitespace
+        <p>     </p>
+        <br>
+        "#;
+        let doc = Readability::new(html_str);
+        let body = doc.root_node.select_first("body").unwrap();
+
+        let mut node = body.as_node().first_child().unwrap();
+        assert_eq!("p", &node.as_element().unwrap().name.local);
+        assert_eq!(false, Readability::is_whitespace(&node));
+
+        node = node.next_sibling().unwrap();
+        assert_eq!(true, node.as_text().is_some());
+        assert_eq!(false, Readability::is_whitespace(&node));
+
+        node = node.next_sibling().unwrap();
+        assert_eq!("p", &node.as_element().unwrap().name.local);
+        assert_eq!(
+            true,
+            Readability::is_whitespace(&node.first_child().unwrap())
+        );
+
+        // This is testing the new line character in between the <p> and <br> tags
+        node = node.next_sibling().unwrap();
+        assert_eq!(true, node.as_text().is_some());
+        assert_eq!(true, Readability::is_whitespace(&node));
+
+        node = node.next_sibling().unwrap();
+        assert_eq!("br", &node.as_element().unwrap().name.local);
+        assert_eq!(true, Readability::is_whitespace(&node));
+    }
+
+    #[test]
+    fn test_set_node_tag() {
+        let html_str = r#"
+        <div id="target" class="some random class" tabindex="0"><p>Child 1</p><p>Child 2</p></div>
+        <div id="not-the-target">The div above is being replaced</div>
+        "#;
+        let doc = Readability::new(html_str);
+        let target = doc.root_node.select_first("#target").unwrap();
+        let children_count = doc.root_node.children().count();
+        let target_children_count = target.as_node().children().count();
+
+        assert_eq!("div", &target.name.local);
+        let new_node = Readability::set_node_tag(target.as_node(), "section");
+
+        assert_eq!(children_count, doc.root_node.children().count());
+        let target = doc.root_node.select_first("#target").unwrap();
+        assert_eq!(&new_node, target.as_node());
+        assert_eq!("section", &target.name.local);
+        assert_eq!(target_children_count, target.as_node().children().count());
+
+        let target_attrs = target.as_node().as_element().unwrap().attributes.borrow();
+        assert_eq!(3, target_attrs.map.len());
+
+        let old_div = doc.root_node.select_first("div#target");
+        assert_eq!(true, old_div.is_err());
+    }
+
+    #[test]
+    fn test_replace_node_tags() {
+        let html_str = r#"
+        <div id="replace-p">
+          <p>Tag 1</p><p>Tag 2</p><p>Tag 3</p>
+        </div>
+        "#;
+        let doc = Readability::new(html_str);
+        let target_parent = doc.root_node.select_first("div#replace-p").unwrap();
+        let target_parent_child_count = target_parent.as_node().children().count();
+        let nodes = target_parent.as_node().select("p").unwrap();
+
+        Readability::replace_node_tags(nodes, "span");
+        assert_eq!(
+            target_parent_child_count,
+            target_parent.as_node().children().count()
+        );
+
+        let nodes = target_parent.as_node().select("p").unwrap();
+        assert_eq!(0, nodes.count());
+        let nodes = target_parent.as_node().select("span").unwrap();
+        assert_eq!(3, nodes.count());
+    }
+
+    #[test]
+    fn test_replace_brs() {
+        let html_str = r#"
+        <div>foo<br>bar<br> <br><br>abc</div>
+        "#;
+        let mut doc = Readability::new(html_str);
+        let div = doc.root_node.select_first("div").unwrap();
+        let br_count = div.as_node().select("br").unwrap().count();
+        let p_count = div.as_node().select("p").unwrap().count();
+        assert_eq!(4, br_count);
+        assert_eq!(0, p_count);
+
+        doc.replace_brs();
+        let br_count = div.as_node().select("br").unwrap().count();
+        let p_count = div.as_node().select("p").unwrap().count();
+        assert_eq!(1, br_count);
+        assert_eq!(1, p_count);
+
+        let p_node = div.as_node().select_first("p").unwrap();
+        assert_eq!("abc", p_node.as_node().text_contents());
+
+        let html_str = r#"
+        <p>foo<br>bar<br> <br><br>abc</p>
+        "#;
+        doc = Readability::new(html_str);
+        let p = doc.root_node.select_first("p").unwrap();
+        let div_count = doc.root_node.select("div").unwrap().count();
+        let br_count = p.as_node().select("br").unwrap().count();
+        assert_eq!(4, br_count);
+        assert_eq!(0, div_count);
+
+        doc.replace_brs();
+        let br_count = doc.root_node.select("br").unwrap().count();
+        let div_count = doc.root_node.select("div").unwrap().count();
+        let p_count = doc.root_node.select("p").unwrap().count();
+        assert_eq!(1, br_count);
+        assert_eq!(1, div_count);
+        assert_eq!(1, p_count);
+        let p_node = doc.root_node.select_first("p").unwrap();
+        assert_eq!("abc", p_node.as_node().text_contents());
+    }
+
+    #[test]
+    fn test_prep_document() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <head>
+            <style>div {padding: 20px; border-bottom: 2px solid black; }</style>
+          </head>
+          <body>
+            <font face="Times New Roman" size="10">Times New Roman</font>
+            <div>foo<br>bar<br> <br><br>abc</div>
+          </body>
+        </html>
+        "#;
+        let mut doc = Readability::new(html_str);
+        doc.prep_document();
+
+        let style_nodes = doc.root_node.select("style").unwrap();
+        let font_nodes = doc.root_node.select("font").unwrap();
+        let p_nodes = doc.root_node.select("p").unwrap();
+        let br_nodes = doc.root_node.select("br").unwrap();
+        assert_eq!(0, style_nodes.count());
+        assert_eq!(0, font_nodes.count());
+        assert_eq!(1, p_nodes.count());
+        assert_eq!(1, br_nodes.count());
+    }
+
+    #[test]
+    fn test_inline_css_str_to_map() {
+        use std::collections::HashMap;
+        let css_str = "display: flex; height: 200px; width: 250px; justify-content: center; align-items: center; border: 2px solid black";
+        let mut css_map = HashMap::new();
+        css_map.insert("display", "flex");
+        css_map.insert("height", "200px");
+        css_map.insert("width", "250px");
+        css_map.insert("justify-content", "center");
+        css_map.insert("align-items", "center");
+        css_map.insert("border", "2px solid black");
+
+        let css_str_to_vec = Readability::inline_css_str_to_map(css_str);
+        assert_eq!(css_map, css_str_to_vec);
+        let mut css_map = HashMap::new();
+        css_map.insert("color", "red");
+        assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;"));
+    }
+
+    #[test]
+    fn test_is_probably_visible() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <p id="visible">Lorem ipsum dolores</p>
+            <div id="hidden-div" style="display: none">
+              <p>This is hidden and so is the parent</p>
+            </div>
+            <input value="Some good CSRF token" hidden>
+            <div id="hidden-aria" style="display: flex;" aria-hidden="true">
+              <p>This is not considered visible</p>
+            </div>
+            <div id="visible-aria" style="display: flex;" aria-hidden="false">
+              <p>This is considered visible</p>
+            </div>
+            <img src="./some-img.png" class="fallback-image">
+            <div id="visible-div" style="display: block" class="visible" aria-hidden="false">
+              <p>This is fully visible</p>
+            </div>
+          </body>
+        </html>
+      "#;
+        let doc = Readability::new(html_str);
+        let div_node = doc.root_node.select_first("div#hidden-div").unwrap();
+        let p_node = doc.root_node.select_first("p#visible").unwrap();
+        let input_node = doc.root_node.select_first("input").unwrap();
+        let hidden_aria_div_node = doc.root_node.select_first("div#hidden-aria").unwrap();
+        let visible_aria_div_node = doc.root_node.select_first("div#visible-aria").unwrap();
+        let img_node = doc.root_node.select_first("img").unwrap();
+        let visible_div_node = doc.root_node.select_first("div#visible-div").unwrap();
+        assert_eq!(true, Readability::is_probably_visible(&p_node.as_node()));
+        assert_eq!(false, Readability::is_probably_visible(&div_node.as_node()));
+        assert_eq!(
+            false,
+            Readability::is_probably_visible(&input_node.as_node())
+        );
+        assert_eq!(
+            false,
+            Readability::is_probably_visible(&hidden_aria_div_node.as_node())
+        );
+        assert_eq!(
+            true,
+            Readability::is_probably_visible(&visible_aria_div_node.as_node())
+        );
+        assert_eq!(true, Readability::is_probably_visible(&img_node.as_node()));
+        assert_eq!(
+            true,
+            Readability::is_probably_visible(&visible_div_node.as_node())
+        );
+    }
+
+    #[test]
+    fn test_check_byline() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+        <body>
+          <p class="byline description" id="author">
+This test is used to find out whether a given node is a byline. This works by checking whether
+a node has a rel attribute with "author" as its value, or if "author"
+is part of its value in the itemprop attribute. If neither is the case then it checks whether the classes and id
+of the node match a regex of a potential byline. If any condition is met, then the content must be less than 100
+characters. For that reason, this <p> tag could not be a byline because it's too long.
+          </p>
+          <p class="author">A Paperoni maintainer</p>
+          <p class="authors not-byline"></p>
+          <p rel="author">Maintainer of Paperoni</p>
+        </body>
+        </html>
+        "#;
+        let mut doc = Readability::new(html_str);
+        assert_eq!(&None, &doc.byline);
+        let p1_node = doc.root_node.select_first("p.byline").unwrap();
+        let p2_node = doc.root_node.select_first("p.author").unwrap();
+        let p3_node = doc.root_node.select_first("p.not-byline").unwrap();
+        let p4_node = doc.root_node.select_first(r#"p[rel="author""#).unwrap();
+        assert_eq!(
+            false,
+            doc.check_byline(p1_node.as_node(), "byline description author")
+        );
+        assert_eq!(true, doc.check_byline(p2_node.as_node(), "author"));
+        assert_eq!(
+            false,
+            doc.check_byline(p3_node.as_node(), "authors not-byline")
+        );
+        assert_eq!(Some("A Paperoni maintainer".into()), doc.byline);
+        // The test below is false because there is already an existing byline.
+        assert_eq!(false, doc.check_byline(p4_node.as_node(), ""));
+    }
+
+    #[test]
+    fn test_get_next_node() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <div id="body-child-1">
+              <p id="start">Foobar content</p>
+              <div id="start-sib">
+                <span>First child</span>
+              </div>
+            </div>
+            <div id="body-child-2"><span>This will not be reached</p></div>
+            <p id="body-child-last">Last element</p>
+          </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let node = doc.root_node.select_first("p#start").unwrap();
+        let next_node = Readability::get_next_node(node.as_node(), false);
+        assert_eq!(true, next_node.is_some());
+        let next_node = next_node.unwrap();
+        let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
+        assert_eq!(Some("start-sib"), next_node_attr.get("id"));
+
+        let next_node = Readability::get_next_node(&next_node, false);
+        assert_eq!(true, next_node.is_some());
+        let next_node = next_node.unwrap();
+        assert_eq!("span", &next_node.as_element().unwrap().name.local);
+
+        let next_node = Readability::get_next_node(&next_node, false);
+        assert_eq!(true, next_node.is_some());
+        let next_node = next_node.unwrap();
+        let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
+        assert_eq!(Some("body-child-2"), next_node_attr.get("id"));
+
+        let next_node = Readability::get_next_node(&next_node, true);
+        assert_eq!(true, next_node.is_some());
+        let next_node = next_node.unwrap();
+        let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
+        assert_eq!(Some("body-child-last"), next_node_attr.get("id"));
+
+        let next_node = Readability::get_next_node(&next_node, true);
+        assert_eq!(None, next_node);
+    }
+
+    #[test]
+    fn test_remove_and_get_next() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <div id="body-child-1">
+              <p id="start">Foobar content</p>
+              <div id="start-sib">
+                <span>First child</span>
+              </div>
+            </div>
+            <div id="body-child-2"><span>This will not be reached</p></div>
+            <p id="body-child-last">Last element</p>
+          </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let node = doc.root_node.select_first("div#body-child-1").unwrap();
+        let p_node = Readability::get_next_node(node.as_node(), false).unwrap();
+        let next_node = Readability::remove_and_get_next(p_node);
+        assert_eq!(true, next_node.is_some());
+
+        let next_node = next_node.unwrap();
+        let next_node_attr = next_node.as_element().unwrap().attributes.borrow();
+        assert_eq!(Some("start-sib"), next_node_attr.get("id"));
+
+        // Confirm the p node no longer exists
+        let p_node = doc.root_node.select_first("p#start");
+        assert_eq!(true, p_node.is_err());
+    }
+
+    #[test]
+    fn test_has_ancestor_tag() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <div>
+              <main>
+                <p>
+                  <span>Target node</span>
+                </p>
+              </main>
+            </div>
+          </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let target = doc.root_node.select_first("span").unwrap();
+        assert_eq!(
+            true,
+            Readability::has_ancestor_tag(target.as_node(), "div", None, None)
+        );
+        assert_eq!(
+            false,
+            Readability::has_ancestor_tag(target.as_node(), "div", Some(1), None)
+        );
+        assert_eq!(
+            false,
+            Readability::has_ancestor_tag(
+                target.as_node(),
+                "div",
+                Some(5),
+                Some(|node_ref| {
+                    let node_attrs = node_ref.as_element().unwrap().attributes.borrow();
+                    node_attrs.contains("class")
+                })
+            )
+        );
+    }
+
+    #[test]
+    fn test_is_element_without_content() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <p>Node with content</p><!-- A comment node which is regarded as not having content -->
+            <p id="empty"></p>
+            <div id="contentful">
+              <p>
+                <span>Target node</span>
+              </p>
+            </div>
+            <div id="no-content"><br><br><br><br><br><br><hr><hr><br></div>
+          </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let target = doc.root_node.select_first("p").unwrap();
+        assert_eq!(
+            false,
+            Readability::is_element_without_content(target.as_node())
+        );
+
+        let target = target.as_node().next_sibling().unwrap();
+        assert_eq!(true, target.as_comment().is_some());
+        assert_eq!(false, Readability::is_element_without_content(&target));
+
+        let mut target = doc.root_node.select_first("p#empty").unwrap();
+        assert_eq!(
+            true,
+            Readability::is_element_without_content(target.as_node())
+        );
+
+        target = doc.root_node.select_first("div#contentful").unwrap();
+        assert_eq!(
+            false,
+            Readability::is_element_without_content(target.as_node())
+        );
+
+        target = doc.root_node.select_first("div#no-content").unwrap();
+        assert_eq!(
+            true,
+            Readability::is_element_without_content(target.as_node())
+        );
+    }
+
+    #[test]
+    fn test_has_single_tag_inside_element() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <p id="one">No element tags here</p>
+            <p id="two"><span>The p tag has only one tag</span></p>
+            <p id="three">
+              <span>Target node</span>
+              <span>
+                The parent has multiple children
+              </span>
+            </p>
+            <p id="four">
+              The text here means this div doesn't have a single tag
+              <span>Target node</span>
+            </p>
+          </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let mut target = doc.root_node.select_first("p#one").unwrap();
+        assert_eq!(
+            false,
+            Readability::has_single_tag_inside_element(target.as_node(), "span")
+        );
+
+        target = doc.root_node.select_first("p#two").unwrap();
+        assert_eq!(
+            true,
+            Readability::has_single_tag_inside_element(target.as_node(), "span")
+        );
+
+        target = doc.root_node.select_first("p#three").unwrap();
+        assert_eq!(
+            false,
+            Readability::has_single_tag_inside_element(target.as_node(), "span")
+        );
+
+        target = doc.root_node.select_first("p#four").unwrap();
+        assert_eq!(
+            false,
+            Readability::has_single_tag_inside_element(target.as_node(), "span")
+        );
+    }
+
+    #[test]
+    fn test_get_inner_text() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <p>The quick brown fox jumps       over the lazy dog</p>
+           </body>
+        </html>
+         "#;
+        let doc = Readability::new(html_str);
+        let target = doc.root_node.select_first("p").unwrap();
+        assert_eq!(
+            49,
+            Readability::get_inner_text(target.as_node(), Some(false)).len()
+        );
+        assert_eq!(
+            43,
+            Readability::get_inner_text(target.as_node(), None).len()
+        );
+    }
+
+    #[test]
+    fn test_get_link_density() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <p id="one">Zero link density</p>
+            <p id="two">Link density > 0 <a href="https://www.rust-lang.org/">The Rust home page</a></p>
+            <p id="three"><a></a><a></a></p>
+           </body>
+        </html>
+         "#;
+        let doc = Readability::new(html_str);
+        let mut target = doc.root_node.select_first("p#one").unwrap();
+        assert_eq!(0_f32, Readability::get_link_density(target.as_node()));
+
+        target = doc.root_node.select_first("p#two").unwrap();
+        assert_eq!(
+            18_f32 / 35_f32,
+            Readability::get_link_density(target.as_node())
+        );
+
+        target = doc.root_node.select_first("p#three").unwrap();
+        assert_eq!(0_f32, Readability::get_link_density(target.as_node()));
+    }
+
+    #[test]
+    fn test_has_child_block_element() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <p id="one">Has no <span>block level</span> elements</p>
+            <p id="two">Link density > 0 <a href="https://www.rust-lang.org/">The Rust home page</a></p>
+            <div id="three">
+              <p>This is a block level element</p>
+            </div>
+           </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let mut target = doc.root_node.select_first("p#one").unwrap();
+        assert_eq!(
+            false,
+            Readability::has_child_block_element(target.as_node())
+        );
+
+        target = doc.root_node.select_first("p#two").unwrap();
+        assert_eq!(
+            false,
+            Readability::has_child_block_element(target.as_node())
+        );
+
+        target = doc.root_node.select_first("div#three").unwrap();
+        assert_eq!(true, Readability::has_child_block_element(target.as_node()));
+    }
+
+    #[test]
+    fn test_get_node_ancestors() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <section>
+              <div>
+                <p><span></span></p>
+              </div>
+            </section>
+          </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let mut target = doc.root_node.select_first("span").unwrap();
+        assert_eq!(
+            1,
+            Readability::get_node_ancestors(target.as_node(), None).len()
+        );
+        assert_eq!(
+            3,
+            Readability::get_node_ancestors(target.as_node(), Some(3)).len()
+        );
+        assert_eq!(
+            5,
+            Readability::get_node_ancestors(target.as_node(), Some(5)).len()
+        );
+        assert_eq!(
+            6,
+            Readability::get_node_ancestors(target.as_node(), Some(200)).len()
+        );
+
+        target = doc.root_node.select_first("html").unwrap();
+        assert_eq!(
+            1,
+            Readability::get_node_ancestors(target.as_node(), Some(4)).len()
+        );
+    }
+
+    #[test]
+    fn test_get_class_weight() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <div id="blog" class="main">
+              <h1 class="hidden">Up next...</h1>
+              <p id="story">A story is told...</p>
+            </div>
+            <div id="comments">
+              Tell us what you think
+              <p class="comment">Great read...</p>
+            </div>
+          </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let mut target = doc.root_node.select_first("body").unwrap();
+        assert_eq!(0, doc.get_class_weight(target.as_node()));
+
+        target = doc.root_node.select_first("div#blog").unwrap();
+        assert_eq!(50, doc.get_class_weight(target.as_node()));
+
+        target = doc.root_node.select_first("h1.hidden").unwrap();
+        assert_eq!(-25, doc.get_class_weight(target.as_node()));
+
+        target = doc.root_node.select_first("p#story").unwrap();
+        assert_eq!(25, doc.get_class_weight(target.as_node()));
+
+        target = doc.root_node.select_first("div#comments").unwrap();
+        assert_eq!(-25, doc.get_class_weight(target.as_node()));
+
+        target = doc.root_node.select_first("p.comment").unwrap();
+        assert_eq!(-25, doc.get_class_weight(target.as_node()));
+    }
+
+    #[test]
+    fn test_initialize_node() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <div id="blog" class="main">
+              <h1 class="hidden">Up next...</h1>
+              <p id="story">A story is told...</p>
+            </div>
+            <div id="comments">
+              Tell us what you think
+              <pre class="comment">Great read...</pre>
+            </div>
+          </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let mut target = doc.root_node.select_first("div#blog").unwrap();
+        let mut node = target.as_node().clone();
+        doc.initialize_node(&mut node);
+        let node_attrs = node.as_element().unwrap().attributes.borrow();
+        assert_eq!(Some("55"), node_attrs.get(READABILITY_SCORE));
+
+        target = doc.root_node.select_first("h1.hidden").unwrap();
+        let mut node = target.as_node().clone();
+        doc.initialize_node(&mut node);
+        let node_attrs = node.as_element().unwrap().attributes.borrow();
+        assert_eq!(Some("-30"), node_attrs.get(READABILITY_SCORE));
+
+        target = doc.root_node.select_first("p#story").unwrap();
+        let mut node = target.as_node().clone();
+        doc.initialize_node(&mut node);
+        let node_attrs = node.as_element().unwrap().attributes.borrow();
+        assert_eq!(Some("25"), node_attrs.get(READABILITY_SCORE));
+
+        target = doc.root_node.select_first("div#comments").unwrap();
+        let mut node = target.as_node().clone();
+        doc.initialize_node(&mut node);
+        let node_attrs = node.as_element().unwrap().attributes.borrow();
+        assert_eq!(Some("-20"), node_attrs.get(READABILITY_SCORE));
+
+        target = doc.root_node.select_first("pre.comment").unwrap();
+        let mut node = target.as_node().clone();
+        doc.initialize_node(&mut node);
+        let node_attrs = node.as_element().unwrap().attributes.borrow();
+        assert_eq!(Some("-22"), node_attrs.get(READABILITY_SCORE));
+    }
+
+    #[test]
+    fn test_get_row_and_column_count() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <table>
+              <tbody>
+                <tr>
+                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
+                </tr>
+                <tr>
+                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td rowspan="2">&nbsp;</td>
+                </tr>
+                <tr>
+                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
+                </tr>
+                <tr>
+                  <td>&nbsp;</td><td colspan="2">&nbsp;</td><td>&nbsp;</td>
+                </tr>
+                <tr>
+                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
+                </tr>
+                <tr>
+                  <td colspan="4">&nbsp;</td>
+                </tr>
+              </tbody>
+            </table>
+          </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let target = doc.root_node.select_first("table").unwrap();
+        assert_eq!(
+            SizeInfo {
+                rows: 6,
+                columns: 4
+            },
+            Readability::get_row_and_column_count(target.as_node())
+        );
+    }
+
+    #[test]
+    fn test_mark_data_tables() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <table id="one"></table>
+            <table width="100%" border="0" id="two">
+              <tr valign="top">
+                <td width="20%">Left</td>
+                <td height="200" width="60%">Main</td>
+                <td width="20%">Right</td>
+              </tr>
+            </table>
+            <table id="three">
+              <caption>Monthly savings</caption>
+              <tr>
+                <th>Month</th>
+                <th>Savings</th>
+              </tr>
+              <tr>
+                <td>January</td>
+                <td>$100</td>
+              </tr>
+              <tr>
+                <td>February</td>
+                <td>$50</td>
+              </tr>
+            </table>
+            <table id="four">
+              <tbody>
+                <tr>
+                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
+                </tr>
+                <tr>
+                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td rowspan="2">&nbsp;</td>
+                </tr>
+                <tr>
+                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
+                </tr>
+                <tr>
+                  <td>&nbsp;</td><td colspan="2">&nbsp;</td><td>&nbsp;</td>
+                </tr>
+                <tr>
+                  <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
+                </tr>
+                <tr>
+                  <td colspan="4">&nbsp;</td>
+                </tr>
+              </tbody>
+            </table>
+            <table id="five">
+              <table>
+                <tbody>
+                  <tr>
+                    <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
+                  </tr>
+                  <tr>
+                    <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td rowspan="2">&nbsp;</td>
+                  </tr>
+                  <tr>
+                    <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
+                  </tr>
+                  <tr>
+                    <td>&nbsp;</td><td colspan="2">&nbsp;</td><td>&nbsp;</td>
+                  </tr>
+                  <tr>
+                    <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
+                  </tr>
+                  <tr>
+                    <td colspan="4">&nbsp;</td>
+                  </tr>
+                </tbody>
+              </table>
+            </table>
+          </body>
+        </html>
+        "#;
+        let mut doc = Readability::new(html_str);
+        doc.mark_data_tables();
+        let target = doc.root_node.select_first("table#one").unwrap();
+        let target_attr = target.attributes.borrow();
+        assert_eq!(Some("false"), target_attr.get("readability-data-table"));
+
+        let target = doc.root_node.select_first("table#two").unwrap();
+        let target_attr = target.attributes.borrow();
+        assert_eq!(Some("false"), target_attr.get("readability-data-table"));
+
+        let target = doc.root_node.select_first("table#three").unwrap();
+        let target_attr = target.attributes.borrow();
+        assert_eq!(Some("true"), target_attr.get("readability-data-table"));
+
+        let target = doc.root_node.select_first("table#four").unwrap();
+        let target_atrr = target.attributes.borrow();
+        assert_eq!(Some("true"), target_atrr.get("readability-data-table"));
+
+        let target = doc.root_node.select_first("table#five").unwrap();
+        let target_atrr = target.attributes.borrow();
+        assert_eq!(Some("false"), target_atrr.get("readability-data-table"));
+    }
+
+    #[test]
+    fn test_fix_lazy_images() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <img id="svg-uri" alt="Basketball" src="data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB2ZXJzaW9uPSIxLjEiIGlkPSJMYXllcl8xIiB4PSIwcHgiIHk9IjBweCIgdmlld0JveD0iMCAwIDEwMCAxMDAiIGVuYWJsZS1iYWNrZ3JvdW5kPSJuZXcgMCAwIDEwMCAxMDAiIHhtbDpzcGFjZT0icHJlc2VydmUiIGhlaWdodD0iMTAwcHgiIHdpZHRoPSIxMDBweCI+CjxnPgoJPHBhdGggZD0iTTI4LjEsMzYuNmM0LjYsMS45LDEyLjIsMS42LDIwLjksMS4xYzguOS0wLjQsMTktMC45LDI4LjksMC45YzYuMywxLjIsMTEuOSwzLjEsMTYuOCw2Yy0xLjUtMTIuMi03LjktMjMuNy0xOC42LTMxLjMgICBjLTQuOS0wLjItOS45LDAuMy0xNC44LDEuNEM0Ny44LDE3LjksMzYuMiwyNS42LDI4LjEsMzYuNnoiLz4KCTxwYXRoIGQ9Ik03MC4zLDkuOEM1Ny41LDMuNCw0Mi44LDMuNiwzMC41LDkuNWMtMyw2LTguNCwxOS42LTUuMywyNC45YzguNi0xMS43LDIwLjktMTkuOCwzNS4yLTIzLjFDNjMuNywxMC41LDY3LDEwLDcwLjMsOS44eiIvPgoJPHBhdGggZD0iTTE2LjUsNTEuM2MwLjYtMS43LDEuMi0zLjQsMi01LjFjLTMuOC0zLjQtNy41LTctMTEtMTAuOGMtMi4xLDYuMS0yLjgsMTIuNS0yLjMsMTguN0M5LjYsNTEuMSwxMy40LDUwLjIsMTYuNSw1MS4zeiIvPgoJPHBhdGggZD0iTTksMzEuNmMzLjUsMy45LDcuMiw3LjYsMTEuMSwxMS4xYzAuOC0xLjYsMS43LTMuMSwyLjYtNC42YzAuMS0wLjIsMC4zLTAuNCwwLjQtMC42Yy0yLjktMy4zLTMuMS05LjItMC42LTE3LjYgICBjMC44LTIuNywxLjgtNS4zLDIuNy03LjRjLTUuMiwzLjQtOS44LDgtMTMuMywxMy43QzEwLjgsMjcuOSw5LjgsMjkuNyw5LDMxLjZ6Ii8+Cgk8cGF0aCBkPSJNMTUuNCw1NC43Yy0yLjYtMS02LjEsMC43LTkuNywzLjRjMS4yLDYuNiwzLjksMTMsOCwxOC41QzEzLDY5LjMsMTMuNSw2MS44LDE1LjQsNTQuN3oiLz4KCTxwYXRoIGQ9Ik0zOS44LDU3LjZDNTQuMyw2Ni43LDcwLDczLDg2LjUsNzYuNGMwLjYtMC44LDEuMS0xLjYsMS43LTIuNWM0LjgtNy43LDctMTYuMyw2LjgtMjQuOGMtMTMuOC05LjMtMzEuMy04LjQtNDUuOC03LjcgICBjLTkuNSwwLjUtMTcuOCwwLjktMjMuMi0xLjdjLTAuMSwwLjEtMC4yLDAuMy0wLjMsMC40Yy0xLDEuNy0yLDMuNC0yLjksNS4xQzI4LjIsNDkuNywzMy44LDUzLjksMzkuOCw1Ny42eiIvPgoJPHBhdGggZD0iTTI2LjIsODguMmMzLjMsMiw2LjcsMy42LDEwLjIsNC43Yy0zLjUtNi4yLTYuMy0xMi42LTguOC0xOC41Yy0zLjEtNy4yLTUuOC0xMy41LTktMTcuMmMtMS45LDgtMiwxNi40LTAuMywyNC43ICAgQzIwLjYsODQuMiwyMy4yLDg2LjMsMjYuMiw4OC4yeiIvPgoJPHBhdGggZD0iTTMwLjksNzNjMi45LDYuOCw2LjEsMTQuNCwxMC41LDIxLjJjMTUuNiwzLDMyLTIuMyw0Mi42LTE0LjZDNjcuNyw3Niw1Mi4yLDY5LjYsMzcuOSw2MC43QzMyLDU3LDI2LjUsNTMsMjEuMyw0OC42ICAgYy0wLjYsMS41LTEuMiwzLTEuNyw0LjZDMjQuMSw1Ny4xLDI3LjMsNjQuNSwzMC45LDczeiIvPgo8L2c+Cjwvc3ZnPg==" />
+                <img id="normal-src" src="./foo.jpg">
+                <img id="gif-uri" src="data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/XBs/fNwfjZ0frl3/zy7////wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAkAABAALAAAAAAQABAAAAVVICSOZGlCQAosJ6mu7fiyZeKqNKToQGDsM8hBADgUXoGAiqhSvp5QAnQKGIgUhwFUYLCVDFCrKUE1lBavAViFIDlTImbKC5Gm2hB0SlBCBMQiB0UjIQA7" alt="star" width="16" height="16">
+                <img id="gif-uri-remove-src" data-src="./not-real-gif.png" src="data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/" alt="star" width="16" height="16">
+                <img id="lazy-loaded" class="lazy" src="placeholder.jpg" data-src="./720x640.jpg">
+                <picture>
+                    <source media="(min-width:650px)" srcset="img_pink_flowers.jpg">
+                    <source media="(min-width:465px)" srcset="img_white_flower.jpg">
+                    <img src="img_orange_flowers.jpg" alt="Flowers" style="width:auto;">
+                </picture>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let svg_uri = doc.root_node.select_first("#svg-uri").unwrap();
+        let normal_src = doc.root_node.select_first("#normal-src").unwrap();
+        let gif_uri = doc.root_node.select_first("#gif-uri").unwrap();
+        let picture = doc.root_node.select_first("picture").unwrap();
+        Readability::fix_lazy_images(&mut doc.root_node.clone());
+        assert_eq!(svg_uri, doc.root_node.select_first("#svg-uri").unwrap());
+        assert_eq!(
+            normal_src,
+            doc.root_node.select_first("#normal-src").unwrap()
+        );
+        assert_eq!(gif_uri, doc.root_node.select_first("#gif-uri").unwrap());
+        assert_eq!(picture, doc.root_node.select_first("picture").unwrap());
+
+        let gif_uri_remove_src = doc.root_node.select_first("#gif-uri-remove-src").unwrap();
+        let gif_uri_remove_src_attrs = gif_uri_remove_src.attributes.borrow();
+        assert_eq!(
+            gif_uri_remove_src_attrs.get("data-src"),
+            gif_uri_remove_src_attrs.get("src")
+        );
+        let lazy_loaded = doc.root_node.select_first("#lazy-loaded").unwrap();
+        let lazy_loaded_attrs = lazy_loaded.attributes.borrow();
+        assert_eq!(
+            lazy_loaded_attrs.get("data-src"),
+            lazy_loaded_attrs.get("src")
+        );
+    }
+
+    #[test]
+    fn test_clean_conditionally() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <table id="data-table">
+                    <caption>Monthly savings</caption>
+                    <tr>
+                        <th>Month</th>
+                        <th>Savings</th>
+                    </tr>
+                    <tr>
+                        <td>January</td>
+                        <td>$100</td>
+                    </tr>
+                    <tr>
+                        <td>February</td>
+                        <td>$50</td>
+                    </tr>
+                </table>
+                <table width="100%" border="0" id="display-table">
+                    <tr valign="top">
+                        <td width="20%">Left</td>
+                        <td height="200" width="60%">Main</td>
+                        <td width="20%">Right</td>
+                    </tr>
+                </table>
+                <table width="100%" border="0" id="display-table-removed" class="comment">
+                    <tr valign="top">
+                        <td width="40%">One</td>
+                        <td width="60%">Two</td>
+                    </tr>
+                </table>
+                <div class="comment">
+                    <p>The parent div will be deleted due to negative weight classes</p>
+                </div>
+                <div id="some-content">
+                    The days of the week: Mon, Tue, Wed, Thur, Fri, Sat, Sun.
+                    The months of the year: Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Oct, Nov, Dec.
+                </div>
+                <div id="embeds">
+                    <iframe width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ"></iframe>
+                </div>
+                <div id="footer">
+                    <p>Check out more articles</p>
+                    <ul>
+                        <li><img src="article.jpg"><p>Article 1</p></li>
+                        <li><img src="article.jpg"><p>Article 2</p></li>
+                        <li><img src="article.jpg"><p>Article 3</p></li>
+                    </ul>
+                </div>
+            </body>
+        </html>
+        "#;
+        let mut doc = Readability::new(html_str);
+        let body = doc.root_node.select_first("body").unwrap();
+        doc.mark_data_tables();
+        doc.clean_conditionally(&mut body.as_node().clone(), "table");
+        assert_eq!(true, doc.root_node.select_first("#data-table").is_ok());
+        assert_eq!(false, doc.root_node.select_first("#display-table").is_ok());
+        assert_eq!(
+            false,
+            doc.root_node.select_first("#display-table-removed").is_ok()
+        );
+        doc.clean_conditionally(&mut body.as_node().clone(), "div");
+        assert_eq!(false, doc.root_node.select_first("div.comment").is_ok());
+        assert_eq!(true, doc.root_node.select_first("div#some-content").is_ok());
+        assert_eq!(true, doc.root_node.select_first("div#embeds").is_ok());
+        assert_eq!(false, doc.root_node.select_first("div#footer").is_ok());
+    }
+
+    #[test]
+    fn test_clean() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <pre>A Paperoni test</pre>
+                <iframe width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ">
+                </iframe>
+                <iframe src="https://www.rust-lang.org/" name="rust_iframe" height="300px" width="100%" title="Rustlang Homepage">
+                </iframe>
+                <iframe src="https://crates.io/" name="crates_iframe" height="300px" width="100%" title="Crates.io Homepage">
+                </iframe>
+                <pre></pre>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        Readability::clean(&mut doc.root_node.clone(), "pre");
+        let pre_count = doc.root_node.select("pre").unwrap().count();
+        assert_eq!(0, pre_count);
+
+        Readability::clean(&mut doc.root_node.clone(), "iframe");
+        let iframe_count = doc.root_node.select("iframe").unwrap().count();
+        assert_eq!(1, iframe_count);
+        let iframe = doc.root_node.select_first("iframe").unwrap();
+        let iframe_attrs = iframe.attributes.borrow();
+        assert_eq!(
+            Some("https://www.youtube.com/embed/dQw4w9WgXcQ"),
+            iframe_attrs.get("src")
+        );
+    }
+
+    #[test]
+    fn test_clean_headers() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <h1 class="tags">#blog, #rust</h1>
+                <h2>A blog in Rust</h2>
+                <p>Foo bar baz quux</p>
+                <h1 class="footer">Copyright info</h1>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let body = doc.root_node.select_first("body").unwrap();
+        let h1_count = doc.root_node.select("h1").unwrap().count();
+        let h2_count = doc.root_node.select("h2").unwrap().count();
+        assert_eq!(2, h1_count);
+        assert_eq!(1, h2_count);
+        doc.clean_headers(&mut body.as_node().clone());
+        let h1_count = doc.root_node.select("h1").unwrap().count();
+        let h2_count = doc.root_node.select("h2").unwrap().count();
+        assert_eq!(0, h1_count);
+        assert_eq!(1, h2_count);
+    }
+
+    #[test]
+    fn test_clean_styles() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <div style="color:red; padding: 10px" id="red">A red box</div>
+                <div height="100px" style="color:blue; padding: 10px" id="blue">
+                    A blue box
+                </div>
+                <svg width="100" height="100">
+                    <circle cx="50" cy="50" r="40" fill="green" />
+                </svg>
+                <table width="100%" bgcolor="yellow">
+                    <tr>
+                        <th>Col 1</th>
+                        <th>Col 2</th>
+                    </tr>
+                </table>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        Readability::clean_styles(&mut doc.root_node.clone());
+        let red_div = doc.root_node.select_first("#red").unwrap();
+        let blue_div = doc.root_node.select_first("#blue").unwrap();
+        let svg = doc.root_node.select_first("svg").unwrap();
+        let table = doc.root_node.select_first("table").unwrap();
+
+        let red_div_attrs = red_div.attributes.borrow();
+        let blue_div_attrs = blue_div.attributes.borrow();
+        let svg_attrs = svg.attributes.borrow();
+        let table_attrs = table.attributes.borrow();
+
+        assert_eq!(1, red_div_attrs.map.len());
+        assert_eq!(false, red_div_attrs.contains("style"));
+        assert_eq!(2, blue_div_attrs.map.len());
+        assert_eq!(false, blue_div_attrs.contains("style"));
+        assert_eq!(true, blue_div_attrs.contains("height"));
+        assert_eq!(2, svg_attrs.map.len());
+        assert_eq!(0, table_attrs.map.len());
+    }
+
+    #[test]
+    fn test_clean_matched_nodes() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <p class="example">In Rust you can have 3 kinds of variables</p>
+                <ul>
+                    <li class="example">Immutable</li>
+                    <li class="example">Mutable</li>
+                    <li class="example">Constant</li>
+                </ul>
+                <p>Onto more tests</p>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let body = doc.root_node.select_first("body").unwrap();
+        Readability::clean_matched_nodes(&mut body.as_node().clone(), |node_ref, match_str| {
+            &node_ref.as_element().unwrap().name.local == "li" && match_str.contains("example")
+        });
+        let p_count = doc.root_node.select("p").unwrap().count();
+        let li_count = doc.root_node.select("li").unwrap().count();
+        assert_eq!(2, p_count);
+        assert_eq!(0, li_count);
+    }
+
+    #[test]
+    fn test_prep_article() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>A test HTML file</title>
+            </head>
+            <body>
+                <h2>A test HTML file</h2>
+                <div class="search">
+                    Search for other posts
+                    <input type="search" placeholder="Type here...">
+                    <button id="search-btn">Search</button>
+                </div>
+                <aside>Some content aside</aside>
+                <h1>A h1 tag</h1>
+                <h1 class="banner">A h1 tag to be removed</h1>
+                <table id="tbl-one"></table>
+                <table width="100%" border="0" id="tbl-two">
+                    <tr valign="top">
+                        <td width="20%">Left</td>
+                        <td height="200" width="60%">Main Content of the system</td>
+                        <td width="20%">Right</td>
+                    </tr>
+                </table>
+                <div style="color:red; padding: 10px" id="red">A red box</div>
+                <div height="100px" style="color:blue; padding: 10px" id="blue">
+                    A blue box
+                </div>
+                <svg width="100" height="100">
+                    <circle cx="50" cy="50" r="40" fill="green" />
+                </svg>
+                <ul>
+                    <li>one</li>
+                    <li>two</li>
+                    <li>three</li>
+                </ul>
+                <object data="obj.html" width="500" height="200"></object>
+                <table id="tbl-three">
+                    <caption>Monthly savings</caption>
+                    <tr>
+                        <th>Month</th>
+                        <th>Savings</th>
+                    </tr>
+                    <tr>
+                        <td>January</td>
+                        <td>$100</td>
+                    </tr>
+                    <tr>
+                        <td>February</td>
+                        <td>$50</td>
+                    </tr>
+                </table>
+                <iframe id="yt" width="420" height="345" src="https://www.youtube.com/embed/dQw4w9WgXcQ">
+                </iframe>
+                <div id="foo">
+                    <form action="">
+                        <fieldset>
+                            <legend>Personal details:</legend>
+                            <label for="fname">First name:</label>
+                            <input type="text" id="fname" name="fname"><br><br>
+                            <label for="lname">Last name:</label>
+                            <input type="text" id="lname" name="lname"><br><br>
+                        </fieldset>
+                    </form>
+                    <br>
+                    <p id="p-link">
+                        omnis nemo qui libero? Eius suscipit veritatis, tenetur impedit et voluptatibus.
+                        <a href="\#">Rerum repellat totam quam nobis harum fuga consequatur</a>
+                        corrupti?
+                    </p>
+                    <br>
+                    <iframe src="https://www.rust-lang.org/" name="rust_iframe" height="300px" width="100%" title="Rustlang Homepage">
+                    </iframe>
+                </div>
+                <iframe src="https://crates.io/" name="crates_iframe" height="300px" width="100%" title="Crates.io Homepage">
+                </iframe>
+                <table id="tbl-replace-p">
+                    <tr valign="top">
+                        <td width="20%" id="td-to-p"><span>One cell table. This is going to be replaced</span></td>
+                    </tr>
+                </table>
+                <embed type="video/webm" src="video.mp4" width="400" height="300">
+                <br>
+                <embed type="image/jpg" src="foo.jpg" width="300" height="200">
+                <div>
+                    <form action="">
+                        <div>
+                            <label>Join our newsletter</label>
+                            <input type="email" placeholder="Your email address">
+                        </div>
+                        <button>Sign up</button>
+                    </form>
+                </div>
+                <div id="div-p">
+                    <p class="share">Share this as a <a href="\#">Tweet</a></p>
+                    <br>
+                    <p id="share">
+                        Lorem ipsum dolor, sit amet consectetur adipisicing elit. Minima quia numquam aperiam dolores ipsam, eos perferendis cupiditate adipisci perspiciatis
+                        dolore, sunt, iusto nobis? Nulla molestiae id repellat quibusdam nobis quia. Lorem ipsum dolor sit amet consectetur, adipisicing elit. Voluptas
+                        laudantium omnis nemo qui libero? Eius suscipit veritatis, tenetur impedit et voluptatibus. Rerum repellat totam quam nobis harum fuga consequatur
+                        corrupti? Lorem ipsum dolor sit amet consectetur, adipisicing elit. Iure excepturi accusamus nemo voluptatibus laborum minus dicta blanditiis totam
+                        aperiam velit amet cupiditate hic a molestias odio nam, fugiat facere iusto.
+                    </p>
+                </div>
+                <table id="tbl-replace-div">
+                    <tr>
+                        <td id="td-to-div"><pre>One cell table. This is going to be replaced</pre></td>
+                    </tr>
+                </table>
+                <footer>A Paperoni test</footer>
+                <footer>Copyright 2020</footer>
+            </body>
+        </html>
+        "#;
+        let mut doc = Readability::new(html_str);
+        doc.article_title = "A test HTML file".into();
+        let body = doc.root_node.select_first("body").unwrap();
+        doc.prep_article(&mut body.as_node().clone());
+
+        // Ensure tables were assigned their data table scores
+        let table_node = doc.root_node.select_first("table").unwrap();
+        let node_attr = table_node.attributes.borrow();
+        assert_eq!(true, node_attr.get("readability-data-table").is_some());
+
+        let forms_and_fieldsets = doc.root_node.select("form, fieldset").unwrap();
+        assert_eq!(0, forms_and_fieldsets.count());
+
+        let nodes = doc
+            .root_node
+            .select("h1, object, embed, footer, link, aside")
+            .unwrap();
+        assert_eq!(0, nodes.count());
+
+        assert_eq!(2, doc.root_node.select("p").unwrap().count());
+        assert_eq!(true, doc.root_node.select_first("p.share").is_err());
+        assert_eq!(true, doc.root_node.select_first("p#share").is_ok());
+        assert_eq!(true, doc.root_node.select_first("p#td-to-p").is_ok());
+
+        let node = doc.root_node.select_first("h2");
+        assert_eq!(true, node.is_err());
+
+        let nodes = doc
+            .root_node
+            .select("input, textarea, select, button")
+            .unwrap();
+        assert_eq!(0, nodes.count());
+
+        let nodes = doc.root_node.select("iframe").unwrap();
+        assert_eq!(1, nodes.count());
+        let node = doc.root_node.select_first("iframe#yt");
+        assert_eq!(true, node.is_ok());
+
+        let nodes = doc.root_node.select("h1").unwrap();
+        assert_eq!(0, nodes.count());
+
+        let nodes = doc
+            .root_node
+            .select("#tbl-one, #tbl-replace-p, #tbl-replace-div")
+            .unwrap();
+        assert_eq!(0, nodes.count());
+
+        let tables = doc.root_node.select("#tbl-two, #tbl-three").unwrap();
+        assert_eq!(2, tables.count());
+
+        assert_eq!(true, doc.root_node.select_first("ul").is_ok());
+
+        assert_eq!(2, doc.root_node.select("div").unwrap().count());
+        assert_eq!(true, doc.root_node.select_first("div#div-p").is_ok());
+        assert_eq!(true, doc.root_node.select_first("div#td-to-div").is_ok());
+
+        assert_eq!(1, doc.root_node.select("br").unwrap().count());
+        let node_ref = doc.root_node.select_first("br").unwrap();
+        assert_eq!(
+            "div",
+            &node_ref
+                .as_node()
+                .following_siblings()
+                .elements()
+                .next()
+                .unwrap()
+                .name
+                .local
+        );
+    }
+
+    #[test]
+    fn test_get_article_title() {
+        let mut html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Porting Readability to Rust</title>
+            </head>
+            <body>
+                <p></p>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Porting Readability to Rust", doc.get_article_title());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Crates.io: The Rust package repository</title>
+            </head>
+            <body>
+                <p></p>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!(
+            "Crates.io: The Rust package repository",
+            doc.get_article_title()
+        );
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Crates.io: The Rust package repository</title>
+            </head>
+            <body>
+                <h1>Crates.io: The Rust package repository</h1>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!(
+            "Crates.io: The Rust package repository",
+            doc.get_article_title()
+        );
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Crates.io: A package repository</title>
+            </head>
+            <body>
+                <h1>Crates.io: A Rust package repository</h1>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Crates.io: A package repository", doc.get_article_title());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Foo developer \ Blog</title>
+            </head>
+            <body>
+                <p></p>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Foo developer \\ Blog", doc.get_article_title());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Foo developer » Blog Post on Foo bar stuff</title>
+            </head>
+            <body>
+                <p></p>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Blog Post on Foo bar stuff", doc.get_article_title());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <title>Blog</title>
+            </head>
+            <body>
+                <h1>Getting started with Rust</h1>
+            </body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        assert_eq!("Blog", doc.get_article_title());
+    }
+
+    #[test]
+    fn test_unescape_html_entities() {
+        let mut input = "Therefore, 5 &gt; 3".to_string();
+        Readability::unescape_html_entities(&mut input);
+        assert_eq!("Therefore, 5 > 3", &input);
+        input = "Logical AND (&amp;&amp;)".to_string();
+        Readability::unescape_html_entities(&mut input);
+        assert_eq!("Logical AND (&&)", &input);
+        input = "&#117; &#43; &#101; = &#252;".to_string();
+        Readability::unescape_html_entities(&mut input);
+        assert_eq!("u + e = ü", input);
+        input = "&#x0158;&#x016d;&#x0161;&#x0163;".to_string();
+        Readability::unescape_html_entities(&mut input);
+        assert_eq!("Řŭšţ", input);
+    }
+
+    #[test]
+    fn test_get_article_metadata() {
+        let mut html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <meta charset="utf-8"/>
+                <meta name="description" content="A post on how hard it is to work with text."/>
+                <meta name="viewport" content="width=device-width"/>
+                <title>Foo Coder / Blog on the difficulty of using utf-8</title>
+                <meta name="author" content="Foo Coder"/>
+            </head>
+            <body></body>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        let mut result = MetaData::new();
+        result.byline = Some("Foo Coder".to_string());
+        result.excerpt = Some("A post on how hard it is to work with text.".to_string());
+        result.title = "Blog on the difficulty of using utf-8".to_string();
+        assert_eq!(result, doc.get_article_metadata());
+
+        html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <head>
+                <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+                <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" user-scalable="no" />
+                <meta name="title" content="A Long Title" />
+                <meta name="description" content="Foo bar baz bo&#223;" />
+                <meta property="og:site_name" content="Blog Place" />
+                <meta property="og:title" content="A Longer Title" />
+                <meta property="og:description" content="Foo bar baz bo&#223;" />
+                <meta name="author" content="F&#x00f6;o Coder" />
+                <meta name="dc:creator" content="F&#x00f6;o Coder" />
+                <meta name="twitter:card" content="summary_large_image" />
+                <title>The Longest Title</title>
+            </head>
+        </html>
+        "#;
+        let doc = Readability::new(html_str);
+        result = MetaData::new();
+        result.byline = Some("Föo Coder".to_string());
+        result.excerpt = Some("Foo bar baz boß".to_string());
+        result.site_name = Some("Blog Place".to_string());
+        result.title = "A Longer Title".to_string();
+        assert_eq!(result, doc.get_article_metadata());
+    }
+
+    #[test]
+    fn test_fix_relative_uris() {
+        let html_str = r##"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <h1><a href="../home.html">Go back</a></h1>
+                <img id="ex-1" src="https://example.image.com/images/1.jpg" alt="Ex 1">
+                <img id="ex-2" src="https://example.image.com/images/2.jpg" alt="Ex 2">
+                <img id="ex-3" src="../images/2.jpg" alt="Ex 3">
+                <img id="ex-4" src="./images/1.jpg" alt="Ex 4">
+                <img id="ex-5" src="https://images.com/images/1.jpg" alt="Ex 5">
+                <img id="ex-6" src="/images/1.jpg" alt="Ex 6">
+                <p><a href="#ex-1">First image</a></p>
+            </body>
+        </html>
+        "##;
+        let mut doc = Readability::new(html_str);
+        doc.article_node = doc
+            .root_node
+            .select_first("body")
+            .ok()
+            .map(|node_ref| node_ref.as_node().clone());
+        doc.fix_relative_uris("https://example.image.com/blog/");
+
+        let node = doc.root_node.select_first("img#ex-1").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/images/1.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("img#ex-2").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/images/2.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("img#ex-3").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/images/2.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("img#ex-4").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/blog/images/1.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("img#ex-5").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://images.com/images/1.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("img#ex-6").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/images/1.jpg"),
+            node_attrs.get("src")
+        );
+
+        let node = doc.root_node.select_first("p a").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(Some("#ex-1"), node_attrs.get("href"));
+
+        let node = doc.root_node.select_first("h1 a").unwrap();
+        let node_attrs = node.attributes.borrow();
+        assert_eq!(
+            Some("https://example.image.com/home.html"),
+            node_attrs.get("href")
+        );
+    }
+
+    #[test]
+    fn test_clean_classes() {
+        // TODO: This test will later be edited to ensure it checks to only remove certain classes
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <p class="a b c d">One</p>
+                <p class="b c d e">Two</p>
+                <div class="a b c div">Three</div>
+                <div class="b c d e">Four</div>
+                <ul class="a b c d">
+                    <li class="a b c d">One</li>
+                    <li class="b c d e">Two</li>
+                    <li class="b c d e">Three</li>
+                </ul>
+            </body>
+        </html>
+        "#;
+        let mut doc = Readability::new(html_str);
+        doc.article_node = doc
+            .root_node
+            .select_first("body")
+            .ok()
+            .map(|node_ref| node_ref.as_node().clone());
+        doc.clean_classes();
+
+        assert_eq!(
+            true,
+            doc.root_node
+                .inclusive_descendants()
+                .elements()
+                .all(|node_elem| {
+                    let node_attrs = node_elem.attributes.borrow();
+                    !node_attrs.contains("class")
+                })
+        );
+    }
+
+    #[test]
+    fn test_clean_readability_attrs() {
+        let html_str = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <div readability-score="0.921487">
+                    <p readability-score="0.8102">Welcome to this awesome blog post. Only good content is here. No spam.</p>
+                    <p readability-score="0.6004">Let's look at some statistics</p>
+                    <table readability-score="0.719275" readability-data-table="true">
+                        <caption>Monthly savings</caption>
+                        <tr>
+                            <th>Month</th>
+                            <th>Savings</th>
+                        </tr>
+                        <tr>
+                            <td>January</td>
+                            <td>$100</td>
+                        </tr>
+                        <tr>
+                            <td>February</td>
+                            <td>$50</td>
+                        </tr>
+                    </table>
+                </div>
+            </body>
+        </html>
+        "#;
+        let mut doc = Readability::new(html_str);
+        doc.article_node = doc
+            .root_node
+            .select_first("body")
+            .ok()
+            .map(|node_ref| node_ref.as_node().clone());
+        doc.clean_readability_attrs();
+        assert_eq!(
+            true,
+            doc.root_node
+                .inclusive_descendants()
+                .elements()
+                .all(|node| {
+                    let node_attrs = node.attributes.borrow();
+                    node_attrs.map.len() == 0
+                })
+        );
+    }
+
+    #[test]
+    fn test_post_process_content() {
+        let html_str = r##"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <p class="a b c d">One</p>
+                <p class="b c d e">Two</p>
+                <div class="a b c div">Three</div>
+                <div class="b c d e">
+                    <img src="./img.jpg" class="lazy">
+                </div>
+                <ul class="a b c d">
+                    <li class="a b c d"><a href="#home">One</a></li>
+                    <li class="b c d e">Two</li>
+                    <li class="b c d e">Three</li>
+                </ul>
+            </body>
+        </html>
+        "##;
+        let mut doc = Readability::new(html_str);
+        doc.article_node = doc
+            .root_node
+            .select_first("body")
+            .ok()
+            .map(|node_ref| node_ref.as_node().clone());
+        doc.post_process_content("https://foo.blog/post/");
+        let has_class_attr = doc
+            .root_node
+            .inclusive_descendants()
+            .elements()
+            .any(|node_ref| {
+                let attrs = node_ref.attributes.borrow();
+                attrs.contains("class")
+            });
+        assert_eq!(false, has_class_attr);
+        let a_node = doc.root_node.select_first("a").unwrap();
+        let a_node_attrs = a_node.attributes.borrow();
+        assert_eq!(Some("#home"), a_node_attrs.get("href"));
+        let img_node = doc.root_node.select_first("img").unwrap();
+        let img_attrs = img_node.attributes.borrow();
+        assert_eq!(Some("https://foo.blog/post/img.jpg"), img_attrs.get("src"));
+    }
+
+    #[test]
+    fn test_flag_is_active() {
+        let html_str = r"
+        <!DOCTYPE html>
+        <html>
+            <body>
+            </body>
+        </html>
+        ";
+        let doc = Readability::new(html_str);
+        assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
+        assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
+        assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
+    }
+
+    #[test]
+    fn test_remove_flag() {
+        let html_str = r"
+        <!DOCTYPE html>
+        <html>
+            <body>
+            </body>
+        </html>
+        ";
+        let mut doc = Readability::new(html_str);
+        assert_eq!(true, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
+        doc.remove_flag(FLAG_CLEAN_CONDITIONALLY);
+        assert_eq!(false, doc.flag_is_active(FLAG_CLEAN_CONDITIONALLY));
+        assert_eq!(true, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
+        doc.remove_flag(FLAG_WEIGHT_CLASSES);
+        assert_eq!(false, doc.flag_is_active(FLAG_WEIGHT_CLASSES));
+        assert_eq!(true, doc.flag_is_active(FLAG_STRIP_UNLIKELYS));
+    }
+}
diff --git a/src/moz_readability/regexes.rs b/src/moz_readability/regexes.rs
new file mode 100644
index 0000000..c4aa0d9
--- /dev/null
+++ b/src/moz_readability/regexes.rs
@@ -0,0 +1,139 @@
+/// This module contains regular expressions frequently used by moz_readability
+/// All regexes that only test if a `&str` matches the regex are preceded by the
+/// word "is_match". All other regexes are publicly accessible.
+use regex::Regex;
+pub fn is_match_byline(match_str: &str) -> bool {
+    lazy_static! {
+        static ref BYLINE_REGEX: Regex =
+            Regex::new(r"(?i)byline|author|dateline|writtenby|p-author").unwrap();
+    }
+    BYLINE_REGEX.is_match(match_str)
+}
+
+pub fn is_match_positive(match_str: &str) -> bool {
+    lazy_static! {
+        static ref POSITIVE_REGEX: Regex = Regex::new(r"(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story").unwrap();
+    }
+    POSITIVE_REGEX.is_match(match_str)
+}
+
+pub fn is_match_negative(match_str: &str) -> bool {
+    lazy_static! {
+        static ref NEGATIVE_REGEX: Regex = Regex::new(r"(?i)hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget").unwrap();
+    }
+    NEGATIVE_REGEX.is_match(match_str)
+}
+
+pub fn is_match_videos(match_str: &str) -> bool {
+    lazy_static! {
+        static ref VIDEOS_REGEX: Regex = Regex::new(r"(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)").unwrap();
+    }
+    VIDEOS_REGEX.is_match(match_str)
+}
+
+pub fn is_match_unlikely(match_str: &str) -> bool {
+    lazy_static! {
+        static ref UNLIKELY_REGEX: Regex = Regex::new(r"(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote").unwrap();
+    }
+    UNLIKELY_REGEX.is_match(match_str)
+}
+
+pub fn is_match_ok_maybe(match_str: &str) -> bool {
+    lazy_static! {
+        static ref OK_MAYBE_REGEX: Regex =
+            Regex::new(r"(?i)and|article|body|column|content|main|shadow").unwrap();
+    }
+    OK_MAYBE_REGEX.is_match(match_str)
+}
+
+pub fn is_match_node_content(match_str: &str) -> bool {
+    lazy_static! {
+        static ref NODE_CONTENT_REGEX: Regex = Regex::new(r"\.( |$)").unwrap();
+    }
+    NODE_CONTENT_REGEX.is_match(match_str)
+}
+
+pub fn is_match_share_elems(match_str: &str) -> bool {
+    lazy_static! {
+        static ref SHARE_ELEMS_REGEX: Regex =
+            Regex::new(r"(?i)(\b|_)(share|sharedaddy)(\b|_)").unwrap();
+    }
+    SHARE_ELEMS_REGEX.is_match(match_str)
+}
+
+pub fn is_match_has_content(match_str: &str) -> bool {
+    lazy_static! {
+        static ref HAS_CONTENT_REGEX: Regex = Regex::new(r"\S$").unwrap();
+    }
+    HAS_CONTENT_REGEX.is_match(match_str)
+}
+
+pub fn is_match_img_ext(match_str: &str) -> bool {
+    lazy_static! {
+        static ref IMG_EXT_REGEX: Regex = Regex::new(r"(?i)\.(jpg|jpeg|png|webp)").unwrap();
+    }
+    IMG_EXT_REGEX.is_match(match_str)
+}
+
+pub fn is_match_srcset(match_str: &str) -> bool {
+    lazy_static! {
+        static ref SRCSET_REGEX: Regex = Regex::new(r"\.(jpg|jpeg|png|webp)\s+\d").unwrap();
+    }
+    SRCSET_REGEX.is_match(match_str)
+}
+
+pub fn is_match_src_regex(match_str: &str) -> bool {
+    lazy_static! {
+        static ref SRC_REGEX: Regex = Regex::new(r"^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$").unwrap();
+    }
+    SRC_REGEX.is_match(match_str)
+}
+
+pub fn is_match_name_pattern(match_str: &str) -> bool {
+    lazy_static! {
+        static ref NAME_PATTERN_REGEX: Regex = Regex::new(r"(?i)\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$").unwrap();
+    }
+    NAME_PATTERN_REGEX.is_match(match_str)
+}
+
+pub fn is_match_title_separator(match_str: &str) -> bool {
+    lazy_static! {
+        static ref TITLE_SEPARATOR_REGEX: Regex = Regex::new(r" [\|\-\\/>»] ").unwrap();
+    }
+    TITLE_SEPARATOR_REGEX.is_match(match_str)
+}
+
+pub fn is_match_has_title_separator(match_str: &str) -> bool {
+    lazy_static! {
+        static ref HAS_TITLE_SEPARATOR_REGEX: Regex = Regex::new(r" [\\/>»] ").unwrap();
+    }
+    HAS_TITLE_SEPARATOR_REGEX.is_match(match_str)
+}
+
+lazy_static! {
+    pub static ref NORMALIZE_REGEX: Regex = Regex::new(r"\s{2,}").unwrap();
+    pub static ref B64_DATA_URL_REGEX: Regex =
+        Regex::new(r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*").unwrap();
+    pub static ref BASE64_REGEX: Regex = Regex::new(r"(?i)base64\s*").unwrap();
+    pub static ref PROPERTY_REGEX: Regex = Regex::new(
+        r"(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*"
+    )
+    .unwrap();
+    pub static ref SRCSET_CAPTURE_REGEX: Regex =
+        Regex::new(r"(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))").unwrap();
+    pub static ref REPLACE_WHITESPACE_REGEX: Regex = Regex::new(r"\s").unwrap();
+    pub static ref REPLACE_DOT_REGEX: Regex = Regex::new(r"\.").unwrap();
+    pub static ref REPLACE_HTML_ESCAPE_REGEX: Regex =
+        Regex::new("&(quot|amp|apos|lt|gt);").unwrap();
+    pub static ref REPLACE_HEX_REGEX: Regex =
+        Regex::new(r"(?i)&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));").unwrap();
+    pub static ref REPLACE_START_SEPARATOR_REGEX: Regex =
+        Regex::new(r"(?i)(?P<start>.*)[\|\-\\/>»] .*").unwrap();
+    pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
+        Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
+    pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
+    pub static ref REPLACE_SELF_CLOSING_REGEX: Regex = Regex::new(
+        r#"(?P<tag><(?:area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)(?: [a-z\-]+=["'][\sa-zA-Z0-9\./\-_#]+["']|[a-z\-]+)*)>"#
+    )
+    .unwrap();
+}
diff --git a/test_html/simple.html b/test_html/simple.html
new file mode 100644
index 0000000..c5c1818
--- /dev/null
+++ b/test_html/simple.html
@@ -0,0 +1,25 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>Sample Document</title>
+</head>
+
+<body>
+    <h1>Some text in h1</h1>
+    <img src="inexistent.png">
+    <div class="invalid-elems">
+        <!-- This div contains invalid elements -->
+        <h1>Imagine some lorem ipsum</h1>
+        <img>
+    </div>
+    <!-- Test that the no-script content is copied over -->
+    <img src="lazy-load.png">
+    <noscript>
+        <div class="parent">
+            <img src="eager-load.png" id="lazy-load">
+        </div>
+    </noscript>
+</body>
+
+</html>