Factor out text extraction into extractor module

2020-05-01 16:17:59 +03:00 · 2020-05-01 16:17:59 +03:00 · b5336e078d
commit b5336e078d
parent 4527fb07d9
2 changed files with 189 additions and 89 deletions
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -0,0 +1,181 @@
+use std::fs::File;
+
+use kuchiki::{traits::*, NodeRef};
+
+pub struct Extractor {
+    pub root_node: NodeRef,
+}
+
+impl Extractor {
+    /// Create a new instance of an HTML extractor given an HTML string
+    pub fn from_html(html_str: &str) -> Self {
+        Extractor {
+            root_node: kuchiki::parse_html().one(html_str),
+        }
+    }
+
+    /// Extract the value of an attribute
+    fn extract_attr_val<T: Fn(&str) -> U, U>(
+        &self,
+        css_selector: &str,
+        attr_target: &str,
+        mapper: T,
+    ) -> Option<U> {
+        self.root_node
+            .select_first(css_selector)
+            .ok()
+            .and_then(|data| data.attributes.borrow().get(attr_target).map(mapper))
+    }
+
+    fn extract_inner_text(&self, css_selector: &str) -> Option<String> {
+        let node_ref = self.root_node.select_first(css_selector).ok()?;
+        extract_text_from_node(node_ref.as_node())
+    }
+
+    pub fn extract_content(&self) {
+        // Extract the useful parts of the head section
+        let author: Option<String> =
+            self.extract_attr_val("meta[name='author']", "content", |author| {
+                author.to_string()
+            });
+
+        let description =
+            self.extract_attr_val("meta[name='description']", "content", |description| {
+                description.to_string()
+            });
+
+        let tags = self.extract_attr_val("meta[name='keywords']", "content", |tags| {
+            tags.split(",")
+                .map(|tag| tag.trim().to_string())
+                .collect::<Vec<String>>()
+        });
+
+        let title = self.extract_inner_text("title").unwrap_or("".to_string());
+        let lang = self
+            .extract_attr_val("html", "lang", |lang| lang.to_string())
+            .unwrap_or("en".to_string());
+
+        let meta_attrs = MetaAttr::new(author, description, lang, tags, title);
+        dbg!(meta_attrs);
+
+        let article_ref = self.root_node.select_first("article").unwrap();
+        let mut out_file = File::create("out.html").expect("Can't make file");
+        for node_ref in article_ref.as_node().descendants() {
+            match node_ref.data() {
+                kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (),
+                _ => node_ref.detach(),
+            }
+        }
+        println!("Saving to file");
+        for node_ref in article_ref.as_node().children() {
+            match node_ref.data() {
+                kuchiki::NodeData::Element(_) => {
+                    node_ref
+                        .serialize(&mut out_file)
+                        .expect("Serialization failed");
+                }
+
+                _ => (),
+            }
+        }
+    }
+}
+fn extract_text_from_node(node: &NodeRef) -> Option<String> {
+    node.first_child()
+        .map(|child_ref| child_ref.text_contents())
+}
+
+#[derive(Debug)]
+pub struct MetaAttr {
+    author: Option<String>,
+    description: Option<String>,
+    language: String,
+    tags: Option<Vec<String>>,
+    title: String,
+}
+
+impl MetaAttr {
+    pub fn new(
+        author: Option<String>,
+        description: Option<String>,
+        language: String,
+        tags: Option<Vec<String>>,
+        title: String,
+    ) -> Self {
+        MetaAttr {
+            author,
+            description,
+            language,
+            tags,
+            title,
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    const TEST_HTML: &'static str = r#"
+        <!doctype html>
+        <html lang="en">
+            <head>
+                <meta charset="utf-8">
+                <meta name="description" content="A sample document">
+                <meta name="keywords" content="test,Rust">
+                <meta name="author" content="Paperoni">                
+                <title>Testing Paperoni</title>
+            </head>
+            <body>
+                <header>
+                <!-- Unimportant information -->
+                    <h1>Testing Paperoni</h1>
+                </header>
+                <article>
+                    <h1>Starting out</h1>
+                    <p>Some Lorem Ipsum text here</p>
+                    <p>Observe this picture</p>
+                    <img src="./img.jpg" alt="Random image">
+                </article>
+                <footer>
+                    <p>Made in HTML</p>
+                </footer>
+            </body>
+        </html>
+        "#;
+
+    #[test]
+    fn test_extract_attr_val() {
+        let extractor = Extractor::from_html(TEST_HTML);
+        let ext_author =
+            extractor.extract_attr_val("meta[name='author']", "content", |val| val.to_string());
+        assert!(ext_author.is_some());
+        assert_eq!("Paperoni", &ext_author.unwrap());
+        let ext_author =
+            extractor.extract_attr_val("meta[name='invalid-name']", "content", |val| {
+                val.to_string()
+            });
+        assert!(ext_author.is_none());
+        let lang_attr = extractor.extract_attr_val("html", "lang", |lang| lang.to_string());
+        assert!(lang_attr.is_some());
+        assert_eq!("en".to_string(), lang_attr.unwrap());
+    }
+
+    #[test]
+    fn test_extract_inner_text() {
+        let extractor = Extractor::from_html(TEST_HTML);
+        let title_text = extractor.extract_inner_text("title");
+        assert!(title_text.is_some());
+        assert_eq!("Testing Paperoni".to_string(), title_text.unwrap());
+
+        let title_text = extractor.extract_inner_text("titln");
+        assert!(title_text.is_none());
+    }
+    #[test]
+    fn test_extract_text() {
+        let extractor = Extractor::from_html(TEST_HTML);
+        let h1_node = extractor.root_node.select_first("h1").unwrap();
+        let h1_text = extract_text_from_node(h1_node.as_node());
+        assert!(h1_text.is_some());
+        assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap());
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,6 +1,8 @@
 use async_std::task;
-use kuchiki::traits::*;

+mod extractor;
+
+use extractor::Extractor;
 fn main() {
    task::block_on(async {
        let urls = vec![
@ -9,10 +11,12 @@ fn main() {
            "https://saveandrun.com/posts/2020-01-08-working-with-git.html",
            "https://blog.hipstermojo.xyz/posts/redis-orm-preface/",
            "https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators",
-            "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10"
+            "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
+            "https://dev.to/steelwolf180/full-stack-development-in-django-3768"
        ];
-        let html = fetch_url(urls[0]).await;
-        extract_content(html);
+        let html = fetch_url(urls[4]).await;
+        let extractor = Extractor::from_html(&html);
+        extractor.extract_content();
    });
 }

@ -25,88 +29,3 @@ async fn fetch_url(url: &str) -> String {
        .await
        .expect("Unable to fetch URL")
 }
-
-fn extract_content(html_str: String) {
-    let document = kuchiki::parse_html().one(html_str);
-    let author: Option<String> =
-        document
-            .select_first("meta[name='author']")
-            .ok()
-            .and_then(|data| {
-                data.attributes
-                    .borrow()
-                    .get("content")
-                    .map(|name| name.to_string())
-            });
-    let description = document
-        .select_first("meta[name='description']")
-        .ok()
-        .and_then(|data| {
-            data.attributes
-                .borrow()
-                .get("content")
-                .map(|description| description.to_string())
-        });
-    let tags = document
-        .select_first("meta[name='keywords']")
-        .ok()
-        .and_then(|data| {
-            data.attributes.borrow().get("content").map(|tags| {
-                tags.split(",")
-                    .map(|tag_str| tag_str.trim().to_string())
-                    .collect::<Vec<String>>()
-            })
-        });
-    let title = if let Some(title_node) = document.select_first("title").ok() {
-        title_node
-            .as_node()
-            .first_child()
-            .and_then(|text_node| {
-                text_node
-                    .as_text()
-                    .map(|text_ref| text_ref.borrow().to_string())
-            })
-            .unwrap_or("".to_string())
-    } else {
-        "".to_string()
-    };
-    let lang = document
-        .select_first("html")
-        .ok()
-        .and_then(|data| {
-            data.attributes
-                .borrow()
-                .get("lang")
-                .map(|val| val.to_string())
-        })
-        .unwrap_or("en".to_string());
-    let meta_attrs = MetaAttr::new(author, description, lang, tags, title);
-    dbg!(meta_attrs);
-}
-
-#[derive(Debug)]
-struct MetaAttr {
-    author: Option<String>,
-    description: Option<String>,
-    language: String,
-    tags: Option<Vec<String>>,
-    title: String,
-}
-
-impl MetaAttr {
-    pub fn new(
-        author: Option<String>,
-        description: Option<String>,
-        language: String,
-        tags: Option<Vec<String>>,
-        title: String,
-    ) -> Self {
-        MetaAttr {
-            author,
-            description,
-            language,
-            tags,
-            title,
-        }
-    }
-}