Add test for extract content

2020-05-01 20:42:41 +03:00 · 2020-05-01 20:42:41 +03:00 · 529704d227
commit 529704d227
parent b5336e078d
2 changed files with 32 additions and 9 deletions
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -1,4 +1,4 @@
-use std::fs::File;
+use std::io::Write;

 use kuchiki::{traits::*, NodeRef};

@ -32,7 +32,7 @@ impl Extractor {
        extract_text_from_node(node_ref.as_node())
    }

-    pub fn extract_content(&self) {
+    pub fn extract_content<W: Write>(&self, writer: &mut W) {
        // Extract the useful parts of the head section
        let author: Option<String> =
            self.extract_attr_val("meta[name='author']", "content", |author| {
@ -58,8 +58,10 @@ impl Extractor {
        let meta_attrs = MetaAttr::new(author, description, lang, tags, title);
        dbg!(meta_attrs);

+        // Extract the article
+
        let article_ref = self.root_node.select_first("article").unwrap();
-        let mut out_file = File::create("out.html").expect("Can't make file");
+
        for node_ref in article_ref.as_node().descendants() {
            match node_ref.data() {
                kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (),
@ -70,9 +72,7 @@ impl Extractor {
        for node_ref in article_ref.as_node().children() {
            match node_ref.data() {
                kuchiki::NodeData::Element(_) => {
-                    node_ref
-                        .serialize(&mut out_file)
-                        .expect("Serialization failed");
+                    node_ref.serialize(writer).expect("Serialization failed");
                }

                _ => (),
@ -178,4 +178,24 @@ mod test {
        assert!(h1_text.is_some());
        assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap());
    }
+
+    #[test]
+    fn test_extract_content() {
+        let extracted_html: String = r#"
+            <h1>Starting out</h1>
+            <p>Some Lorem Ipsum text here</p>
+            <p>Observe this picture</p>
+            <img alt="Random image" src="./img.jpg">
+        "#
+        .lines()
+        .map(|line| line.trim())
+        .collect();
+
+        let extractor = Extractor::from_html(TEST_HTML);
+        let mut output_string = Vec::new();
+        extractor.extract_content(&mut output_string);
+        let output_string = std::str::from_utf8(&output_string).unwrap();
+        assert!(output_string.len() > 0);
+        assert_eq!(extracted_html, output_string);
+    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -1,3 +1,5 @@
+use std::fs::File;
+
 use async_std::task;

 mod extractor;
@ -8,15 +10,16 @@ fn main() {
        let urls = vec![
            "https://saveandrun.com/posts/2020-01-24-generating-mazes-with-haskell-part-1.html",
            "https://saveandrun.com/posts/2020-04-05-querying-pacman-with-datalog.html",
-            "https://saveandrun.com/posts/2020-01-08-working-with-git.html",
            "https://blog.hipstermojo.xyz/posts/redis-orm-preface/",
            "https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators",
            "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
            "https://dev.to/steelwolf180/full-stack-development-in-django-3768"
        ];
-        let html = fetch_url(urls[4]).await;
+        let html = fetch_url(urls[6]).await;
        let extractor = Extractor::from_html(&html);
-        extractor.extract_content();
+        println!("Extracting");
+        let mut out_file = File::create("out.html").expect("Can't make file");
+        extractor.extract_content(&mut out_file);
    });
 }