Change signature of extract_content to copy the reference to article DOM

node instead of writing to file
2020-05-02 14:51:53 +03:00 · 2020-05-02 14:51:53 +03:00 · f24e72e70f
commit f24e72e70f
parent 529704d227
2 changed files with 38 additions and 23 deletions
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -1,15 +1,15 @@
-use std::io::Write;
-
-use kuchiki::{traits::*, NodeRef};
+use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef};

 pub struct Extractor {
    pub root_node: NodeRef,
+    content: Option<NodeDataRef<ElementData>>,
 }

 impl Extractor {
    /// Create a new instance of an HTML extractor given an HTML string
    pub fn from_html(html_str: &str) -> Self {
        Extractor {
+            content: None,
            root_node: kuchiki::parse_html().one(html_str),
        }
    }
@ -27,12 +27,15 @@ impl Extractor {
            .and_then(|data| data.attributes.borrow().get(attr_target).map(mapper))
    }

+    /// Extract the text of a DOM node given its CSS selector
    fn extract_inner_text(&self, css_selector: &str) -> Option<String> {
        let node_ref = self.root_node.select_first(css_selector).ok()?;
        extract_text_from_node(node_ref.as_node())
    }

-    pub fn extract_content<W: Write>(&self, writer: &mut W) {
+    /// Locates and extracts the HTML in a document which is determined to be
+    /// the source of the content
+    pub fn extract_content(&mut self) {
        // Extract the useful parts of the head section
        let author: Option<String> =
            self.extract_attr_val("meta[name='author']", "content", |author| {
@ -68,11 +71,7 @@ impl Extractor {
                _ => node_ref.detach(),
            }
        }
-        println!("Saving to file");
-        for node_ref in article_ref.as_node().children() {
-            match node_ref.data() {
-                kuchiki::NodeData::Element(_) => {
-                    node_ref.serialize(writer).expect("Serialization failed");
+        self.content = Some(article_ref);
    }

                _ => (),
@ -182,20 +181,37 @@ mod test {
    #[test]
    fn test_extract_content() {
        let extracted_html: String = r#"
+            <article>
                <h1>Starting out</h1>
                <p>Some Lorem Ipsum text here</p>
                <p>Observe this picture</p>
                <img alt="Random image" src="./img.jpg">
+            </article>
        "#
        .lines()
        .map(|line| line.trim())
        .collect();

-        let extractor = Extractor::from_html(TEST_HTML);
-        let mut output_string = Vec::new();
-        extractor.extract_content(&mut output_string);
-        let output_string = std::str::from_utf8(&output_string).unwrap();
-        assert!(output_string.len() > 0);
-        assert_eq!(extracted_html, output_string);
+        let mut extractor = Extractor::from_html(
+            &TEST_HTML
+                .lines()
+                .map(|line| line.trim())
+                .collect::<String>(),
+        );
+
+        extractor.extract_content();
+        let mut output = Vec::new();
+        assert!(extractor.content.is_some());
+
+        extractor
+            .content
+            .unwrap()
+            .as_node()
+            .serialize(&mut output)
+            .expect("Unable to serialize output HTML");
+        let output = std::str::from_utf8(&output).unwrap();
+
+        assert_eq!(extracted_html, output);
+    }
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -16,10 +16,9 @@ fn main() {
            "https://dev.to/steelwolf180/full-stack-development-in-django-3768"
        ];
        let html = fetch_url(urls[6]).await;
-        let extractor = Extractor::from_html(&html);
+        let mut extractor = Extractor::from_html(&html);
        println!("Extracting");
-        let mut out_file = File::create("out.html").expect("Can't make file");
-        extractor.extract_content(&mut out_file);
+        extractor.extract_content();
    });
 }