diff --git a/src/extractor.rs b/src/extractor.rs index 9797e45..2f05b21 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,4 +1,4 @@ -use std::fs::File; +use std::io::Write; use kuchiki::{traits::*, NodeRef}; @@ -32,7 +32,7 @@ impl Extractor { extract_text_from_node(node_ref.as_node()) } - pub fn extract_content(&self) { + pub fn extract_content(&self, writer: &mut W) { // Extract the useful parts of the head section let author: Option = self.extract_attr_val("meta[name='author']", "content", |author| { @@ -58,8 +58,10 @@ impl Extractor { let meta_attrs = MetaAttr::new(author, description, lang, tags, title); dbg!(meta_attrs); + // Extract the article + let article_ref = self.root_node.select_first("article").unwrap(); - let mut out_file = File::create("out.html").expect("Can't make file"); + for node_ref in article_ref.as_node().descendants() { match node_ref.data() { kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (), @@ -70,9 +72,7 @@ impl Extractor { for node_ref in article_ref.as_node().children() { match node_ref.data() { kuchiki::NodeData::Element(_) => { - node_ref - .serialize(&mut out_file) - .expect("Serialization failed"); + node_ref.serialize(writer).expect("Serialization failed"); } _ => (), @@ -178,4 +178,24 @@ mod test { assert!(h1_text.is_some()); assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap()); } + + #[test] + fn test_extract_content() { + let extracted_html: String = r#" +

Starting out

+

Some Lorem Ipsum text here

+

Observe this picture

+ Random image + "# + .lines() + .map(|line| line.trim()) + .collect(); + + let extractor = Extractor::from_html(TEST_HTML); + let mut output_string = Vec::new(); + extractor.extract_content(&mut output_string); + let output_string = std::str::from_utf8(&output_string).unwrap(); + assert!(output_string.len() > 0); + assert_eq!(extracted_html, output_string); + } } diff --git a/src/main.rs b/src/main.rs index baf9943..a23e29d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,5 @@ +use std::fs::File; + use async_std::task; mod extractor; @@ -8,15 +10,16 @@ fn main() { let urls = vec![ "https://saveandrun.com/posts/2020-01-24-generating-mazes-with-haskell-part-1.html", "https://saveandrun.com/posts/2020-04-05-querying-pacman-with-datalog.html", - "https://saveandrun.com/posts/2020-01-08-working-with-git.html", "https://blog.hipstermojo.xyz/posts/redis-orm-preface/", "https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators", "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10", "https://dev.to/steelwolf180/full-stack-development-in-django-3768" ]; - let html = fetch_url(urls[4]).await; + let html = fetch_url(urls[6]).await; let extractor = Extractor::from_html(&html); - extractor.extract_content(); + println!("Extracting"); + let mut out_file = File::create("out.html").expect("Can't make file"); + extractor.extract_content(&mut out_file); }); }