From b5336e078d1b7d555e52dd8f88fbcf28bcaff94b Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Fri, 1 May 2020 16:17:59 +0300 Subject: [PATCH] Factor out text extraction into extractor module --- src/extractor.rs | 181 +++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 97 +++---------------------- 2 files changed, 189 insertions(+), 89 deletions(-) create mode 100644 src/extractor.rs diff --git a/src/extractor.rs b/src/extractor.rs new file mode 100644 index 0000000..9797e45 --- /dev/null +++ b/src/extractor.rs @@ -0,0 +1,181 @@ +use std::fs::File; + +use kuchiki::{traits::*, NodeRef}; + +pub struct Extractor { + pub root_node: NodeRef, +} + +impl Extractor { + /// Create a new instance of an HTML extractor given an HTML string + pub fn from_html(html_str: &str) -> Self { + Extractor { + root_node: kuchiki::parse_html().one(html_str), + } + } + + /// Extract the value of an attribute + fn extract_attr_val U, U>( + &self, + css_selector: &str, + attr_target: &str, + mapper: T, + ) -> Option { + self.root_node + .select_first(css_selector) + .ok() + .and_then(|data| data.attributes.borrow().get(attr_target).map(mapper)) + } + + fn extract_inner_text(&self, css_selector: &str) -> Option { + let node_ref = self.root_node.select_first(css_selector).ok()?; + extract_text_from_node(node_ref.as_node()) + } + + pub fn extract_content(&self) { + // Extract the useful parts of the head section + let author: Option = + self.extract_attr_val("meta[name='author']", "content", |author| { + author.to_string() + }); + + let description = + self.extract_attr_val("meta[name='description']", "content", |description| { + description.to_string() + }); + + let tags = self.extract_attr_val("meta[name='keywords']", "content", |tags| { + tags.split(",") + .map(|tag| tag.trim().to_string()) + .collect::>() + }); + + let title = self.extract_inner_text("title").unwrap_or("".to_string()); + let lang = self + .extract_attr_val("html", "lang", |lang| lang.to_string()) + .unwrap_or("en".to_string()); + + let meta_attrs = MetaAttr::new(author, description, lang, tags, title); + dbg!(meta_attrs); + + let article_ref = self.root_node.select_first("article").unwrap(); + let mut out_file = File::create("out.html").expect("Can't make file"); + for node_ref in article_ref.as_node().descendants() { + match node_ref.data() { + kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (), + _ => node_ref.detach(), + } + } + println!("Saving to file"); + for node_ref in article_ref.as_node().children() { + match node_ref.data() { + kuchiki::NodeData::Element(_) => { + node_ref + .serialize(&mut out_file) + .expect("Serialization failed"); + } + + _ => (), + } + } + } +} +fn extract_text_from_node(node: &NodeRef) -> Option { + node.first_child() + .map(|child_ref| child_ref.text_contents()) +} + +#[derive(Debug)] +pub struct MetaAttr { + author: Option, + description: Option, + language: String, + tags: Option>, + title: String, +} + +impl MetaAttr { + pub fn new( + author: Option, + description: Option, + language: String, + tags: Option>, + title: String, + ) -> Self { + MetaAttr { + author, + description, + language, + tags, + title, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + const TEST_HTML: &'static str = r#" + + + + + + + + Testing Paperoni + + +
+ +

Testing Paperoni

+
+
+

Starting out

+

Some Lorem Ipsum text here

+

Observe this picture

+ Random image +
+
+

Made in HTML

+
+ + + "#; + + #[test] + fn test_extract_attr_val() { + let extractor = Extractor::from_html(TEST_HTML); + let ext_author = + extractor.extract_attr_val("meta[name='author']", "content", |val| val.to_string()); + assert!(ext_author.is_some()); + assert_eq!("Paperoni", &ext_author.unwrap()); + let ext_author = + extractor.extract_attr_val("meta[name='invalid-name']", "content", |val| { + val.to_string() + }); + assert!(ext_author.is_none()); + let lang_attr = extractor.extract_attr_val("html", "lang", |lang| lang.to_string()); + assert!(lang_attr.is_some()); + assert_eq!("en".to_string(), lang_attr.unwrap()); + } + + #[test] + fn test_extract_inner_text() { + let extractor = Extractor::from_html(TEST_HTML); + let title_text = extractor.extract_inner_text("title"); + assert!(title_text.is_some()); + assert_eq!("Testing Paperoni".to_string(), title_text.unwrap()); + + let title_text = extractor.extract_inner_text("titln"); + assert!(title_text.is_none()); + } + #[test] + fn test_extract_text() { + let extractor = Extractor::from_html(TEST_HTML); + let h1_node = extractor.root_node.select_first("h1").unwrap(); + let h1_text = extract_text_from_node(h1_node.as_node()); + assert!(h1_text.is_some()); + assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap()); + } +} diff --git a/src/main.rs b/src/main.rs index 03861bc..baf9943 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,8 @@ use async_std::task; -use kuchiki::traits::*; +mod extractor; + +use extractor::Extractor; fn main() { task::block_on(async { let urls = vec![ @@ -9,10 +11,12 @@ fn main() { "https://saveandrun.com/posts/2020-01-08-working-with-git.html", "https://blog.hipstermojo.xyz/posts/redis-orm-preface/", "https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators", - "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10" + "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10", + "https://dev.to/steelwolf180/full-stack-development-in-django-3768" ]; - let html = fetch_url(urls[0]).await; - extract_content(html); + let html = fetch_url(urls[4]).await; + let extractor = Extractor::from_html(&html); + extractor.extract_content(); }); } @@ -25,88 +29,3 @@ async fn fetch_url(url: &str) -> String { .await .expect("Unable to fetch URL") } - -fn extract_content(html_str: String) { - let document = kuchiki::parse_html().one(html_str); - let author: Option = - document - .select_first("meta[name='author']") - .ok() - .and_then(|data| { - data.attributes - .borrow() - .get("content") - .map(|name| name.to_string()) - }); - let description = document - .select_first("meta[name='description']") - .ok() - .and_then(|data| { - data.attributes - .borrow() - .get("content") - .map(|description| description.to_string()) - }); - let tags = document - .select_first("meta[name='keywords']") - .ok() - .and_then(|data| { - data.attributes.borrow().get("content").map(|tags| { - tags.split(",") - .map(|tag_str| tag_str.trim().to_string()) - .collect::>() - }) - }); - let title = if let Some(title_node) = document.select_first("title").ok() { - title_node - .as_node() - .first_child() - .and_then(|text_node| { - text_node - .as_text() - .map(|text_ref| text_ref.borrow().to_string()) - }) - .unwrap_or("".to_string()) - } else { - "".to_string() - }; - let lang = document - .select_first("html") - .ok() - .and_then(|data| { - data.attributes - .borrow() - .get("lang") - .map(|val| val.to_string()) - }) - .unwrap_or("en".to_string()); - let meta_attrs = MetaAttr::new(author, description, lang, tags, title); - dbg!(meta_attrs); -} - -#[derive(Debug)] -struct MetaAttr { - author: Option, - description: Option, - language: String, - tags: Option>, - title: String, -} - -impl MetaAttr { - pub fn new( - author: Option, - description: Option, - language: String, - tags: Option>, - title: String, - ) -> Self { - MetaAttr { - author, - description, - language, - tags, - title, - } - } -}