From f24e72e70f12313560ca0620fc9ef367f2e25f0d Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 2 May 2020 14:51:53 +0300 Subject: [PATCH] Change signature of `extract_content` to copy the reference to article DOM node instead of writing to file --- src/extractor.rs | 56 +++++++++++++++++++++++++++++++----------------- src/main.rs | 5 ++--- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/src/extractor.rs b/src/extractor.rs index 2f05b21..8b1971d 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,15 +1,15 @@ -use std::io::Write; - -use kuchiki::{traits::*, NodeRef}; +use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef}; pub struct Extractor { pub root_node: NodeRef, + content: Option>, } impl Extractor { /// Create a new instance of an HTML extractor given an HTML string pub fn from_html(html_str: &str) -> Self { Extractor { + content: None, root_node: kuchiki::parse_html().one(html_str), } } @@ -27,12 +27,15 @@ impl Extractor { .and_then(|data| data.attributes.borrow().get(attr_target).map(mapper)) } + /// Extract the text of a DOM node given its CSS selector fn extract_inner_text(&self, css_selector: &str) -> Option { let node_ref = self.root_node.select_first(css_selector).ok()?; extract_text_from_node(node_ref.as_node()) } - pub fn extract_content(&self, writer: &mut W) { + /// Locates and extracts the HTML in a document which is determined to be + /// the source of the content + pub fn extract_content(&mut self) { // Extract the useful parts of the head section let author: Option = self.extract_attr_val("meta[name='author']", "content", |author| { @@ -68,12 +71,8 @@ impl Extractor { _ => node_ref.detach(), } } - println!("Saving to file"); - for node_ref in article_ref.as_node().children() { - match node_ref.data() { - kuchiki::NodeData::Element(_) => { - node_ref.serialize(writer).expect("Serialization failed"); - } + self.content = Some(article_ref); + } _ => (), } @@ -182,20 +181,37 @@ mod test { #[test] fn test_extract_content() { let extracted_html: String = r#" -

Starting out

-

Some Lorem Ipsum text here

-

Observe this picture

- Random image +
+

Starting out

+

Some Lorem Ipsum text here

+

Observe this picture

+ Random image +
"# .lines() .map(|line| line.trim()) .collect(); - let extractor = Extractor::from_html(TEST_HTML); - let mut output_string = Vec::new(); - extractor.extract_content(&mut output_string); - let output_string = std::str::from_utf8(&output_string).unwrap(); - assert!(output_string.len() > 0); - assert_eq!(extracted_html, output_string); + let mut extractor = Extractor::from_html( + &TEST_HTML + .lines() + .map(|line| line.trim()) + .collect::(), + ); + + extractor.extract_content(); + let mut output = Vec::new(); + assert!(extractor.content.is_some()); + + extractor + .content + .unwrap() + .as_node() + .serialize(&mut output) + .expect("Unable to serialize output HTML"); + let output = std::str::from_utf8(&output).unwrap(); + + assert_eq!(extracted_html, output); + } } } diff --git a/src/main.rs b/src/main.rs index a23e29d..dd8620a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,10 +16,9 @@ fn main() { "https://dev.to/steelwolf180/full-stack-development-in-django-3768" ]; let html = fetch_url(urls[6]).await; - let extractor = Extractor::from_html(&html); + let mut extractor = Extractor::from_html(&html); println!("Extracting"); - let mut out_file = File::create("out.html").expect("Can't make file"); - extractor.extract_content(&mut out_file); + extractor.extract_content(); }); }