Change signature of extract_content to copy the reference to article DOM

node instead of writing to file
This commit is contained in:
Kenneth Gitere 2020-05-02 14:51:53 +03:00
parent 529704d227
commit f24e72e70f
2 changed files with 38 additions and 23 deletions

View file

@ -1,15 +1,15 @@
use std::io::Write; use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef};
use kuchiki::{traits::*, NodeRef};
pub struct Extractor { pub struct Extractor {
pub root_node: NodeRef, pub root_node: NodeRef,
content: Option<NodeDataRef<ElementData>>,
} }
impl Extractor { impl Extractor {
/// Create a new instance of an HTML extractor given an HTML string /// Create a new instance of an HTML extractor given an HTML string
pub fn from_html(html_str: &str) -> Self { pub fn from_html(html_str: &str) -> Self {
Extractor { Extractor {
content: None,
root_node: kuchiki::parse_html().one(html_str), root_node: kuchiki::parse_html().one(html_str),
} }
} }
@ -27,12 +27,15 @@ impl Extractor {
.and_then(|data| data.attributes.borrow().get(attr_target).map(mapper)) .and_then(|data| data.attributes.borrow().get(attr_target).map(mapper))
} }
/// Extract the text of a DOM node given its CSS selector
fn extract_inner_text(&self, css_selector: &str) -> Option<String> { fn extract_inner_text(&self, css_selector: &str) -> Option<String> {
let node_ref = self.root_node.select_first(css_selector).ok()?; let node_ref = self.root_node.select_first(css_selector).ok()?;
extract_text_from_node(node_ref.as_node()) extract_text_from_node(node_ref.as_node())
} }
pub fn extract_content<W: Write>(&self, writer: &mut W) { /// Locates and extracts the HTML in a document which is determined to be
/// the source of the content
pub fn extract_content(&mut self) {
// Extract the useful parts of the head section // Extract the useful parts of the head section
let author: Option<String> = let author: Option<String> =
self.extract_attr_val("meta[name='author']", "content", |author| { self.extract_attr_val("meta[name='author']", "content", |author| {
@ -68,11 +71,7 @@ impl Extractor {
_ => node_ref.detach(), _ => node_ref.detach(),
} }
} }
println!("Saving to file"); self.content = Some(article_ref);
for node_ref in article_ref.as_node().children() {
match node_ref.data() {
kuchiki::NodeData::Element(_) => {
node_ref.serialize(writer).expect("Serialization failed");
} }
_ => (), _ => (),
@ -182,20 +181,37 @@ mod test {
#[test] #[test]
fn test_extract_content() { fn test_extract_content() {
let extracted_html: String = r#" let extracted_html: String = r#"
<article>
<h1>Starting out</h1> <h1>Starting out</h1>
<p>Some Lorem Ipsum text here</p> <p>Some Lorem Ipsum text here</p>
<p>Observe this picture</p> <p>Observe this picture</p>
<img alt="Random image" src="./img.jpg"> <img alt="Random image" src="./img.jpg">
</article>
"# "#
.lines() .lines()
.map(|line| line.trim()) .map(|line| line.trim())
.collect(); .collect();
let extractor = Extractor::from_html(TEST_HTML); let mut extractor = Extractor::from_html(
let mut output_string = Vec::new(); &TEST_HTML
extractor.extract_content(&mut output_string); .lines()
let output_string = std::str::from_utf8(&output_string).unwrap(); .map(|line| line.trim())
assert!(output_string.len() > 0); .collect::<String>(),
assert_eq!(extracted_html, output_string); );
extractor.extract_content();
let mut output = Vec::new();
assert!(extractor.content.is_some());
extractor
.content
.unwrap()
.as_node()
.serialize(&mut output)
.expect("Unable to serialize output HTML");
let output = std::str::from_utf8(&output).unwrap();
assert_eq!(extracted_html, output);
}
} }
} }

View file

@ -16,10 +16,9 @@ fn main() {
"https://dev.to/steelwolf180/full-stack-development-in-django-3768" "https://dev.to/steelwolf180/full-stack-development-in-django-3768"
]; ];
let html = fetch_url(urls[6]).await; let html = fetch_url(urls[6]).await;
let extractor = Extractor::from_html(&html); let mut extractor = Extractor::from_html(&html);
println!("Extracting"); println!("Extracting");
let mut out_file = File::create("out.html").expect("Can't make file"); extractor.extract_content();
extractor.extract_content(&mut out_file);
}); });
} }