Change signature of extract_content to copy the reference to article DOM

node instead of writing to file
This commit is contained in:
Kenneth Gitere 2020-05-02 14:51:53 +03:00
parent 529704d227
commit f24e72e70f
2 changed files with 38 additions and 23 deletions

View file

@ -1,15 +1,15 @@
use std::io::Write;
use kuchiki::{traits::*, NodeRef};
use kuchiki::{traits::*, ElementData, NodeDataRef, NodeRef};
pub struct Extractor {
pub root_node: NodeRef,
content: Option<NodeDataRef<ElementData>>,
}
impl Extractor {
/// Create a new instance of an HTML extractor given an HTML string
pub fn from_html(html_str: &str) -> Self {
Extractor {
content: None,
root_node: kuchiki::parse_html().one(html_str),
}
}
@ -27,12 +27,15 @@ impl Extractor {
.and_then(|data| data.attributes.borrow().get(attr_target).map(mapper))
}
/// Extract the text of a DOM node given its CSS selector
fn extract_inner_text(&self, css_selector: &str) -> Option<String> {
let node_ref = self.root_node.select_first(css_selector).ok()?;
extract_text_from_node(node_ref.as_node())
}
pub fn extract_content<W: Write>(&self, writer: &mut W) {
/// Locates and extracts the HTML in a document which is determined to be
/// the source of the content
pub fn extract_content(&mut self) {
// Extract the useful parts of the head section
let author: Option<String> =
self.extract_attr_val("meta[name='author']", "content", |author| {
@ -68,12 +71,8 @@ impl Extractor {
_ => node_ref.detach(),
}
}
println!("Saving to file");
for node_ref in article_ref.as_node().children() {
match node_ref.data() {
kuchiki::NodeData::Element(_) => {
node_ref.serialize(writer).expect("Serialization failed");
}
self.content = Some(article_ref);
}
_ => (),
}
@ -182,20 +181,37 @@ mod test {
#[test]
fn test_extract_content() {
let extracted_html: String = r#"
<h1>Starting out</h1>
<p>Some Lorem Ipsum text here</p>
<p>Observe this picture</p>
<img alt="Random image" src="./img.jpg">
<article>
<h1>Starting out</h1>
<p>Some Lorem Ipsum text here</p>
<p>Observe this picture</p>
<img alt="Random image" src="./img.jpg">
</article>
"#
.lines()
.map(|line| line.trim())
.collect();
let extractor = Extractor::from_html(TEST_HTML);
let mut output_string = Vec::new();
extractor.extract_content(&mut output_string);
let output_string = std::str::from_utf8(&output_string).unwrap();
assert!(output_string.len() > 0);
assert_eq!(extracted_html, output_string);
let mut extractor = Extractor::from_html(
&TEST_HTML
.lines()
.map(|line| line.trim())
.collect::<String>(),
);
extractor.extract_content();
let mut output = Vec::new();
assert!(extractor.content.is_some());
extractor
.content
.unwrap()
.as_node()
.serialize(&mut output)
.expect("Unable to serialize output HTML");
let output = std::str::from_utf8(&output).unwrap();
assert_eq!(extracted_html, output);
}
}
}

View file

@ -16,10 +16,9 @@ fn main() {
"https://dev.to/steelwolf180/full-stack-development-in-django-3768"
];
let html = fetch_url(urls[6]).await;
let extractor = Extractor::from_html(&html);
let mut extractor = Extractor::from_html(&html);
println!("Extracting");
let mut out_file = File::create("out.html").expect("Can't make file");
extractor.extract_content(&mut out_file);
extractor.extract_content();
});
}