Add test for extract content

This commit is contained in:
Kenneth Gitere 2020-05-01 20:42:41 +03:00
parent b5336e078d
commit 529704d227
2 changed files with 32 additions and 9 deletions

View file

@ -1,4 +1,4 @@
use std::fs::File; use std::io::Write;
use kuchiki::{traits::*, NodeRef}; use kuchiki::{traits::*, NodeRef};
@ -32,7 +32,7 @@ impl Extractor {
extract_text_from_node(node_ref.as_node()) extract_text_from_node(node_ref.as_node())
} }
pub fn extract_content(&self) { pub fn extract_content<W: Write>(&self, writer: &mut W) {
// Extract the useful parts of the head section // Extract the useful parts of the head section
let author: Option<String> = let author: Option<String> =
self.extract_attr_val("meta[name='author']", "content", |author| { self.extract_attr_val("meta[name='author']", "content", |author| {
@ -58,8 +58,10 @@ impl Extractor {
let meta_attrs = MetaAttr::new(author, description, lang, tags, title); let meta_attrs = MetaAttr::new(author, description, lang, tags, title);
dbg!(meta_attrs); dbg!(meta_attrs);
// Extract the article
let article_ref = self.root_node.select_first("article").unwrap(); let article_ref = self.root_node.select_first("article").unwrap();
let mut out_file = File::create("out.html").expect("Can't make file");
for node_ref in article_ref.as_node().descendants() { for node_ref in article_ref.as_node().descendants() {
match node_ref.data() { match node_ref.data() {
kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (), kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (),
@ -70,9 +72,7 @@ impl Extractor {
for node_ref in article_ref.as_node().children() { for node_ref in article_ref.as_node().children() {
match node_ref.data() { match node_ref.data() {
kuchiki::NodeData::Element(_) => { kuchiki::NodeData::Element(_) => {
node_ref node_ref.serialize(writer).expect("Serialization failed");
.serialize(&mut out_file)
.expect("Serialization failed");
} }
_ => (), _ => (),
@ -178,4 +178,24 @@ mod test {
assert!(h1_text.is_some()); assert!(h1_text.is_some());
assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap()); assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap());
} }
#[test]
fn test_extract_content() {
let extracted_html: String = r#"
<h1>Starting out</h1>
<p>Some Lorem Ipsum text here</p>
<p>Observe this picture</p>
<img alt="Random image" src="./img.jpg">
"#
.lines()
.map(|line| line.trim())
.collect();
let extractor = Extractor::from_html(TEST_HTML);
let mut output_string = Vec::new();
extractor.extract_content(&mut output_string);
let output_string = std::str::from_utf8(&output_string).unwrap();
assert!(output_string.len() > 0);
assert_eq!(extracted_html, output_string);
}
} }

View file

@ -1,3 +1,5 @@
use std::fs::File;
use async_std::task; use async_std::task;
mod extractor; mod extractor;
@ -8,15 +10,16 @@ fn main() {
let urls = vec![ let urls = vec![
"https://saveandrun.com/posts/2020-01-24-generating-mazes-with-haskell-part-1.html", "https://saveandrun.com/posts/2020-01-24-generating-mazes-with-haskell-part-1.html",
"https://saveandrun.com/posts/2020-04-05-querying-pacman-with-datalog.html", "https://saveandrun.com/posts/2020-04-05-querying-pacman-with-datalog.html",
"https://saveandrun.com/posts/2020-01-08-working-with-git.html",
"https://blog.hipstermojo.xyz/posts/redis-orm-preface/", "https://blog.hipstermojo.xyz/posts/redis-orm-preface/",
"https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators", "https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators",
"https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10", "https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
"https://dev.to/steelwolf180/full-stack-development-in-django-3768" "https://dev.to/steelwolf180/full-stack-development-in-django-3768"
]; ];
let html = fetch_url(urls[4]).await; let html = fetch_url(urls[6]).await;
let extractor = Extractor::from_html(&html); let extractor = Extractor::from_html(&html);
extractor.extract_content(); println!("Extracting");
let mut out_file = File::create("out.html").expect("Can't make file");
extractor.extract_content(&mut out_file);
}); });
} }