Add test for extract content
This commit is contained in:
parent
b5336e078d
commit
529704d227
2 changed files with 32 additions and 9 deletions
|
@ -1,4 +1,4 @@
|
||||||
use std::fs::File;
|
use std::io::Write;
|
||||||
|
|
||||||
use kuchiki::{traits::*, NodeRef};
|
use kuchiki::{traits::*, NodeRef};
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ impl Extractor {
|
||||||
extract_text_from_node(node_ref.as_node())
|
extract_text_from_node(node_ref.as_node())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn extract_content(&self) {
|
pub fn extract_content<W: Write>(&self, writer: &mut W) {
|
||||||
// Extract the useful parts of the head section
|
// Extract the useful parts of the head section
|
||||||
let author: Option<String> =
|
let author: Option<String> =
|
||||||
self.extract_attr_val("meta[name='author']", "content", |author| {
|
self.extract_attr_val("meta[name='author']", "content", |author| {
|
||||||
|
@ -58,8 +58,10 @@ impl Extractor {
|
||||||
let meta_attrs = MetaAttr::new(author, description, lang, tags, title);
|
let meta_attrs = MetaAttr::new(author, description, lang, tags, title);
|
||||||
dbg!(meta_attrs);
|
dbg!(meta_attrs);
|
||||||
|
|
||||||
|
// Extract the article
|
||||||
|
|
||||||
let article_ref = self.root_node.select_first("article").unwrap();
|
let article_ref = self.root_node.select_first("article").unwrap();
|
||||||
let mut out_file = File::create("out.html").expect("Can't make file");
|
|
||||||
for node_ref in article_ref.as_node().descendants() {
|
for node_ref in article_ref.as_node().descendants() {
|
||||||
match node_ref.data() {
|
match node_ref.data() {
|
||||||
kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (),
|
kuchiki::NodeData::Element(..) | kuchiki::NodeData::Text(..) => (),
|
||||||
|
@ -70,9 +72,7 @@ impl Extractor {
|
||||||
for node_ref in article_ref.as_node().children() {
|
for node_ref in article_ref.as_node().children() {
|
||||||
match node_ref.data() {
|
match node_ref.data() {
|
||||||
kuchiki::NodeData::Element(_) => {
|
kuchiki::NodeData::Element(_) => {
|
||||||
node_ref
|
node_ref.serialize(writer).expect("Serialization failed");
|
||||||
.serialize(&mut out_file)
|
|
||||||
.expect("Serialization failed");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_ => (),
|
_ => (),
|
||||||
|
@ -178,4 +178,24 @@ mod test {
|
||||||
assert!(h1_text.is_some());
|
assert!(h1_text.is_some());
|
||||||
assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap());
|
assert_eq!("Testing Paperoni".to_string(), h1_text.unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_content() {
|
||||||
|
let extracted_html: String = r#"
|
||||||
|
<h1>Starting out</h1>
|
||||||
|
<p>Some Lorem Ipsum text here</p>
|
||||||
|
<p>Observe this picture</p>
|
||||||
|
<img alt="Random image" src="./img.jpg">
|
||||||
|
"#
|
||||||
|
.lines()
|
||||||
|
.map(|line| line.trim())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let extractor = Extractor::from_html(TEST_HTML);
|
||||||
|
let mut output_string = Vec::new();
|
||||||
|
extractor.extract_content(&mut output_string);
|
||||||
|
let output_string = std::str::from_utf8(&output_string).unwrap();
|
||||||
|
assert!(output_string.len() > 0);
|
||||||
|
assert_eq!(extracted_html, output_string);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
use std::fs::File;
|
||||||
|
|
||||||
use async_std::task;
|
use async_std::task;
|
||||||
|
|
||||||
mod extractor;
|
mod extractor;
|
||||||
|
@ -8,15 +10,16 @@ fn main() {
|
||||||
let urls = vec![
|
let urls = vec![
|
||||||
"https://saveandrun.com/posts/2020-01-24-generating-mazes-with-haskell-part-1.html",
|
"https://saveandrun.com/posts/2020-01-24-generating-mazes-with-haskell-part-1.html",
|
||||||
"https://saveandrun.com/posts/2020-04-05-querying-pacman-with-datalog.html",
|
"https://saveandrun.com/posts/2020-04-05-querying-pacman-with-datalog.html",
|
||||||
"https://saveandrun.com/posts/2020-01-08-working-with-git.html",
|
|
||||||
"https://blog.hipstermojo.xyz/posts/redis-orm-preface/",
|
"https://blog.hipstermojo.xyz/posts/redis-orm-preface/",
|
||||||
"https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators",
|
"https://vuejsdevelopers.com/2020/03/31/vue-js-form-composition-api/?utm_campaign=xl5&utm_medium=article&utm_source=vuejsnews#adding-validators",
|
||||||
"https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
|
"https://medium.com/typeforms-engineering-blog/the-beginners-guide-to-oauth-dancing-4b8f3666de10",
|
||||||
"https://dev.to/steelwolf180/full-stack-development-in-django-3768"
|
"https://dev.to/steelwolf180/full-stack-development-in-django-3768"
|
||||||
];
|
];
|
||||||
let html = fetch_url(urls[4]).await;
|
let html = fetch_url(urls[6]).await;
|
||||||
let extractor = Extractor::from_html(&html);
|
let extractor = Extractor::from_html(&html);
|
||||||
extractor.extract_content();
|
println!("Extracting");
|
||||||
|
let mut out_file = File::create("out.html").expect("Can't make file");
|
||||||
|
extractor.extract_content(&mut out_file);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Reference in a new issue