use itertools::Itertools; use kuchiki::{traits::*, NodeRef}; use crate::errors::PaperoniError; use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); pub struct Extractor { article: Option, pub img_urls: Vec, readability: Readability, pub url: String, } impl Extractor { /// Create a new instance of an HTML extractor given an HTML string pub fn from_html(html_str: &str, url: &str) -> Self { Extractor { article: None, img_urls: Vec::new(), readability: Readability::new(html_str), url: url.to_string(), } } /// Locates and extracts the HTML in a document which is determined to be /// the source of the content pub fn extract_content(&mut self) -> Result<(), PaperoniError> { self.readability.parse(&self.url)?; if let Some(article_node_ref) = &self.readability.article_node { let template = r#" "#; let doc = kuchiki::parse_html().one(template); let body = doc.select_first("body").unwrap(); body.as_node().append(article_node_ref.clone()); self.article = Some(doc); } Ok(()) } /// Traverses the DOM tree of the content and retrieves the IMG URLs pub fn extract_img_urls(&mut self) { if let Some(content_ref) = &self.article { self.img_urls = content_ref .select("img") .unwrap() .filter_map(|img_ref| { let attrs = img_ref.attributes.borrow(); attrs .get("src") .filter(|val| !(val.is_empty() || val.starts_with("data:image"))) .map(ToString::to_string) }) .unique() .map(|val| (val, None)) .collect(); } } /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse pub fn article(&self) -> &NodeRef { self.article.as_ref().expect( "Article node doesn't exist. This may be because the document has not been parsed", ) } pub fn metadata(&self) -> &MetaData { &self.readability.metadata } } #[cfg(test)] mod test { use super::*; const TEST_HTML: &'static str = r#" Testing Paperoni

Testing Paperoni

Starting out

Some Lorem Ipsum text here

Observe this picture

Random image
"#; #[test] fn test_extract_img_urls() { let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/"); extractor .extract_content() .expect("Article extraction failed unexpectedly"); extractor.extract_img_urls(); assert!(extractor.img_urls.len() > 0); assert_eq!( vec![("http://example.com/img.jpg".to_string(), None)], extractor.img_urls ); } }