use std::collections::{BTreeMap, HashMap}; use crate::extractor::MetaAttr; use html5ever::{LocalName, Namespace, QualName}; use kuchiki::{ iter::{Descendants, Elements, Select}, traits::*, NodeData, NodeRef, }; use regex::Regex; const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; const PHRASING_ELEMS: [&str; 39] = [ "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object", "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong", "sub", "sup", "textarea", "time", "var", "wbr", ]; pub struct Readability { root_node: NodeRef, byline: Option, article_title: String, } #[derive(Debug, PartialEq)] struct SizeInfo { rows: usize, columns: usize, } impl Readability { pub fn new(html_str: &str) -> Self { Self { root_node: kuchiki::parse_html().one(html_str), byline: None, article_title: "".into(), } } pub fn parse(&mut self) { self.unwrap_no_script_tags(); self.remove_scripts(); self.prep_document(); // TODO: Add implementation for get_article_metadata } /// Recursively check if node is image, or if node contains exactly only one image /// whether as a direct child or as its descendants. fn is_single_image(node_ref: &NodeRef) -> bool { if let Some(element) = node_ref.as_element() { if &element.name.local == "img" { return true; } } if node_ref.children().filter(Self::has_content).count() != 1 || !node_ref.text_contents().trim().is_empty() { return false; } return Readability::is_single_image( &node_ref .children() .filter(Self::has_content) .next() .expect("Unable to get first child which should exist"), ); } fn has_content(node_ref: &NodeRef) -> bool { match node_ref.data() { NodeData::Text(text) => !text.borrow().trim().is_empty(), _ => true, } } /// Find all