use std::collections::{BTreeMap, HashMap, HashSet}; use std::str::FromStr; use html5ever::{LocalName, Namespace, QualName}; use kuchiki::{ iter::{Descendants, Elements, Select}, traits::*, NodeData, NodeRef, }; use url::Url; const SHARE_ELEMENT_THRESHOLD: usize = 500; const READABILITY_SCORE: &'static str = "readability-score"; const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; // TODO: Change to HashSet const PHRASING_ELEMS: [&str; 39] = [ "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object", "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong", "sub", "sup", "textarea", "time", "var", "wbr", ]; // TODO: Change to HashSet const DEFAULT_TAGS_TO_SCORE: [&str; 9] = ["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"]; // TODO: Change to HashSet const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"]; const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace", ]; const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"]; // TODO: Change to HashSet const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"]; mod regexes; pub struct Readability { root_node: NodeRef, byline: Option, article_title: String, pub article_node: Option, article_dir: Option, } #[derive(Debug, PartialEq)] struct SizeInfo { rows: usize, columns: usize, } impl Readability { pub fn new(html_str: &str) -> Self { Self { root_node: kuchiki::parse_html().one(html_str), byline: None, article_title: "".into(), article_node: None, article_dir: None, } } pub fn parse(&mut self, url: &str) { self.unwrap_no_script_tags(); self.remove_scripts(); self.prep_document(); let meta_data = self.get_article_metadata(); self.article_title = meta_data.title.clone(); self.grab_article(); self.post_process_content(url); } /// Recursively check if node is image, or if node contains exactly only one image /// whether as a direct child or as its descendants. fn is_single_image(node_ref: &NodeRef) -> bool { if let Some(element) = node_ref.as_element() { if &element.name.local == "img" { return true; } } if node_ref.children().filter(Self::has_content).count() != 1 || !node_ref.text_contents().trim().is_empty() { return false; } return Readability::is_single_image( &node_ref .children() .filter(Self::has_content) .next() .expect("Unable to get first child which should exist"), ); } fn has_content(node_ref: &NodeRef) -> bool { match node_ref.data() { NodeData::Text(text) => !text.borrow().trim().is_empty(), _ => true, } } /// Find all