From e777426c1b08e31883a533b9b2fc83cc112a829f Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 30 Dec 2021 07:58:19 +0300 Subject: [PATCH] feat: add reinsertion of title as

requested in #22 --- src/extractor.rs | 20 +++++++++++++++++++- src/http.rs | 2 +- src/moz_readability/mod.rs | 2 +- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/extractor.rs b/src/extractor.rs index fcd13a4..516ab0a 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,8 +1,11 @@ +use std::collections::BTreeMap; + +use html5ever::{LocalName, Namespace, QualName}; use itertools::Itertools; use kuchiki::{traits::*, NodeRef}; use crate::errors::PaperoniError; -use crate::moz_readability::{MetaData, Readability}; +use crate::moz_readability::{MetaData, Readability, HTML_NS}; /// A tuple of the url and an Option of the resource's MIME type pub type ResourceInfo = (String, Option); @@ -29,6 +32,7 @@ impl Article { /// the source of the content pub fn extract_content(&mut self) -> Result<(), PaperoniError> { self.readability.parse(&self.url)?; + self.reinsert_title_heading(); if let Some(article_node_ref) = &self.readability.article_node { let template = r#" @@ -74,6 +78,20 @@ impl Article { ) } + fn reinsert_title_heading(&mut self) { + if let Some(article_node_ref) = &self.readability.article_node { + if let Ok(article_root_ref) = article_node_ref.select_first("div#readability-page-1") { + let article_root_elem = article_root_ref.as_node(); + let h1_elem = NodeRef::new_element( + QualName::new(None, Namespace::from(HTML_NS), LocalName::from("h1")), + BTreeMap::new(), + ); + h1_elem.append(NodeRef::new_text(self.readability.metadata.title())); + article_root_elem.prepend(h1_elem); + }; + } + } + pub fn metadata(&self) -> &MetaData { &self.readability.metadata } diff --git a/src/http.rs b/src/http.rs index b3140d9..38162b8 100644 --- a/src/http.rs +++ b/src/http.rs @@ -132,7 +132,7 @@ struct PaperteerResponse { // This also means that extracting and downloading imgs should be handled externally async fn fetch_html_from_paperteer( urls: Vec, - app_config: &AppConfig, + _app_config: &AppConfig, bar: &ProgressBar, partial_downloads: &mut Vec, errors: &mut Vec, diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 0f4fc66..a1f22d2 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -17,7 +17,7 @@ const FLAG_STRIP_UNLIKELYS: u32 = 0x1; const FLAG_WEIGHT_CLASSES: u32 = 0x2; const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4; const READABILITY_SCORE: &'static str = "readability-score"; -const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; +pub const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; // TODO: Change to HashSet const PHRASING_ELEMS: [&str; 39] = [ "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",