feat: add reinsertion of title as <h1> requested in #22

This commit is contained in:
Kenneth Gitere 2021-12-30 07:58:19 +03:00
parent 3bf0719c8e
commit e777426c1b
3 changed files with 21 additions and 3 deletions

View file

@ -1,8 +1,11 @@
use std::collections::BTreeMap;
use html5ever::{LocalName, Namespace, QualName};
use itertools::Itertools; use itertools::Itertools;
use kuchiki::{traits::*, NodeRef}; use kuchiki::{traits::*, NodeRef};
use crate::errors::PaperoniError; use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability}; use crate::moz_readability::{MetaData, Readability, HTML_NS};
/// A tuple of the url and an Option of the resource's MIME type /// A tuple of the url and an Option of the resource's MIME type
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
@ -29,6 +32,7 @@ impl Article {
/// the source of the content /// the source of the content
pub fn extract_content(&mut self) -> Result<(), PaperoniError> { pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
self.readability.parse(&self.url)?; self.readability.parse(&self.url)?;
self.reinsert_title_heading();
if let Some(article_node_ref) = &self.readability.article_node { if let Some(article_node_ref) = &self.readability.article_node {
let template = r#" let template = r#"
<!DOCTYPE html> <!DOCTYPE html>
@ -74,6 +78,20 @@ impl Article {
) )
} }
fn reinsert_title_heading(&mut self) {
if let Some(article_node_ref) = &self.readability.article_node {
if let Ok(article_root_ref) = article_node_ref.select_first("div#readability-page-1") {
let article_root_elem = article_root_ref.as_node();
let h1_elem = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("h1")),
BTreeMap::new(),
);
h1_elem.append(NodeRef::new_text(self.readability.metadata.title()));
article_root_elem.prepend(h1_elem);
};
}
}
pub fn metadata(&self) -> &MetaData { pub fn metadata(&self) -> &MetaData {
&self.readability.metadata &self.readability.metadata
} }

View file

@ -132,7 +132,7 @@ struct PaperteerResponse {
// This also means that extracting and downloading imgs should be handled externally // This also means that extracting and downloading imgs should be handled externally
async fn fetch_html_from_paperteer( async fn fetch_html_from_paperteer(
urls: Vec<String>, urls: Vec<String>,
app_config: &AppConfig, _app_config: &AppConfig,
bar: &ProgressBar, bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>, partial_downloads: &mut Vec<PartialDownload>,
errors: &mut Vec<PaperoniError>, errors: &mut Vec<PaperoniError>,

View file

@ -17,7 +17,7 @@ const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
const FLAG_WEIGHT_CLASSES: u32 = 0x2; const FLAG_WEIGHT_CLASSES: u32 = 0x2;
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4; const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
const READABILITY_SCORE: &'static str = "readability-score"; const READABILITY_SCORE: &'static str = "readability-score";
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; pub const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
// TODO: Change to HashSet // TODO: Change to HashSet
const PHRASING_ELEMS: [&str; 39] = [ const PHRASING_ELEMS: [&str; 39] = [
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",