feat: add reinsertion of title as <h1> requested in #22
This commit is contained in:
parent
3bf0719c8e
commit
e777426c1b
3 changed files with 21 additions and 3 deletions
|
@ -1,8 +1,11 @@
|
|||
use std::collections::BTreeMap;
|
||||
|
||||
use html5ever::{LocalName, Namespace, QualName};
|
||||
use itertools::Itertools;
|
||||
use kuchiki::{traits::*, NodeRef};
|
||||
|
||||
use crate::errors::PaperoniError;
|
||||
use crate::moz_readability::{MetaData, Readability};
|
||||
use crate::moz_readability::{MetaData, Readability, HTML_NS};
|
||||
|
||||
/// A tuple of the url and an Option of the resource's MIME type
|
||||
pub type ResourceInfo = (String, Option<String>);
|
||||
|
@ -29,6 +32,7 @@ impl Article {
|
|||
/// the source of the content
|
||||
pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
|
||||
self.readability.parse(&self.url)?;
|
||||
self.reinsert_title_heading();
|
||||
if let Some(article_node_ref) = &self.readability.article_node {
|
||||
let template = r#"
|
||||
<!DOCTYPE html>
|
||||
|
@ -74,6 +78,20 @@ impl Article {
|
|||
)
|
||||
}
|
||||
|
||||
fn reinsert_title_heading(&mut self) {
|
||||
if let Some(article_node_ref) = &self.readability.article_node {
|
||||
if let Ok(article_root_ref) = article_node_ref.select_first("div#readability-page-1") {
|
||||
let article_root_elem = article_root_ref.as_node();
|
||||
let h1_elem = NodeRef::new_element(
|
||||
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("h1")),
|
||||
BTreeMap::new(),
|
||||
);
|
||||
h1_elem.append(NodeRef::new_text(self.readability.metadata.title()));
|
||||
article_root_elem.prepend(h1_elem);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
pub fn metadata(&self) -> &MetaData {
|
||||
&self.readability.metadata
|
||||
}
|
||||
|
|
|
@ -132,7 +132,7 @@ struct PaperteerResponse {
|
|||
// This also means that extracting and downloading imgs should be handled externally
|
||||
async fn fetch_html_from_paperteer(
|
||||
urls: Vec<String>,
|
||||
app_config: &AppConfig,
|
||||
_app_config: &AppConfig,
|
||||
bar: &ProgressBar,
|
||||
partial_downloads: &mut Vec<PartialDownload>,
|
||||
errors: &mut Vec<PaperoniError>,
|
||||
|
|
|
@ -17,7 +17,7 @@ const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
|
|||
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
||||
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
|
||||
const READABILITY_SCORE: &'static str = "readability-score";
|
||||
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
||||
pub const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
||||
// TODO: Change to HashSet
|
||||
const PHRASING_ELEMS: [&str; 39] = [
|
||||
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
|
||||
|
|
Reference in a new issue