feat: add reinsertion of title as <h1> requested in #22
This commit is contained in:
parent
3bf0719c8e
commit
e777426c1b
3 changed files with 21 additions and 3 deletions
|
@ -1,8 +1,11 @@
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use html5ever::{LocalName, Namespace, QualName};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use kuchiki::{traits::*, NodeRef};
|
use kuchiki::{traits::*, NodeRef};
|
||||||
|
|
||||||
use crate::errors::PaperoniError;
|
use crate::errors::PaperoniError;
|
||||||
use crate::moz_readability::{MetaData, Readability};
|
use crate::moz_readability::{MetaData, Readability, HTML_NS};
|
||||||
|
|
||||||
/// A tuple of the url and an Option of the resource's MIME type
|
/// A tuple of the url and an Option of the resource's MIME type
|
||||||
pub type ResourceInfo = (String, Option<String>);
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
@ -29,6 +32,7 @@ impl Article {
|
||||||
/// the source of the content
|
/// the source of the content
|
||||||
pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
|
pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
|
||||||
self.readability.parse(&self.url)?;
|
self.readability.parse(&self.url)?;
|
||||||
|
self.reinsert_title_heading();
|
||||||
if let Some(article_node_ref) = &self.readability.article_node {
|
if let Some(article_node_ref) = &self.readability.article_node {
|
||||||
let template = r#"
|
let template = r#"
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
|
@ -74,6 +78,20 @@ impl Article {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn reinsert_title_heading(&mut self) {
|
||||||
|
if let Some(article_node_ref) = &self.readability.article_node {
|
||||||
|
if let Ok(article_root_ref) = article_node_ref.select_first("div#readability-page-1") {
|
||||||
|
let article_root_elem = article_root_ref.as_node();
|
||||||
|
let h1_elem = NodeRef::new_element(
|
||||||
|
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("h1")),
|
||||||
|
BTreeMap::new(),
|
||||||
|
);
|
||||||
|
h1_elem.append(NodeRef::new_text(self.readability.metadata.title()));
|
||||||
|
article_root_elem.prepend(h1_elem);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn metadata(&self) -> &MetaData {
|
pub fn metadata(&self) -> &MetaData {
|
||||||
&self.readability.metadata
|
&self.readability.metadata
|
||||||
}
|
}
|
||||||
|
|
|
@ -132,7 +132,7 @@ struct PaperteerResponse {
|
||||||
// This also means that extracting and downloading imgs should be handled externally
|
// This also means that extracting and downloading imgs should be handled externally
|
||||||
async fn fetch_html_from_paperteer(
|
async fn fetch_html_from_paperteer(
|
||||||
urls: Vec<String>,
|
urls: Vec<String>,
|
||||||
app_config: &AppConfig,
|
_app_config: &AppConfig,
|
||||||
bar: &ProgressBar,
|
bar: &ProgressBar,
|
||||||
partial_downloads: &mut Vec<PartialDownload>,
|
partial_downloads: &mut Vec<PartialDownload>,
|
||||||
errors: &mut Vec<PaperoniError>,
|
errors: &mut Vec<PaperoniError>,
|
||||||
|
|
|
@ -17,7 +17,7 @@ const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
|
||||||
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
||||||
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
|
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
|
||||||
const READABILITY_SCORE: &'static str = "readability-score";
|
const READABILITY_SCORE: &'static str = "readability-score";
|
||||||
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
pub const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
||||||
// TODO: Change to HashSet
|
// TODO: Change to HashSet
|
||||||
const PHRASING_ELEMS: [&str; 39] = [
|
const PHRASING_ELEMS: [&str; 39] = [
|
||||||
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
|
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
|
||||||
|
|
Reference in a new issue