Add custom serializer for XHTML
This commit is contained in:
parent
37cb4e1fd2
commit
5f99bddc10
3 changed files with 58 additions and 11 deletions
|
@ -1,3 +1,5 @@
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use async_std::fs::File;
|
use async_std::fs::File;
|
||||||
use async_std::io::prelude::*;
|
use async_std::io::prelude::*;
|
||||||
use async_std::task;
|
use async_std::task;
|
||||||
|
@ -8,6 +10,10 @@ use crate::moz_readability::{MetaData, Readability};
|
||||||
|
|
||||||
pub type ResourceInfo = (String, Option<String>);
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
pub struct Extractor {
|
pub struct Extractor {
|
||||||
article: Option<NodeRef>,
|
article: Option<NodeRef>,
|
||||||
pub img_urls: Vec<ResourceInfo>,
|
pub img_urls: Vec<ResourceInfo>,
|
||||||
|
@ -165,6 +171,56 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Serializes a NodeRef to a string that is XHTML compatible
|
||||||
|
/// The only DOM nodes serialized are Text and Element nodes
|
||||||
|
pub fn serialize_to_xhtml<W: std::io::Write>(
|
||||||
|
node_ref: &NodeRef,
|
||||||
|
mut w: &mut W,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
let mut escape_map = HashMap::new();
|
||||||
|
escape_map.insert("<", "<");
|
||||||
|
escape_map.insert(">", ">");
|
||||||
|
escape_map.insert("&", "&");
|
||||||
|
for edge in node_ref.traverse_inclusive() {
|
||||||
|
match edge {
|
||||||
|
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
|
||||||
|
kuchiki::NodeData::Text(rc_text) => {
|
||||||
|
let text = rc_text.borrow();
|
||||||
|
let esc_text = ESC_SEQ_REGEX
|
||||||
|
.replace_all(&text, |captures: ®ex::Captures| escape_map[&captures[1]]);
|
||||||
|
write!(&mut w, "{}", esc_text)?;
|
||||||
|
}
|
||||||
|
kuchiki::NodeData::Element(elem_data) => {
|
||||||
|
let attrs = elem_data.attributes.borrow();
|
||||||
|
let attrs_str = attrs
|
||||||
|
.map
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| {
|
||||||
|
format!(
|
||||||
|
"{}=\"{}\"",
|
||||||
|
k.local,
|
||||||
|
ESC_SEQ_REGEX
|
||||||
|
.replace_all(&v.value, |captures: ®ex::Captures| {
|
||||||
|
escape_map[&captures[1]]
|
||||||
|
})
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.fold("".to_string(), |acc, val| acc + " " + &val);
|
||||||
|
write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
},
|
||||||
|
kuchiki::iter::NodeEdge::End(n) => match n.data() {
|
||||||
|
kuchiki::NodeData::Element(elem_data) => {
|
||||||
|
write!(&mut w, "</{}>", &elem_data.name.local)?;
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
|
@ -56,14 +56,9 @@ fn download(urls: Vec<String>) {
|
||||||
let file_name = format!("{}.epub", extractor.metadata().title());
|
let file_name = format!("{}.epub", extractor.metadata().title());
|
||||||
let mut out_file = File::create(&file_name).unwrap();
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
let mut html_buf = Vec::new();
|
let mut html_buf = Vec::new();
|
||||||
extractor
|
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
||||||
.article()
|
.expect("Unable to serialize to xhtml");
|
||||||
.unwrap()
|
|
||||||
.serialize(&mut html_buf)
|
|
||||||
.expect("Unable to serialize");
|
|
||||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
||||||
let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX
|
|
||||||
.replace_all(html_buf, "$tag/>");
|
|
||||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||||
if let Some(author) = extractor.metadata().byline() {
|
if let Some(author) = extractor.metadata().byline() {
|
||||||
epub.metadata("author", author.replace("&", "&"))
|
epub.metadata("author", author.replace("&", "&"))
|
||||||
|
|
|
@ -132,8 +132,4 @@ lazy_static! {
|
||||||
pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
|
pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
|
||||||
Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
|
Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
|
||||||
pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
|
pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
|
||||||
pub static ref REPLACE_SELF_CLOSING_REGEX: Regex = Regex::new(
|
|
||||||
r#"(?P<tag><(?:area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)(?: [a-z\-]+=["'][\sa-zA-Z0-9\./\-_#]+["']|[a-z\-]+)*)>"#
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue