Add template for epub output
Change output format to name file with the title name Add getters in MetaData
This commit is contained in:
parent
703de7e3bf
commit
db11e78d8c
3 changed files with 42 additions and 6 deletions
|
@ -1,14 +1,15 @@
|
||||||
use async_std::fs::File;
|
use async_std::fs::File;
|
||||||
use async_std::io::prelude::*;
|
use async_std::io::prelude::*;
|
||||||
use async_std::task;
|
use async_std::task;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::{traits::*, NodeRef};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use super::moz_readability::Readability;
|
use crate::moz_readability::{MetaData, Readability};
|
||||||
|
|
||||||
pub type ResourceInfo = (String, Option<String>);
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
|
||||||
pub struct Extractor {
|
pub struct Extractor {
|
||||||
|
article: Option<NodeRef>,
|
||||||
pub img_urls: Vec<ResourceInfo>,
|
pub img_urls: Vec<ResourceInfo>,
|
||||||
readability: Readability,
|
readability: Readability,
|
||||||
}
|
}
|
||||||
|
@ -17,6 +18,7 @@ impl Extractor {
|
||||||
/// Create a new instance of an HTML extractor given an HTML string
|
/// Create a new instance of an HTML extractor given an HTML string
|
||||||
pub fn from_html(html_str: &str) -> Self {
|
pub fn from_html(html_str: &str) -> Self {
|
||||||
Extractor {
|
Extractor {
|
||||||
|
article: None,
|
||||||
img_urls: Vec::new(),
|
img_urls: Vec::new(),
|
||||||
readability: Readability::new(html_str),
|
readability: Readability::new(html_str),
|
||||||
}
|
}
|
||||||
|
@ -26,6 +28,20 @@ impl Extractor {
|
||||||
/// the source of the content
|
/// the source of the content
|
||||||
pub fn extract_content(&mut self, url: &str) {
|
pub fn extract_content(&mut self, url: &str) {
|
||||||
self.readability.parse(url);
|
self.readability.parse(url);
|
||||||
|
if let Some(article_node_ref) = &self.readability.article_node {
|
||||||
|
let template = r#"
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
||||||
|
<head>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"#;
|
||||||
|
let doc = kuchiki::parse_html().one(template);
|
||||||
|
let body = doc.select_first("body").unwrap();
|
||||||
|
body.as_node().append(article_node_ref.clone());
|
||||||
|
self.article = Some(doc);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
||||||
|
@ -94,7 +110,11 @@ impl Extractor {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn article(&self) -> Option<&NodeRef> {
|
pub fn article(&self) -> Option<&NodeRef> {
|
||||||
self.readability.article_node.as_ref()
|
self.article.as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn metadata(&self) -> &MetaData {
|
||||||
|
&self.readability.metadata
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,8 @@ fn download(url: String) {
|
||||||
.download_images(&Url::parse(&url).unwrap())
|
.download_images(&Url::parse(&url).unwrap())
|
||||||
.await
|
.await
|
||||||
.expect("Unable to download images");
|
.expect("Unable to download images");
|
||||||
let mut out_file = File::create("out.epub").unwrap();
|
let mut out_file =
|
||||||
|
File::create(format!("{}.epub", extractor.metadata().title())).unwrap();
|
||||||
let mut html_buf = Vec::new();
|
let mut html_buf = Vec::new();
|
||||||
extractor
|
extractor
|
||||||
.article()
|
.article()
|
||||||
|
@ -55,6 +56,11 @@ fn download(url: String) {
|
||||||
.expect("Unable to serialize");
|
.expect("Unable to serialize");
|
||||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
||||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||||
|
if let Some(author) = extractor.metadata().byline() {
|
||||||
|
epub.metadata("author", author).unwrap();
|
||||||
|
}
|
||||||
|
epub.metadata("title", extractor.metadata().title())
|
||||||
|
.unwrap();
|
||||||
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
for img in extractor.img_urls {
|
for img in extractor.img_urls {
|
||||||
|
|
|
@ -55,6 +55,7 @@ pub struct Readability {
|
||||||
pub article_node: Option<NodeRef>,
|
pub article_node: Option<NodeRef>,
|
||||||
article_dir: Option<String>,
|
article_dir: Option<String>,
|
||||||
flags: u32,
|
flags: u32,
|
||||||
|
pub metadata: MetaData,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
|
@ -72,14 +73,15 @@ impl Readability {
|
||||||
article_node: None,
|
article_node: None,
|
||||||
article_dir: None,
|
article_dir: None,
|
||||||
flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
|
flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
|
||||||
|
metadata: MetaData::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn parse(&mut self, url: &str) {
|
pub fn parse(&mut self, url: &str) {
|
||||||
self.unwrap_no_script_tags();
|
self.unwrap_no_script_tags();
|
||||||
self.remove_scripts();
|
self.remove_scripts();
|
||||||
self.prep_document();
|
self.prep_document();
|
||||||
let meta_data = self.get_article_metadata();
|
self.metadata = self.get_article_metadata();
|
||||||
self.article_title = meta_data.title.clone();
|
self.article_title = self.metadata.title.clone();
|
||||||
self.grab_article();
|
self.grab_article();
|
||||||
self.post_process_content(url);
|
self.post_process_content(url);
|
||||||
}
|
}
|
||||||
|
@ -2077,6 +2079,14 @@ impl MetaData {
|
||||||
title: "".into(),
|
title: "".into(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn title(&self) -> &str {
|
||||||
|
&self.title
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn byline(&self) -> Option<&String> {
|
||||||
|
self.byline.as_ref()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
Reference in a new issue