Add template for epub output

Change output format to name file with the title name
Add getters in MetaData
This commit is contained in:
Kenneth Gitere 2020-10-22 13:55:02 +03:00
parent 703de7e3bf
commit db11e78d8c
3 changed files with 42 additions and 6 deletions

View file

@ -1,14 +1,15 @@
use async_std::fs::File; use async_std::fs::File;
use async_std::io::prelude::*; use async_std::io::prelude::*;
use async_std::task; use async_std::task;
use kuchiki::NodeRef; use kuchiki::{traits::*, NodeRef};
use url::Url; use url::Url;
use super::moz_readability::Readability; use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
pub struct Extractor { pub struct Extractor {
article: Option<NodeRef>,
pub img_urls: Vec<ResourceInfo>, pub img_urls: Vec<ResourceInfo>,
readability: Readability, readability: Readability,
} }
@ -17,6 +18,7 @@ impl Extractor {
/// Create a new instance of an HTML extractor given an HTML string /// Create a new instance of an HTML extractor given an HTML string
pub fn from_html(html_str: &str) -> Self { pub fn from_html(html_str: &str) -> Self {
Extractor { Extractor {
article: None,
img_urls: Vec::new(), img_urls: Vec::new(),
readability: Readability::new(html_str), readability: Readability::new(html_str),
} }
@ -26,6 +28,20 @@ impl Extractor {
/// the source of the content /// the source of the content
pub fn extract_content(&mut self, url: &str) { pub fn extract_content(&mut self, url: &str) {
self.readability.parse(url); self.readability.parse(url);
if let Some(article_node_ref) = &self.readability.article_node {
let template = r#"
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
</head>
<body>
</body>
</html>
"#;
let doc = kuchiki::parse_html().one(template);
let body = doc.select_first("body").unwrap();
body.as_node().append(article_node_ref.clone());
self.article = Some(doc);
}
} }
/// Traverses the DOM tree of the content and retrieves the IMG URLs /// Traverses the DOM tree of the content and retrieves the IMG URLs
@ -94,7 +110,11 @@ impl Extractor {
} }
pub fn article(&self) -> Option<&NodeRef> { pub fn article(&self) -> Option<&NodeRef> {
self.readability.article_node.as_ref() self.article.as_ref()
}
pub fn metadata(&self) -> &MetaData {
&self.readability.metadata
} }
} }

View file

@ -46,7 +46,8 @@ fn download(url: String) {
.download_images(&Url::parse(&url).unwrap()) .download_images(&Url::parse(&url).unwrap())
.await .await
.expect("Unable to download images"); .expect("Unable to download images");
let mut out_file = File::create("out.epub").unwrap(); let mut out_file =
File::create(format!("{}.epub", extractor.metadata().title())).unwrap();
let mut html_buf = Vec::new(); let mut html_buf = Vec::new();
extractor extractor
.article() .article()
@ -55,6 +56,11 @@ fn download(url: String) {
.expect("Unable to serialize"); .expect("Unable to serialize");
let html_buf = std::str::from_utf8(&html_buf).unwrap(); let html_buf = std::str::from_utf8(&html_buf).unwrap();
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
if let Some(author) = extractor.metadata().byline() {
epub.metadata("author", author).unwrap();
}
epub.metadata("title", extractor.metadata().title())
.unwrap();
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
.unwrap(); .unwrap();
for img in extractor.img_urls { for img in extractor.img_urls {

View file

@ -55,6 +55,7 @@ pub struct Readability {
pub article_node: Option<NodeRef>, pub article_node: Option<NodeRef>,
article_dir: Option<String>, article_dir: Option<String>,
flags: u32, flags: u32,
pub metadata: MetaData,
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
@ -72,14 +73,15 @@ impl Readability {
article_node: None, article_node: None,
article_dir: None, article_dir: None,
flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY, flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
metadata: MetaData::new(),
} }
} }
pub fn parse(&mut self, url: &str) { pub fn parse(&mut self, url: &str) {
self.unwrap_no_script_tags(); self.unwrap_no_script_tags();
self.remove_scripts(); self.remove_scripts();
self.prep_document(); self.prep_document();
let meta_data = self.get_article_metadata(); self.metadata = self.get_article_metadata();
self.article_title = meta_data.title.clone(); self.article_title = self.metadata.title.clone();
self.grab_article(); self.grab_article();
self.post_process_content(url); self.post_process_content(url);
} }
@ -2077,6 +2079,14 @@ impl MetaData {
title: "".into(), title: "".into(),
} }
} }
pub fn title(&self) -> &str {
&self.title
}
pub fn byline(&self) -> Option<&String> {
self.byline.as_ref()
}
} }
#[cfg(test)] #[cfg(test)]