From db11e78d8cf568d8abf11b74d8964bd140655e55 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Thu, 22 Oct 2020 13:55:02 +0300 Subject: [PATCH] Add template for epub output Change output format to name file with the title name Add getters in MetaData --- src/extractor.rs | 26 +++++++++++++++++++++++--- src/main.rs | 8 +++++++- src/moz_readability/mod.rs | 14 ++++++++++++-- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/extractor.rs b/src/extractor.rs index 679f6ba..93ab5bb 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,14 +1,15 @@ use async_std::fs::File; use async_std::io::prelude::*; use async_std::task; -use kuchiki::NodeRef; +use kuchiki::{traits::*, NodeRef}; use url::Url; -use super::moz_readability::Readability; +use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); pub struct Extractor { + article: Option, pub img_urls: Vec, readability: Readability, } @@ -17,6 +18,7 @@ impl Extractor { /// Create a new instance of an HTML extractor given an HTML string pub fn from_html(html_str: &str) -> Self { Extractor { + article: None, img_urls: Vec::new(), readability: Readability::new(html_str), } @@ -26,6 +28,20 @@ impl Extractor { /// the source of the content pub fn extract_content(&mut self, url: &str) { self.readability.parse(url); + if let Some(article_node_ref) = &self.readability.article_node { + let template = r#" + + + + + + + "#; + let doc = kuchiki::parse_html().one(template); + let body = doc.select_first("body").unwrap(); + body.as_node().append(article_node_ref.clone()); + self.article = Some(doc); + } } /// Traverses the DOM tree of the content and retrieves the IMG URLs @@ -94,7 +110,11 @@ impl Extractor { } pub fn article(&self) -> Option<&NodeRef> { - self.readability.article_node.as_ref() + self.article.as_ref() + } + + pub fn metadata(&self) -> &MetaData { + &self.readability.metadata } } diff --git a/src/main.rs b/src/main.rs index 4c81896..1ea3d62 100644 --- a/src/main.rs +++ b/src/main.rs @@ -46,7 +46,8 @@ fn download(url: String) { .download_images(&Url::parse(&url).unwrap()) .await .expect("Unable to download images"); - let mut out_file = File::create("out.epub").unwrap(); + let mut out_file = + File::create(format!("{}.epub", extractor.metadata().title())).unwrap(); let mut html_buf = Vec::new(); extractor .article() @@ -55,6 +56,11 @@ fn download(url: String) { .expect("Unable to serialize"); let html_buf = std::str::from_utf8(&html_buf).unwrap(); let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + if let Some(author) = extractor.metadata().byline() { + epub.metadata("author", author).unwrap(); + } + epub.metadata("title", extractor.metadata().title()) + .unwrap(); epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes())) .unwrap(); for img in extractor.img_urls { diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index 99755f3..af1f84e 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -55,6 +55,7 @@ pub struct Readability { pub article_node: Option, article_dir: Option, flags: u32, + pub metadata: MetaData, } #[derive(Debug, PartialEq)] @@ -72,14 +73,15 @@ impl Readability { article_node: None, article_dir: None, flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY, + metadata: MetaData::new(), } } pub fn parse(&mut self, url: &str) { self.unwrap_no_script_tags(); self.remove_scripts(); self.prep_document(); - let meta_data = self.get_article_metadata(); - self.article_title = meta_data.title.clone(); + self.metadata = self.get_article_metadata(); + self.article_title = self.metadata.title.clone(); self.grab_article(); self.post_process_content(url); } @@ -2077,6 +2079,14 @@ impl MetaData { title: "".into(), } } + + pub fn title(&self) -> &str { + &self.title + } + + pub fn byline(&self) -> Option<&String> { + self.byline.as_ref() + } } #[cfg(test)]