Add template for epub output
Change output format to name file with the title name Add getters in MetaData
This commit is contained in:
parent
703de7e3bf
commit
db11e78d8c
3 changed files with 42 additions and 6 deletions
|
@ -1,14 +1,15 @@
|
|||
use async_std::fs::File;
|
||||
use async_std::io::prelude::*;
|
||||
use async_std::task;
|
||||
use kuchiki::NodeRef;
|
||||
use kuchiki::{traits::*, NodeRef};
|
||||
use url::Url;
|
||||
|
||||
use super::moz_readability::Readability;
|
||||
use crate::moz_readability::{MetaData, Readability};
|
||||
|
||||
pub type ResourceInfo = (String, Option<String>);
|
||||
|
||||
pub struct Extractor {
|
||||
article: Option<NodeRef>,
|
||||
pub img_urls: Vec<ResourceInfo>,
|
||||
readability: Readability,
|
||||
}
|
||||
|
@ -17,6 +18,7 @@ impl Extractor {
|
|||
/// Create a new instance of an HTML extractor given an HTML string
|
||||
pub fn from_html(html_str: &str) -> Self {
|
||||
Extractor {
|
||||
article: None,
|
||||
img_urls: Vec::new(),
|
||||
readability: Readability::new(html_str),
|
||||
}
|
||||
|
@ -26,6 +28,20 @@ impl Extractor {
|
|||
/// the source of the content
|
||||
pub fn extract_content(&mut self, url: &str) {
|
||||
self.readability.parse(url);
|
||||
if let Some(article_node_ref) = &self.readability.article_node {
|
||||
let template = r#"
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
||||
"#;
|
||||
let doc = kuchiki::parse_html().one(template);
|
||||
let body = doc.select_first("body").unwrap();
|
||||
body.as_node().append(article_node_ref.clone());
|
||||
self.article = Some(doc);
|
||||
}
|
||||
}
|
||||
|
||||
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
||||
|
@ -94,7 +110,11 @@ impl Extractor {
|
|||
}
|
||||
|
||||
pub fn article(&self) -> Option<&NodeRef> {
|
||||
self.readability.article_node.as_ref()
|
||||
self.article.as_ref()
|
||||
}
|
||||
|
||||
pub fn metadata(&self) -> &MetaData {
|
||||
&self.readability.metadata
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -46,7 +46,8 @@ fn download(url: String) {
|
|||
.download_images(&Url::parse(&url).unwrap())
|
||||
.await
|
||||
.expect("Unable to download images");
|
||||
let mut out_file = File::create("out.epub").unwrap();
|
||||
let mut out_file =
|
||||
File::create(format!("{}.epub", extractor.metadata().title())).unwrap();
|
||||
let mut html_buf = Vec::new();
|
||||
extractor
|
||||
.article()
|
||||
|
@ -55,6 +56,11 @@ fn download(url: String) {
|
|||
.expect("Unable to serialize");
|
||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
if let Some(author) = extractor.metadata().byline() {
|
||||
epub.metadata("author", author).unwrap();
|
||||
}
|
||||
epub.metadata("title", extractor.metadata().title())
|
||||
.unwrap();
|
||||
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
||||
.unwrap();
|
||||
for img in extractor.img_urls {
|
||||
|
|
|
@ -55,6 +55,7 @@ pub struct Readability {
|
|||
pub article_node: Option<NodeRef>,
|
||||
article_dir: Option<String>,
|
||||
flags: u32,
|
||||
pub metadata: MetaData,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
|
@ -72,14 +73,15 @@ impl Readability {
|
|||
article_node: None,
|
||||
article_dir: None,
|
||||
flags: FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY,
|
||||
metadata: MetaData::new(),
|
||||
}
|
||||
}
|
||||
pub fn parse(&mut self, url: &str) {
|
||||
self.unwrap_no_script_tags();
|
||||
self.remove_scripts();
|
||||
self.prep_document();
|
||||
let meta_data = self.get_article_metadata();
|
||||
self.article_title = meta_data.title.clone();
|
||||
self.metadata = self.get_article_metadata();
|
||||
self.article_title = self.metadata.title.clone();
|
||||
self.grab_article();
|
||||
self.post_process_content(url);
|
||||
}
|
||||
|
@ -2077,6 +2079,14 @@ impl MetaData {
|
|||
title: "".into(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn title(&self) -> &str {
|
||||
&self.title
|
||||
}
|
||||
|
||||
pub fn byline(&self) -> Option<&String> {
|
||||
self.byline.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
Loading…
Reference in a new issue