refactor: rename Extractor
to Article
This commit is contained in:
parent
eac28da798
commit
e6f901eb5a
4 changed files with 38 additions and 38 deletions
14
src/epub.rs
14
src/epub.rs
|
@ -8,7 +8,7 @@ use indicatif::{ProgressBar, ProgressStyle};
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
use log::{debug, error, info};
|
use log::{debug, error, info};
|
||||||
|
|
||||||
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
|
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Article};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
||||||
|
@ -16,7 +16,7 @@ lazy_static! {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_epubs(
|
pub fn generate_epubs(
|
||||||
articles: Vec<Extractor>,
|
articles: Vec<Article>,
|
||||||
app_config: &AppConfig,
|
app_config: &AppConfig,
|
||||||
successful_articles_table: &mut Table,
|
successful_articles_table: &mut Table,
|
||||||
) -> Result<(), Vec<PaperoniError>> {
|
) -> Result<(), Vec<PaperoniError>> {
|
||||||
|
@ -88,9 +88,9 @@ pub fn generate_epubs(
|
||||||
let content_url = format!("article_{}.xhtml", idx);
|
let content_url = format!("article_{}.xhtml", idx);
|
||||||
let mut xhtml_buf = Vec::new();
|
let mut xhtml_buf = Vec::new();
|
||||||
let header_level_tocs =
|
let header_level_tocs =
|
||||||
get_header_level_toc_vec(&content_url, article.article());
|
get_header_level_toc_vec(&content_url, article.node_ref());
|
||||||
|
|
||||||
serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
|
serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)?;
|
||||||
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
|
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
|
||||||
let section_name = article.metadata().title();
|
let section_name = article.metadata().title();
|
||||||
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
|
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
|
||||||
|
@ -179,8 +179,8 @@ pub fn generate_epubs(
|
||||||
let mut out_file = File::create(&file_name).unwrap();
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
let mut xhtml_buf = Vec::new();
|
let mut xhtml_buf = Vec::new();
|
||||||
let header_level_tocs =
|
let header_level_tocs =
|
||||||
get_header_level_toc_vec("index.xhtml", article.article());
|
get_header_level_toc_vec("index.xhtml", article.node_ref());
|
||||||
serialize_to_xhtml(article.article(), &mut xhtml_buf)
|
serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)
|
||||||
.expect("Unable to serialize to xhtml");
|
.expect("Unable to serialize to xhtml");
|
||||||
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
|
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
|
||||||
|
|
||||||
|
@ -269,7 +269,7 @@ fn add_stylesheets<T: epub_builder::Zip>(
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
|
//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
|
||||||
fn generate_appendix(articles: Vec<&Extractor>) -> String {
|
fn generate_appendix(articles: Vec<&Article>) -> String {
|
||||||
let link_tags: String = articles
|
let link_tags: String = articles
|
||||||
.iter()
|
.iter()
|
||||||
.map(|article| {
|
.map(|article| {
|
||||||
|
|
|
@ -6,18 +6,18 @@ use crate::moz_readability::{MetaData, Readability};
|
||||||
|
|
||||||
pub type ResourceInfo = (String, Option<String>);
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
|
||||||
pub struct Extractor {
|
pub struct Article {
|
||||||
article: Option<NodeRef>,
|
node_ref_opt: Option<NodeRef>,
|
||||||
pub img_urls: Vec<ResourceInfo>,
|
pub img_urls: Vec<ResourceInfo>,
|
||||||
readability: Readability,
|
readability: Readability,
|
||||||
pub url: String,
|
pub url: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Extractor {
|
impl Article {
|
||||||
/// Create a new instance of an HTML extractor given an HTML string
|
/// Create a new instance of an HTML extractor given an HTML string
|
||||||
pub fn from_html(html_str: &str, url: &str) -> Self {
|
pub fn from_html(html_str: &str, url: &str) -> Self {
|
||||||
Extractor {
|
Self {
|
||||||
article: None,
|
node_ref_opt: None,
|
||||||
img_urls: Vec::new(),
|
img_urls: Vec::new(),
|
||||||
readability: Readability::new(html_str),
|
readability: Readability::new(html_str),
|
||||||
url: url.to_string(),
|
url: url.to_string(),
|
||||||
|
@ -42,14 +42,14 @@ impl Extractor {
|
||||||
let doc = kuchiki::parse_html().one(template);
|
let doc = kuchiki::parse_html().one(template);
|
||||||
let body = doc.select_first("body").unwrap();
|
let body = doc.select_first("body").unwrap();
|
||||||
body.as_node().append(article_node_ref.clone());
|
body.as_node().append(article_node_ref.clone());
|
||||||
self.article = Some(doc);
|
self.node_ref_opt = Some(doc);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
||||||
pub fn extract_img_urls(&mut self) {
|
pub fn extract_img_urls(&mut self) {
|
||||||
if let Some(content_ref) = &self.article {
|
if let Some(content_ref) = &self.node_ref_opt {
|
||||||
self.img_urls = content_ref
|
self.img_urls = content_ref
|
||||||
.select("img")
|
.select("img")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
|
@ -67,8 +67,8 @@ impl Extractor {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
|
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
|
||||||
pub fn article(&self) -> &NodeRef {
|
pub fn node_ref(&self) -> &NodeRef {
|
||||||
self.article.as_ref().expect(
|
self.node_ref_opt.as_ref().expect(
|
||||||
"Article node doesn't exist. This may be because the document has not been parsed",
|
"Article node doesn't exist. This may be because the document has not been parsed",
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -112,16 +112,16 @@ mod test {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_extract_img_urls() {
|
fn test_extract_img_urls() {
|
||||||
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
|
let mut article = Article::from_html(TEST_HTML, "http://example.com/");
|
||||||
extractor
|
article
|
||||||
.extract_content()
|
.extract_content()
|
||||||
.expect("Article extraction failed unexpectedly");
|
.expect("Article extraction failed unexpectedly");
|
||||||
extractor.extract_img_urls();
|
article.extract_img_urls();
|
||||||
|
|
||||||
assert!(extractor.img_urls.len() > 0);
|
assert!(article.img_urls.len() > 0);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
vec![("http://example.com/img.jpg".to_string(), None)],
|
vec![("http://example.com/img.jpg".to_string(), None)],
|
||||||
extractor.img_urls
|
article.img_urls
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
24
src/html.rs
24
src/html.rs
|
@ -14,7 +14,7 @@ use log::{debug, error, info};
|
||||||
use crate::{
|
use crate::{
|
||||||
cli::{self, AppConfig},
|
cli::{self, AppConfig},
|
||||||
errors::PaperoniError,
|
errors::PaperoniError,
|
||||||
extractor::Extractor,
|
extractor::Article,
|
||||||
moz_readability::MetaData,
|
moz_readability::MetaData,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ const BASE_HTML_TEMPLATE: &str = r#"<!DOCTYPE html>
|
||||||
</html>"#;
|
</html>"#;
|
||||||
|
|
||||||
pub fn generate_html_exports(
|
pub fn generate_html_exports(
|
||||||
articles: Vec<Extractor>,
|
articles: Vec<Article>,
|
||||||
app_config: &AppConfig,
|
app_config: &AppConfig,
|
||||||
successful_articles_table: &mut Table,
|
successful_articles_table: &mut Table,
|
||||||
) -> Result<(), Vec<PaperoniError>> {
|
) -> Result<(), Vec<PaperoniError>> {
|
||||||
|
@ -80,7 +80,7 @@ pub fn generate_html_exports(
|
||||||
|
|
||||||
for (idx, article) in articles.iter().enumerate() {
|
for (idx, article) in articles.iter().enumerate() {
|
||||||
let article_elem = article
|
let article_elem = article
|
||||||
.article()
|
.node_ref()
|
||||||
.select_first("div[id=\"readability-page-1\"]")
|
.select_first("div[id=\"readability-page-1\"]")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
@ -226,16 +226,16 @@ pub fn generate_html_exports(
|
||||||
elem_attrs.insert("charset", "UTF-8".into());
|
elem_attrs.insert("charset", "UTF-8".into());
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Ok(head_elem) = article.article().select_first("head") {
|
if let Ok(head_elem) = article.node_ref().select_first("head") {
|
||||||
let head_elem_node = head_elem.as_node();
|
let head_elem_node = head_elem.as_node();
|
||||||
head_elem_node.append(utf8_encoding);
|
head_elem_node.append(utf8_encoding);
|
||||||
};
|
};
|
||||||
|
|
||||||
insert_title_elem(article.article(), article.metadata().title());
|
insert_title_elem(article.node_ref(), article.metadata().title());
|
||||||
insert_appendix(article.article(), vec![(article.metadata(), &article.url)]);
|
insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]);
|
||||||
inline_css(article.article(), app_config);
|
inline_css(article.node_ref(), app_config);
|
||||||
|
|
||||||
article.article().serialize(&mut out_file)?;
|
article.node_ref().serialize(&mut out_file)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -269,7 +269,7 @@ fn create_qualname(name: &str) -> QualName {
|
||||||
|
|
||||||
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
|
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
|
||||||
fn update_imgs_base64(
|
fn update_imgs_base64(
|
||||||
article: &Extractor,
|
article: &Article,
|
||||||
img_url: &str,
|
img_url: &str,
|
||||||
mime_type: &str,
|
mime_type: &str,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
|
@ -279,7 +279,7 @@ fn update_imgs_base64(
|
||||||
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));
|
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));
|
||||||
|
|
||||||
let img_elems = article
|
let img_elems = article
|
||||||
.article()
|
.node_ref()
|
||||||
.select(&format!("img[src=\"{}\"]", img_url))
|
.select(&format!("img[src=\"{}\"]", img_url))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
for img_elem in img_elems {
|
for img_elem in img_elems {
|
||||||
|
@ -292,14 +292,14 @@ fn update_imgs_base64(
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location
|
/// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location
|
||||||
fn update_img_urls(article: &Extractor, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
|
fn update_img_urls(article: &Article, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
|
||||||
let temp_dir = std::env::temp_dir();
|
let temp_dir = std::env::temp_dir();
|
||||||
for (img_url, _) in &article.img_urls {
|
for (img_url, _) in &article.img_urls {
|
||||||
let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url));
|
let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url));
|
||||||
info!("Copying {:?} to {:?}", from, to);
|
info!("Copying {:?} to {:?}", from, to);
|
||||||
fs::copy(from, to)?;
|
fs::copy(from, to)?;
|
||||||
let img_elems = article
|
let img_elems = article
|
||||||
.article()
|
.node_ref()
|
||||||
.select(&format!("img[src=\"{}\"]", img_url))
|
.select(&format!("img[src=\"{}\"]", img_url))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
for img_elem in img_elems {
|
for img_elem in img_elems {
|
||||||
|
|
10
src/http.rs
10
src/http.rs
|
@ -9,7 +9,7 @@ use url::Url;
|
||||||
|
|
||||||
use crate::cli::AppConfig;
|
use crate::cli::AppConfig;
|
||||||
use crate::errors::{ErrorKind, ImgError, PaperoniError};
|
use crate::errors::{ErrorKind, ImgError, PaperoniError};
|
||||||
use crate::extractor::Extractor;
|
use crate::extractor::Article;
|
||||||
type HTMLResource = (String, String);
|
type HTMLResource = (String, String);
|
||||||
|
|
||||||
pub fn download(
|
pub fn download(
|
||||||
|
@ -17,7 +17,7 @@ pub fn download(
|
||||||
bar: &ProgressBar,
|
bar: &ProgressBar,
|
||||||
partial_downloads: &mut Vec<PartialDownload>,
|
partial_downloads: &mut Vec<PartialDownload>,
|
||||||
errors: &mut Vec<PaperoniError>,
|
errors: &mut Vec<PaperoniError>,
|
||||||
) -> Vec<Extractor> {
|
) -> Vec<Article> {
|
||||||
task::block_on(async {
|
task::block_on(async {
|
||||||
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
||||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
||||||
|
@ -26,7 +26,7 @@ pub fn download(
|
||||||
match fetch_result {
|
match fetch_result {
|
||||||
Ok((url, html)) => {
|
Ok((url, html)) => {
|
||||||
debug!("Extracting {}", &url);
|
debug!("Extracting {}", &url);
|
||||||
let mut extractor = Extractor::from_html(&html, &url);
|
let mut extractor = Article::from_html(&html, &url);
|
||||||
bar.set_message("Extracting...");
|
bar.set_message("Extracting...");
|
||||||
match extractor.extract_content() {
|
match extractor.extract_content() {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
|
@ -185,7 +185,7 @@ async fn process_img_response<'a>(
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn download_images(
|
pub async fn download_images(
|
||||||
extractor: &mut Extractor,
|
extractor: &mut Article,
|
||||||
article_origin: &Url,
|
article_origin: &Url,
|
||||||
bar: &ProgressBar,
|
bar: &ProgressBar,
|
||||||
) -> Result<(), Vec<ImgError>> {
|
) -> Result<(), Vec<ImgError>> {
|
||||||
|
@ -237,7 +237,7 @@ pub async fn download_images(
|
||||||
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
|
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
|
||||||
let (img_url, img_path, img_mime) = img_item;
|
let (img_url, img_path, img_mime) = img_item;
|
||||||
let img_ref = extractor
|
let img_ref = extractor
|
||||||
.article()
|
.node_ref()
|
||||||
.select_first(&format!("img[src='{}']", img_url))
|
.select_first(&format!("img[src='{}']", img_url))
|
||||||
.expect("Image node does not exist");
|
.expect("Image node does not exist");
|
||||||
let mut img_node = img_ref.attributes.borrow_mut();
|
let mut img_node = img_ref.attributes.borrow_mut();
|
||||||
|
|
Reference in a new issue