refactor: rename Extractor to Article

This commit is contained in:
Kenneth Gitere 2021-07-24 12:43:40 +03:00
parent eac28da798
commit e6f901eb5a
4 changed files with 38 additions and 38 deletions

View file

@ -8,7 +8,7 @@ use indicatif::{ProgressBar, ProgressStyle};
use kuchiki::NodeRef; use kuchiki::NodeRef;
use log::{debug, error, info}; use log::{debug, error, info};
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor}; use crate::{cli::AppConfig, errors::PaperoniError, extractor::Article};
lazy_static! { lazy_static! {
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
@ -16,7 +16,7 @@ lazy_static! {
} }
pub fn generate_epubs( pub fn generate_epubs(
articles: Vec<Extractor>, articles: Vec<Article>,
app_config: &AppConfig, app_config: &AppConfig,
successful_articles_table: &mut Table, successful_articles_table: &mut Table,
) -> Result<(), Vec<PaperoniError>> { ) -> Result<(), Vec<PaperoniError>> {
@ -88,9 +88,9 @@ pub fn generate_epubs(
let content_url = format!("article_{}.xhtml", idx); let content_url = format!("article_{}.xhtml", idx);
let mut xhtml_buf = Vec::new(); let mut xhtml_buf = Vec::new();
let header_level_tocs = let header_level_tocs =
get_header_level_toc_vec(&content_url, article.article()); get_header_level_toc_vec(&content_url, article.node_ref());
serialize_to_xhtml(article.article(), &mut xhtml_buf)?; serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)?;
let xhtml_str = std::str::from_utf8(&xhtml_buf)?; let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
let section_name = article.metadata().title(); let section_name = article.metadata().title();
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes()) let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
@ -179,8 +179,8 @@ pub fn generate_epubs(
let mut out_file = File::create(&file_name).unwrap(); let mut out_file = File::create(&file_name).unwrap();
let mut xhtml_buf = Vec::new(); let mut xhtml_buf = Vec::new();
let header_level_tocs = let header_level_tocs =
get_header_level_toc_vec("index.xhtml", article.article()); get_header_level_toc_vec("index.xhtml", article.node_ref());
serialize_to_xhtml(article.article(), &mut xhtml_buf) serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)
.expect("Unable to serialize to xhtml"); .expect("Unable to serialize to xhtml");
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap(); let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
@ -269,7 +269,7 @@ fn add_stylesheets<T: epub_builder::Zip>(
} }
//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references //TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
fn generate_appendix(articles: Vec<&Extractor>) -> String { fn generate_appendix(articles: Vec<&Article>) -> String {
let link_tags: String = articles let link_tags: String = articles
.iter() .iter()
.map(|article| { .map(|article| {

View file

@ -6,18 +6,18 @@ use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
pub struct Extractor { pub struct Article {
article: Option<NodeRef>, node_ref_opt: Option<NodeRef>,
pub img_urls: Vec<ResourceInfo>, pub img_urls: Vec<ResourceInfo>,
readability: Readability, readability: Readability,
pub url: String, pub url: String,
} }
impl Extractor { impl Article {
/// Create a new instance of an HTML extractor given an HTML string /// Create a new instance of an HTML extractor given an HTML string
pub fn from_html(html_str: &str, url: &str) -> Self { pub fn from_html(html_str: &str, url: &str) -> Self {
Extractor { Self {
article: None, node_ref_opt: None,
img_urls: Vec::new(), img_urls: Vec::new(),
readability: Readability::new(html_str), readability: Readability::new(html_str),
url: url.to_string(), url: url.to_string(),
@ -42,14 +42,14 @@ impl Extractor {
let doc = kuchiki::parse_html().one(template); let doc = kuchiki::parse_html().one(template);
let body = doc.select_first("body").unwrap(); let body = doc.select_first("body").unwrap();
body.as_node().append(article_node_ref.clone()); body.as_node().append(article_node_ref.clone());
self.article = Some(doc); self.node_ref_opt = Some(doc);
} }
Ok(()) Ok(())
} }
/// Traverses the DOM tree of the content and retrieves the IMG URLs /// Traverses the DOM tree of the content and retrieves the IMG URLs
pub fn extract_img_urls(&mut self) { pub fn extract_img_urls(&mut self) {
if let Some(content_ref) = &self.article { if let Some(content_ref) = &self.node_ref_opt {
self.img_urls = content_ref self.img_urls = content_ref
.select("img") .select("img")
.unwrap() .unwrap()
@ -67,8 +67,8 @@ impl Extractor {
} }
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
pub fn article(&self) -> &NodeRef { pub fn node_ref(&self) -> &NodeRef {
self.article.as_ref().expect( self.node_ref_opt.as_ref().expect(
"Article node doesn't exist. This may be because the document has not been parsed", "Article node doesn't exist. This may be because the document has not been parsed",
) )
} }
@ -112,16 +112,16 @@ mod test {
#[test] #[test]
fn test_extract_img_urls() { fn test_extract_img_urls() {
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/"); let mut article = Article::from_html(TEST_HTML, "http://example.com/");
extractor article
.extract_content() .extract_content()
.expect("Article extraction failed unexpectedly"); .expect("Article extraction failed unexpectedly");
extractor.extract_img_urls(); article.extract_img_urls();
assert!(extractor.img_urls.len() > 0); assert!(article.img_urls.len() > 0);
assert_eq!( assert_eq!(
vec![("http://example.com/img.jpg".to_string(), None)], vec![("http://example.com/img.jpg".to_string(), None)],
extractor.img_urls article.img_urls
); );
} }
} }

View file

@ -14,7 +14,7 @@ use log::{debug, error, info};
use crate::{ use crate::{
cli::{self, AppConfig}, cli::{self, AppConfig},
errors::PaperoniError, errors::PaperoniError,
extractor::Extractor, extractor::Article,
moz_readability::MetaData, moz_readability::MetaData,
}; };
@ -29,7 +29,7 @@ const BASE_HTML_TEMPLATE: &str = r#"<!DOCTYPE html>
</html>"#; </html>"#;
pub fn generate_html_exports( pub fn generate_html_exports(
articles: Vec<Extractor>, articles: Vec<Article>,
app_config: &AppConfig, app_config: &AppConfig,
successful_articles_table: &mut Table, successful_articles_table: &mut Table,
) -> Result<(), Vec<PaperoniError>> { ) -> Result<(), Vec<PaperoniError>> {
@ -80,7 +80,7 @@ pub fn generate_html_exports(
for (idx, article) in articles.iter().enumerate() { for (idx, article) in articles.iter().enumerate() {
let article_elem = article let article_elem = article
.article() .node_ref()
.select_first("div[id=\"readability-page-1\"]") .select_first("div[id=\"readability-page-1\"]")
.unwrap(); .unwrap();
@ -226,16 +226,16 @@ pub fn generate_html_exports(
elem_attrs.insert("charset", "UTF-8".into()); elem_attrs.insert("charset", "UTF-8".into());
} }
if let Ok(head_elem) = article.article().select_first("head") { if let Ok(head_elem) = article.node_ref().select_first("head") {
let head_elem_node = head_elem.as_node(); let head_elem_node = head_elem.as_node();
head_elem_node.append(utf8_encoding); head_elem_node.append(utf8_encoding);
}; };
insert_title_elem(article.article(), article.metadata().title()); insert_title_elem(article.node_ref(), article.metadata().title());
insert_appendix(article.article(), vec![(article.metadata(), &article.url)]); insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]);
inline_css(article.article(), app_config); inline_css(article.node_ref(), app_config);
article.article().serialize(&mut out_file)?; article.node_ref().serialize(&mut out_file)?;
Ok(()) Ok(())
}; };
@ -269,7 +269,7 @@ fn create_qualname(name: &str) -> QualName {
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data /// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
fn update_imgs_base64( fn update_imgs_base64(
article: &Extractor, article: &Article,
img_url: &str, img_url: &str,
mime_type: &str, mime_type: &str,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
@ -279,7 +279,7 @@ fn update_imgs_base64(
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes)); let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));
let img_elems = article let img_elems = article
.article() .node_ref()
.select(&format!("img[src=\"{}\"]", img_url)) .select(&format!("img[src=\"{}\"]", img_url))
.unwrap(); .unwrap();
for img_elem in img_elems { for img_elem in img_elems {
@ -292,14 +292,14 @@ fn update_imgs_base64(
} }
/// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location /// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location
fn update_img_urls(article: &Extractor, imgs_dir_path: &Path) -> Result<(), std::io::Error> { fn update_img_urls(article: &Article, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
let temp_dir = std::env::temp_dir(); let temp_dir = std::env::temp_dir();
for (img_url, _) in &article.img_urls { for (img_url, _) in &article.img_urls {
let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url)); let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url));
info!("Copying {:?} to {:?}", from, to); info!("Copying {:?} to {:?}", from, to);
fs::copy(from, to)?; fs::copy(from, to)?;
let img_elems = article let img_elems = article
.article() .node_ref()
.select(&format!("img[src=\"{}\"]", img_url)) .select(&format!("img[src=\"{}\"]", img_url))
.unwrap(); .unwrap();
for img_elem in img_elems { for img_elem in img_elems {

View file

@ -9,7 +9,7 @@ use url::Url;
use crate::cli::AppConfig; use crate::cli::AppConfig;
use crate::errors::{ErrorKind, ImgError, PaperoniError}; use crate::errors::{ErrorKind, ImgError, PaperoniError};
use crate::extractor::Extractor; use crate::extractor::Article;
type HTMLResource = (String, String); type HTMLResource = (String, String);
pub fn download( pub fn download(
@ -17,7 +17,7 @@ pub fn download(
bar: &ProgressBar, bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>, partial_downloads: &mut Vec<PartialDownload>,
errors: &mut Vec<PaperoniError>, errors: &mut Vec<PaperoniError>,
) -> Vec<Extractor> { ) -> Vec<Article> {
task::block_on(async { task::block_on(async {
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
@ -26,7 +26,7 @@ pub fn download(
match fetch_result { match fetch_result {
Ok((url, html)) => { Ok((url, html)) => {
debug!("Extracting {}", &url); debug!("Extracting {}", &url);
let mut extractor = Extractor::from_html(&html, &url); let mut extractor = Article::from_html(&html, &url);
bar.set_message("Extracting..."); bar.set_message("Extracting...");
match extractor.extract_content() { match extractor.extract_content() {
Ok(_) => { Ok(_) => {
@ -185,7 +185,7 @@ async fn process_img_response<'a>(
} }
pub async fn download_images( pub async fn download_images(
extractor: &mut Extractor, extractor: &mut Article,
article_origin: &Url, article_origin: &Url,
bar: &ProgressBar, bar: &ProgressBar,
) -> Result<(), Vec<ImgError>> { ) -> Result<(), Vec<ImgError>> {
@ -237,7 +237,7 @@ pub async fn download_images(
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) { let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
let (img_url, img_path, img_mime) = img_item; let (img_url, img_path, img_mime) = img_item;
let img_ref = extractor let img_ref = extractor
.article() .node_ref()
.select_first(&format!("img[src='{}']", img_url)) .select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist"); .expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut(); let mut img_node = img_ref.attributes.borrow_mut();