From e6f901eb5a8bd2bdfc95fe4424dddb2b3c6d8ff9 Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sat, 24 Jul 2021 12:43:40 +0300 Subject: [PATCH] refactor: rename `Extractor` to `Article` --- src/epub.rs | 14 +++++++------- src/extractor.rs | 28 ++++++++++++++-------------- src/html.rs | 24 ++++++++++++------------ src/http.rs | 10 +++++----- 4 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/epub.rs b/src/epub.rs index 8c280f1..d589ff4 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -8,7 +8,7 @@ use indicatif::{ProgressBar, ProgressStyle}; use kuchiki::NodeRef; use log::{debug, error, info}; -use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor}; +use crate::{cli::AppConfig, errors::PaperoniError, extractor::Article}; lazy_static! { static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap(); @@ -16,7 +16,7 @@ lazy_static! { } pub fn generate_epubs( - articles: Vec, + articles: Vec
, app_config: &AppConfig, successful_articles_table: &mut Table, ) -> Result<(), Vec> { @@ -88,9 +88,9 @@ pub fn generate_epubs( let content_url = format!("article_{}.xhtml", idx); let mut xhtml_buf = Vec::new(); let header_level_tocs = - get_header_level_toc_vec(&content_url, article.article()); + get_header_level_toc_vec(&content_url, article.node_ref()); - serialize_to_xhtml(article.article(), &mut xhtml_buf)?; + serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)?; let xhtml_str = std::str::from_utf8(&xhtml_buf)?; let section_name = article.metadata().title(); let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes()) @@ -179,8 +179,8 @@ pub fn generate_epubs( let mut out_file = File::create(&file_name).unwrap(); let mut xhtml_buf = Vec::new(); let header_level_tocs = - get_header_level_toc_vec("index.xhtml", article.article()); - serialize_to_xhtml(article.article(), &mut xhtml_buf) + get_header_level_toc_vec("index.xhtml", article.node_ref()); + serialize_to_xhtml(article.node_ref(), &mut xhtml_buf) .expect("Unable to serialize to xhtml"); let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap(); @@ -269,7 +269,7 @@ fn add_stylesheets( } //TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references -fn generate_appendix(articles: Vec<&Extractor>) -> String { +fn generate_appendix(articles: Vec<&Article>) -> String { let link_tags: String = articles .iter() .map(|article| { diff --git a/src/extractor.rs b/src/extractor.rs index 9df5168..b16373a 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -6,18 +6,18 @@ use crate::moz_readability::{MetaData, Readability}; pub type ResourceInfo = (String, Option); -pub struct Extractor { - article: Option, +pub struct Article { + node_ref_opt: Option, pub img_urls: Vec, readability: Readability, pub url: String, } -impl Extractor { +impl Article { /// Create a new instance of an HTML extractor given an HTML string pub fn from_html(html_str: &str, url: &str) -> Self { - Extractor { - article: None, + Self { + node_ref_opt: None, img_urls: Vec::new(), readability: Readability::new(html_str), url: url.to_string(), @@ -42,14 +42,14 @@ impl Extractor { let doc = kuchiki::parse_html().one(template); let body = doc.select_first("body").unwrap(); body.as_node().append(article_node_ref.clone()); - self.article = Some(doc); + self.node_ref_opt = Some(doc); } Ok(()) } /// Traverses the DOM tree of the content and retrieves the IMG URLs pub fn extract_img_urls(&mut self) { - if let Some(content_ref) = &self.article { + if let Some(content_ref) = &self.node_ref_opt { self.img_urls = content_ref .select("img") .unwrap() @@ -67,8 +67,8 @@ impl Extractor { } /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse - pub fn article(&self) -> &NodeRef { - self.article.as_ref().expect( + pub fn node_ref(&self) -> &NodeRef { + self.node_ref_opt.as_ref().expect( "Article node doesn't exist. This may be because the document has not been parsed", ) } @@ -112,16 +112,16 @@ mod test { #[test] fn test_extract_img_urls() { - let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/"); - extractor + let mut article = Article::from_html(TEST_HTML, "http://example.com/"); + article .extract_content() .expect("Article extraction failed unexpectedly"); - extractor.extract_img_urls(); + article.extract_img_urls(); - assert!(extractor.img_urls.len() > 0); + assert!(article.img_urls.len() > 0); assert_eq!( vec![("http://example.com/img.jpg".to_string(), None)], - extractor.img_urls + article.img_urls ); } } diff --git a/src/html.rs b/src/html.rs index a26fe85..7b761d2 100644 --- a/src/html.rs +++ b/src/html.rs @@ -14,7 +14,7 @@ use log::{debug, error, info}; use crate::{ cli::{self, AppConfig}, errors::PaperoniError, - extractor::Extractor, + extractor::Article, moz_readability::MetaData, }; @@ -29,7 +29,7 @@ const BASE_HTML_TEMPLATE: &str = r#" "#; pub fn generate_html_exports( - articles: Vec, + articles: Vec
, app_config: &AppConfig, successful_articles_table: &mut Table, ) -> Result<(), Vec> { @@ -80,7 +80,7 @@ pub fn generate_html_exports( for (idx, article) in articles.iter().enumerate() { let article_elem = article - .article() + .node_ref() .select_first("div[id=\"readability-page-1\"]") .unwrap(); @@ -226,16 +226,16 @@ pub fn generate_html_exports( elem_attrs.insert("charset", "UTF-8".into()); } - if let Ok(head_elem) = article.article().select_first("head") { + if let Ok(head_elem) = article.node_ref().select_first("head") { let head_elem_node = head_elem.as_node(); head_elem_node.append(utf8_encoding); }; - insert_title_elem(article.article(), article.metadata().title()); - insert_appendix(article.article(), vec![(article.metadata(), &article.url)]); - inline_css(article.article(), app_config); + insert_title_elem(article.node_ref(), article.metadata().title()); + insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]); + inline_css(article.node_ref(), app_config); - article.article().serialize(&mut out_file)?; + article.node_ref().serialize(&mut out_file)?; Ok(()) }; @@ -269,7 +269,7 @@ fn create_qualname(name: &str) -> QualName { /// Updates the src attribute of `` elements with a base64 encoded string of the image data fn update_imgs_base64( - article: &Extractor, + article: &Article, img_url: &str, mime_type: &str, ) -> Result<(), std::io::Error> { @@ -279,7 +279,7 @@ fn update_imgs_base64( let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes)); let img_elems = article - .article() + .node_ref() .select(&format!("img[src=\"{}\"]", img_url)) .unwrap(); for img_elem in img_elems { @@ -292,14 +292,14 @@ fn update_imgs_base64( } /// Updates the src attribute of `` elements to the new `imgs_dir_path` and copies the image to the new file location -fn update_img_urls(article: &Extractor, imgs_dir_path: &Path) -> Result<(), std::io::Error> { +fn update_img_urls(article: &Article, imgs_dir_path: &Path) -> Result<(), std::io::Error> { let temp_dir = std::env::temp_dir(); for (img_url, _) in &article.img_urls { let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url)); info!("Copying {:?} to {:?}", from, to); fs::copy(from, to)?; let img_elems = article - .article() + .node_ref() .select(&format!("img[src=\"{}\"]", img_url)) .unwrap(); for img_elem in img_elems { diff --git a/src/http.rs b/src/http.rs index 8707977..15cdb3c 100644 --- a/src/http.rs +++ b/src/http.rs @@ -9,7 +9,7 @@ use url::Url; use crate::cli::AppConfig; use crate::errors::{ErrorKind, ImgError, PaperoniError}; -use crate::extractor::Extractor; +use crate::extractor::Article; type HTMLResource = (String, String); pub fn download( @@ -17,7 +17,7 @@ pub fn download( bar: &ProgressBar, partial_downloads: &mut Vec, errors: &mut Vec, -) -> Vec { +) -> Vec
{ task::block_on(async { let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); @@ -26,7 +26,7 @@ pub fn download( match fetch_result { Ok((url, html)) => { debug!("Extracting {}", &url); - let mut extractor = Extractor::from_html(&html, &url); + let mut extractor = Article::from_html(&html, &url); bar.set_message("Extracting..."); match extractor.extract_content() { Ok(_) => { @@ -185,7 +185,7 @@ async fn process_img_response<'a>( } pub async fn download_images( - extractor: &mut Extractor, + extractor: &mut Article, article_origin: &Url, bar: &ProgressBar, ) -> Result<(), Vec> { @@ -237,7 +237,7 @@ pub async fn download_images( let replace_existing_img_src = |img_item: ImgItem| -> (String, Option) { let (img_url, img_path, img_mime) = img_item; let img_ref = extractor - .article() + .node_ref() .select_first(&format!("img[src='{}']", img_url)) .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut();