refactor: rename Extractor to Article

2021-07-24 12:43:40 +03:00 · 2021-07-24 12:43:40 +03:00 · e6f901eb5a
commit e6f901eb5a
parent eac28da798
4 changed files with 38 additions and 38 deletions
--- a/src/epub.rs
+++ b/src/epub.rs
@ -8,7 +8,7 @@ use indicatif::{ProgressBar, ProgressStyle};
 use kuchiki::NodeRef;
 use log::{debug, error, info};

-use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
+use crate::{cli::AppConfig, errors::PaperoniError, extractor::Article};

 lazy_static! {
    static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
@ -16,7 +16,7 @@ lazy_static! {
 }

 pub fn generate_epubs(
-    articles: Vec<Extractor>,
+    articles: Vec<Article>,
    app_config: &AppConfig,
    successful_articles_table: &mut Table,
 ) -> Result<(), Vec<PaperoniError>> {
@ -88,9 +88,9 @@ pub fn generate_epubs(
                        let content_url = format!("article_{}.xhtml", idx);
                        let mut xhtml_buf = Vec::new();
                        let header_level_tocs =
-                            get_header_level_toc_vec(&content_url, article.article());
+                            get_header_level_toc_vec(&content_url, article.node_ref());

-                        serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
+                        serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)?;
                        let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
                        let section_name = article.metadata().title();
                        let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
@ -179,8 +179,8 @@ pub fn generate_epubs(
                    let mut out_file = File::create(&file_name).unwrap();
                    let mut xhtml_buf = Vec::new();
                    let header_level_tocs =
-                        get_header_level_toc_vec("index.xhtml", article.article());
-                    serialize_to_xhtml(article.article(), &mut xhtml_buf)
+                        get_header_level_toc_vec("index.xhtml", article.node_ref());
+                    serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)
                        .expect("Unable to serialize to xhtml");
                    let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();

@ -269,7 +269,7 @@ fn add_stylesheets<T: epub_builder::Zip>(
 }

 //TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
-fn generate_appendix(articles: Vec<&Extractor>) -> String {
+fn generate_appendix(articles: Vec<&Article>) -> String {
    let link_tags: String = articles
        .iter()
        .map(|article| {
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -6,18 +6,18 @@ use crate::moz_readability::{MetaData, Readability};

 pub type ResourceInfo = (String, Option<String>);

-pub struct Extractor {
-    article: Option<NodeRef>,
+pub struct Article {
+    node_ref_opt: Option<NodeRef>,
    pub img_urls: Vec<ResourceInfo>,
    readability: Readability,
    pub url: String,
 }

-impl Extractor {
+impl Article {
    /// Create a new instance of an HTML extractor given an HTML string
    pub fn from_html(html_str: &str, url: &str) -> Self {
-        Extractor {
-            article: None,
+        Self {
+            node_ref_opt: None,
            img_urls: Vec::new(),
            readability: Readability::new(html_str),
            url: url.to_string(),
@ -42,14 +42,14 @@ impl Extractor {
            let doc = kuchiki::parse_html().one(template);
            let body = doc.select_first("body").unwrap();
            body.as_node().append(article_node_ref.clone());
-            self.article = Some(doc);
+            self.node_ref_opt = Some(doc);
        }
        Ok(())
    }

    /// Traverses the DOM tree of the content and retrieves the IMG URLs
    pub fn extract_img_urls(&mut self) {
-        if let Some(content_ref) = &self.article {
+        if let Some(content_ref) = &self.node_ref_opt {
            self.img_urls = content_ref
                .select("img")
                .unwrap()
@ -67,8 +67,8 @@ impl Extractor {
    }

    /// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
-    pub fn article(&self) -> &NodeRef {
-        self.article.as_ref().expect(
+    pub fn node_ref(&self) -> &NodeRef {
+        self.node_ref_opt.as_ref().expect(
            "Article node doesn't exist. This may be because the document has not been parsed",
        )
    }
@ -112,16 +112,16 @@ mod test {

    #[test]
    fn test_extract_img_urls() {
-        let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
-        extractor
+        let mut article = Article::from_html(TEST_HTML, "http://example.com/");
+        article
            .extract_content()
            .expect("Article extraction failed unexpectedly");
-        extractor.extract_img_urls();
+        article.extract_img_urls();

-        assert!(extractor.img_urls.len() > 0);
+        assert!(article.img_urls.len() > 0);
        assert_eq!(
            vec![("http://example.com/img.jpg".to_string(), None)],
-            extractor.img_urls
+            article.img_urls
        );
    }
 }
--- a/src/html.rs
+++ b/src/html.rs
@ -14,7 +14,7 @@ use log::{debug, error, info};
 use crate::{
    cli::{self, AppConfig},
    errors::PaperoniError,
-    extractor::Extractor,
+    extractor::Article,
    moz_readability::MetaData,
 };

@ -29,7 +29,7 @@ const BASE_HTML_TEMPLATE: &str = r#"<!DOCTYPE html>
 </html>"#;

 pub fn generate_html_exports(
-    articles: Vec<Extractor>,
+    articles: Vec<Article>,
    app_config: &AppConfig,
    successful_articles_table: &mut Table,
 ) -> Result<(), Vec<PaperoniError>> {
@ -80,7 +80,7 @@ pub fn generate_html_exports(

            for (idx, article) in articles.iter().enumerate() {
                let article_elem = article
-                    .article()
+                    .node_ref()
                    .select_first("div[id=\"readability-page-1\"]")
                    .unwrap();

@ -226,16 +226,16 @@ pub fn generate_html_exports(
                        elem_attrs.insert("charset", "UTF-8".into());
                    }

-                    if let Ok(head_elem) = article.article().select_first("head") {
+                    if let Ok(head_elem) = article.node_ref().select_first("head") {
                        let head_elem_node = head_elem.as_node();
                        head_elem_node.append(utf8_encoding);
                    };

-                    insert_title_elem(article.article(), article.metadata().title());
-                    insert_appendix(article.article(), vec![(article.metadata(), &article.url)]);
-                    inline_css(article.article(), app_config);
+                    insert_title_elem(article.node_ref(), article.metadata().title());
+                    insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]);
+                    inline_css(article.node_ref(), app_config);

-                    article.article().serialize(&mut out_file)?;
+                    article.node_ref().serialize(&mut out_file)?;
                    Ok(())
                };

@ -269,7 +269,7 @@ fn create_qualname(name: &str) -> QualName {

 /// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
 fn update_imgs_base64(
-    article: &Extractor,
+    article: &Article,
    img_url: &str,
    mime_type: &str,
 ) -> Result<(), std::io::Error> {
@ -279,7 +279,7 @@ fn update_imgs_base64(
    let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));

    let img_elems = article
-        .article()
+        .node_ref()
        .select(&format!("img[src=\"{}\"]", img_url))
        .unwrap();
    for img_elem in img_elems {
@ -292,14 +292,14 @@ fn update_imgs_base64(
 }

 /// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location
-fn update_img_urls(article: &Extractor, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
+fn update_img_urls(article: &Article, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
    let temp_dir = std::env::temp_dir();
    for (img_url, _) in &article.img_urls {
        let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url));
        info!("Copying {:?} to {:?}", from, to);
        fs::copy(from, to)?;
        let img_elems = article
-            .article()
+            .node_ref()
            .select(&format!("img[src=\"{}\"]", img_url))
            .unwrap();
        for img_elem in img_elems {
--- a/src/http.rs
+++ b/src/http.rs
@ -9,7 +9,7 @@ use url::Url;

 use crate::cli::AppConfig;
 use crate::errors::{ErrorKind, ImgError, PaperoniError};
-use crate::extractor::Extractor;
+use crate::extractor::Article;
 type HTMLResource = (String, String);

 pub fn download(
@ -17,7 +17,7 @@ pub fn download(
    bar: &ProgressBar,
    partial_downloads: &mut Vec<PartialDownload>,
    errors: &mut Vec<PaperoniError>,
-) -> Vec<Extractor> {
+) -> Vec<Article> {
    task::block_on(async {
        let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
        let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
@ -26,7 +26,7 @@ pub fn download(
            match fetch_result {
                Ok((url, html)) => {
                    debug!("Extracting {}", &url);
-                    let mut extractor = Extractor::from_html(&html, &url);
+                    let mut extractor = Article::from_html(&html, &url);
                    bar.set_message("Extracting...");
                    match extractor.extract_content() {
                        Ok(_) => {
@ -185,7 +185,7 @@ async fn process_img_response<'a>(
 }

 pub async fn download_images(
-    extractor: &mut Extractor,
+    extractor: &mut Article,
    article_origin: &Url,
    bar: &ProgressBar,
 ) -> Result<(), Vec<ImgError>> {
@ -237,7 +237,7 @@ pub async fn download_images(
    let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
        let (img_url, img_path, img_mime) = img_item;
        let img_ref = extractor
-            .article()
+            .node_ref()
            .select_first(&format!("img[src='{}']", img_url))
            .expect("Image node does not exist");
        let mut img_node = img_ref.attributes.borrow_mut();