refactor: rename Extractor
to Article
This commit is contained in:
parent
eac28da798
commit
e6f901eb5a
4 changed files with 38 additions and 38 deletions
14
src/epub.rs
14
src/epub.rs
|
@ -8,7 +8,7 @@ use indicatif::{ProgressBar, ProgressStyle};
|
|||
use kuchiki::NodeRef;
|
||||
use log::{debug, error, info};
|
||||
|
||||
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Extractor};
|
||||
use crate::{cli::AppConfig, errors::PaperoniError, extractor::Article};
|
||||
|
||||
lazy_static! {
|
||||
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
|
||||
|
@ -16,7 +16,7 @@ lazy_static! {
|
|||
}
|
||||
|
||||
pub fn generate_epubs(
|
||||
articles: Vec<Extractor>,
|
||||
articles: Vec<Article>,
|
||||
app_config: &AppConfig,
|
||||
successful_articles_table: &mut Table,
|
||||
) -> Result<(), Vec<PaperoniError>> {
|
||||
|
@ -88,9 +88,9 @@ pub fn generate_epubs(
|
|||
let content_url = format!("article_{}.xhtml", idx);
|
||||
let mut xhtml_buf = Vec::new();
|
||||
let header_level_tocs =
|
||||
get_header_level_toc_vec(&content_url, article.article());
|
||||
get_header_level_toc_vec(&content_url, article.node_ref());
|
||||
|
||||
serialize_to_xhtml(article.article(), &mut xhtml_buf)?;
|
||||
serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)?;
|
||||
let xhtml_str = std::str::from_utf8(&xhtml_buf)?;
|
||||
let section_name = article.metadata().title();
|
||||
let mut content = EpubContent::new(&content_url, xhtml_str.as_bytes())
|
||||
|
@ -179,8 +179,8 @@ pub fn generate_epubs(
|
|||
let mut out_file = File::create(&file_name).unwrap();
|
||||
let mut xhtml_buf = Vec::new();
|
||||
let header_level_tocs =
|
||||
get_header_level_toc_vec("index.xhtml", article.article());
|
||||
serialize_to_xhtml(article.article(), &mut xhtml_buf)
|
||||
get_header_level_toc_vec("index.xhtml", article.node_ref());
|
||||
serialize_to_xhtml(article.node_ref(), &mut xhtml_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let xhtml_str = std::str::from_utf8(&xhtml_buf).unwrap();
|
||||
|
||||
|
@ -269,7 +269,7 @@ fn add_stylesheets<T: epub_builder::Zip>(
|
|||
}
|
||||
|
||||
//TODO: The type signature of the argument should change as it requires that merged articles create an entirely new Vec of references
|
||||
fn generate_appendix(articles: Vec<&Extractor>) -> String {
|
||||
fn generate_appendix(articles: Vec<&Article>) -> String {
|
||||
let link_tags: String = articles
|
||||
.iter()
|
||||
.map(|article| {
|
||||
|
|
|
@ -6,18 +6,18 @@ use crate::moz_readability::{MetaData, Readability};
|
|||
|
||||
pub type ResourceInfo = (String, Option<String>);
|
||||
|
||||
pub struct Extractor {
|
||||
article: Option<NodeRef>,
|
||||
pub struct Article {
|
||||
node_ref_opt: Option<NodeRef>,
|
||||
pub img_urls: Vec<ResourceInfo>,
|
||||
readability: Readability,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
impl Extractor {
|
||||
impl Article {
|
||||
/// Create a new instance of an HTML extractor given an HTML string
|
||||
pub fn from_html(html_str: &str, url: &str) -> Self {
|
||||
Extractor {
|
||||
article: None,
|
||||
Self {
|
||||
node_ref_opt: None,
|
||||
img_urls: Vec::new(),
|
||||
readability: Readability::new(html_str),
|
||||
url: url.to_string(),
|
||||
|
@ -42,14 +42,14 @@ impl Extractor {
|
|||
let doc = kuchiki::parse_html().one(template);
|
||||
let body = doc.select_first("body").unwrap();
|
||||
body.as_node().append(article_node_ref.clone());
|
||||
self.article = Some(doc);
|
||||
self.node_ref_opt = Some(doc);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
||||
pub fn extract_img_urls(&mut self) {
|
||||
if let Some(content_ref) = &self.article {
|
||||
if let Some(content_ref) = &self.node_ref_opt {
|
||||
self.img_urls = content_ref
|
||||
.select("img")
|
||||
.unwrap()
|
||||
|
@ -67,8 +67,8 @@ impl Extractor {
|
|||
}
|
||||
|
||||
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
|
||||
pub fn article(&self) -> &NodeRef {
|
||||
self.article.as_ref().expect(
|
||||
pub fn node_ref(&self) -> &NodeRef {
|
||||
self.node_ref_opt.as_ref().expect(
|
||||
"Article node doesn't exist. This may be because the document has not been parsed",
|
||||
)
|
||||
}
|
||||
|
@ -112,16 +112,16 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn test_extract_img_urls() {
|
||||
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
|
||||
extractor
|
||||
let mut article = Article::from_html(TEST_HTML, "http://example.com/");
|
||||
article
|
||||
.extract_content()
|
||||
.expect("Article extraction failed unexpectedly");
|
||||
extractor.extract_img_urls();
|
||||
article.extract_img_urls();
|
||||
|
||||
assert!(extractor.img_urls.len() > 0);
|
||||
assert!(article.img_urls.len() > 0);
|
||||
assert_eq!(
|
||||
vec![("http://example.com/img.jpg".to_string(), None)],
|
||||
extractor.img_urls
|
||||
article.img_urls
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
24
src/html.rs
24
src/html.rs
|
@ -14,7 +14,7 @@ use log::{debug, error, info};
|
|||
use crate::{
|
||||
cli::{self, AppConfig},
|
||||
errors::PaperoniError,
|
||||
extractor::Extractor,
|
||||
extractor::Article,
|
||||
moz_readability::MetaData,
|
||||
};
|
||||
|
||||
|
@ -29,7 +29,7 @@ const BASE_HTML_TEMPLATE: &str = r#"<!DOCTYPE html>
|
|||
</html>"#;
|
||||
|
||||
pub fn generate_html_exports(
|
||||
articles: Vec<Extractor>,
|
||||
articles: Vec<Article>,
|
||||
app_config: &AppConfig,
|
||||
successful_articles_table: &mut Table,
|
||||
) -> Result<(), Vec<PaperoniError>> {
|
||||
|
@ -80,7 +80,7 @@ pub fn generate_html_exports(
|
|||
|
||||
for (idx, article) in articles.iter().enumerate() {
|
||||
let article_elem = article
|
||||
.article()
|
||||
.node_ref()
|
||||
.select_first("div[id=\"readability-page-1\"]")
|
||||
.unwrap();
|
||||
|
||||
|
@ -226,16 +226,16 @@ pub fn generate_html_exports(
|
|||
elem_attrs.insert("charset", "UTF-8".into());
|
||||
}
|
||||
|
||||
if let Ok(head_elem) = article.article().select_first("head") {
|
||||
if let Ok(head_elem) = article.node_ref().select_first("head") {
|
||||
let head_elem_node = head_elem.as_node();
|
||||
head_elem_node.append(utf8_encoding);
|
||||
};
|
||||
|
||||
insert_title_elem(article.article(), article.metadata().title());
|
||||
insert_appendix(article.article(), vec![(article.metadata(), &article.url)]);
|
||||
inline_css(article.article(), app_config);
|
||||
insert_title_elem(article.node_ref(), article.metadata().title());
|
||||
insert_appendix(article.node_ref(), vec![(article.metadata(), &article.url)]);
|
||||
inline_css(article.node_ref(), app_config);
|
||||
|
||||
article.article().serialize(&mut out_file)?;
|
||||
article.node_ref().serialize(&mut out_file)?;
|
||||
Ok(())
|
||||
};
|
||||
|
||||
|
@ -269,7 +269,7 @@ fn create_qualname(name: &str) -> QualName {
|
|||
|
||||
/// Updates the src attribute of `<img>` elements with a base64 encoded string of the image data
|
||||
fn update_imgs_base64(
|
||||
article: &Extractor,
|
||||
article: &Article,
|
||||
img_url: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<(), std::io::Error> {
|
||||
|
@ -279,7 +279,7 @@ fn update_imgs_base64(
|
|||
let img_base64_str = format!("data:image:{};base64,{}", mime_type, encode(img_bytes));
|
||||
|
||||
let img_elems = article
|
||||
.article()
|
||||
.node_ref()
|
||||
.select(&format!("img[src=\"{}\"]", img_url))
|
||||
.unwrap();
|
||||
for img_elem in img_elems {
|
||||
|
@ -292,14 +292,14 @@ fn update_imgs_base64(
|
|||
}
|
||||
|
||||
/// Updates the src attribute of `<img>` elements to the new `imgs_dir_path` and copies the image to the new file location
|
||||
fn update_img_urls(article: &Extractor, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
|
||||
fn update_img_urls(article: &Article, imgs_dir_path: &Path) -> Result<(), std::io::Error> {
|
||||
let temp_dir = std::env::temp_dir();
|
||||
for (img_url, _) in &article.img_urls {
|
||||
let (from, to) = (temp_dir.join(img_url), imgs_dir_path.join(img_url));
|
||||
info!("Copying {:?} to {:?}", from, to);
|
||||
fs::copy(from, to)?;
|
||||
let img_elems = article
|
||||
.article()
|
||||
.node_ref()
|
||||
.select(&format!("img[src=\"{}\"]", img_url))
|
||||
.unwrap();
|
||||
for img_elem in img_elems {
|
||||
|
|
10
src/http.rs
10
src/http.rs
|
@ -9,7 +9,7 @@ use url::Url;
|
|||
|
||||
use crate::cli::AppConfig;
|
||||
use crate::errors::{ErrorKind, ImgError, PaperoniError};
|
||||
use crate::extractor::Extractor;
|
||||
use crate::extractor::Article;
|
||||
type HTMLResource = (String, String);
|
||||
|
||||
pub fn download(
|
||||
|
@ -17,7 +17,7 @@ pub fn download(
|
|||
bar: &ProgressBar,
|
||||
partial_downloads: &mut Vec<PartialDownload>,
|
||||
errors: &mut Vec<PaperoniError>,
|
||||
) -> Vec<Extractor> {
|
||||
) -> Vec<Article> {
|
||||
task::block_on(async {
|
||||
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
||||
|
@ -26,7 +26,7 @@ pub fn download(
|
|||
match fetch_result {
|
||||
Ok((url, html)) => {
|
||||
debug!("Extracting {}", &url);
|
||||
let mut extractor = Extractor::from_html(&html, &url);
|
||||
let mut extractor = Article::from_html(&html, &url);
|
||||
bar.set_message("Extracting...");
|
||||
match extractor.extract_content() {
|
||||
Ok(_) => {
|
||||
|
@ -185,7 +185,7 @@ async fn process_img_response<'a>(
|
|||
}
|
||||
|
||||
pub async fn download_images(
|
||||
extractor: &mut Extractor,
|
||||
extractor: &mut Article,
|
||||
article_origin: &Url,
|
||||
bar: &ProgressBar,
|
||||
) -> Result<(), Vec<ImgError>> {
|
||||
|
@ -237,7 +237,7 @@ pub async fn download_images(
|
|||
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
|
||||
let (img_url, img_path, img_mime) = img_item;
|
||||
let img_ref = extractor
|
||||
.article()
|
||||
.node_ref()
|
||||
.select_first(&format!("img[src='{}']", img_url))
|
||||
.expect("Image node does not exist");
|
||||
let mut img_node = img_ref.attributes.borrow_mut();
|
||||
|
|
Reference in a new issue