Refactor grab_article to return a Result

- Add ReadabilityError field
- Refactor `article` getter in Extractor to return a &NodeRef. This
  relies on the assumption that the article has already been parsed
  and should otherwise panic.
This commit is contained in:
Kenneth Gitere 2021-04-21 19:07:08 +03:00
parent ae1ddb9386
commit dbac7c3b69
6 changed files with 46 additions and 31 deletions

View file

@ -61,7 +61,7 @@ pub fn generate_epubs(
.fold(&mut epub, |epub, (idx, article)| {
let mut article_result = || -> Result<(), PaperoniError> {
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?;
extractor::serialize_to_xhtml(article.article(), &mut html_buf)?;
let html_str = std::str::from_utf8(&html_buf)?;
epub.metadata("title", replace_metadata_value(name))?;
let section_name = article.metadata().title();
@ -129,7 +129,7 @@ pub fn generate_epubs(
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
extractor::serialize_to_xhtml(article.article(), &mut html_buf)
.expect("Unable to serialize to xhtml");
let html_str = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = article.metadata().byline() {

View file

@ -10,6 +10,8 @@ pub enum ErrorKind {
IOError(String),
#[error("[UTF8Error]: {0}")]
UTF8Error(String),
#[error("[ReadabilityError]: {0}")]
ReadabilityError(String),
}
#[derive(Error, Debug)]

View file

@ -31,8 +31,8 @@ impl Extractor {
/// Locates and extracts the HTML in a document which is determined to be
/// the source of the content
pub fn extract_content(&mut self) {
self.readability.parse(&self.url);
pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
self.readability.parse(&self.url)?;
if let Some(article_node_ref) = &self.readability.article_node {
let template = r#"
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
@ -47,6 +47,7 @@ impl Extractor {
body.as_node().append(article_node_ref.clone());
self.article = Some(doc);
}
Ok(())
}
/// Traverses the DOM tree of the content and retrieves the IMG URLs
@ -64,8 +65,11 @@ impl Extractor {
}
}
pub fn article(&self) -> Option<&NodeRef> {
self.article.as_ref()
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
pub fn article(&self) -> &NodeRef {
self.article.as_ref().expect(
"Article node doesn't exist. This may be because the document has not been parsed",
)
}
pub fn metadata(&self) -> &MetaData {
@ -160,7 +164,9 @@ mod test {
#[test]
fn test_extract_img_urls() {
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
extractor.extract_content();
extractor
.extract_content()
.expect("Article extraction failed unexpectedly");
extractor.extract_img_urls();
assert!(extractor.img_urls.len() > 0);

View file

@ -141,8 +141,6 @@ pub async fn download_images(
let (img_url, img_path, img_mime) = img_item;
let img_ref = extractor
.article()
.as_mut()
.expect("Unable to get mutable ref")
.select_first(&format!("img[src='{}']", img_url))
.expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut();

View file

@ -49,21 +49,26 @@ fn download(app_config: AppConfig) {
// println!("Extracting");
let mut extractor = Extractor::from_html(&html, &url);
bar.set_message("Extracting...");
extractor.extract_content();
if extractor.article().is_some() {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await
{
eprintln!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
.await
{
eprintln!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
articles.push(extractor);
}
}
Err(e) => errors.push(e),

View file

@ -9,6 +9,8 @@ use kuchiki::{
};
use url::Url;
use crate::errors::{ErrorKind, PaperoniError};
const DEFAULT_CHAR_THRESHOLD: usize = 500;
const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
@ -76,14 +78,15 @@ impl Readability {
metadata: MetaData::new(),
}
}
pub fn parse(&mut self, url: &str) {
pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> {
self.unwrap_no_script_tags();
self.remove_scripts();
self.prep_document();
self.metadata = self.get_article_metadata();
self.article_title = self.metadata.title.clone();
self.grab_article();
self.grab_article()?;
self.post_process_content(url);
Ok(())
}
/// Recursively check if node is image, or if node contains exactly only one image
@ -1584,7 +1587,7 @@ impl Readability {
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
/// a user wants to read. Then return it wrapped up in a div.
fn grab_article(&mut self) {
fn grab_article(&mut self) -> Result<(), PaperoniError> {
// TODO: Add logging for this
// println!("Grabbing article");
// var doc = this._doc;
@ -1593,8 +1596,7 @@ impl Readability {
let page = self.root_node.select_first("body");
if page.is_err() {
// TODO:Have error logging for this
println!("Document has no <body>");
return;
return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
}
let page = page.unwrap();
let mut attempts: Vec<ExtractAttempt> = Vec::new();
@ -2084,8 +2086,10 @@ impl Readability {
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
if attempts.first().as_ref().unwrap().length == 0 {
println!("Unable to extract content");
break;
return Err(ErrorKind::ReadabilityError(
"Unable to extract content".into(),
)
.into());
}
article_content = attempts[0].article.clone();
parse_successful = true;
@ -2111,7 +2115,7 @@ impl Readability {
false
});
self.article_node = Some(article_content);
return;
return Ok(());
}
}
}