Refactor grab_article
to return a Result
- Add ReadabilityError field - Refactor `article` getter in Extractor to return a &NodeRef. This relies on the assumption that the article has already been parsed and should otherwise panic.
This commit is contained in:
parent
ae1ddb9386
commit
dbac7c3b69
6 changed files with 46 additions and 31 deletions
|
@ -61,7 +61,7 @@ pub fn generate_epubs(
|
|||
.fold(&mut epub, |epub, (idx, article)| {
|
||||
let mut article_result = || -> Result<(), PaperoniError> {
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)?;
|
||||
extractor::serialize_to_xhtml(article.article(), &mut html_buf)?;
|
||||
let html_str = std::str::from_utf8(&html_buf)?;
|
||||
epub.metadata("title", replace_metadata_value(name))?;
|
||||
let section_name = article.metadata().title();
|
||||
|
@ -129,7 +129,7 @@ pub fn generate_epubs(
|
|||
);
|
||||
let mut out_file = File::create(&file_name).unwrap();
|
||||
let mut html_buf = Vec::new();
|
||||
extractor::serialize_to_xhtml(article.article().unwrap(), &mut html_buf)
|
||||
extractor::serialize_to_xhtml(article.article(), &mut html_buf)
|
||||
.expect("Unable to serialize to xhtml");
|
||||
let html_str = std::str::from_utf8(&html_buf).unwrap();
|
||||
if let Some(author) = article.metadata().byline() {
|
||||
|
|
|
@ -10,6 +10,8 @@ pub enum ErrorKind {
|
|||
IOError(String),
|
||||
#[error("[UTF8Error]: {0}")]
|
||||
UTF8Error(String),
|
||||
#[error("[ReadabilityError]: {0}")]
|
||||
ReadabilityError(String),
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
|
|
|
@ -31,8 +31,8 @@ impl Extractor {
|
|||
|
||||
/// Locates and extracts the HTML in a document which is determined to be
|
||||
/// the source of the content
|
||||
pub fn extract_content(&mut self) {
|
||||
self.readability.parse(&self.url);
|
||||
pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
|
||||
self.readability.parse(&self.url)?;
|
||||
if let Some(article_node_ref) = &self.readability.article_node {
|
||||
let template = r#"
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
||||
|
@ -47,6 +47,7 @@ impl Extractor {
|
|||
body.as_node().append(article_node_ref.clone());
|
||||
self.article = Some(doc);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
||||
|
@ -64,8 +65,11 @@ impl Extractor {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn article(&self) -> Option<&NodeRef> {
|
||||
self.article.as_ref()
|
||||
/// Returns the extracted article [NodeRef]. It should only be called *AFTER* calling parse
|
||||
pub fn article(&self) -> &NodeRef {
|
||||
self.article.as_ref().expect(
|
||||
"Article node doesn't exist. This may be because the document has not been parsed",
|
||||
)
|
||||
}
|
||||
|
||||
pub fn metadata(&self) -> &MetaData {
|
||||
|
@ -160,7 +164,9 @@ mod test {
|
|||
#[test]
|
||||
fn test_extract_img_urls() {
|
||||
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
|
||||
extractor.extract_content();
|
||||
extractor
|
||||
.extract_content()
|
||||
.expect("Article extraction failed unexpectedly");
|
||||
extractor.extract_img_urls();
|
||||
|
||||
assert!(extractor.img_urls.len() > 0);
|
||||
|
|
|
@ -141,8 +141,6 @@ pub async fn download_images(
|
|||
let (img_url, img_path, img_mime) = img_item;
|
||||
let img_ref = extractor
|
||||
.article()
|
||||
.as_mut()
|
||||
.expect("Unable to get mutable ref")
|
||||
.select_first(&format!("img[src='{}']", img_url))
|
||||
.expect("Image node does not exist");
|
||||
let mut img_node = img_ref.attributes.borrow_mut();
|
||||
|
|
33
src/main.rs
33
src/main.rs
|
@ -49,21 +49,26 @@ fn download(app_config: AppConfig) {
|
|||
// println!("Extracting");
|
||||
let mut extractor = Extractor::from_html(&html, &url);
|
||||
bar.set_message("Extracting...");
|
||||
extractor.extract_content();
|
||||
|
||||
if extractor.article().is_some() {
|
||||
extractor.extract_img_urls();
|
||||
if let Err(img_errors) =
|
||||
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await
|
||||
{
|
||||
eprintln!(
|
||||
"{} image{} failed to download for {}",
|
||||
img_errors.len(),
|
||||
if img_errors.len() > 1 { "s" } else { "" },
|
||||
url
|
||||
);
|
||||
match extractor.extract_content() {
|
||||
Ok(_) => {
|
||||
extractor.extract_img_urls();
|
||||
if let Err(img_errors) =
|
||||
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
|
||||
.await
|
||||
{
|
||||
eprintln!(
|
||||
"{} image{} failed to download for {}",
|
||||
img_errors.len(),
|
||||
if img_errors.len() > 1 { "s" } else { "" },
|
||||
url
|
||||
);
|
||||
}
|
||||
articles.push(extractor);
|
||||
}
|
||||
Err(mut e) => {
|
||||
e.set_article_source(&url);
|
||||
errors.push(e);
|
||||
}
|
||||
articles.push(extractor);
|
||||
}
|
||||
}
|
||||
Err(e) => errors.push(e),
|
||||
|
|
|
@ -9,6 +9,8 @@ use kuchiki::{
|
|||
};
|
||||
use url::Url;
|
||||
|
||||
use crate::errors::{ErrorKind, PaperoniError};
|
||||
|
||||
const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||
const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
|
||||
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
||||
|
@ -76,14 +78,15 @@ impl Readability {
|
|||
metadata: MetaData::new(),
|
||||
}
|
||||
}
|
||||
pub fn parse(&mut self, url: &str) {
|
||||
pub fn parse(&mut self, url: &str) -> Result<(), PaperoniError> {
|
||||
self.unwrap_no_script_tags();
|
||||
self.remove_scripts();
|
||||
self.prep_document();
|
||||
self.metadata = self.get_article_metadata();
|
||||
self.article_title = self.metadata.title.clone();
|
||||
self.grab_article();
|
||||
self.grab_article()?;
|
||||
self.post_process_content(url);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Recursively check if node is image, or if node contains exactly only one image
|
||||
|
@ -1584,7 +1587,7 @@ impl Readability {
|
|||
|
||||
/// Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff
|
||||
/// a user wants to read. Then return it wrapped up in a div.
|
||||
fn grab_article(&mut self) {
|
||||
fn grab_article(&mut self) -> Result<(), PaperoniError> {
|
||||
// TODO: Add logging for this
|
||||
// println!("Grabbing article");
|
||||
// var doc = this._doc;
|
||||
|
@ -1593,8 +1596,7 @@ impl Readability {
|
|||
let page = self.root_node.select_first("body");
|
||||
if page.is_err() {
|
||||
// TODO:Have error logging for this
|
||||
println!("Document has no <body>");
|
||||
return;
|
||||
return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
|
||||
}
|
||||
let page = page.unwrap();
|
||||
let mut attempts: Vec<ExtractAttempt> = Vec::new();
|
||||
|
@ -2084,8 +2086,10 @@ impl Readability {
|
|||
attempts.push(ExtractAttempt::new(article_content.clone(), text_length));
|
||||
attempts.sort_by(|a, b| b.length.partial_cmp(&a.length).unwrap());
|
||||
if attempts.first().as_ref().unwrap().length == 0 {
|
||||
println!("Unable to extract content");
|
||||
break;
|
||||
return Err(ErrorKind::ReadabilityError(
|
||||
"Unable to extract content".into(),
|
||||
)
|
||||
.into());
|
||||
}
|
||||
article_content = attempts[0].article.clone();
|
||||
parse_successful = true;
|
||||
|
@ -2111,7 +2115,7 @@ impl Readability {
|
|||
false
|
||||
});
|
||||
self.article_node = Some(article_content);
|
||||
return;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Reference in a new issue