Add url field in Extractor struct
This commit is contained in:
parent
b217448601
commit
60fb30e8a2
2 changed files with 9 additions and 7 deletions
|
@ -15,22 +15,24 @@ pub struct Extractor {
|
||||||
article: Option<NodeRef>,
|
article: Option<NodeRef>,
|
||||||
pub img_urls: Vec<ResourceInfo>,
|
pub img_urls: Vec<ResourceInfo>,
|
||||||
readability: Readability,
|
readability: Readability,
|
||||||
|
pub url: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Extractor {
|
impl Extractor {
|
||||||
/// Create a new instance of an HTML extractor given an HTML string
|
/// Create a new instance of an HTML extractor given an HTML string
|
||||||
pub fn from_html(html_str: &str) -> Self {
|
pub fn from_html(html_str: &str, url: &str) -> Self {
|
||||||
Extractor {
|
Extractor {
|
||||||
article: None,
|
article: None,
|
||||||
img_urls: Vec::new(),
|
img_urls: Vec::new(),
|
||||||
readability: Readability::new(html_str),
|
readability: Readability::new(html_str),
|
||||||
|
url: url.to_string(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Locates and extracts the HTML in a document which is determined to be
|
/// Locates and extracts the HTML in a document which is determined to be
|
||||||
/// the source of the content
|
/// the source of the content
|
||||||
pub fn extract_content(&mut self, url: &str) {
|
pub fn extract_content(&mut self) {
|
||||||
self.readability.parse(url);
|
self.readability.parse(&self.url);
|
||||||
if let Some(article_node_ref) = &self.readability.article_node {
|
if let Some(article_node_ref) = &self.readability.article_node {
|
||||||
let template = r#"
|
let template = r#"
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
||||||
|
@ -157,8 +159,8 @@ mod test {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_extract_img_urls() {
|
fn test_extract_img_urls() {
|
||||||
let mut extractor = Extractor::from_html(TEST_HTML);
|
let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
|
||||||
extractor.extract_content("http://example.com/");
|
extractor.extract_content();
|
||||||
extractor.extract_img_urls();
|
extractor.extract_img_urls();
|
||||||
|
|
||||||
assert!(extractor.img_urls.len() > 0);
|
assert!(extractor.img_urls.len() > 0);
|
||||||
|
|
|
@ -44,9 +44,9 @@ fn download(app_config: AppConfig) {
|
||||||
match fetch_result {
|
match fetch_result {
|
||||||
Ok((url, html)) => {
|
Ok((url, html)) => {
|
||||||
// println!("Extracting");
|
// println!("Extracting");
|
||||||
let mut extractor = Extractor::from_html(&html);
|
let mut extractor = Extractor::from_html(&html, &url);
|
||||||
bar.set_message("Extracting...");
|
bar.set_message("Extracting...");
|
||||||
extractor.extract_content(&url);
|
extractor.extract_content();
|
||||||
|
|
||||||
if extractor.article().is_some() {
|
if extractor.article().is_some() {
|
||||||
extractor.extract_img_urls();
|
extractor.extract_img_urls();
|
||||||
|
|
Reference in a new issue