Add url field in Extractor struct

2021-04-20 21:06:54 +03:00 · 2021-04-20 21:06:54 +03:00 · 60fb30e8a2
commit 60fb30e8a2
parent b217448601
2 changed files with 9 additions and 7 deletions
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -15,22 +15,24 @@ pub struct Extractor {
    article: Option<NodeRef>,
    pub img_urls: Vec<ResourceInfo>,
    readability: Readability,
+    pub url: String,
 }

 impl Extractor {
    /// Create a new instance of an HTML extractor given an HTML string
-    pub fn from_html(html_str: &str) -> Self {
+    pub fn from_html(html_str: &str, url: &str) -> Self {
        Extractor {
            article: None,
            img_urls: Vec::new(),
            readability: Readability::new(html_str),
+            url: url.to_string(),
        }
    }

    /// Locates and extracts the HTML in a document which is determined to be
    /// the source of the content
-    pub fn extract_content(&mut self, url: &str) {
-        self.readability.parse(url);
+    pub fn extract_content(&mut self) {
+        self.readability.parse(&self.url);
        if let Some(article_node_ref) = &self.readability.article_node {
            let template = r#"
            <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
@ -157,8 +159,8 @@ mod test {

    #[test]
    fn test_extract_img_urls() {
-        let mut extractor = Extractor::from_html(TEST_HTML);
-        extractor.extract_content("http://example.com/");
+        let mut extractor = Extractor::from_html(TEST_HTML, "http://example.com/");
+        extractor.extract_content();
        extractor.extract_img_urls();

        assert!(extractor.img_urls.len() > 0);
--- a/src/main.rs
+++ b/src/main.rs
@ -44,9 +44,9 @@ fn download(app_config: AppConfig) {
            match fetch_result {
                Ok((url, html)) => {
                    // println!("Extracting");
-                    let mut extractor = Extractor::from_html(&html);
+                    let mut extractor = Extractor::from_html(&html, &url);
                    bar.set_message("Extracting...");
-                    extractor.extract_content(&url);
+                    extractor.extract_content();

                    if extractor.article().is_some() {
                        extractor.extract_img_urls();