dev: update packages and disable paperteer features

feat: add reinsertion of title as <h1> requested in #22
feat: add fetch_html_from_puppeteer fn
2022-02-01 20:16:29 +03:00 · 2021-12-30 07:58:19 +03:00 · 2021-10-18 10:03:09 +03:00
6 changed files with 531 additions and 390 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -13,25 +13,26 @@ readme = "README.md"

 [dependencies]
 # atty = "0.2.14"
-async-std = "1.9.0"
+async-std = "1.10.0"
 base64 = "0.13.0"
 chrono = "0.4.19"
-clap = { version = "2.33.3", features = ["yaml"] }
+clap = { version = "2.34.0", features = ["yaml"] }
 colored = "2.0.0"
 comfy-table = "3.0.0"
 derive_builder = "0.10.2"
 directories = "3.0.2"
-epub-builder = "0.4.8"
-flexi_logger = "0.18.0"
-futures = "0.3.15"
+epub-builder = "0.4.10"
+flexi_logger = "0.22.2"
+futures = "0.3.19"
 html5ever = "0.25.1"
 indicatif = "0.16.2"
-itertools = "0.10.1"
+itertools = "0.10.3"
 kuchiki = "0.8.1"
 lazy_static = "1.4.0"
 log = "0.4.14"
 md5 = "0.7.0"
 regex = "1.5.4"
-surf = "2.2.0"
-thiserror = "1.0.25"
+serde = "1.0.136"
+surf = "2.3.2"
+thiserror = "1.0.30"
 url = "2.2.2"
--- a/2
+++ b/2
@ -1 +1 @@
-1.52.1
+1.57.0
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -1,8 +1,11 @@
+use std::collections::BTreeMap;
+
+use html5ever::{LocalName, Namespace, QualName};
 use itertools::Itertools;
 use kuchiki::{traits::*, NodeRef};

 use crate::errors::PaperoniError;
-use crate::moz_readability::{MetaData, Readability};
+use crate::moz_readability::{MetaData, Readability, HTML_NS};

 /// A tuple of the url and an Option of the resource's MIME type
 pub type ResourceInfo = (String, Option<String>);
@ -29,6 +32,7 @@ impl Article {
    /// the source of the content
    pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
        self.readability.parse(&self.url)?;
+        self.reinsert_title_heading();
        if let Some(article_node_ref) = &self.readability.article_node {
            let template = r#"
            <!DOCTYPE html>
@ -74,6 +78,20 @@ impl Article {
        )
    }

+    fn reinsert_title_heading(&mut self) {
+        if let Some(article_node_ref) = &self.readability.article_node {
+            if let Ok(article_root_ref) = article_node_ref.select_first("div#readability-page-1") {
+                let article_root_elem = article_root_ref.as_node();
+                let h1_elem = NodeRef::new_element(
+                    QualName::new(None, Namespace::from(HTML_NS), LocalName::from("h1")),
+                    BTreeMap::new(),
+                );
+                h1_elem.append(NodeRef::new_text(self.readability.metadata.title()));
+                article_root_elem.prepend(h1_elem);
+            };
+        }
+    }
+
    pub fn metadata(&self) -> &MetaData {
        &self.readability.metadata
    }
--- a/src/http.rs
+++ b/src/http.rs
@ -5,6 +5,7 @@ use futures::StreamExt;
 use indicatif::ProgressBar;
 use log::warn;
 use log::{debug, info};
+use serde::{Deserialize, Serialize};
 use url::Url;

 use crate::cli::AppConfig;
@ -22,9 +23,54 @@ pub fn download(
        let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
        let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
        let mut articles = Vec::new();
+        // Collect all urls that couldn't extract here
+        // let mut retry_with_paperteer: Vec<String> = Vec::new();
        while let Some(fetch_result) = responses.next().await {
            match fetch_result {
                Ok((url, html)) => {
+                    match extract_and_download_imgs(
+                        &url,
+                        html,
+                        bar,
+                        partial_downloads,
+                        &mut articles,
+                    )
+                    .await
+                    {
+                        Ok(_) => bar.inc(1),
+
+                        // All errors are pushed into here since they're readability issues.
+                        Err(e) => errors.push(e),
+                    }
+
+                    // Outside the stream, make a new one to retry with paperteer
+                }
+                Err(e) => errors.push(e),
+            }
+        }
+        // if !retry_with_paperteer.is_empty() {
+        //     fetch_html_from_paperteer(
+        //         retry_with_paperteer,
+        //         app_config,
+        //         bar,
+        //         partial_downloads,
+        //         errors,
+        //         &mut articles,
+        //     )
+        //     .await
+        //     .unwrap();
+        // }
+        articles
+    })
+}
+
+async fn extract_and_download_imgs<'a>(
+    url: &str,
+    html: String,
+    bar: &ProgressBar,
+    partial_downloads: &mut Vec<PartialDownload>,
+    articles: &mut Vec<Article>,
+) -> Result<(), PaperoniError> {
    debug!("Extracting {}", &url);
    let mut extractor = Article::from_html(&html, &url);
    bar.set_message("Extracting...");
@ -32,16 +78,14 @@ pub fn download(
        Ok(_) => {
            extractor.extract_img_urls();
            if let Err(img_errors) =
-                                download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
-                                    .await
+                download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await
            {
-                                partial_downloads
-                                    .push(PartialDownload::new(&url, extractor.metadata().title()));
+                partial_downloads.push(PartialDownload::new(&url, extractor.metadata().title()));
                warn!(
                    "{} image{} failed to download for {}",
                    img_errors.len(),
                    if img_errors.len() > 1 { "s" } else { "" },
-                                    url
+                    &url
                );
                for img_error in img_errors {
                    warn!(
@ -52,19 +96,87 @@ pub fn download(
                }
            }
            articles.push(extractor);
+            Ok(())
        }
        Err(mut e) => {
            e.set_article_source(&url);
-                            errors.push(e);
+            Err(e)
        }
    }
+}
+
+#[derive(Serialize, Deserialize)]
+struct PaperteerBody {
+    urls: Vec<String>,
+}
+
+impl PaperteerBody {
+    fn new(urls: Vec<String>) -> Self {
+        PaperteerBody { urls }
    }
+}
+
+#[derive(Serialize, Deserialize)]
+struct PaperteerItem {
+    url: String,
+    response: String,
+    html: String,
+}
+
+#[derive(Serialize, Deserialize)]
+struct PaperteerResponse {
+    data: Vec<PaperteerItem>,
+}
+
+// TODO: Change signature to simply take a vec of urls and return a vec of urls with either html or an error
+// This also means that extracting and downloading imgs should be handled externally
+async fn _fetch_html_from_paperteer(
+    urls: Vec<String>,
+    _app_config: &AppConfig,
+    bar: &ProgressBar,
+    partial_downloads: &mut Vec<PartialDownload>,
+    errors: &mut Vec<PaperoniError>,
+    articles: &mut Vec<Article>,
+) -> Result<(), ()> {
+    // Get the paperteer url
+    let render_endpoint = "/api/render";
+    let paperteer_url = url::Url::parse("http://localhost:3000")
+        .unwrap()
+        .join(render_endpoint)
+        .unwrap();
+
+    // Build request body with urls
+    let urls_str = urls.into_iter().map(|url| url.to_string()).collect();
+    let body = PaperteerBody::new(urls_str);
+
+    // Send to the paperteer url
+    let mut res = surf::post(paperteer_url)
+        .body(surf::Body::from_json(&body).unwrap())
+        .await
+        .unwrap();
+
+    // Receive the json response
+    // TODO: Check for body response
+    let PaperteerResponse { data } = res.body_json().await.unwrap();
+
+    // For each url, extract the article and images
+    for item in data {
+        let PaperteerItem {
+            html,
+            url,
+            response,
+        } = item;
+        if response == "ok" {
+            // Run the extract and download fn
+            match extract_and_download_imgs(&url, html, bar, partial_downloads, articles).await {
+                Ok(_) => bar.inc(1),
                Err(e) => errors.push(e),
            }
-            bar.inc(1);
+        } else {
+            errors.push(crate::errors::ErrorKind::HTTPError("Paperteer failed".into()).into());
        }
-        articles
-    })
+    }
+    Ok(())
 }

 pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@ -17,7 +17,7 @@ const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
 const FLAG_WEIGHT_CLASSES: u32 = 0x2;
 const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
 const READABILITY_SCORE: &'static str = "readability-score";
-const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
+pub const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
 // TODO: Change to HashSet
 const PHRASING_ELEMS: [&str; 39] = [
    "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
Author	SHA1	Message	Date
Kenneth Gitere	abaa7d37df	dev: update packages and disable paperteer features	2022-02-01 20:16:29 +03:00
Kenneth Gitere	e777426c1b	feat: add reinsertion of title as <h1> requested in #22	2021-12-30 07:58:19 +03:00
Kenneth Gitere	3bf0719c8e	feat: add `fetch_html_from_puppeteer` fn	2021-10-18 10:03:09 +03:00
 @ -1 +1 @@
 .52.1
 .57.0