From 3bf0719c8e4b45eb1f22541c7c6c0d888ebbfa1c Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Mon, 18 Oct 2021 09:56:19 +0300 Subject: [PATCH] feat: add `fetch_html_from_puppeteer` fn --- Cargo.lock | 9 +-- Cargo.toml | 1 + src/http.rs | 178 ++++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 151 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6b176cd..6936461 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1573,6 +1573,7 @@ dependencies = [ "log 0.4.14", "md5", "regex", + "serde", "surf", "thiserror", "url", @@ -2020,18 +2021,18 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.125" +version = "1.0.130" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171" +checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.125" +version = "1.0.130" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d" +checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 5761beb..b40409f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ lazy_static = "1.4.0" log = "0.4.14" md5 = "0.7.0" regex = "1.5.4" +serde = "1.0.130" surf = "2.2.0" thiserror = "1.0.25" url = "2.2.2" diff --git a/src/http.rs b/src/http.rs index 1a1206d..b3140d9 100644 --- a/src/http.rs +++ b/src/http.rs @@ -5,6 +5,7 @@ use futures::StreamExt; use indicatif::ProgressBar; use log::warn; use log::{debug, info}; +use serde::{Deserialize, Serialize}; use url::Url; use crate::cli::AppConfig; @@ -22,51 +23,162 @@ pub fn download( let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); let mut articles = Vec::new(); + // Collect all urls that couldn't extract here + let mut retry_with_paperteer: Vec = Vec::new(); while let Some(fetch_result) = responses.next().await { match fetch_result { Ok((url, html)) => { - debug!("Extracting {}", &url); - let mut extractor = Article::from_html(&html, &url); - bar.set_message("Extracting..."); - match extractor.extract_content() { - Ok(_) => { - extractor.extract_img_urls(); - if let Err(img_errors) = - download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) - .await - { - partial_downloads - .push(PartialDownload::new(&url, extractor.metadata().title())); - warn!( - "{} image{} failed to download for {}", - img_errors.len(), - if img_errors.len() > 1 { "s" } else { "" }, - url - ); - for img_error in img_errors { - warn!( - "{}\n\t\tReason {}", - img_error.url().as_ref().unwrap(), - img_error - ); - } - } - articles.push(extractor); - } - Err(mut e) => { - e.set_article_source(&url); - errors.push(e); - } + match extract_and_download_imgs( + &url, + html, + bar, + partial_downloads, + &mut articles, + ) + .await + { + Ok(_) => bar.inc(1), + + // All errors are pushed into here since they're readability issues. + Err(_) => retry_with_paperteer.push(url), } + + // Outside the stream, make a new one to retry with paperteer } Err(e) => errors.push(e), } - bar.inc(1); + } + if !retry_with_paperteer.is_empty() { + fetch_html_from_paperteer( + retry_with_paperteer, + app_config, + bar, + partial_downloads, + errors, + &mut articles, + ) + .await + .unwrap(); } articles }) } +async fn extract_and_download_imgs<'a>( + url: &str, + html: String, + bar: &ProgressBar, + partial_downloads: &mut Vec, + articles: &mut Vec
, +) -> Result<(), PaperoniError> { + debug!("Extracting {}", &url); + let mut extractor = Article::from_html(&html, &url); + bar.set_message("Extracting..."); + match extractor.extract_content() { + Ok(_) => { + extractor.extract_img_urls(); + if let Err(img_errors) = + download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await + { + partial_downloads.push(PartialDownload::new(&url, extractor.metadata().title())); + warn!( + "{} image{} failed to download for {}", + img_errors.len(), + if img_errors.len() > 1 { "s" } else { "" }, + &url + ); + for img_error in img_errors { + warn!( + "{}\n\t\tReason {}", + img_error.url().as_ref().unwrap(), + img_error + ); + } + } + articles.push(extractor); + Ok(()) + } + Err(mut e) => { + e.set_article_source(&url); + Err(e) + } + } +} + +#[derive(Serialize, Deserialize)] +struct PaperteerBody { + urls: Vec, +} + +impl PaperteerBody { + fn new(urls: Vec) -> Self { + PaperteerBody { urls } + } +} + +#[derive(Serialize, Deserialize)] +struct PaperteerItem { + url: String, + response: String, + html: String, +} + +#[derive(Serialize, Deserialize)] +struct PaperteerResponse { + data: Vec, +} + +// TODO: Change signature to simply take a vec of urls and return a vec of urls with either html or an error +// This also means that extracting and downloading imgs should be handled externally +async fn fetch_html_from_paperteer( + urls: Vec, + app_config: &AppConfig, + bar: &ProgressBar, + partial_downloads: &mut Vec, + errors: &mut Vec, + articles: &mut Vec
, +) -> Result<(), ()> { + // Get the paperteer url + let render_endpoint = "/api/render"; + let paperteer_url = url::Url::parse("http://localhost:3000") + .unwrap() + .join(render_endpoint) + .unwrap(); + + // Build request body with urls + let urls_str = urls.into_iter().map(|url| url.to_string()).collect(); + let body = PaperteerBody::new(urls_str); + + // Send to the paperteer url + let mut res = surf::post(paperteer_url) + .body(surf::Body::from_json(&body).unwrap()) + .await + .unwrap(); + + // Receive the json response + // TODO: Check for body response + let PaperteerResponse { data } = res.body_json().await.unwrap(); + + // For each url, extract the article and images + for item in data { + let PaperteerItem { + html, + url, + response, + } = item; + if response == "ok" { + // Run the extract and download fn + match extract_and_download_imgs(&url, html, bar, partial_downloads, articles).await { + Ok(_) => bar.inc(1), + Err(e) => errors.push(e), + } + } else { + errors.push(crate::errors::ErrorKind::HTTPError("Paperteer failed".into()).into()); + } + } + Ok(()) +} + pub async fn fetch_html(url: &str) -> Result { let client = surf::Client::new(); debug!("Fetching {}", url);