feat: add fetch_html_from_puppeteer fn

This commit is contained in:
Kenneth Gitere 2021-10-18 09:56:19 +03:00
parent 796a34a34c
commit 3bf0719c8e
3 changed files with 151 additions and 37 deletions

9
Cargo.lock generated
View file

@ -1573,6 +1573,7 @@ dependencies = [
"log 0.4.14", "log 0.4.14",
"md5", "md5",
"regex", "regex",
"serde",
"surf", "surf",
"thiserror", "thiserror",
"url", "url",
@ -2020,18 +2021,18 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.125" version = "1.0.130"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171" checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913"
dependencies = [ dependencies = [
"serde_derive", "serde_derive",
] ]
[[package]] [[package]]
name = "serde_derive" name = "serde_derive"
version = "1.0.125" version = "1.0.130"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d" checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",

View file

@ -32,6 +32,7 @@ lazy_static = "1.4.0"
log = "0.4.14" log = "0.4.14"
md5 = "0.7.0" md5 = "0.7.0"
regex = "1.5.4" regex = "1.5.4"
serde = "1.0.130"
surf = "2.2.0" surf = "2.2.0"
thiserror = "1.0.25" thiserror = "1.0.25"
url = "2.2.2" url = "2.2.2"

View file

@ -5,6 +5,7 @@ use futures::StreamExt;
use indicatif::ProgressBar; use indicatif::ProgressBar;
use log::warn; use log::warn;
use log::{debug, info}; use log::{debug, info};
use serde::{Deserialize, Serialize};
use url::Url; use url::Url;
use crate::cli::AppConfig; use crate::cli::AppConfig;
@ -22,51 +23,162 @@ pub fn download(
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
let mut articles = Vec::new(); let mut articles = Vec::new();
// Collect all urls that couldn't extract here
let mut retry_with_paperteer: Vec<String> = Vec::new();
while let Some(fetch_result) = responses.next().await { while let Some(fetch_result) = responses.next().await {
match fetch_result { match fetch_result {
Ok((url, html)) => { Ok((url, html)) => {
debug!("Extracting {}", &url); match extract_and_download_imgs(
let mut extractor = Article::from_html(&html, &url); &url,
bar.set_message("Extracting..."); html,
match extractor.extract_content() { bar,
Ok(_) => { partial_downloads,
extractor.extract_img_urls(); &mut articles,
if let Err(img_errors) = )
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) .await
.await {
{ Ok(_) => bar.inc(1),
partial_downloads
.push(PartialDownload::new(&url, extractor.metadata().title())); // All errors are pushed into here since they're readability issues.
warn!( Err(_) => retry_with_paperteer.push(url),
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
}
Err(mut e) => {
e.set_article_source(&url);
errors.push(e);
}
} }
// Outside the stream, make a new one to retry with paperteer
} }
Err(e) => errors.push(e), Err(e) => errors.push(e),
} }
bar.inc(1); }
if !retry_with_paperteer.is_empty() {
fetch_html_from_paperteer(
retry_with_paperteer,
app_config,
bar,
partial_downloads,
errors,
&mut articles,
)
.await
.unwrap();
} }
articles articles
}) })
} }
async fn extract_and_download_imgs<'a>(
url: &str,
html: String,
bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>,
articles: &mut Vec<Article>,
) -> Result<(), PaperoniError> {
debug!("Extracting {}", &url);
let mut extractor = Article::from_html(&html, &url);
bar.set_message("Extracting...");
match extractor.extract_content() {
Ok(_) => {
extractor.extract_img_urls();
if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await
{
partial_downloads.push(PartialDownload::new(&url, extractor.metadata().title()));
warn!(
"{} image{} failed to download for {}",
img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" },
&url
);
for img_error in img_errors {
warn!(
"{}\n\t\tReason {}",
img_error.url().as_ref().unwrap(),
img_error
);
}
}
articles.push(extractor);
Ok(())
}
Err(mut e) => {
e.set_article_source(&url);
Err(e)
}
}
}
#[derive(Serialize, Deserialize)]
struct PaperteerBody {
urls: Vec<String>,
}
impl PaperteerBody {
fn new(urls: Vec<String>) -> Self {
PaperteerBody { urls }
}
}
#[derive(Serialize, Deserialize)]
struct PaperteerItem {
url: String,
response: String,
html: String,
}
#[derive(Serialize, Deserialize)]
struct PaperteerResponse {
data: Vec<PaperteerItem>,
}
// TODO: Change signature to simply take a vec of urls and return a vec of urls with either html or an error
// This also means that extracting and downloading imgs should be handled externally
async fn fetch_html_from_paperteer(
urls: Vec<String>,
app_config: &AppConfig,
bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>,
errors: &mut Vec<PaperoniError>,
articles: &mut Vec<Article>,
) -> Result<(), ()> {
// Get the paperteer url
let render_endpoint = "/api/render";
let paperteer_url = url::Url::parse("http://localhost:3000")
.unwrap()
.join(render_endpoint)
.unwrap();
// Build request body with urls
let urls_str = urls.into_iter().map(|url| url.to_string()).collect();
let body = PaperteerBody::new(urls_str);
// Send to the paperteer url
let mut res = surf::post(paperteer_url)
.body(surf::Body::from_json(&body).unwrap())
.await
.unwrap();
// Receive the json response
// TODO: Check for body response
let PaperteerResponse { data } = res.body_json().await.unwrap();
// For each url, extract the article and images
for item in data {
let PaperteerItem {
html,
url,
response,
} = item;
if response == "ok" {
// Run the extract and download fn
match extract_and_download_imgs(&url, html, bar, partial_downloads, articles).await {
Ok(_) => bar.inc(1),
Err(e) => errors.push(e),
}
} else {
errors.push(crate::errors::ErrorKind::HTTPError("Paperteer failed".into()).into());
}
}
Ok(())
}
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> { pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
let client = surf::Client::new(); let client = surf::Client::new();
debug!("Fetching {}", url); debug!("Fetching {}", url);