feat: add fetch_html_from_puppeteer
fn
This commit is contained in:
parent
796a34a34c
commit
3bf0719c8e
3 changed files with 151 additions and 37 deletions
9
Cargo.lock
generated
9
Cargo.lock
generated
|
@ -1573,6 +1573,7 @@ dependencies = [
|
||||||
"log 0.4.14",
|
"log 0.4.14",
|
||||||
"md5",
|
"md5",
|
||||||
"regex",
|
"regex",
|
||||||
|
"serde",
|
||||||
"surf",
|
"surf",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"url",
|
"url",
|
||||||
|
@ -2020,18 +2021,18 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.125"
|
version = "1.0.130"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171"
|
checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde_derive",
|
"serde_derive",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_derive"
|
name = "serde_derive"
|
||||||
version = "1.0.125"
|
version = "1.0.130"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d"
|
checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
|
|
|
@ -32,6 +32,7 @@ lazy_static = "1.4.0"
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
regex = "1.5.4"
|
regex = "1.5.4"
|
||||||
|
serde = "1.0.130"
|
||||||
surf = "2.2.0"
|
surf = "2.2.0"
|
||||||
thiserror = "1.0.25"
|
thiserror = "1.0.25"
|
||||||
url = "2.2.2"
|
url = "2.2.2"
|
||||||
|
|
130
src/http.rs
130
src/http.rs
|
@ -5,6 +5,7 @@ use futures::StreamExt;
|
||||||
use indicatif::ProgressBar;
|
use indicatif::ProgressBar;
|
||||||
use log::warn;
|
use log::warn;
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use crate::cli::AppConfig;
|
use crate::cli::AppConfig;
|
||||||
|
@ -22,9 +23,54 @@ pub fn download(
|
||||||
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
||||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
||||||
let mut articles = Vec::new();
|
let mut articles = Vec::new();
|
||||||
|
// Collect all urls that couldn't extract here
|
||||||
|
let mut retry_with_paperteer: Vec<String> = Vec::new();
|
||||||
while let Some(fetch_result) = responses.next().await {
|
while let Some(fetch_result) = responses.next().await {
|
||||||
match fetch_result {
|
match fetch_result {
|
||||||
Ok((url, html)) => {
|
Ok((url, html)) => {
|
||||||
|
match extract_and_download_imgs(
|
||||||
|
&url,
|
||||||
|
html,
|
||||||
|
bar,
|
||||||
|
partial_downloads,
|
||||||
|
&mut articles,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(_) => bar.inc(1),
|
||||||
|
|
||||||
|
// All errors are pushed into here since they're readability issues.
|
||||||
|
Err(_) => retry_with_paperteer.push(url),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Outside the stream, make a new one to retry with paperteer
|
||||||
|
}
|
||||||
|
Err(e) => errors.push(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !retry_with_paperteer.is_empty() {
|
||||||
|
fetch_html_from_paperteer(
|
||||||
|
retry_with_paperteer,
|
||||||
|
app_config,
|
||||||
|
bar,
|
||||||
|
partial_downloads,
|
||||||
|
errors,
|
||||||
|
&mut articles,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
articles
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn extract_and_download_imgs<'a>(
|
||||||
|
url: &str,
|
||||||
|
html: String,
|
||||||
|
bar: &ProgressBar,
|
||||||
|
partial_downloads: &mut Vec<PartialDownload>,
|
||||||
|
articles: &mut Vec<Article>,
|
||||||
|
) -> Result<(), PaperoniError> {
|
||||||
debug!("Extracting {}", &url);
|
debug!("Extracting {}", &url);
|
||||||
let mut extractor = Article::from_html(&html, &url);
|
let mut extractor = Article::from_html(&html, &url);
|
||||||
bar.set_message("Extracting...");
|
bar.set_message("Extracting...");
|
||||||
|
@ -32,16 +78,14 @@ pub fn download(
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
extractor.extract_img_urls();
|
extractor.extract_img_urls();
|
||||||
if let Err(img_errors) =
|
if let Err(img_errors) =
|
||||||
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
|
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await
|
||||||
.await
|
|
||||||
{
|
{
|
||||||
partial_downloads
|
partial_downloads.push(PartialDownload::new(&url, extractor.metadata().title()));
|
||||||
.push(PartialDownload::new(&url, extractor.metadata().title()));
|
|
||||||
warn!(
|
warn!(
|
||||||
"{} image{} failed to download for {}",
|
"{} image{} failed to download for {}",
|
||||||
img_errors.len(),
|
img_errors.len(),
|
||||||
if img_errors.len() > 1 { "s" } else { "" },
|
if img_errors.len() > 1 { "s" } else { "" },
|
||||||
url
|
&url
|
||||||
);
|
);
|
||||||
for img_error in img_errors {
|
for img_error in img_errors {
|
||||||
warn!(
|
warn!(
|
||||||
|
@ -52,19 +96,87 @@ pub fn download(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
articles.push(extractor);
|
articles.push(extractor);
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
Err(mut e) => {
|
Err(mut e) => {
|
||||||
e.set_article_source(&url);
|
e.set_article_source(&url);
|
||||||
errors.push(e);
|
Err(e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
struct PaperteerBody {
|
||||||
|
urls: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PaperteerBody {
|
||||||
|
fn new(urls: Vec<String>) -> Self {
|
||||||
|
PaperteerBody { urls }
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
struct PaperteerItem {
|
||||||
|
url: String,
|
||||||
|
response: String,
|
||||||
|
html: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
struct PaperteerResponse {
|
||||||
|
data: Vec<PaperteerItem>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Change signature to simply take a vec of urls and return a vec of urls with either html or an error
|
||||||
|
// This also means that extracting and downloading imgs should be handled externally
|
||||||
|
async fn fetch_html_from_paperteer(
|
||||||
|
urls: Vec<String>,
|
||||||
|
app_config: &AppConfig,
|
||||||
|
bar: &ProgressBar,
|
||||||
|
partial_downloads: &mut Vec<PartialDownload>,
|
||||||
|
errors: &mut Vec<PaperoniError>,
|
||||||
|
articles: &mut Vec<Article>,
|
||||||
|
) -> Result<(), ()> {
|
||||||
|
// Get the paperteer url
|
||||||
|
let render_endpoint = "/api/render";
|
||||||
|
let paperteer_url = url::Url::parse("http://localhost:3000")
|
||||||
|
.unwrap()
|
||||||
|
.join(render_endpoint)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Build request body with urls
|
||||||
|
let urls_str = urls.into_iter().map(|url| url.to_string()).collect();
|
||||||
|
let body = PaperteerBody::new(urls_str);
|
||||||
|
|
||||||
|
// Send to the paperteer url
|
||||||
|
let mut res = surf::post(paperteer_url)
|
||||||
|
.body(surf::Body::from_json(&body).unwrap())
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Receive the json response
|
||||||
|
// TODO: Check for body response
|
||||||
|
let PaperteerResponse { data } = res.body_json().await.unwrap();
|
||||||
|
|
||||||
|
// For each url, extract the article and images
|
||||||
|
for item in data {
|
||||||
|
let PaperteerItem {
|
||||||
|
html,
|
||||||
|
url,
|
||||||
|
response,
|
||||||
|
} = item;
|
||||||
|
if response == "ok" {
|
||||||
|
// Run the extract and download fn
|
||||||
|
match extract_and_download_imgs(&url, html, bar, partial_downloads, articles).await {
|
||||||
|
Ok(_) => bar.inc(1),
|
||||||
Err(e) => errors.push(e),
|
Err(e) => errors.push(e),
|
||||||
}
|
}
|
||||||
bar.inc(1);
|
} else {
|
||||||
|
errors.push(crate::errors::ErrorKind::HTTPError("Paperteer failed".into()).into());
|
||||||
}
|
}
|
||||||
articles
|
}
|
||||||
})
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
||||||
|
|
Reference in a new issue