Compare commits
3 commits
Author | SHA1 | Date | |
---|---|---|---|
|
abaa7d37df | ||
|
e777426c1b | ||
|
3bf0719c8e |
6 changed files with 531 additions and 390 deletions
702
Cargo.lock
generated
702
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
17
Cargo.toml
17
Cargo.toml
|
@ -13,25 +13,26 @@ readme = "README.md"
|
|||
|
||||
[dependencies]
|
||||
# atty = "0.2.14"
|
||||
async-std = "1.9.0"
|
||||
async-std = "1.10.0"
|
||||
base64 = "0.13.0"
|
||||
chrono = "0.4.19"
|
||||
clap = { version = "2.33.3", features = ["yaml"] }
|
||||
clap = { version = "2.34.0", features = ["yaml"] }
|
||||
colored = "2.0.0"
|
||||
comfy-table = "3.0.0"
|
||||
derive_builder = "0.10.2"
|
||||
directories = "3.0.2"
|
||||
epub-builder = "0.4.8"
|
||||
flexi_logger = "0.18.0"
|
||||
futures = "0.3.15"
|
||||
epub-builder = "0.4.10"
|
||||
flexi_logger = "0.22.2"
|
||||
futures = "0.3.19"
|
||||
html5ever = "0.25.1"
|
||||
indicatif = "0.16.2"
|
||||
itertools = "0.10.1"
|
||||
itertools = "0.10.3"
|
||||
kuchiki = "0.8.1"
|
||||
lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
md5 = "0.7.0"
|
||||
regex = "1.5.4"
|
||||
surf = "2.2.0"
|
||||
thiserror = "1.0.25"
|
||||
serde = "1.0.136"
|
||||
surf = "2.3.2"
|
||||
thiserror = "1.0.30"
|
||||
url = "2.2.2"
|
||||
|
|
|
@ -1 +1 @@
|
|||
1.52.1
|
||||
1.57.0
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
use std::collections::BTreeMap;
|
||||
|
||||
use html5ever::{LocalName, Namespace, QualName};
|
||||
use itertools::Itertools;
|
||||
use kuchiki::{traits::*, NodeRef};
|
||||
|
||||
use crate::errors::PaperoniError;
|
||||
use crate::moz_readability::{MetaData, Readability};
|
||||
use crate::moz_readability::{MetaData, Readability, HTML_NS};
|
||||
|
||||
/// A tuple of the url and an Option of the resource's MIME type
|
||||
pub type ResourceInfo = (String, Option<String>);
|
||||
|
@ -29,6 +32,7 @@ impl Article {
|
|||
/// the source of the content
|
||||
pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
|
||||
self.readability.parse(&self.url)?;
|
||||
self.reinsert_title_heading();
|
||||
if let Some(article_node_ref) = &self.readability.article_node {
|
||||
let template = r#"
|
||||
<!DOCTYPE html>
|
||||
|
@ -74,6 +78,20 @@ impl Article {
|
|||
)
|
||||
}
|
||||
|
||||
fn reinsert_title_heading(&mut self) {
|
||||
if let Some(article_node_ref) = &self.readability.article_node {
|
||||
if let Ok(article_root_ref) = article_node_ref.select_first("div#readability-page-1") {
|
||||
let article_root_elem = article_root_ref.as_node();
|
||||
let h1_elem = NodeRef::new_element(
|
||||
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("h1")),
|
||||
BTreeMap::new(),
|
||||
);
|
||||
h1_elem.append(NodeRef::new_text(self.readability.metadata.title()));
|
||||
article_root_elem.prepend(h1_elem);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
pub fn metadata(&self) -> &MetaData {
|
||||
&self.readability.metadata
|
||||
}
|
||||
|
|
130
src/http.rs
130
src/http.rs
|
@ -5,6 +5,7 @@ use futures::StreamExt;
|
|||
use indicatif::ProgressBar;
|
||||
use log::warn;
|
||||
use log::{debug, info};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Url;
|
||||
|
||||
use crate::cli::AppConfig;
|
||||
|
@ -22,9 +23,54 @@ pub fn download(
|
|||
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
|
||||
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
|
||||
let mut articles = Vec::new();
|
||||
// Collect all urls that couldn't extract here
|
||||
// let mut retry_with_paperteer: Vec<String> = Vec::new();
|
||||
while let Some(fetch_result) = responses.next().await {
|
||||
match fetch_result {
|
||||
Ok((url, html)) => {
|
||||
match extract_and_download_imgs(
|
||||
&url,
|
||||
html,
|
||||
bar,
|
||||
partial_downloads,
|
||||
&mut articles,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(_) => bar.inc(1),
|
||||
|
||||
// All errors are pushed into here since they're readability issues.
|
||||
Err(e) => errors.push(e),
|
||||
}
|
||||
|
||||
// Outside the stream, make a new one to retry with paperteer
|
||||
}
|
||||
Err(e) => errors.push(e),
|
||||
}
|
||||
}
|
||||
// if !retry_with_paperteer.is_empty() {
|
||||
// fetch_html_from_paperteer(
|
||||
// retry_with_paperteer,
|
||||
// app_config,
|
||||
// bar,
|
||||
// partial_downloads,
|
||||
// errors,
|
||||
// &mut articles,
|
||||
// )
|
||||
// .await
|
||||
// .unwrap();
|
||||
// }
|
||||
articles
|
||||
})
|
||||
}
|
||||
|
||||
async fn extract_and_download_imgs<'a>(
|
||||
url: &str,
|
||||
html: String,
|
||||
bar: &ProgressBar,
|
||||
partial_downloads: &mut Vec<PartialDownload>,
|
||||
articles: &mut Vec<Article>,
|
||||
) -> Result<(), PaperoniError> {
|
||||
debug!("Extracting {}", &url);
|
||||
let mut extractor = Article::from_html(&html, &url);
|
||||
bar.set_message("Extracting...");
|
||||
|
@ -32,16 +78,14 @@ pub fn download(
|
|||
Ok(_) => {
|
||||
extractor.extract_img_urls();
|
||||
if let Err(img_errors) =
|
||||
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar)
|
||||
.await
|
||||
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await
|
||||
{
|
||||
partial_downloads
|
||||
.push(PartialDownload::new(&url, extractor.metadata().title()));
|
||||
partial_downloads.push(PartialDownload::new(&url, extractor.metadata().title()));
|
||||
warn!(
|
||||
"{} image{} failed to download for {}",
|
||||
img_errors.len(),
|
||||
if img_errors.len() > 1 { "s" } else { "" },
|
||||
url
|
||||
&url
|
||||
);
|
||||
for img_error in img_errors {
|
||||
warn!(
|
||||
|
@ -52,19 +96,87 @@ pub fn download(
|
|||
}
|
||||
}
|
||||
articles.push(extractor);
|
||||
Ok(())
|
||||
}
|
||||
Err(mut e) => {
|
||||
e.set_article_source(&url);
|
||||
errors.push(e);
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PaperteerBody {
|
||||
urls: Vec<String>,
|
||||
}
|
||||
|
||||
impl PaperteerBody {
|
||||
fn new(urls: Vec<String>) -> Self {
|
||||
PaperteerBody { urls }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PaperteerItem {
|
||||
url: String,
|
||||
response: String,
|
||||
html: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PaperteerResponse {
|
||||
data: Vec<PaperteerItem>,
|
||||
}
|
||||
|
||||
// TODO: Change signature to simply take a vec of urls and return a vec of urls with either html or an error
|
||||
// This also means that extracting and downloading imgs should be handled externally
|
||||
async fn _fetch_html_from_paperteer(
|
||||
urls: Vec<String>,
|
||||
_app_config: &AppConfig,
|
||||
bar: &ProgressBar,
|
||||
partial_downloads: &mut Vec<PartialDownload>,
|
||||
errors: &mut Vec<PaperoniError>,
|
||||
articles: &mut Vec<Article>,
|
||||
) -> Result<(), ()> {
|
||||
// Get the paperteer url
|
||||
let render_endpoint = "/api/render";
|
||||
let paperteer_url = url::Url::parse("http://localhost:3000")
|
||||
.unwrap()
|
||||
.join(render_endpoint)
|
||||
.unwrap();
|
||||
|
||||
// Build request body with urls
|
||||
let urls_str = urls.into_iter().map(|url| url.to_string()).collect();
|
||||
let body = PaperteerBody::new(urls_str);
|
||||
|
||||
// Send to the paperteer url
|
||||
let mut res = surf::post(paperteer_url)
|
||||
.body(surf::Body::from_json(&body).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Receive the json response
|
||||
// TODO: Check for body response
|
||||
let PaperteerResponse { data } = res.body_json().await.unwrap();
|
||||
|
||||
// For each url, extract the article and images
|
||||
for item in data {
|
||||
let PaperteerItem {
|
||||
html,
|
||||
url,
|
||||
response,
|
||||
} = item;
|
||||
if response == "ok" {
|
||||
// Run the extract and download fn
|
||||
match extract_and_download_imgs(&url, html, bar, partial_downloads, articles).await {
|
||||
Ok(_) => bar.inc(1),
|
||||
Err(e) => errors.push(e),
|
||||
}
|
||||
bar.inc(1);
|
||||
} else {
|
||||
errors.push(crate::errors::ErrorKind::HTTPError("Paperteer failed".into()).into());
|
||||
}
|
||||
articles
|
||||
})
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
||||
|
|
|
@ -17,7 +17,7 @@ const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
|
|||
const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
||||
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
|
||||
const READABILITY_SCORE: &'static str = "readability-score";
|
||||
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
||||
pub const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
||||
// TODO: Change to HashSet
|
||||
const PHRASING_ELEMS: [&str; 39] = [
|
||||
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
|
||||
|
|
Loading…
Reference in a new issue