Compare commits

..

3 commits
main ... dev

Author SHA1 Message Date
Kenneth Gitere
abaa7d37df dev: update packages and disable paperteer features 2022-02-01 20:16:29 +03:00
Kenneth Gitere
e777426c1b feat: add reinsertion of title as <h1> requested in #22 2021-12-30 07:58:19 +03:00
Kenneth Gitere
3bf0719c8e feat: add fetch_html_from_puppeteer fn 2021-10-18 10:03:09 +03:00
13 changed files with 888 additions and 864 deletions

View file

@ -1,20 +0,0 @@
steps:
build:
when:
- event: cron
- event: push
branch: main
- event: pull_request
image: docker.io/rust:latest
commands:
- apt-get install libssl-dev
- cargo build
test:
when:
- event: cron
- event: push
branch: main
- event: pull_request
image: docker.io/rust:latest
commands:
- cargo test

1425
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -13,27 +13,26 @@ readme = "README.md"
[dependencies] [dependencies]
# atty = "0.2.14" # atty = "0.2.14"
async-std = "1.12.0" async-std = "1.10.0"
base64 = "0.22.0" base64 = "0.13.0"
chrono = "0.4.38" chrono = "0.4.19"
clap = { version = "2.34.0", features = ["yaml"] } clap = { version = "2.34.0", features = ["yaml"] }
colored = "2.1.0" colored = "2.0.0"
comfy-table = "7.1.1" comfy-table = "3.0.0"
derive_builder = "0.20.0" derive_builder = "0.10.2"
directories = "5.0.1" directories = "3.0.2"
epub-builder = "0.7.4" epub-builder = "0.4.10"
eyre = "0.6.12" flexi_logger = "0.22.2"
flexi_logger = "0.29.0" futures = "0.3.19"
futures = "0.3.30"
html5ever = "0.25.1" html5ever = "0.25.1"
indicatif = "0.17.8" indicatif = "0.16.2"
itertools = "0.13.0" itertools = "0.10.3"
kuchiki = "0.8.1" kuchiki = "0.8.1"
lazy_static = "1.4.0" lazy_static = "1.4.0"
log = "0.4.21" log = "0.4.14"
md5 = "0.7.0" md5 = "0.7.0"
openssl-sys = "0.9.102" regex = "1.5.4"
regex = "1.11.0" serde = "1.0.136"
surf = "2.3.2" surf = "2.3.2"
thiserror = "1.0.59" thiserror = "1.0.30"
url = "2.5.0" url = "2.2.2"

View file

@ -1,10 +0,0 @@
{
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
"extends": ["config:recommended"],
"packageRules": [
{
"matchManagers": ["cargo"],
"rangeStrategy": "replace"
}
]
}

1
rust-toolchain Normal file
View file

@ -0,0 +1 @@
1.57.0

View file

@ -30,11 +30,7 @@ pub fn generate_epubs(
let enabled_bar = ProgressBar::new(articles.len() as u64); let enabled_bar = ProgressBar::new(articles.len() as u64);
let style = ProgressStyle::default_bar().template( let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}", "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} epub {pos}/{len:7} {msg:.green}",
).map_err(|e| { );
let mut paperoni_err: PaperoniError = e.into();
paperoni_err.set_article_source("progress bar");
vec![paperoni_err]
})?;
enabled_bar.set_style(style); enabled_bar.set_style(style);
if !articles.is_empty() { if !articles.is_empty() {
enabled_bar.set_message("Generating epubs"); enabled_bar.set_message("Generating epubs");
@ -253,7 +249,10 @@ fn replace_escaped_characters(value: &str) -> String {
.replace(">", "&gt;") .replace(">", "&gt;")
} }
fn add_stylesheets(epub: &mut EpubBuilder<ZipLibrary>, app_config: &AppConfig) -> eyre::Result<()> { fn add_stylesheets<T: epub_builder::Zip>(
epub: &mut EpubBuilder<T>,
app_config: &AppConfig,
) -> Result<(), epub_builder::Error> {
let body_stylesheet: &[u8] = include_bytes!("./assets/body.min.css"); let body_stylesheet: &[u8] = include_bytes!("./assets/body.min.css");
let header_stylesheet: &[u8] = include_bytes!("./assets/headers.min.css"); let header_stylesheet: &[u8] = include_bytes!("./assets/headers.min.css");
match app_config.css_config { match app_config.css_config {

View file

@ -15,8 +15,6 @@ pub enum ErrorKind {
UTF8Error(String), UTF8Error(String),
#[error("[ReadabilityError]: {0}")] #[error("[ReadabilityError]: {0}")]
ReadabilityError(String), ReadabilityError(String),
#[error("[TemplateError]: {0}")]
TemplateError(String),
} }
#[derive(Error, Debug)] #[derive(Error, Debug)]
@ -100,9 +98,9 @@ impl From<ErrorKind> for PaperoniError {
} }
} }
impl From<eyre::Error> for PaperoniError { impl From<epub_builder::Error> for PaperoniError {
fn from(err: eyre::Error) -> Self { fn from(err: epub_builder::Error) -> Self {
PaperoniError::with_kind(ErrorKind::EpubError(err.to_string())) PaperoniError::with_kind(ErrorKind::EpubError(err.description().to_owned()))
} }
} }
@ -130,12 +128,6 @@ impl From<std::str::Utf8Error> for PaperoniError {
} }
} }
impl From<indicatif::style::TemplateError> for PaperoniError {
fn from(err: indicatif::style::TemplateError) -> Self {
PaperoniError::with_kind(ErrorKind::TemplateError(err.to_string()))
}
}
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum LogError { pub enum LogError {
#[error(transparent)] #[error(transparent)]

View file

@ -1,8 +1,11 @@
use std::collections::BTreeMap;
use html5ever::{LocalName, Namespace, QualName};
use itertools::Itertools; use itertools::Itertools;
use kuchiki::{traits::*, NodeRef}; use kuchiki::{traits::*, NodeRef};
use crate::errors::PaperoniError; use crate::errors::PaperoniError;
use crate::moz_readability::{MetaData, Readability}; use crate::moz_readability::{MetaData, Readability, HTML_NS};
/// A tuple of the url and an Option of the resource's MIME type /// A tuple of the url and an Option of the resource's MIME type
pub type ResourceInfo = (String, Option<String>); pub type ResourceInfo = (String, Option<String>);
@ -29,6 +32,7 @@ impl Article {
/// the source of the content /// the source of the content
pub fn extract_content(&mut self) -> Result<(), PaperoniError> { pub fn extract_content(&mut self) -> Result<(), PaperoniError> {
self.readability.parse(&self.url)?; self.readability.parse(&self.url)?;
self.reinsert_title_heading();
if let Some(article_node_ref) = &self.readability.article_node { if let Some(article_node_ref) = &self.readability.article_node {
let template = r#" let template = r#"
<!DOCTYPE html> <!DOCTYPE html>
@ -74,6 +78,20 @@ impl Article {
) )
} }
fn reinsert_title_heading(&mut self) {
if let Some(article_node_ref) = &self.readability.article_node {
if let Ok(article_root_ref) = article_node_ref.select_first("div#readability-page-1") {
let article_root_elem = article_root_ref.as_node();
let h1_elem = NodeRef::new_element(
QualName::new(None, Namespace::from(HTML_NS), LocalName::from("h1")),
BTreeMap::new(),
);
h1_elem.append(NodeRef::new_text(self.readability.metadata.title()));
article_root_elem.prepend(h1_elem);
};
}
}
pub fn metadata(&self) -> &MetaData { pub fn metadata(&self) -> &MetaData {
&self.readability.metadata &self.readability.metadata
} }

View file

@ -4,8 +4,7 @@ use std::{
path::Path, path::Path,
}; };
use base64::prelude::*; use base64::encode;
use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table}; use comfy_table::{Attribute, Cell, CellAlignment, Color, ContentArrangement, Table};
use html5ever::{LocalName, Namespace, QualName}; use html5ever::{LocalName, Namespace, QualName};
use indicatif::{ProgressBar, ProgressStyle}; use indicatif::{ProgressBar, ProgressStyle};
@ -44,11 +43,7 @@ pub fn generate_html_exports(
let enabled_bar = ProgressBar::new(articles.len() as u64); let enabled_bar = ProgressBar::new(articles.len() as u64);
let style = ProgressStyle::default_bar().template( let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} html {pos}/{len:7} {msg:.green}", "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} html {pos}/{len:7} {msg:.green}",
).map_err(|e| { );
let mut paperoni_err: PaperoniError = e.into();
paperoni_err.set_article_source("progress bar");
vec![paperoni_err]
})?;
enabled_bar.set_style(style); enabled_bar.set_style(style);
if !articles.is_empty() { if !articles.is_empty() {
enabled_bar.set_message("Generating html files"); enabled_bar.set_message("Generating html files");
@ -272,7 +267,7 @@ fn update_imgs_base64(article: &Article) -> Result<(), std::io::Error> {
let img_base64_str = format!( let img_base64_str = format!(
"data:image:{};base64,{}", "data:image:{};base64,{}",
mime_type.as_deref().unwrap_or("image/*"), mime_type.as_deref().unwrap_or("image/*"),
BASE64_STANDARD.encode(img_bytes) encode(img_bytes)
); );
let img_elems = article let img_elems = article

View file

@ -5,6 +5,7 @@ use futures::StreamExt;
use indicatif::ProgressBar; use indicatif::ProgressBar;
use log::warn; use log::warn;
use log::{debug, info}; use log::{debug, info};
use serde::{Deserialize, Serialize};
use url::Url; use url::Url;
use crate::cli::AppConfig; use crate::cli::AppConfig;
@ -22,9 +23,54 @@ pub fn download(
let urls_iter = app_config.urls.iter().map(|url| fetch_html(url)); let urls_iter = app_config.urls.iter().map(|url| fetch_html(url));
let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn); let mut responses = stream::from_iter(urls_iter).buffered(app_config.max_conn);
let mut articles = Vec::new(); let mut articles = Vec::new();
// Collect all urls that couldn't extract here
// let mut retry_with_paperteer: Vec<String> = Vec::new();
while let Some(fetch_result) = responses.next().await { while let Some(fetch_result) = responses.next().await {
match fetch_result { match fetch_result {
Ok((url, html)) => { Ok((url, html)) => {
match extract_and_download_imgs(
&url,
html,
bar,
partial_downloads,
&mut articles,
)
.await
{
Ok(_) => bar.inc(1),
// All errors are pushed into here since they're readability issues.
Err(e) => errors.push(e),
}
// Outside the stream, make a new one to retry with paperteer
}
Err(e) => errors.push(e),
}
}
// if !retry_with_paperteer.is_empty() {
// fetch_html_from_paperteer(
// retry_with_paperteer,
// app_config,
// bar,
// partial_downloads,
// errors,
// &mut articles,
// )
// .await
// .unwrap();
// }
articles
})
}
async fn extract_and_download_imgs<'a>(
url: &str,
html: String,
bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>,
articles: &mut Vec<Article>,
) -> Result<(), PaperoniError> {
debug!("Extracting {}", &url); debug!("Extracting {}", &url);
let mut extractor = Article::from_html(&html, &url); let mut extractor = Article::from_html(&html, &url);
bar.set_message("Extracting..."); bar.set_message("Extracting...");
@ -32,16 +78,14 @@ pub fn download(
Ok(_) => { Ok(_) => {
extractor.extract_img_urls(); extractor.extract_img_urls();
if let Err(img_errors) = if let Err(img_errors) =
download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar) download_images(&mut extractor, &Url::parse(&url).unwrap(), &bar).await
.await
{ {
partial_downloads partial_downloads.push(PartialDownload::new(&url, extractor.metadata().title()));
.push(PartialDownload::new(&url, extractor.metadata().title()));
warn!( warn!(
"{} image{} failed to download for {}", "{} image{} failed to download for {}",
img_errors.len(), img_errors.len(),
if img_errors.len() > 1 { "s" } else { "" }, if img_errors.len() > 1 { "s" } else { "" },
url &url
); );
for img_error in img_errors { for img_error in img_errors {
warn!( warn!(
@ -52,19 +96,87 @@ pub fn download(
} }
} }
articles.push(extractor); articles.push(extractor);
Ok(())
} }
Err(mut e) => { Err(mut e) => {
e.set_article_source(&url); e.set_article_source(&url);
errors.push(e); Err(e)
} }
} }
} }
#[derive(Serialize, Deserialize)]
struct PaperteerBody {
urls: Vec<String>,
}
impl PaperteerBody {
fn new(urls: Vec<String>) -> Self {
PaperteerBody { urls }
}
}
#[derive(Serialize, Deserialize)]
struct PaperteerItem {
url: String,
response: String,
html: String,
}
#[derive(Serialize, Deserialize)]
struct PaperteerResponse {
data: Vec<PaperteerItem>,
}
// TODO: Change signature to simply take a vec of urls and return a vec of urls with either html or an error
// This also means that extracting and downloading imgs should be handled externally
async fn _fetch_html_from_paperteer(
urls: Vec<String>,
_app_config: &AppConfig,
bar: &ProgressBar,
partial_downloads: &mut Vec<PartialDownload>,
errors: &mut Vec<PaperoniError>,
articles: &mut Vec<Article>,
) -> Result<(), ()> {
// Get the paperteer url
let render_endpoint = "/api/render";
let paperteer_url = url::Url::parse("http://localhost:3000")
.unwrap()
.join(render_endpoint)
.unwrap();
// Build request body with urls
let urls_str = urls.into_iter().map(|url| url.to_string()).collect();
let body = PaperteerBody::new(urls_str);
// Send to the paperteer url
let mut res = surf::post(paperteer_url)
.body(surf::Body::from_json(&body).unwrap())
.await
.unwrap();
// Receive the json response
// TODO: Check for body response
let PaperteerResponse { data } = res.body_json().await.unwrap();
// For each url, extract the article and images
for item in data {
let PaperteerItem {
html,
url,
response,
} = item;
if response == "ok" {
// Run the extract and download fn
match extract_and_download_imgs(&url, html, bar, partial_downloads, articles).await {
Ok(_) => bar.inc(1),
Err(e) => errors.push(e), Err(e) => errors.push(e),
} }
bar.inc(1); } else {
errors.push(crate::errors::ErrorKind::HTTPError("Paperteer failed".into()).into());
} }
articles }
}) Ok(())
} }
pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> { pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {

View file

@ -2,7 +2,7 @@ use std::fs;
use chrono::{DateTime, Local}; use chrono::{DateTime, Local};
use colored::*; use colored::*;
use comfy_table::presets::UTF8_HORIZONTAL_ONLY; use comfy_table::presets::UTF8_HORIZONTAL_BORDERS_ONLY;
use comfy_table::{Cell, CellAlignment, ContentArrangement, Table}; use comfy_table::{Cell, CellAlignment, ContentArrangement, Table};
use flexi_logger::{FileSpec, LevelFilter}; use flexi_logger::{FileSpec, LevelFilter};
use log::error; use log::error;
@ -38,7 +38,7 @@ pub fn display_summary(
println!("\n{}", "Partially failed downloads".yellow().bold()); println!("\n{}", "Partially failed downloads".yellow().bold());
let mut table_partial = Table::new(); let mut table_partial = Table::new();
table_partial table_partial
.load_preset(UTF8_HORIZONTAL_ONLY) .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_header(vec![ .set_header(vec![
Cell::new("Link").set_alignment(CellAlignment::Center), Cell::new("Link").set_alignment(CellAlignment::Center),
Cell::new("Title").set_alignment(CellAlignment::Center), Cell::new("Title").set_alignment(CellAlignment::Center),
@ -55,7 +55,7 @@ pub fn display_summary(
println!("\n{}", "Failed article downloads".bright_red().bold()); println!("\n{}", "Failed article downloads".bright_red().bold());
let mut table_failed = Table::new(); let mut table_failed = Table::new();
table_failed table_failed
.load_preset(UTF8_HORIZONTAL_ONLY) .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_header(vec![ .set_header(vec![
Cell::new("Link").set_alignment(CellAlignment::Center), Cell::new("Link").set_alignment(CellAlignment::Center),
Cell::new("Reason").set_alignment(CellAlignment::Center), Cell::new("Reason").set_alignment(CellAlignment::Center),

View file

@ -2,10 +2,9 @@
extern crate lazy_static; extern crate lazy_static;
use std::process::exit; use std::process::exit;
use std::time::Duration;
use colored::Colorize; use colored::Colorize;
use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_ONLY}; use comfy_table::presets::{UTF8_FULL, UTF8_HORIZONTAL_BORDERS_ONLY};
use comfy_table::{ContentArrangement, Table}; use comfy_table::{ContentArrangement, Table};
use http::download; use http::download;
use indicatif::{ProgressBar, ProgressStyle}; use indicatif::{ProgressBar, ProgressStyle};
@ -26,8 +25,6 @@ use epub::generate_epubs;
use html::generate_html_exports; use html::generate_html_exports;
use logs::display_summary; use logs::display_summary;
use crate::errors::PaperoniError;
fn main() { fn main() {
let app_config = match cli::AppConfig::init_with_cli() { let app_config = match cli::AppConfig::init_with_cli() {
Ok(app_config) => app_config, Ok(app_config) => app_config,
@ -61,15 +58,9 @@ fn run(app_config: AppConfig) {
let enabled_bar = ProgressBar::new(app_config.urls.len() as u64); let enabled_bar = ProgressBar::new(app_config.urls.len() as u64);
let style = ProgressStyle::default_bar().template( let style = ProgressStyle::default_bar().template(
"{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}", "{spinner:.cyan} [{elapsed_precise}] {bar:40.white} {:>8} link {pos}/{len:7} {msg:.yellow/white}",
).map_err(|e| { );
let mut paperoni_err: PaperoniError = e.into();
paperoni_err.set_article_source("progress bar");
vec![paperoni_err]
});
if let Ok(style) = style {
enabled_bar.set_style(style); enabled_bar.set_style(style);
} enabled_bar.enable_steady_tick(500);
enabled_bar.enable_steady_tick(Duration::from_millis(500));
enabled_bar enabled_bar
}; };
@ -79,7 +70,7 @@ fn run(app_config: AppConfig) {
let mut successful_articles_table = Table::new(); let mut successful_articles_table = Table::new();
successful_articles_table successful_articles_table
.load_preset(UTF8_FULL) .load_preset(UTF8_FULL)
.load_preset(UTF8_HORIZONTAL_ONLY) .load_preset(UTF8_HORIZONTAL_BORDERS_ONLY)
.set_content_arrangement(ContentArrangement::Dynamic); .set_content_arrangement(ContentArrangement::Dynamic);
match app_config.export_type { match app_config.export_type {

View file

@ -17,7 +17,7 @@ const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
const FLAG_WEIGHT_CLASSES: u32 = 0x2; const FLAG_WEIGHT_CLASSES: u32 = 0x2;
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4; const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
const READABILITY_SCORE: &'static str = "readability-score"; const READABILITY_SCORE: &'static str = "readability-score";
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; pub const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
// TODO: Change to HashSet // TODO: Change to HashSet
const PHRASING_ELEMS: [&str; 39] = [ const PHRASING_ELEMS: [&str; 39] = [
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",