Add http and epub modules
This commit is contained in:
parent
08f847531f
commit
b402472ba6
5 changed files with 293 additions and 249 deletions
57
src/cli.rs
57
src/cli.rs
|
@ -1,7 +1,9 @@
|
||||||
|
use std::{fs::File, io::Read};
|
||||||
|
|
||||||
use clap::{App, AppSettings, Arg};
|
use clap::{App, AppSettings, Arg};
|
||||||
|
|
||||||
pub fn cli_init() -> App<'static, 'static> {
|
pub fn cli_init() -> AppConfig {
|
||||||
App::new("paperoni")
|
let app = App::new("paperoni")
|
||||||
.settings(&[
|
.settings(&[
|
||||||
AppSettings::ArgRequiredElseHelp,
|
AppSettings::ArgRequiredElseHelp,
|
||||||
AppSettings::UnifiedHelpMessage,
|
AppSettings::UnifiedHelpMessage,
|
||||||
|
@ -24,5 +26,54 @@ It takes a url and downloads the article content from it and saves it to an epub
|
||||||
.long("file")
|
.long("file")
|
||||||
.help("Input file containing links")
|
.help("Input file containing links")
|
||||||
.takes_value(true),
|
.takes_value(true),
|
||||||
)
|
);
|
||||||
|
let arg_matches = app.get_matches();
|
||||||
|
let mut urls: Vec<String> = match arg_matches.value_of("file") {
|
||||||
|
Some(file_name) => {
|
||||||
|
if let Ok(mut file) = File::open(file_name) {
|
||||||
|
let mut content = String::new();
|
||||||
|
match file.read_to_string(&mut content) {
|
||||||
|
Ok(_) => content
|
||||||
|
.lines()
|
||||||
|
.filter(|line| !line.is_empty())
|
||||||
|
.map(|line| line.to_owned())
|
||||||
|
.collect(),
|
||||||
|
Err(_) => vec![],
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("Unable to open file: {}", file_name);
|
||||||
|
vec![]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => vec![],
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(vals) = arg_matches.values_of("urls") {
|
||||||
|
urls.extend(
|
||||||
|
vals.filter(|val| !val.is_empty())
|
||||||
|
.map(|val| val.to_string()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut app_config = AppConfig::new();
|
||||||
|
app_config.set_urls(urls);
|
||||||
|
app_config
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct AppConfig {
|
||||||
|
urls: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AppConfig {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self { urls: vec![] }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_urls(&mut self, urls: Vec<String>) {
|
||||||
|
self.urls.extend(urls);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn urls(&self) -> &Vec<String> {
|
||||||
|
&self.urls
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
40
src/epub.rs
Normal file
40
src/epub.rs
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
use std::fs::File;
|
||||||
|
|
||||||
|
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
||||||
|
|
||||||
|
use crate::extractor::{self, Extractor};
|
||||||
|
|
||||||
|
pub fn generate_epub(extractor: Extractor) {
|
||||||
|
let file_name = format!(
|
||||||
|
"{}.epub",
|
||||||
|
extractor
|
||||||
|
.metadata()
|
||||||
|
.title()
|
||||||
|
.replace("/", " ")
|
||||||
|
.replace("\\", " ")
|
||||||
|
);
|
||||||
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
|
let mut html_buf = Vec::new();
|
||||||
|
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
||||||
|
.expect("Unable to serialize to xhtml");
|
||||||
|
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
||||||
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||||
|
if let Some(author) = extractor.metadata().byline() {
|
||||||
|
epub.metadata("author", author.replace("&", "&"))
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
||||||
|
.unwrap();
|
||||||
|
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
||||||
|
.unwrap();
|
||||||
|
for img in extractor.img_urls {
|
||||||
|
let mut file_path = std::env::temp_dir();
|
||||||
|
file_path.push(&img.0);
|
||||||
|
|
||||||
|
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||||
|
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
epub.generate(&mut out_file).unwrap();
|
||||||
|
println!("Created {:?}", file_name);
|
||||||
|
}
|
131
src/extractor.rs
131
src/extractor.rs
|
@ -1,10 +1,6 @@
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use async_std::fs::File;
|
|
||||||
use async_std::io::prelude::*;
|
|
||||||
use async_std::task;
|
|
||||||
use kuchiki::{traits::*, NodeRef};
|
use kuchiki::{traits::*, NodeRef};
|
||||||
use url::Url;
|
|
||||||
|
|
||||||
use crate::moz_readability::{MetaData, Readability};
|
use crate::moz_readability::{MetaData, Readability};
|
||||||
|
|
||||||
|
@ -51,8 +47,8 @@ impl Extractor {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
/// Traverses the DOM tree of the content and retrieves the IMG URLs
|
||||||
fn extract_img_urls(&mut self) {
|
pub fn extract_img_urls(&mut self) {
|
||||||
if let Some(content_ref) = &self.readability.article_node {
|
if let Some(content_ref) = &self.article {
|
||||||
for img_ref in content_ref.select("img").unwrap() {
|
for img_ref in content_ref.select("img").unwrap() {
|
||||||
img_ref.as_node().as_element().map(|img_elem| {
|
img_ref.as_node().as_element().map(|img_elem| {
|
||||||
img_elem.attributes.borrow().get("src").map(|img_url| {
|
img_elem.attributes.borrow().get("src").map(|img_url| {
|
||||||
|
@ -65,80 +61,6 @@ impl Extractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
|
||||||
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
|
||||||
self.extract_img_urls();
|
|
||||||
if self.img_urls.len() > 0 {
|
|
||||||
println!("Downloading images...");
|
|
||||||
}
|
|
||||||
for img_url in &self.img_urls {
|
|
||||||
let img_url = img_url.0.clone();
|
|
||||||
let abs_url = get_absolute_url(&img_url, article_origin);
|
|
||||||
|
|
||||||
async_download_tasks.push(task::spawn(async move {
|
|
||||||
let mut img_response = surf::Client::new()
|
|
||||||
// The middleware has been temporarily commented out because it happens
|
|
||||||
// to affect downloading images when there is no redirecting
|
|
||||||
// .with(surf::middleware::Redirect::default())
|
|
||||||
.get(&abs_url)
|
|
||||||
.await
|
|
||||||
.expect("Unable to retrieve file");
|
|
||||||
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
|
||||||
let img_mime = img_response
|
|
||||||
.content_type()
|
|
||||||
.map(|mime| mime.essence().to_string());
|
|
||||||
let img_ext = img_response
|
|
||||||
.content_type()
|
|
||||||
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
|
||||||
.unwrap();
|
|
||||||
let mut img_path = std::env::temp_dir();
|
|
||||||
img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
|
|
||||||
let mut img_file = File::create(&img_path)
|
|
||||||
.await
|
|
||||||
.expect("Unable to create file");
|
|
||||||
img_file
|
|
||||||
.write_all(&img_content)
|
|
||||||
.await
|
|
||||||
.expect("Unable to save to file");
|
|
||||||
|
|
||||||
(
|
|
||||||
img_url,
|
|
||||||
img_path
|
|
||||||
.file_name()
|
|
||||||
.map(|os_str_name| {
|
|
||||||
os_str_name
|
|
||||||
.to_str()
|
|
||||||
.expect("Unable to get image file name")
|
|
||||||
.to_string()
|
|
||||||
})
|
|
||||||
.unwrap(),
|
|
||||||
img_mime,
|
|
||||||
)
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
self.img_urls.clear();
|
|
||||||
|
|
||||||
for async_task in async_download_tasks {
|
|
||||||
let (img_url, img_path, img_mime) = async_task.await;
|
|
||||||
// Update the image sources
|
|
||||||
let img_ref = self
|
|
||||||
.readability
|
|
||||||
.article_node
|
|
||||||
.as_mut()
|
|
||||||
.expect("Unable to get mutable ref")
|
|
||||||
.select_first(&format!("img[src='{}']", img_url))
|
|
||||||
.expect("Image node does not exist");
|
|
||||||
let mut img_node = img_ref.attributes.borrow_mut();
|
|
||||||
*img_node.get_mut("src").unwrap() = img_path.clone();
|
|
||||||
// srcset is removed because readers such as Foliate then fail to display
|
|
||||||
// the image already downloaded and stored in src
|
|
||||||
img_node.remove("srcset");
|
|
||||||
self.img_urls.push((img_path, img_mime));
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn article(&self) -> Option<&NodeRef> {
|
pub fn article(&self) -> Option<&NodeRef> {
|
||||||
self.article.as_ref()
|
self.article.as_ref()
|
||||||
}
|
}
|
||||||
|
@ -148,40 +70,6 @@ impl Extractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Utility for hashing URLs. This is used to help store files locally with unique values
|
|
||||||
fn hash_url(url: &str) -> String {
|
|
||||||
format!("{:x}", md5::compute(url.as_bytes()))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Handles getting the extension from a given MIME subtype.
|
|
||||||
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
|
||||||
if subtype == ("svg+xml") {
|
|
||||||
return "svg";
|
|
||||||
} else if subtype == "x-icon" {
|
|
||||||
"ico"
|
|
||||||
} else {
|
|
||||||
subtype
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
|
||||||
if Url::parse(url).is_ok() {
|
|
||||||
url.to_owned()
|
|
||||||
} else if url.starts_with("/") {
|
|
||||||
Url::parse(&format!(
|
|
||||||
"{}://{}",
|
|
||||||
request_url.scheme(),
|
|
||||||
request_url.host_str().unwrap()
|
|
||||||
))
|
|
||||||
.unwrap()
|
|
||||||
.join(url)
|
|
||||||
.unwrap()
|
|
||||||
.into_string()
|
|
||||||
} else {
|
|
||||||
request_url.join(url).unwrap().into_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Serializes a NodeRef to a string that is XHTML compatible
|
/// Serializes a NodeRef to a string that is XHTML compatible
|
||||||
/// The only DOM nodes serialized are Text and Element nodes
|
/// The only DOM nodes serialized are Text and Element nodes
|
||||||
pub fn serialize_to_xhtml<W: std::io::Write>(
|
pub fn serialize_to_xhtml<W: std::io::Write>(
|
||||||
|
@ -278,19 +166,4 @@ mod test {
|
||||||
extractor.img_urls
|
extractor.img_urls
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_map_mime_type_to_ext() {
|
|
||||||
let mime_subtypes = vec![
|
|
||||||
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
|
|
||||||
];
|
|
||||||
let exts = mime_subtypes
|
|
||||||
.into_iter()
|
|
||||||
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
assert_eq!(
|
|
||||||
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
|
|
||||||
exts
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
183
src/http.rs
Normal file
183
src/http.rs
Normal file
|
@ -0,0 +1,183 @@
|
||||||
|
use async_std::fs::File;
|
||||||
|
use async_std::io::prelude::*;
|
||||||
|
use async_std::task;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::extractor::Extractor;
|
||||||
|
|
||||||
|
type HTMLResource = (String, String);
|
||||||
|
|
||||||
|
pub async fn fetch_url(
|
||||||
|
url: &str,
|
||||||
|
) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
|
||||||
|
let client = surf::Client::new();
|
||||||
|
println!("Fetching...");
|
||||||
|
|
||||||
|
let mut redirect_count: u8 = 0;
|
||||||
|
let base_url = Url::parse(&url)?;
|
||||||
|
let mut url = base_url.clone();
|
||||||
|
while redirect_count < 5 {
|
||||||
|
redirect_count += 1;
|
||||||
|
let req = surf::get(&url);
|
||||||
|
let mut res = client.send(req).await?;
|
||||||
|
if res.status().is_redirection() {
|
||||||
|
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
||||||
|
match Url::parse(location.last().as_str()) {
|
||||||
|
Ok(valid_url) => url = valid_url,
|
||||||
|
Err(e) => match e {
|
||||||
|
url::ParseError::RelativeUrlWithoutBase => {
|
||||||
|
url = base_url.join(location.last().as_str())?
|
||||||
|
}
|
||||||
|
e => return Err(e.into()),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} else if res.status().is_success() {
|
||||||
|
if let Some(mime) = res.content_type() {
|
||||||
|
if mime.essence() == "text/html" {
|
||||||
|
return Ok((url.to_string(), res.body_string().await?));
|
||||||
|
} else {
|
||||||
|
return Err(format!(
|
||||||
|
"Invalid HTTP response. Received {} instead of text/html",
|
||||||
|
mime.essence()
|
||||||
|
)
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Err("Unknown HTTP response".into());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Err(format!("Request failed: HTTP {}", res.status()).into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err("Unable to fetch HTML".into())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn download_images(
|
||||||
|
extractor: &mut Extractor,
|
||||||
|
article_origin: &Url,
|
||||||
|
) -> async_std::io::Result<()> {
|
||||||
|
extractor.extract_img_urls();
|
||||||
|
let mut async_download_tasks = Vec::with_capacity(extractor.img_urls.len());
|
||||||
|
if extractor.img_urls.len() > 0 {
|
||||||
|
println!("Downloading images...");
|
||||||
|
}
|
||||||
|
for img_url in &extractor.img_urls {
|
||||||
|
let img_url = img_url.0.clone();
|
||||||
|
let abs_url = get_absolute_url(&img_url, article_origin);
|
||||||
|
|
||||||
|
async_download_tasks.push(task::spawn(async move {
|
||||||
|
let mut img_response = surf::Client::new()
|
||||||
|
// The middleware has been temporarily commented out because it happens
|
||||||
|
// to affect downloading images when there is no redirecting
|
||||||
|
// .with(surf::middleware::Redirect::default())
|
||||||
|
.get(&abs_url)
|
||||||
|
.await
|
||||||
|
.expect("Unable to retrieve file");
|
||||||
|
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
||||||
|
let img_mime = img_response
|
||||||
|
.content_type()
|
||||||
|
.map(|mime| mime.essence().to_string());
|
||||||
|
let img_ext = img_response
|
||||||
|
.content_type()
|
||||||
|
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||||
|
.unwrap();
|
||||||
|
let mut img_path = std::env::temp_dir();
|
||||||
|
img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
|
||||||
|
let mut img_file = File::create(&img_path)
|
||||||
|
.await
|
||||||
|
.expect("Unable to create file");
|
||||||
|
img_file
|
||||||
|
.write_all(&img_content)
|
||||||
|
.await
|
||||||
|
.expect("Unable to save to file");
|
||||||
|
|
||||||
|
(
|
||||||
|
img_url,
|
||||||
|
img_path
|
||||||
|
.file_name()
|
||||||
|
.map(|os_str_name| {
|
||||||
|
os_str_name
|
||||||
|
.to_str()
|
||||||
|
.expect("Unable to get image file name")
|
||||||
|
.to_string()
|
||||||
|
})
|
||||||
|
.unwrap(),
|
||||||
|
img_mime,
|
||||||
|
)
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
extractor.img_urls.clear();
|
||||||
|
|
||||||
|
for async_task in async_download_tasks {
|
||||||
|
let (img_url, img_path, img_mime) = async_task.await;
|
||||||
|
// Update the image sources
|
||||||
|
let img_ref = extractor
|
||||||
|
.article()
|
||||||
|
.as_mut()
|
||||||
|
.expect("Unable to get mutable ref")
|
||||||
|
.select_first(&format!("img[src='{}']", img_url))
|
||||||
|
.expect("Image node does not exist");
|
||||||
|
let mut img_node = img_ref.attributes.borrow_mut();
|
||||||
|
*img_node.get_mut("src").unwrap() = img_path.clone();
|
||||||
|
// srcset is removed because readers such as Foliate then fail to display
|
||||||
|
// the image already downloaded and stored in src
|
||||||
|
img_node.remove("srcset");
|
||||||
|
extractor.img_urls.push((img_path, img_mime));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handles getting the extension from a given MIME subtype.
|
||||||
|
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
||||||
|
if subtype == ("svg+xml") {
|
||||||
|
return "svg";
|
||||||
|
} else if subtype == "x-icon" {
|
||||||
|
"ico"
|
||||||
|
} else {
|
||||||
|
subtype
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Utility for hashing URLs. This is used to help store files locally with unique values
|
||||||
|
fn hash_url(url: &str) -> String {
|
||||||
|
format!("{:x}", md5::compute(url.as_bytes()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
||||||
|
if Url::parse(url).is_ok() {
|
||||||
|
url.to_owned()
|
||||||
|
} else if url.starts_with("/") {
|
||||||
|
Url::parse(&format!(
|
||||||
|
"{}://{}",
|
||||||
|
request_url.scheme(),
|
||||||
|
request_url.host_str().unwrap()
|
||||||
|
))
|
||||||
|
.unwrap()
|
||||||
|
.join(url)
|
||||||
|
.unwrap()
|
||||||
|
.into_string()
|
||||||
|
} else {
|
||||||
|
request_url.join(url).unwrap().into_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::*;
|
||||||
|
#[test]
|
||||||
|
fn test_map_mime_type_to_ext() {
|
||||||
|
let mime_subtypes = vec![
|
||||||
|
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
|
||||||
|
];
|
||||||
|
let exts = mime_subtypes
|
||||||
|
.into_iter()
|
||||||
|
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
assert_eq!(
|
||||||
|
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
|
||||||
|
exts
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
131
src/main.rs
131
src/main.rs
|
@ -1,96 +1,27 @@
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate lazy_static;
|
extern crate lazy_static;
|
||||||
|
|
||||||
use std::{fs::File, io::Read};
|
|
||||||
|
|
||||||
use async_std::task;
|
use async_std::task;
|
||||||
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
mod cli;
|
mod cli;
|
||||||
|
mod epub;
|
||||||
mod extractor;
|
mod extractor;
|
||||||
|
/// This module is responsible for async HTTP calls for downloading
|
||||||
|
/// the HTML content and images
|
||||||
|
mod http;
|
||||||
mod moz_readability;
|
mod moz_readability;
|
||||||
|
|
||||||
|
use epub::generate_epub;
|
||||||
|
use http::{download_images, fetch_url};
|
||||||
|
|
||||||
use extractor::Extractor;
|
use extractor::Extractor;
|
||||||
fn main() {
|
fn main() {
|
||||||
let app = cli::cli_init();
|
let app_config = cli::cli_init();
|
||||||
let arg_matches = app.get_matches();
|
|
||||||
let mut urls: Vec<String> = match arg_matches.value_of("file") {
|
|
||||||
Some(file_name) => {
|
|
||||||
if let Ok(mut file) = File::open(file_name) {
|
|
||||||
let mut content = String::new();
|
|
||||||
match file.read_to_string(&mut content) {
|
|
||||||
Ok(_) => content
|
|
||||||
.lines()
|
|
||||||
.filter(|line| !line.is_empty())
|
|
||||||
.map(|line| line.to_owned())
|
|
||||||
.collect(),
|
|
||||||
Err(_) => vec![],
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
println!("Unable to open file: {}", file_name);
|
|
||||||
vec![]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => vec![],
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(vals) = arg_matches.values_of("urls") {
|
if !app_config.urls().is_empty() {
|
||||||
urls.extend(
|
download(app_config.urls().clone());
|
||||||
vals.filter(|val| !val.is_empty())
|
|
||||||
.map(|val| val.to_string()),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if !urls.is_empty() {
|
|
||||||
download(urls);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type HTMLResource = (String, String);
|
|
||||||
|
|
||||||
async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
|
|
||||||
let client = surf::Client::new();
|
|
||||||
println!("Fetching...");
|
|
||||||
|
|
||||||
let mut redirect_count: u8 = 0;
|
|
||||||
let base_url = Url::parse(&url)?;
|
|
||||||
let mut url = base_url.clone();
|
|
||||||
while redirect_count < 5 {
|
|
||||||
redirect_count += 1;
|
|
||||||
let req = surf::get(&url);
|
|
||||||
let mut res = client.send(req).await?;
|
|
||||||
if res.status().is_redirection() {
|
|
||||||
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
|
||||||
match Url::parse(location.last().as_str()) {
|
|
||||||
Ok(valid_url) => url = valid_url,
|
|
||||||
Err(e) => match e {
|
|
||||||
url::ParseError::RelativeUrlWithoutBase => {
|
|
||||||
url = base_url.join(location.last().as_str())?
|
|
||||||
}
|
|
||||||
e => return Err(e.into()),
|
|
||||||
},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
} else if res.status().is_success() {
|
|
||||||
if let Some(mime) = res.content_type() {
|
|
||||||
if mime.essence() == "text/html" {
|
|
||||||
return Ok((url.to_string(), res.body_string().await?));
|
|
||||||
} else {
|
|
||||||
return Err(format!(
|
|
||||||
"Invalid HTTP response. Received {} instead of text/html",
|
|
||||||
mime.essence()
|
|
||||||
)
|
|
||||||
.into());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return Err("Unknown HTTP response".into());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return Err(format!("Request failed: HTTP {}", res.status()).into());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err("Unable to fetch HTML".into())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn download(urls: Vec<String>) {
|
fn download(urls: Vec<String>) {
|
||||||
|
@ -98,6 +29,7 @@ fn download(urls: Vec<String>) {
|
||||||
for url in urls {
|
for url in urls {
|
||||||
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
|
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
|
||||||
}
|
}
|
||||||
|
|
||||||
task::block_on(async {
|
task::block_on(async {
|
||||||
for url_task in async_url_tasks {
|
for url_task in async_url_tasks {
|
||||||
match url_task.await {
|
match url_task.await {
|
||||||
|
@ -105,47 +37,12 @@ fn download(urls: Vec<String>) {
|
||||||
println!("Extracting");
|
println!("Extracting");
|
||||||
let mut extractor = Extractor::from_html(&html);
|
let mut extractor = Extractor::from_html(&html);
|
||||||
extractor.extract_content(&url);
|
extractor.extract_content(&url);
|
||||||
|
|
||||||
if extractor.article().is_some() {
|
if extractor.article().is_some() {
|
||||||
extractor
|
download_images(&mut extractor, &Url::parse(&url).unwrap())
|
||||||
.download_images(&Url::parse(&url).unwrap())
|
|
||||||
.await
|
.await
|
||||||
.expect("Unable to download images");
|
.expect("Unable to download images");
|
||||||
let file_name = format!(
|
generate_epub(extractor);
|
||||||
"{}.epub",
|
|
||||||
extractor
|
|
||||||
.metadata()
|
|
||||||
.title()
|
|
||||||
.replace("/", " ")
|
|
||||||
.replace("\\", " ")
|
|
||||||
);
|
|
||||||
let mut out_file = File::create(&file_name).unwrap();
|
|
||||||
let mut html_buf = Vec::new();
|
|
||||||
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
|
||||||
.expect("Unable to serialize to xhtml");
|
|
||||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
|
||||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
|
||||||
if let Some(author) = extractor.metadata().byline() {
|
|
||||||
epub.metadata("author", author.replace("&", "&"))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
|
||||||
.unwrap();
|
|
||||||
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
|
||||||
.unwrap();
|
|
||||||
for img in extractor.img_urls {
|
|
||||||
let mut file_path = std::env::temp_dir();
|
|
||||||
file_path.push(&img.0);
|
|
||||||
|
|
||||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
|
||||||
epub.add_resource(
|
|
||||||
file_path.file_name().unwrap(),
|
|
||||||
img_buf,
|
|
||||||
img.1.unwrap(),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
epub.generate(&mut out_file).unwrap();
|
|
||||||
println!("Created {:?}", file_name);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => println!("{}", e),
|
Err(e) => println!("{}", e),
|
||||||
|
|
Reference in a new issue