2020-10-12 19:33:01 +01:00
|
|
|
#[macro_use]
|
|
|
|
extern crate lazy_static;
|
|
|
|
|
2021-02-01 08:28:07 +00:00
|
|
|
use std::{fs::File, io::Read};
|
2020-05-01 18:42:41 +01:00
|
|
|
|
2020-11-23 06:39:56 +00:00
|
|
|
use async_std::task;
|
2020-05-02 17:25:31 +01:00
|
|
|
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
2020-05-02 16:33:45 +01:00
|
|
|
use url::Url;
|
2020-04-30 09:05:53 +01:00
|
|
|
|
2020-05-16 08:09:44 +01:00
|
|
|
mod cli;
|
2020-05-01 14:17:59 +01:00
|
|
|
mod extractor;
|
2020-08-31 17:30:09 +01:00
|
|
|
mod moz_readability;
|
2020-05-01 14:17:59 +01:00
|
|
|
|
|
|
|
use extractor::Extractor;
|
2020-04-30 09:05:53 +01:00
|
|
|
fn main() {
|
2020-11-24 06:58:50 +00:00
|
|
|
let app = cli::cli_init();
|
|
|
|
let arg_matches = app.get_matches();
|
2021-02-01 08:28:07 +00:00
|
|
|
let mut urls: Vec<String> = match arg_matches.value_of("file") {
|
|
|
|
Some(file_name) => {
|
|
|
|
if let Ok(mut file) = File::open(file_name) {
|
|
|
|
let mut content = String::new();
|
|
|
|
match file.read_to_string(&mut content) {
|
2021-02-03 04:39:51 +00:00
|
|
|
Ok(_) => content
|
|
|
|
.lines()
|
|
|
|
.filter(|line| !line.is_empty())
|
|
|
|
.map(|line| line.to_owned())
|
|
|
|
.collect(),
|
2021-02-01 08:28:07 +00:00
|
|
|
Err(_) => vec![],
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
println!("Unable to open file: {}", file_name);
|
|
|
|
vec![]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
None => vec![],
|
|
|
|
};
|
|
|
|
|
2020-11-24 06:58:50 +00:00
|
|
|
if let Some(vals) = arg_matches.values_of("urls") {
|
2021-02-03 04:39:51 +00:00
|
|
|
urls.extend(
|
|
|
|
vals.filter(|val| !val.is_empty())
|
|
|
|
.map(|val| val.to_string()),
|
|
|
|
);
|
2021-02-01 08:28:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if !urls.is_empty() {
|
2020-11-24 06:58:50 +00:00
|
|
|
download(urls);
|
2020-05-16 08:09:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-22 13:22:56 +01:00
|
|
|
type HTMLResource = (String, String);
|
|
|
|
|
2021-01-24 14:49:42 +00:00
|
|
|
async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
|
2020-05-16 08:09:44 +01:00
|
|
|
let client = surf::Client::new();
|
|
|
|
println!("Fetching...");
|
2021-01-24 14:49:42 +00:00
|
|
|
|
|
|
|
let mut redirect_count: u8 = 0;
|
|
|
|
let base_url = Url::parse(&url)?;
|
|
|
|
let mut url = base_url.clone();
|
|
|
|
while redirect_count < 5 {
|
|
|
|
redirect_count += 1;
|
|
|
|
let req = surf::get(&url);
|
|
|
|
let mut res = client.send(req).await?;
|
|
|
|
if res.status().is_redirection() {
|
|
|
|
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
|
|
|
match Url::parse(location.last().as_str()) {
|
|
|
|
Ok(valid_url) => url = valid_url,
|
|
|
|
Err(e) => match e {
|
|
|
|
url::ParseError::RelativeUrlWithoutBase => {
|
|
|
|
url = base_url.join(location.last().as_str())?
|
|
|
|
}
|
|
|
|
e => return Err(e.into()),
|
|
|
|
},
|
|
|
|
};
|
|
|
|
}
|
|
|
|
} else if res.status().is_success() {
|
|
|
|
if let Some(mime) = res.content_type() {
|
|
|
|
if mime.essence() == "text/html" {
|
|
|
|
return Ok((url.to_string(), res.body_string().await?));
|
|
|
|
} else {
|
|
|
|
return Err(format!(
|
|
|
|
"Invalid HTTP response. Received {} instead of text/html",
|
|
|
|
mime.essence()
|
|
|
|
)
|
|
|
|
.into());
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return Err("Unknown HTTP response".into());
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return Err(format!("Request failed: HTTP {}", res.status()).into());
|
|
|
|
}
|
2020-11-24 14:44:31 +00:00
|
|
|
}
|
2021-01-24 14:49:42 +00:00
|
|
|
Err("Unable to fetch HTML".into())
|
2020-05-16 08:09:44 +01:00
|
|
|
}
|
|
|
|
|
2020-10-22 13:22:56 +01:00
|
|
|
fn download(urls: Vec<String>) {
|
|
|
|
let mut async_url_tasks = Vec::with_capacity(urls.len());
|
|
|
|
for url in urls {
|
2021-01-24 14:49:42 +00:00
|
|
|
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
|
2020-10-22 13:22:56 +01:00
|
|
|
}
|
2020-04-30 09:05:53 +01:00
|
|
|
task::block_on(async {
|
2020-10-22 13:22:56 +01:00
|
|
|
for url_task in async_url_tasks {
|
2021-01-24 14:49:42 +00:00
|
|
|
match url_task.await {
|
|
|
|
Ok((url, html)) => {
|
|
|
|
println!("Extracting");
|
|
|
|
let mut extractor = Extractor::from_html(&html);
|
|
|
|
extractor.extract_content(&url);
|
|
|
|
if extractor.article().is_some() {
|
|
|
|
extractor
|
|
|
|
.download_images(&Url::parse(&url).unwrap())
|
|
|
|
.await
|
|
|
|
.expect("Unable to download images");
|
|
|
|
let file_name = format!(
|
|
|
|
"{}.epub",
|
|
|
|
extractor
|
|
|
|
.metadata()
|
|
|
|
.title()
|
|
|
|
.replace("/", " ")
|
|
|
|
.replace("\\", " ")
|
|
|
|
);
|
|
|
|
let mut out_file = File::create(&file_name).unwrap();
|
|
|
|
let mut html_buf = Vec::new();
|
|
|
|
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
|
|
|
.expect("Unable to serialize to xhtml");
|
|
|
|
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
|
|
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
|
|
|
if let Some(author) = extractor.metadata().byline() {
|
|
|
|
epub.metadata("author", author.replace("&", "&"))
|
|
|
|
.unwrap();
|
|
|
|
}
|
|
|
|
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
|
|
|
.unwrap();
|
|
|
|
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
|
|
|
.unwrap();
|
|
|
|
for img in extractor.img_urls {
|
|
|
|
let mut file_path = std::env::temp_dir();
|
|
|
|
file_path.push(&img.0);
|
2020-05-05 10:24:11 +01:00
|
|
|
|
2021-01-24 14:49:42 +00:00
|
|
|
let img_buf = File::open(&file_path).expect("Can't read file");
|
|
|
|
epub.add_resource(
|
|
|
|
file_path.file_name().unwrap(),
|
|
|
|
img_buf,
|
|
|
|
img.1.unwrap(),
|
|
|
|
)
|
|
|
|
.unwrap();
|
|
|
|
}
|
|
|
|
epub.generate(&mut out_file).unwrap();
|
|
|
|
println!("Created {:?}", file_name);
|
|
|
|
}
|
2020-10-22 13:22:56 +01:00
|
|
|
}
|
2021-01-24 14:49:42 +00:00
|
|
|
Err(e) => println!("{}", e),
|
2020-10-22 10:12:30 +01:00
|
|
|
}
|
2020-05-05 10:24:11 +01:00
|
|
|
}
|
2020-05-02 17:25:31 +01:00
|
|
|
})
|
2020-04-30 09:05:53 +01:00
|
|
|
}
|