commit
c82071a871
6 changed files with 98 additions and 60 deletions
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -1242,7 +1242,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "paperoni"
|
name = "paperoni"
|
||||||
version = "0.2.1-alpha1"
|
version = "0.2.2-alpha1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-std",
|
"async-std",
|
||||||
"clap",
|
"clap",
|
||||||
|
|
|
@ -3,7 +3,7 @@ description = "A web article downloader"
|
||||||
homepage = "https://github.com/hipstermojo/paperoni"
|
homepage = "https://github.com/hipstermojo/paperoni"
|
||||||
repository = "https://github.com/hipstermojo/paperoni"
|
repository = "https://github.com/hipstermojo/paperoni"
|
||||||
name = "paperoni"
|
name = "paperoni"
|
||||||
version = "0.2.1-alpha1"
|
version = "0.2.2-alpha1"
|
||||||
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
@ -21,4 +21,4 @@ lazy_static = "1.4.0"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
regex = "1.4.2"
|
regex = "1.4.2"
|
||||||
surf = "2.1.0"
|
surf = "2.1.0"
|
||||||
url = "2.2.0"
|
url = "2.2.0"
|
||||||
|
|
|
@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
|
||||||
AppSettings::ArgRequiredElseHelp,
|
AppSettings::ArgRequiredElseHelp,
|
||||||
AppSettings::UnifiedHelpMessage,
|
AppSettings::UnifiedHelpMessage,
|
||||||
])
|
])
|
||||||
.version("0.2.1-alpha1")
|
.version("0.2.2-alpha1")
|
||||||
.about(
|
.about(
|
||||||
"
|
"
|
||||||
Paperoni is an article downloader.
|
Paperoni is an article downloader.
|
||||||
|
|
|
@ -68,7 +68,9 @@ impl Extractor {
|
||||||
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
||||||
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
||||||
self.extract_img_urls();
|
self.extract_img_urls();
|
||||||
println!("Downloading images...");
|
if self.img_urls.len() > 0 {
|
||||||
|
println!("Downloading images...");
|
||||||
|
}
|
||||||
for img_url in &self.img_urls {
|
for img_url in &self.img_urls {
|
||||||
let img_url = img_url.0.clone();
|
let img_url = img_url.0.clone();
|
||||||
let abs_url = get_absolute_url(&img_url, article_origin);
|
let abs_url = get_absolute_url(&img_url, article_origin);
|
||||||
|
@ -129,6 +131,9 @@ impl Extractor {
|
||||||
.expect("Image node does not exist");
|
.expect("Image node does not exist");
|
||||||
let mut img_node = img_ref.attributes.borrow_mut();
|
let mut img_node = img_ref.attributes.borrow_mut();
|
||||||
*img_node.get_mut("src").unwrap() = img_path.clone();
|
*img_node.get_mut("src").unwrap() = img_path.clone();
|
||||||
|
// srcset is removed because readers such as Foliate then fail to display
|
||||||
|
// the image already downloaded and stored in src
|
||||||
|
img_node.remove("srcset");
|
||||||
self.img_urls.push((img_path, img_mime));
|
self.img_urls.push((img_path, img_mime));
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
138
src/main.rs
138
src/main.rs
|
@ -23,70 +23,106 @@ fn main() {
|
||||||
|
|
||||||
type HTMLResource = (String, String);
|
type HTMLResource = (String, String);
|
||||||
|
|
||||||
async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error>> {
|
async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
|
||||||
let client = surf::Client::new();
|
let client = surf::Client::new();
|
||||||
println!("Fetching...");
|
println!("Fetching...");
|
||||||
let mut res = client
|
|
||||||
.with(surf::middleware::Redirect::default())
|
let mut redirect_count: u8 = 0;
|
||||||
.get(url)
|
let base_url = Url::parse(&url)?;
|
||||||
.send()
|
let mut url = base_url.clone();
|
||||||
.await
|
while redirect_count < 5 {
|
||||||
.expect(&format!("Unable to fetch {}", url));
|
redirect_count += 1;
|
||||||
if res.status() == 200 {
|
let req = surf::get(&url);
|
||||||
Ok((url.to_string(), res.body_string().await?))
|
let mut res = client.send(req).await?;
|
||||||
} else {
|
if res.status().is_redirection() {
|
||||||
Err("Request failed to return HTTP 200".into())
|
if let Some(location) = res.header(surf::http::headers::LOCATION) {
|
||||||
|
match Url::parse(location.last().as_str()) {
|
||||||
|
Ok(valid_url) => url = valid_url,
|
||||||
|
Err(e) => match e {
|
||||||
|
url::ParseError::RelativeUrlWithoutBase => {
|
||||||
|
url = base_url.join(location.last().as_str())?
|
||||||
|
}
|
||||||
|
e => return Err(e.into()),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} else if res.status().is_success() {
|
||||||
|
if let Some(mime) = res.content_type() {
|
||||||
|
if mime.essence() == "text/html" {
|
||||||
|
return Ok((url.to_string(), res.body_string().await?));
|
||||||
|
} else {
|
||||||
|
return Err(format!(
|
||||||
|
"Invalid HTTP response. Received {} instead of text/html",
|
||||||
|
mime.essence()
|
||||||
|
)
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Err("Unknown HTTP response".into());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Err(format!("Request failed: HTTP {}", res.status()).into());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Err("Unable to fetch HTML".into())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn download(urls: Vec<String>) {
|
fn download(urls: Vec<String>) {
|
||||||
let mut async_url_tasks = Vec::with_capacity(urls.len());
|
let mut async_url_tasks = Vec::with_capacity(urls.len());
|
||||||
for url in urls {
|
for url in urls {
|
||||||
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() }));
|
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
|
||||||
}
|
}
|
||||||
task::block_on(async {
|
task::block_on(async {
|
||||||
for url_task in async_url_tasks {
|
for url_task in async_url_tasks {
|
||||||
let (url, html) = url_task.await;
|
match url_task.await {
|
||||||
println!("Extracting");
|
Ok((url, html)) => {
|
||||||
let mut extractor = Extractor::from_html(&html);
|
println!("Extracting");
|
||||||
extractor.extract_content(&url);
|
let mut extractor = Extractor::from_html(&html);
|
||||||
if extractor.article().is_some() {
|
extractor.extract_content(&url);
|
||||||
extractor
|
if extractor.article().is_some() {
|
||||||
.download_images(&Url::parse(&url).unwrap())
|
extractor
|
||||||
.await
|
.download_images(&Url::parse(&url).unwrap())
|
||||||
.expect("Unable to download images");
|
.await
|
||||||
let file_name = format!(
|
.expect("Unable to download images");
|
||||||
"{}.epub",
|
let file_name = format!(
|
||||||
extractor
|
"{}.epub",
|
||||||
.metadata()
|
extractor
|
||||||
.title()
|
.metadata()
|
||||||
.replace("/", " ")
|
.title()
|
||||||
.replace("\\", " ")
|
.replace("/", " ")
|
||||||
);
|
.replace("\\", " ")
|
||||||
let mut out_file = File::create(&file_name).unwrap();
|
);
|
||||||
let mut html_buf = Vec::new();
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
let mut html_buf = Vec::new();
|
||||||
.expect("Unable to serialize to xhtml");
|
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
||||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
.expect("Unable to serialize to xhtml");
|
||||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
||||||
if let Some(author) = extractor.metadata().byline() {
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||||
epub.metadata("author", author.replace("&", "&"))
|
if let Some(author) = extractor.metadata().byline() {
|
||||||
.unwrap();
|
epub.metadata("author", author.replace("&", "&"))
|
||||||
}
|
.unwrap();
|
||||||
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
}
|
||||||
.unwrap();
|
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
||||||
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
.unwrap();
|
||||||
.unwrap();
|
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
|
||||||
for img in extractor.img_urls {
|
.unwrap();
|
||||||
let mut file_path = std::env::temp_dir();
|
for img in extractor.img_urls {
|
||||||
file_path.push(&img.0);
|
let mut file_path = std::env::temp_dir();
|
||||||
|
file_path.push(&img.0);
|
||||||
|
|
||||||
let img_buf = File::open(&file_path).expect("Can't read file");
|
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||||
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
|
epub.add_resource(
|
||||||
.unwrap();
|
file_path.file_name().unwrap(),
|
||||||
|
img_buf,
|
||||||
|
img.1.unwrap(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
epub.generate(&mut out_file).unwrap();
|
||||||
|
println!("Created {:?}", file_name);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
epub.generate(&mut out_file).unwrap();
|
Err(e) => println!("{}", e),
|
||||||
println!("Created {:?}", file_name);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
|
@ -193,10 +193,7 @@ impl Readability {
|
||||||
.borrow_mut()
|
.borrow_mut()
|
||||||
.insert(attr_name, prev_value.value.clone());
|
.insert(attr_name, prev_value.value.clone());
|
||||||
}
|
}
|
||||||
// WARN: This assumes `next_element` returns an element node!!
|
prev_elem.insert_after(new_img.as_node().clone());
|
||||||
let inner_node_child =
|
|
||||||
Self::next_element(inner_node_ref.first_child(), true);
|
|
||||||
prev_elem.insert_after(inner_node_child.unwrap());
|
|
||||||
prev_elem.detach();
|
prev_elem.detach();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue