Merge pull request #5 from hipstermojo/dev

Merge 0.2.2-alpha-1
This commit is contained in:
Kenneth Gitere 2021-01-24 18:00:50 +03:00 committed by GitHub
commit c82071a871
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 98 additions and 60 deletions

2
Cargo.lock generated
View file

@ -1242,7 +1242,7 @@ dependencies = [
[[package]] [[package]]
name = "paperoni" name = "paperoni"
version = "0.2.1-alpha1" version = "0.2.2-alpha1"
dependencies = [ dependencies = [
"async-std", "async-std",
"clap", "clap",

View file

@ -3,7 +3,7 @@ description = "A web article downloader"
homepage = "https://github.com/hipstermojo/paperoni" homepage = "https://github.com/hipstermojo/paperoni"
repository = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni"
name = "paperoni" name = "paperoni"
version = "0.2.1-alpha1" version = "0.2.2-alpha1"
authors = ["Kenneth Gitere <gitere81@gmail.com>"] authors = ["Kenneth Gitere <gitere81@gmail.com>"]
edition = "2018" edition = "2018"
license = "MIT" license = "MIT"
@ -21,4 +21,4 @@ lazy_static = "1.4.0"
md5 = "0.7.0" md5 = "0.7.0"
regex = "1.4.2" regex = "1.4.2"
surf = "2.1.0" surf = "2.1.0"
url = "2.2.0" url = "2.2.0"

View file

@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
AppSettings::ArgRequiredElseHelp, AppSettings::ArgRequiredElseHelp,
AppSettings::UnifiedHelpMessage, AppSettings::UnifiedHelpMessage,
]) ])
.version("0.2.1-alpha1") .version("0.2.2-alpha1")
.about( .about(
" "
Paperoni is an article downloader. Paperoni is an article downloader.

View file

@ -68,7 +68,9 @@ impl Extractor {
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> { pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len()); let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
self.extract_img_urls(); self.extract_img_urls();
println!("Downloading images..."); if self.img_urls.len() > 0 {
println!("Downloading images...");
}
for img_url in &self.img_urls { for img_url in &self.img_urls {
let img_url = img_url.0.clone(); let img_url = img_url.0.clone();
let abs_url = get_absolute_url(&img_url, article_origin); let abs_url = get_absolute_url(&img_url, article_origin);
@ -129,6 +131,9 @@ impl Extractor {
.expect("Image node does not exist"); .expect("Image node does not exist");
let mut img_node = img_ref.attributes.borrow_mut(); let mut img_node = img_ref.attributes.borrow_mut();
*img_node.get_mut("src").unwrap() = img_path.clone(); *img_node.get_mut("src").unwrap() = img_path.clone();
// srcset is removed because readers such as Foliate then fail to display
// the image already downloaded and stored in src
img_node.remove("srcset");
self.img_urls.push((img_path, img_mime)); self.img_urls.push((img_path, img_mime));
} }
Ok(()) Ok(())

View file

@ -23,70 +23,106 @@ fn main() {
type HTMLResource = (String, String); type HTMLResource = (String, String);
async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error>> { async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
let client = surf::Client::new(); let client = surf::Client::new();
println!("Fetching..."); println!("Fetching...");
let mut res = client
.with(surf::middleware::Redirect::default()) let mut redirect_count: u8 = 0;
.get(url) let base_url = Url::parse(&url)?;
.send() let mut url = base_url.clone();
.await while redirect_count < 5 {
.expect(&format!("Unable to fetch {}", url)); redirect_count += 1;
if res.status() == 200 { let req = surf::get(&url);
Ok((url.to_string(), res.body_string().await?)) let mut res = client.send(req).await?;
} else { if res.status().is_redirection() {
Err("Request failed to return HTTP 200".into()) if let Some(location) = res.header(surf::http::headers::LOCATION) {
match Url::parse(location.last().as_str()) {
Ok(valid_url) => url = valid_url,
Err(e) => match e {
url::ParseError::RelativeUrlWithoutBase => {
url = base_url.join(location.last().as_str())?
}
e => return Err(e.into()),
},
};
}
} else if res.status().is_success() {
if let Some(mime) = res.content_type() {
if mime.essence() == "text/html" {
return Ok((url.to_string(), res.body_string().await?));
} else {
return Err(format!(
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
)
.into());
}
} else {
return Err("Unknown HTTP response".into());
}
} else {
return Err(format!("Request failed: HTTP {}", res.status()).into());
}
} }
Err("Unable to fetch HTML".into())
} }
fn download(urls: Vec<String>) { fn download(urls: Vec<String>) {
let mut async_url_tasks = Vec::with_capacity(urls.len()); let mut async_url_tasks = Vec::with_capacity(urls.len());
for url in urls { for url in urls {
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() })); async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
} }
task::block_on(async { task::block_on(async {
for url_task in async_url_tasks { for url_task in async_url_tasks {
let (url, html) = url_task.await; match url_task.await {
println!("Extracting"); Ok((url, html)) => {
let mut extractor = Extractor::from_html(&html); println!("Extracting");
extractor.extract_content(&url); let mut extractor = Extractor::from_html(&html);
if extractor.article().is_some() { extractor.extract_content(&url);
extractor if extractor.article().is_some() {
.download_images(&Url::parse(&url).unwrap()) extractor
.await .download_images(&Url::parse(&url).unwrap())
.expect("Unable to download images"); .await
let file_name = format!( .expect("Unable to download images");
"{}.epub", let file_name = format!(
extractor "{}.epub",
.metadata() extractor
.title() .metadata()
.replace("/", " ") .title()
.replace("\\", " ") .replace("/", " ")
); .replace("\\", " ")
let mut out_file = File::create(&file_name).unwrap(); );
let mut html_buf = Vec::new(); let mut out_file = File::create(&file_name).unwrap();
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) let mut html_buf = Vec::new();
.expect("Unable to serialize to xhtml"); extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
let html_buf = std::str::from_utf8(&html_buf).unwrap(); .expect("Unable to serialize to xhtml");
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); let html_buf = std::str::from_utf8(&html_buf).unwrap();
if let Some(author) = extractor.metadata().byline() { let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
epub.metadata("author", author.replace("&", "&amp;")) if let Some(author) = extractor.metadata().byline() {
.unwrap(); epub.metadata("author", author.replace("&", "&amp;"))
} .unwrap();
epub.metadata("title", extractor.metadata().title().replace("&", "&amp;")) }
.unwrap(); epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) .unwrap();
.unwrap(); epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
for img in extractor.img_urls { .unwrap();
let mut file_path = std::env::temp_dir(); for img in extractor.img_urls {
file_path.push(&img.0); let mut file_path = std::env::temp_dir();
file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file"); let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) epub.add_resource(
.unwrap(); file_path.file_name().unwrap(),
img_buf,
img.1.unwrap(),
)
.unwrap();
}
epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name);
}
} }
epub.generate(&mut out_file).unwrap(); Err(e) => println!("{}", e),
println!("Created {:?}", file_name);
} }
} }
}) })

View file

@ -193,10 +193,7 @@ impl Readability {
.borrow_mut() .borrow_mut()
.insert(attr_name, prev_value.value.clone()); .insert(attr_name, prev_value.value.clone());
} }
// WARN: This assumes `next_element` returns an element node!! prev_elem.insert_after(new_img.as_node().clone());
let inner_node_child =
Self::next_element(inner_node_ref.first_child(), true);
prev_elem.insert_after(inner_node_child.unwrap());
prev_elem.detach(); prev_elem.detach();
} }
} }