diff --git a/Cargo.lock b/Cargo.lock index e1d51de..3b7d384 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1242,7 +1242,7 @@ dependencies = [ [[package]] name = "paperoni" -version = "0.2.1-alpha1" +version = "0.2.2-alpha1" dependencies = [ "async-std", "clap", diff --git a/Cargo.toml b/Cargo.toml index c4c64ac..01bbe6b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ description = "A web article downloader" homepage = "https://github.com/hipstermojo/paperoni" repository = "https://github.com/hipstermojo/paperoni" name = "paperoni" -version = "0.2.1-alpha1" +version = "0.2.2-alpha1" authors = ["Kenneth Gitere "] edition = "2018" license = "MIT" @@ -21,4 +21,4 @@ lazy_static = "1.4.0" md5 = "0.7.0" regex = "1.4.2" surf = "2.1.0" -url = "2.2.0" \ No newline at end of file +url = "2.2.0" diff --git a/src/cli.rs b/src/cli.rs index 92b56f4..474223b 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> { AppSettings::ArgRequiredElseHelp, AppSettings::UnifiedHelpMessage, ]) - .version("0.2.1-alpha1") + .version("0.2.2-alpha1") .about( " Paperoni is an article downloader. diff --git a/src/extractor.rs b/src/extractor.rs index 9294ae6..2b90e3b 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -68,7 +68,9 @@ impl Extractor { pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> { let mut async_download_tasks = Vec::with_capacity(self.img_urls.len()); self.extract_img_urls(); - println!("Downloading images..."); + if self.img_urls.len() > 0 { + println!("Downloading images..."); + } for img_url in &self.img_urls { let img_url = img_url.0.clone(); let abs_url = get_absolute_url(&img_url, article_origin); @@ -129,6 +131,9 @@ impl Extractor { .expect("Image node does not exist"); let mut img_node = img_ref.attributes.borrow_mut(); *img_node.get_mut("src").unwrap() = img_path.clone(); + // srcset is removed because readers such as Foliate then fail to display + // the image already downloaded and stored in src + img_node.remove("srcset"); self.img_urls.push((img_path, img_mime)); } Ok(()) diff --git a/src/main.rs b/src/main.rs index bf14ee3..4e403b6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -23,70 +23,106 @@ fn main() { type HTMLResource = (String, String); -async fn fetch_url(url: &str) -> Result> { +async fn fetch_url(url: &str) -> Result> { let client = surf::Client::new(); println!("Fetching..."); - let mut res = client - .with(surf::middleware::Redirect::default()) - .get(url) - .send() - .await - .expect(&format!("Unable to fetch {}", url)); - if res.status() == 200 { - Ok((url.to_string(), res.body_string().await?)) - } else { - Err("Request failed to return HTTP 200".into()) + + let mut redirect_count: u8 = 0; + let base_url = Url::parse(&url)?; + let mut url = base_url.clone(); + while redirect_count < 5 { + redirect_count += 1; + let req = surf::get(&url); + let mut res = client.send(req).await?; + if res.status().is_redirection() { + if let Some(location) = res.header(surf::http::headers::LOCATION) { + match Url::parse(location.last().as_str()) { + Ok(valid_url) => url = valid_url, + Err(e) => match e { + url::ParseError::RelativeUrlWithoutBase => { + url = base_url.join(location.last().as_str())? + } + e => return Err(e.into()), + }, + }; + } + } else if res.status().is_success() { + if let Some(mime) = res.content_type() { + if mime.essence() == "text/html" { + return Ok((url.to_string(), res.body_string().await?)); + } else { + return Err(format!( + "Invalid HTTP response. Received {} instead of text/html", + mime.essence() + ) + .into()); + } + } else { + return Err("Unknown HTTP response".into()); + } + } else { + return Err(format!("Request failed: HTTP {}", res.status()).into()); + } } + Err("Unable to fetch HTML".into()) } fn download(urls: Vec) { let mut async_url_tasks = Vec::with_capacity(urls.len()); for url in urls { - async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() })); + async_url_tasks.push(task::spawn(async move { fetch_url(&url).await })); } task::block_on(async { for url_task in async_url_tasks { - let (url, html) = url_task.await; - println!("Extracting"); - let mut extractor = Extractor::from_html(&html); - extractor.extract_content(&url); - if extractor.article().is_some() { - extractor - .download_images(&Url::parse(&url).unwrap()) - .await - .expect("Unable to download images"); - let file_name = format!( - "{}.epub", - extractor - .metadata() - .title() - .replace("/", " ") - .replace("\\", " ") - ); - let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_buf = std::str::from_utf8(&html_buf).unwrap(); - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); - if let Some(author) = extractor.metadata().byline() { - epub.metadata("author", author.replace("&", "&")) - .unwrap(); - } - epub.metadata("title", extractor.metadata().title().replace("&", "&")) - .unwrap(); - epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) - .unwrap(); - for img in extractor.img_urls { - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); + match url_task.await { + Ok((url, html)) => { + println!("Extracting"); + let mut extractor = Extractor::from_html(&html); + extractor.extract_content(&url); + if extractor.article().is_some() { + extractor + .download_images(&Url::parse(&url).unwrap()) + .await + .expect("Unable to download images"); + let file_name = format!( + "{}.epub", + extractor + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + let mut out_file = File::create(&file_name).unwrap(); + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); + let html_buf = std::str::from_utf8(&html_buf).unwrap(); + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + if let Some(author) = extractor.metadata().byline() { + epub.metadata("author", author.replace("&", "&")) + .unwrap(); + } + epub.metadata("title", extractor.metadata().title().replace("&", "&")) + .unwrap(); + epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) + .unwrap(); + for img in extractor.img_urls { + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) - .unwrap(); + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.unwrap(), + ) + .unwrap(); + } + epub.generate(&mut out_file).unwrap(); + println!("Created {:?}", file_name); + } } - epub.generate(&mut out_file).unwrap(); - println!("Created {:?}", file_name); + Err(e) => println!("{}", e), } } }) diff --git a/src/moz_readability/mod.rs b/src/moz_readability/mod.rs index a252d57..7986c2b 100644 --- a/src/moz_readability/mod.rs +++ b/src/moz_readability/mod.rs @@ -193,10 +193,7 @@ impl Readability { .borrow_mut() .insert(attr_name, prev_value.value.clone()); } - // WARN: This assumes `next_element` returns an element node!! - let inner_node_child = - Self::next_element(inner_node_ref.first_child(), true); - prev_elem.insert_after(inner_node_child.unwrap()); + prev_elem.insert_after(new_img.as_node().clone()); prev_elem.detach(); } }