Merge pull request #5 from hipstermojo/dev

Merge 0.2.2-alpha-1
2021-01-24 18:00:50 +03:00 · 2021-01-24 18:00:50 +03:00 · c82071a871
commit c82071a871
parent ca1f9e2800 b98c0a69a6
6 changed files with 98 additions and 60 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1242,7 +1242,7 @@ dependencies = [

 [[package]]
 name = "paperoni"
-version = "0.2.1-alpha1"
+version = "0.2.2-alpha1"
 dependencies = [
 "async-std",
 "clap",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ description = "A web article downloader"
 homepage = "https://github.com/hipstermojo/paperoni"
 repository = "https://github.com/hipstermojo/paperoni"
 name = "paperoni"
-version = "0.2.1-alpha1"
+version = "0.2.2-alpha1"
 authors = ["Kenneth Gitere <gitere81@gmail.com>"]
 edition = "2018"
 license = "MIT"
@ -21,4 +21,4 @@ lazy_static = "1.4.0"
 md5 = "0.7.0"
 regex = "1.4.2"
 surf = "2.1.0"
-url = "2.2.0"
+url = "2.2.0"
--- a/src/cli.rs
+++ b/src/cli.rs
@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
            AppSettings::ArgRequiredElseHelp,
            AppSettings::UnifiedHelpMessage,
        ])
-        .version("0.2.1-alpha1")
+        .version("0.2.2-alpha1")
        .about(
            "
 Paperoni is an article downloader.
--- a/src/extractor.rs
+++ b/src/extractor.rs
@ -68,7 +68,9 @@ impl Extractor {
    pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
        let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
        self.extract_img_urls();
-        println!("Downloading images...");
+        if self.img_urls.len() > 0 {
+            println!("Downloading images...");
+        }
        for img_url in &self.img_urls {
            let img_url = img_url.0.clone();
            let abs_url = get_absolute_url(&img_url, article_origin);
@ -129,6 +131,9 @@ impl Extractor {
                .expect("Image node does not exist");
            let mut img_node = img_ref.attributes.borrow_mut();
            *img_node.get_mut("src").unwrap() = img_path.clone();
+            // srcset is removed because readers such as Foliate then fail to display
+            // the image already downloaded and stored in src
+            img_node.remove("srcset");
            self.img_urls.push((img_path, img_mime));
        }
        Ok(())
--- a/src/main.rs
+++ b/src/main.rs
@ -23,70 +23,106 @@ fn main() {

 type HTMLResource = (String, String);

-async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error>> {
+async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
    let client = surf::Client::new();
    println!("Fetching...");
-    let mut res = client
-        .with(surf::middleware::Redirect::default())
-        .get(url)
-        .send()
-        .await
-        .expect(&format!("Unable to fetch {}", url));
-    if res.status() == 200 {
-        Ok((url.to_string(), res.body_string().await?))
-    } else {
-        Err("Request failed to return HTTP 200".into())
+
+    let mut redirect_count: u8 = 0;
+    let base_url = Url::parse(&url)?;
+    let mut url = base_url.clone();
+    while redirect_count < 5 {
+        redirect_count += 1;
+        let req = surf::get(&url);
+        let mut res = client.send(req).await?;
+        if res.status().is_redirection() {
+            if let Some(location) = res.header(surf::http::headers::LOCATION) {
+                match Url::parse(location.last().as_str()) {
+                    Ok(valid_url) => url = valid_url,
+                    Err(e) => match e {
+                        url::ParseError::RelativeUrlWithoutBase => {
+                            url = base_url.join(location.last().as_str())?
+                        }
+                        e => return Err(e.into()),
+                    },
+                };
+            }
+        } else if res.status().is_success() {
+            if let Some(mime) = res.content_type() {
+                if mime.essence() == "text/html" {
+                    return Ok((url.to_string(), res.body_string().await?));
+                } else {
+                    return Err(format!(
+                        "Invalid HTTP response. Received {} instead of text/html",
+                        mime.essence()
+                    )
+                    .into());
+                }
+            } else {
+                return Err("Unknown HTTP response".into());
+            }
+        } else {
+            return Err(format!("Request failed: HTTP {}", res.status()).into());
+        }
    }
+    Err("Unable to fetch HTML".into())
 }

 fn download(urls: Vec<String>) {
    let mut async_url_tasks = Vec::with_capacity(urls.len());
    for url in urls {
-        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() }));
+        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
    }
    task::block_on(async {
        for url_task in async_url_tasks {
-            let (url, html) = url_task.await;
-            println!("Extracting");
-            let mut extractor = Extractor::from_html(&html);
-            extractor.extract_content(&url);
-            if extractor.article().is_some() {
-                extractor
-                    .download_images(&Url::parse(&url).unwrap())
-                    .await
-                    .expect("Unable to download images");
-                let file_name = format!(
-                    "{}.epub",
-                    extractor
-                        .metadata()
-                        .title()
-                        .replace("/", " ")
-                        .replace("\\", " ")
-                );
-                let mut out_file = File::create(&file_name).unwrap();
-                let mut html_buf = Vec::new();
-                extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
-                    .expect("Unable to serialize to xhtml");
-                let html_buf = std::str::from_utf8(&html_buf).unwrap();
-                let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
-                if let Some(author) = extractor.metadata().byline() {
-                    epub.metadata("author", author.replace("&", "&amp;"))
-                        .unwrap();
-                }
-                epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
-                    .unwrap();
-                epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
-                    .unwrap();
-                for img in extractor.img_urls {
-                    let mut file_path = std::env::temp_dir();
-                    file_path.push(&img.0);
+            match url_task.await {
+                Ok((url, html)) => {
+                    println!("Extracting");
+                    let mut extractor = Extractor::from_html(&html);
+                    extractor.extract_content(&url);
+                    if extractor.article().is_some() {
+                        extractor
+                            .download_images(&Url::parse(&url).unwrap())
+                            .await
+                            .expect("Unable to download images");
+                        let file_name = format!(
+                            "{}.epub",
+                            extractor
+                                .metadata()
+                                .title()
+                                .replace("/", " ")
+                                .replace("\\", " ")
+                        );
+                        let mut out_file = File::create(&file_name).unwrap();
+                        let mut html_buf = Vec::new();
+                        extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
+                            .expect("Unable to serialize to xhtml");
+                        let html_buf = std::str::from_utf8(&html_buf).unwrap();
+                        let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
+                        if let Some(author) = extractor.metadata().byline() {
+                            epub.metadata("author", author.replace("&", "&amp;"))
+                                .unwrap();
+                        }
+                        epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
+                            .unwrap();
+                        epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
+                            .unwrap();
+                        for img in extractor.img_urls {
+                            let mut file_path = std::env::temp_dir();
+                            file_path.push(&img.0);

-                    let img_buf = File::open(&file_path).expect("Can't read file");
-                    epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
-                        .unwrap();
+                            let img_buf = File::open(&file_path).expect("Can't read file");
+                            epub.add_resource(
+                                file_path.file_name().unwrap(),
+                                img_buf,
+                                img.1.unwrap(),
+                            )
+                            .unwrap();
+                        }
+                        epub.generate(&mut out_file).unwrap();
+                        println!("Created {:?}", file_name);
+                    }
                }
-                epub.generate(&mut out_file).unwrap();
-                println!("Created {:?}", file_name);
+                Err(e) => println!("{}", e),
            }
        }
    })
--- a/src/moz_readability/mod.rs
+++ b/src/moz_readability/mod.rs
@ -193,10 +193,7 @@ impl Readability {
                                .borrow_mut()
                                .insert(attr_name, prev_value.value.clone());
                        }
-                        // WARN: This assumes `next_element` returns an element node!!
-                        let inner_node_child =
-                            Self::next_element(inner_node_ref.first_child(), true);
-                        prev_elem.insert_after(inner_node_child.unwrap());
+                        prev_elem.insert_after(new_img.as_node().clone());
                        prev_elem.detach();
                    }
                }