Refactor fetch_url

This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium.
2021-01-24 17:49:42 +03:00 · 2021-01-24 17:49:42 +03:00 · 21c3ffd922
commit 21c3ffd922
parent 1dc7b3432b
1 changed files with 87 additions and 51 deletions
--- a/src/main.rs
+++ b/src/main.rs
@ -23,30 +23,59 @@ fn main() {

 type HTMLResource = (String, String);

-async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error>> {
+async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
    let client = surf::Client::new();
    println!("Fetching...");
-    let mut res = client
-        .with(surf::middleware::Redirect::default())
-        .get(url)
-        .send()
-        .await
-        .expect(&format!("Unable to fetch {}", url));
-    if res.status() == 200 {
-        Ok((url.to_string(), res.body_string().await?))
-    } else {
-        Err("Request failed to return HTTP 200".into())
+
+    let mut redirect_count: u8 = 0;
+    let base_url = Url::parse(&url)?;
+    let mut url = base_url.clone();
+    while redirect_count < 5 {
+        redirect_count += 1;
+        let req = surf::get(&url);
+        let mut res = client.send(req).await?;
+        if res.status().is_redirection() {
+            if let Some(location) = res.header(surf::http::headers::LOCATION) {
+                match Url::parse(location.last().as_str()) {
+                    Ok(valid_url) => url = valid_url,
+                    Err(e) => match e {
+                        url::ParseError::RelativeUrlWithoutBase => {
+                            url = base_url.join(location.last().as_str())?
                        }
+                        e => return Err(e.into()),
+                    },
+                };
+            }
+        } else if res.status().is_success() {
+            if let Some(mime) = res.content_type() {
+                if mime.essence() == "text/html" {
+                    return Ok((url.to_string(), res.body_string().await?));
+                } else {
+                    return Err(format!(
+                        "Invalid HTTP response. Received {} instead of text/html",
+                        mime.essence()
+                    )
+                    .into());
+                }
+            } else {
+                return Err("Unknown HTTP response".into());
+            }
+        } else {
+            return Err(format!("Request failed: HTTP {}", res.status()).into());
+        }
+    }
+    Err("Unable to fetch HTML".into())
 }

 fn download(urls: Vec<String>) {
    let mut async_url_tasks = Vec::with_capacity(urls.len());
    for url in urls {
-        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() }));
+        async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
    }
    task::block_on(async {
        for url_task in async_url_tasks {
-            let (url, html) = url_task.await;
+            match url_task.await {
+                Ok((url, html)) => {
                    println!("Extracting");
                    let mut extractor = Extractor::from_html(&html);
                    extractor.extract_content(&url);
@ -82,12 +111,19 @@ fn download(urls: Vec<String>) {
                            file_path.push(&img.0);

                            let img_buf = File::open(&file_path).expect("Can't read file");
-                    epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
+                            epub.add_resource(
+                                file_path.file_name().unwrap(),
+                                img_buf,
+                                img.1.unwrap(),
+                            )
                            .unwrap();
                        }
                        epub.generate(&mut out_file).unwrap();
                        println!("Created {:?}", file_name);
                    }
                }
+                Err(e) => println!("{}", e),
+            }
+        }
    })
 }