From 21c3ffd922ddb14efd5bc273a98d073e6f08be7f Mon Sep 17 00:00:00 2001 From: Kenneth Gitere Date: Sun, 24 Jan 2021 17:49:42 +0300 Subject: [PATCH] Refactor fetch_url This adds: - More validation of responses to ensure the HTML response is valid. - Better handling of redirecting URLs which allows for fetching of links proxied to Medium. --- src/main.rs | 138 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 87 insertions(+), 51 deletions(-) diff --git a/src/main.rs b/src/main.rs index bf14ee3..4e403b6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -23,70 +23,106 @@ fn main() { type HTMLResource = (String, String); -async fn fetch_url(url: &str) -> Result> { +async fn fetch_url(url: &str) -> Result> { let client = surf::Client::new(); println!("Fetching..."); - let mut res = client - .with(surf::middleware::Redirect::default()) - .get(url) - .send() - .await - .expect(&format!("Unable to fetch {}", url)); - if res.status() == 200 { - Ok((url.to_string(), res.body_string().await?)) - } else { - Err("Request failed to return HTTP 200".into()) + + let mut redirect_count: u8 = 0; + let base_url = Url::parse(&url)?; + let mut url = base_url.clone(); + while redirect_count < 5 { + redirect_count += 1; + let req = surf::get(&url); + let mut res = client.send(req).await?; + if res.status().is_redirection() { + if let Some(location) = res.header(surf::http::headers::LOCATION) { + match Url::parse(location.last().as_str()) { + Ok(valid_url) => url = valid_url, + Err(e) => match e { + url::ParseError::RelativeUrlWithoutBase => { + url = base_url.join(location.last().as_str())? + } + e => return Err(e.into()), + }, + }; + } + } else if res.status().is_success() { + if let Some(mime) = res.content_type() { + if mime.essence() == "text/html" { + return Ok((url.to_string(), res.body_string().await?)); + } else { + return Err(format!( + "Invalid HTTP response. Received {} instead of text/html", + mime.essence() + ) + .into()); + } + } else { + return Err("Unknown HTTP response".into()); + } + } else { + return Err(format!("Request failed: HTTP {}", res.status()).into()); + } } + Err("Unable to fetch HTML".into()) } fn download(urls: Vec) { let mut async_url_tasks = Vec::with_capacity(urls.len()); for url in urls { - async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() })); + async_url_tasks.push(task::spawn(async move { fetch_url(&url).await })); } task::block_on(async { for url_task in async_url_tasks { - let (url, html) = url_task.await; - println!("Extracting"); - let mut extractor = Extractor::from_html(&html); - extractor.extract_content(&url); - if extractor.article().is_some() { - extractor - .download_images(&Url::parse(&url).unwrap()) - .await - .expect("Unable to download images"); - let file_name = format!( - "{}.epub", - extractor - .metadata() - .title() - .replace("/", " ") - .replace("\\", " ") - ); - let mut out_file = File::create(&file_name).unwrap(); - let mut html_buf = Vec::new(); - extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) - .expect("Unable to serialize to xhtml"); - let html_buf = std::str::from_utf8(&html_buf).unwrap(); - let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); - if let Some(author) = extractor.metadata().byline() { - epub.metadata("author", author.replace("&", "&")) - .unwrap(); - } - epub.metadata("title", extractor.metadata().title().replace("&", "&")) - .unwrap(); - epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) - .unwrap(); - for img in extractor.img_urls { - let mut file_path = std::env::temp_dir(); - file_path.push(&img.0); + match url_task.await { + Ok((url, html)) => { + println!("Extracting"); + let mut extractor = Extractor::from_html(&html); + extractor.extract_content(&url); + if extractor.article().is_some() { + extractor + .download_images(&Url::parse(&url).unwrap()) + .await + .expect("Unable to download images"); + let file_name = format!( + "{}.epub", + extractor + .metadata() + .title() + .replace("/", " ") + .replace("\\", " ") + ); + let mut out_file = File::create(&file_name).unwrap(); + let mut html_buf = Vec::new(); + extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf) + .expect("Unable to serialize to xhtml"); + let html_buf = std::str::from_utf8(&html_buf).unwrap(); + let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap(); + if let Some(author) = extractor.metadata().byline() { + epub.metadata("author", author.replace("&", "&")) + .unwrap(); + } + epub.metadata("title", extractor.metadata().title().replace("&", "&")) + .unwrap(); + epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes())) + .unwrap(); + for img in extractor.img_urls { + let mut file_path = std::env::temp_dir(); + file_path.push(&img.0); - let img_buf = File::open(&file_path).expect("Can't read file"); - epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) - .unwrap(); + let img_buf = File::open(&file_path).expect("Can't read file"); + epub.add_resource( + file_path.file_name().unwrap(), + img_buf, + img.1.unwrap(), + ) + .unwrap(); + } + epub.generate(&mut out_file).unwrap(); + println!("Created {:?}", file_name); + } } - epub.generate(&mut out_file).unwrap(); - println!("Created {:?}", file_name); + Err(e) => println!("{}", e), } } })