Refactor fetch_url

This adds:
- More validation of responses to ensure the HTML response is valid.
- Better handling of redirecting URLs which allows for fetching of
  links proxied to Medium.
This commit is contained in:
Kenneth Gitere 2021-01-24 17:49:42 +03:00
parent 1dc7b3432b
commit 21c3ffd922

View file

@ -23,30 +23,59 @@ fn main() {
type HTMLResource = (String, String); type HTMLResource = (String, String);
async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error>> { async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error + Send + Sync>> {
let client = surf::Client::new(); let client = surf::Client::new();
println!("Fetching..."); println!("Fetching...");
let mut res = client
.with(surf::middleware::Redirect::default()) let mut redirect_count: u8 = 0;
.get(url) let base_url = Url::parse(&url)?;
.send() let mut url = base_url.clone();
.await while redirect_count < 5 {
.expect(&format!("Unable to fetch {}", url)); redirect_count += 1;
if res.status() == 200 { let req = surf::get(&url);
Ok((url.to_string(), res.body_string().await?)) let mut res = client.send(req).await?;
} else { if res.status().is_redirection() {
Err("Request failed to return HTTP 200".into()) if let Some(location) = res.header(surf::http::headers::LOCATION) {
match Url::parse(location.last().as_str()) {
Ok(valid_url) => url = valid_url,
Err(e) => match e {
url::ParseError::RelativeUrlWithoutBase => {
url = base_url.join(location.last().as_str())?
} }
e => return Err(e.into()),
},
};
}
} else if res.status().is_success() {
if let Some(mime) = res.content_type() {
if mime.essence() == "text/html" {
return Ok((url.to_string(), res.body_string().await?));
} else {
return Err(format!(
"Invalid HTTP response. Received {} instead of text/html",
mime.essence()
)
.into());
}
} else {
return Err("Unknown HTTP response".into());
}
} else {
return Err(format!("Request failed: HTTP {}", res.status()).into());
}
}
Err("Unable to fetch HTML".into())
} }
fn download(urls: Vec<String>) { fn download(urls: Vec<String>) {
let mut async_url_tasks = Vec::with_capacity(urls.len()); let mut async_url_tasks = Vec::with_capacity(urls.len());
for url in urls { for url in urls {
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() })); async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
} }
task::block_on(async { task::block_on(async {
for url_task in async_url_tasks { for url_task in async_url_tasks {
let (url, html) = url_task.await; match url_task.await {
Ok((url, html)) => {
println!("Extracting"); println!("Extracting");
let mut extractor = Extractor::from_html(&html); let mut extractor = Extractor::from_html(&html);
extractor.extract_content(&url); extractor.extract_content(&url);
@ -82,12 +111,19 @@ fn download(urls: Vec<String>) {
file_path.push(&img.0); file_path.push(&img.0);
let img_buf = File::open(&file_path).expect("Can't read file"); let img_buf = File::open(&file_path).expect("Can't read file");
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap()) epub.add_resource(
file_path.file_name().unwrap(),
img_buf,
img.1.unwrap(),
)
.unwrap(); .unwrap();
} }
epub.generate(&mut out_file).unwrap(); epub.generate(&mut out_file).unwrap();
println!("Created {:?}", file_name); println!("Created {:?}", file_name);
} }
} }
Err(e) => println!("{}", e),
}
}
}) })
} }