Merge branch 'dev' of github.com:hipstermojo/paperoni into dev
This commit is contained in:
commit
5ccbe1a17a
2 changed files with 79 additions and 63 deletions
131
src/http.rs
131
src/http.rs
|
@ -72,6 +72,60 @@ pub async fn fetch_html(url: &str) -> Result<HTMLResource, PaperoniError> {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ImgItem<'a> = (&'a str, String, Option<String>);
|
||||||
|
|
||||||
|
async fn process_img_response<'a>(
|
||||||
|
img_response: &mut surf::Response,
|
||||||
|
url: &'a str,
|
||||||
|
) -> Result<ImgItem<'a>, ImgError> {
|
||||||
|
if !img_response.status().is_success() {
|
||||||
|
let kind = ErrorKind::HTTPError(format!(
|
||||||
|
"Non-success HTTP status code ({})",
|
||||||
|
img_response.status()
|
||||||
|
));
|
||||||
|
return Err(ImgError::with_kind(kind));
|
||||||
|
}
|
||||||
|
let img_content: Vec<u8> = match img_response.body_bytes().await {
|
||||||
|
Ok(bytes) => bytes,
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
|
};
|
||||||
|
let img_mime = img_response
|
||||||
|
.content_type()
|
||||||
|
.map(|mime| mime.essence().to_string());
|
||||||
|
let img_ext = match img_response
|
||||||
|
.content_type()
|
||||||
|
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||||
|
{
|
||||||
|
Some(mime_str) => mime_str,
|
||||||
|
None => return Err(ErrorKind::HTTPError("Image has no Content-Type".to_owned()).into()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut img_path = std::env::temp_dir();
|
||||||
|
img_path.push(format!("{}.{}", hash_url(url), &img_ext));
|
||||||
|
let mut img_file = match File::create(&img_path).await {
|
||||||
|
Ok(file) => file,
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
|
};
|
||||||
|
match img_file.write_all(&img_content).await {
|
||||||
|
Ok(_) => (),
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((
|
||||||
|
url,
|
||||||
|
img_path
|
||||||
|
.file_name()
|
||||||
|
.map(|os_str_name| {
|
||||||
|
os_str_name
|
||||||
|
.to_str()
|
||||||
|
.expect("Unable to get image file name")
|
||||||
|
.to_string()
|
||||||
|
})
|
||||||
|
.unwrap(),
|
||||||
|
img_mime,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn download_images(
|
pub async fn download_images(
|
||||||
extractor: &mut Extractor,
|
extractor: &mut Extractor,
|
||||||
article_origin: &Url,
|
article_origin: &Url,
|
||||||
|
@ -102,53 +156,9 @@ pub async fn download_images(
|
||||||
bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());
|
bar.set_message(format!("Downloading images [{}/{}]", img_idx + 1, img_count).as_str());
|
||||||
match req.await {
|
match req.await {
|
||||||
Ok(mut img_response) => {
|
Ok(mut img_response) => {
|
||||||
let process_response = async {
|
let process_response =
|
||||||
let img_content: Vec<u8> = match img_response.body_bytes().await {
|
process_img_response(&mut img_response, url.as_ref()).await;
|
||||||
Ok(bytes) => bytes,
|
process_response.map_err(|mut e: ImgError| {
|
||||||
Err(e) => return Err(e.into()),
|
|
||||||
};
|
|
||||||
let img_mime = img_response
|
|
||||||
.content_type()
|
|
||||||
.map(|mime| mime.essence().to_string());
|
|
||||||
let img_ext = match img_response
|
|
||||||
.content_type()
|
|
||||||
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
|
||||||
{
|
|
||||||
Some(mime_str) => mime_str,
|
|
||||||
None => {
|
|
||||||
return Err(ErrorKind::HTTPError(
|
|
||||||
"Image has no Content-Type".to_owned(),
|
|
||||||
)
|
|
||||||
.into())
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut img_path = std::env::temp_dir();
|
|
||||||
img_path.push(format!("{}.{}", hash_url(&url), &img_ext));
|
|
||||||
let mut img_file = match File::create(&img_path).await {
|
|
||||||
Ok(file) => file,
|
|
||||||
Err(e) => return Err(e.into()),
|
|
||||||
};
|
|
||||||
match img_file.write_all(&img_content).await {
|
|
||||||
Ok(_) => (),
|
|
||||||
Err(e) => return Err(e.into()),
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok((
|
|
||||||
url,
|
|
||||||
img_path
|
|
||||||
.file_name()
|
|
||||||
.map(|os_str_name| {
|
|
||||||
os_str_name
|
|
||||||
.to_str()
|
|
||||||
.expect("Unable to get image file name")
|
|
||||||
.to_string()
|
|
||||||
})
|
|
||||||
.unwrap(),
|
|
||||||
img_mime,
|
|
||||||
))
|
|
||||||
};
|
|
||||||
process_response.await.map_err(|mut e: ImgError| {
|
|
||||||
e.set_url(url);
|
e.set_url(url);
|
||||||
e
|
e
|
||||||
})
|
})
|
||||||
|
@ -162,20 +172,19 @@ pub async fn download_images(
|
||||||
});
|
});
|
||||||
|
|
||||||
// A utility closure used when update the value of an image source after downloading is successful
|
// A utility closure used when update the value of an image source after downloading is successful
|
||||||
let replace_existing_img_src =
|
let replace_existing_img_src = |img_item: ImgItem| -> (String, Option<String>) {
|
||||||
|img_item: (&String, String, Option<String>)| -> (String, Option<String>) {
|
let (img_url, img_path, img_mime) = img_item;
|
||||||
let (img_url, img_path, img_mime) = img_item;
|
let img_ref = extractor
|
||||||
let img_ref = extractor
|
.article()
|
||||||
.article()
|
.select_first(&format!("img[src='{}']", img_url))
|
||||||
.select_first(&format!("img[src='{}']", img_url))
|
.expect("Image node does not exist");
|
||||||
.expect("Image node does not exist");
|
let mut img_node = img_ref.attributes.borrow_mut();
|
||||||
let mut img_node = img_ref.attributes.borrow_mut();
|
*img_node.get_mut("src").unwrap() = img_path.clone();
|
||||||
*img_node.get_mut("src").unwrap() = img_path.clone();
|
// srcset is removed because readers such as Foliate then fail to display
|
||||||
// srcset is removed because readers such as Foliate then fail to display
|
// the image already downloaded and stored in src
|
||||||
// the image already downloaded and stored in src
|
img_node.remove("srcset");
|
||||||
img_node.remove("srcset");
|
(img_path, img_mime)
|
||||||
(img_path, img_mime)
|
};
|
||||||
};
|
|
||||||
|
|
||||||
let imgs_req_iter = stream::from_iter(imgs_req_iter)
|
let imgs_req_iter = stream::from_iter(imgs_req_iter)
|
||||||
.buffered(10)
|
.buffered(10)
|
||||||
|
|
|
@ -1248,8 +1248,7 @@ impl Readability {
|
||||||
let srcset = node_attr.get("srcset");
|
let srcset = node_attr.get("srcset");
|
||||||
let class = node_attr.get("class");
|
let class = node_attr.get("class");
|
||||||
if (src.is_some() || srcset.is_some())
|
if (src.is_some() || srcset.is_some())
|
||||||
&& class.is_some()
|
&& class.and_then(|classname| classname.find("lazy")).is_none()
|
||||||
&& !class.unwrap().contains("lazy")
|
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -3160,6 +3159,7 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
<source media="(min-width:465px)" srcset="img_white_flower.jpg">
|
<source media="(min-width:465px)" srcset="img_white_flower.jpg">
|
||||||
<img src="img_orange_flowers.jpg" alt="Flowers" style="width:auto;">
|
<img src="img_orange_flowers.jpg" alt="Flowers" style="width:auto;">
|
||||||
</picture>
|
</picture>
|
||||||
|
<img id="no-lazy-class" src="https://image.url/" data-attrs="{"src":"https://other.url/1.png","alt":""}"/>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"#;
|
"#;
|
||||||
|
@ -3189,6 +3189,13 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||||
lazy_loaded_attrs.get("data-src"),
|
lazy_loaded_attrs.get("data-src"),
|
||||||
lazy_loaded_attrs.get("src")
|
lazy_loaded_attrs.get("src")
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let no_lazy_class = doc.root_node.select_first("#no-lazy-class").unwrap();
|
||||||
|
let no_lazy_class_attrs = no_lazy_class.attributes.borrow();
|
||||||
|
assert_eq!(
|
||||||
|
no_lazy_class_attrs.get("src").unwrap(),
|
||||||
|
"https://image.url/"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
Reference in a new issue