commit
3c7dc9a416
8 changed files with 1360 additions and 950 deletions
1962
Cargo.lock
generated
1962
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
17
Cargo.toml
17
Cargo.toml
|
@ -3,21 +3,22 @@ description = "A web article downloader"
|
||||||
homepage = "https://github.com/hipstermojo/paperoni"
|
homepage = "https://github.com/hipstermojo/paperoni"
|
||||||
repository = "https://github.com/hipstermojo/paperoni"
|
repository = "https://github.com/hipstermojo/paperoni"
|
||||||
name = "paperoni"
|
name = "paperoni"
|
||||||
version = "0.1.0-alpha1"
|
version = "0.2.0-alpha1"
|
||||||
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
authors = ["Kenneth Gitere <gitere81@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
readme = "README.md"
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
async-std = "1.5.0"
|
async-std = "1.7.0"
|
||||||
epub-builder = "0.4.5"
|
clap = "2.33.3"
|
||||||
|
epub-builder = "0.4.8"
|
||||||
html5ever = "0.25.1"
|
html5ever = "0.25.1"
|
||||||
kuchiki = "0.8.1"
|
kuchiki = "0.8.1"
|
||||||
lazy_static = "1.3.9"
|
lazy_static = "1.4.0"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
regex = "1.3.9"
|
regex = "1.4.2"
|
||||||
surf = "1.0.3"
|
surf = "2.1.0"
|
||||||
structopt = { version = "0.3" }
|
url = "2.2.0"
|
||||||
url = "2.1.1"
|
|
|
@ -1,4 +1,4 @@
|
||||||
<p align="center"><img src="./paperoni-dark.png" width="400"></p>
|
<p align="center"><img src="./paperoni-dark.png"></p>
|
||||||
|
|
||||||
<p align="center"><i>Salami not included</i></p>
|
<p align="center"><i>Salami not included</i></p>
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ This extractor retrieves a possible article using a port of the [Mozilla Readabi
|
||||||
|
|
||||||
This program is still in alpha so a number of things currently break:
|
This program is still in alpha so a number of things currently break:
|
||||||
|
|
||||||
- Links with redirects will crash the program as it has no redirect logic.
|
- Certain links with redirects can't be extracted. Such links include urls that are proxying Medium.
|
||||||
- Websites that only run with JavaScript cannot be extracted.
|
- Websites that only run with JavaScript cannot be extracted.
|
||||||
- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
|
- Website articles that cannot be extracted by Readability cannot be extracted by Paperoni either.
|
||||||
|
|
||||||
|
|
30
src/cli.rs
30
src/cli.rs
|
@ -1,13 +1,21 @@
|
||||||
use structopt::StructOpt;
|
use clap::{App, AppSettings, Arg};
|
||||||
|
|
||||||
#[derive(Debug, StructOpt)]
|
pub fn cli_init() -> App<'static, 'static> {
|
||||||
#[structopt(name = "paperoni")]
|
App::new("paperoni")
|
||||||
/// Paperoni is an article downloader.
|
.settings(&[
|
||||||
///
|
AppSettings::ArgRequiredElseHelp,
|
||||||
/// It takes a url and downloads the article content from it and
|
AppSettings::UnifiedHelpMessage,
|
||||||
/// saves it to an epub.
|
])
|
||||||
pub struct Opts {
|
.version("0.1.0-alpha1")
|
||||||
// #[structopt(conflicts_with("links"))]
|
.about(
|
||||||
/// Url of a web article
|
"
|
||||||
pub urls: Vec<String>,
|
Paperoni is an article downloader.
|
||||||
|
It takes a url and downloads the article content from it and saves it to an epub.
|
||||||
|
",
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("urls")
|
||||||
|
.help("Urls of web articles")
|
||||||
|
.multiple(true),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
133
src/extractor.rs
133
src/extractor.rs
|
@ -1,3 +1,5 @@
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use async_std::fs::File;
|
use async_std::fs::File;
|
||||||
use async_std::io::prelude::*;
|
use async_std::io::prelude::*;
|
||||||
use async_std::task;
|
use async_std::task;
|
||||||
|
@ -8,6 +10,10 @@ use crate::moz_readability::{MetaData, Readability};
|
||||||
|
|
||||||
pub type ResourceInfo = (String, Option<String>);
|
pub type ResourceInfo = (String, Option<String>);
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
pub struct Extractor {
|
pub struct Extractor {
|
||||||
article: Option<NodeRef>,
|
article: Option<NodeRef>,
|
||||||
pub img_urls: Vec<ResourceInfo>,
|
pub img_urls: Vec<ResourceInfo>,
|
||||||
|
@ -62,22 +68,27 @@ impl Extractor {
|
||||||
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
pub async fn download_images(&mut self, article_origin: &Url) -> async_std::io::Result<()> {
|
||||||
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
let mut async_download_tasks = Vec::with_capacity(self.img_urls.len());
|
||||||
self.extract_img_urls();
|
self.extract_img_urls();
|
||||||
println!("Downloading images to res/");
|
println!("Downloading images...");
|
||||||
for img_url in &self.img_urls {
|
for img_url in &self.img_urls {
|
||||||
let img_url = img_url.0.clone();
|
let img_url = img_url.0.clone();
|
||||||
let abs_url = get_absolute_url(&img_url, article_origin);
|
let abs_url = get_absolute_url(&img_url, article_origin);
|
||||||
|
|
||||||
async_download_tasks.push(task::spawn(async move {
|
async_download_tasks.push(task::spawn(async move {
|
||||||
let mut img_response = surf::get(&abs_url).await.expect("Unable to retrieve file");
|
let mut img_response = surf::Client::new()
|
||||||
|
.with(surf::middleware::Redirect::default())
|
||||||
|
.get(&abs_url)
|
||||||
|
.await
|
||||||
|
.expect("Unable to retrieve file");
|
||||||
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
let img_content: Vec<u8> = img_response.body_bytes().await.unwrap();
|
||||||
let img_mime = img_response
|
let img_mime = img_response
|
||||||
.header("Content-Type")
|
.content_type()
|
||||||
.map(|header| header.to_string());
|
.map(|mime| mime.essence().to_string());
|
||||||
let img_ext = img_response
|
let img_ext = img_response
|
||||||
.header("Content-Type")
|
.content_type()
|
||||||
.and_then(map_mime_type_to_ext)
|
.map(|mime| map_mime_subtype_to_ext(mime.subtype()).to_string())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
let mut img_path = std::env::temp_dir();
|
||||||
let img_path = format!("res/{}{}", hash_url(&abs_url), &img_ext);
|
img_path.push(format!("{}.{}", hash_url(&abs_url), &img_ext));
|
||||||
let mut img_file = File::create(&img_path)
|
let mut img_file = File::create(&img_path)
|
||||||
.await
|
.await
|
||||||
.expect("Unable to create file");
|
.expect("Unable to create file");
|
||||||
|
@ -86,7 +97,19 @@ impl Extractor {
|
||||||
.await
|
.await
|
||||||
.expect("Unable to save to file");
|
.expect("Unable to save to file");
|
||||||
|
|
||||||
(img_url, img_path, img_mime)
|
(
|
||||||
|
img_url,
|
||||||
|
img_path
|
||||||
|
.file_name()
|
||||||
|
.map(|os_str_name| {
|
||||||
|
os_str_name
|
||||||
|
.to_str()
|
||||||
|
.expect("Unable to get image file name")
|
||||||
|
.to_string()
|
||||||
|
})
|
||||||
|
.unwrap(),
|
||||||
|
img_mime,
|
||||||
|
)
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,21 +146,15 @@ fn hash_url(url: &str) -> String {
|
||||||
format!("{:x}", md5::compute(url.as_bytes()))
|
format!("{:x}", md5::compute(url.as_bytes()))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Handles getting the extension from a given MIME type. The extension starts with a dot
|
/// Handles getting the extension from a given MIME subtype.
|
||||||
fn map_mime_type_to_ext(mime_type: &str) -> Option<String> {
|
fn map_mime_subtype_to_ext(subtype: &str) -> &str {
|
||||||
mime_type
|
if subtype == ("svg+xml") {
|
||||||
.split("/")
|
return "svg";
|
||||||
.last()
|
} else if subtype == "x-icon" {
|
||||||
.map(|format| {
|
"ico"
|
||||||
if format == ("svg+xml") {
|
} else {
|
||||||
return "svg";
|
subtype
|
||||||
} else if format == "x-icon" {
|
}
|
||||||
"ico"
|
|
||||||
} else {
|
|
||||||
format
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.map(|format| String::from(".") + format)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
||||||
|
@ -158,6 +175,56 @@ fn get_absolute_url(url: &str, request_url: &Url) -> String {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Serializes a NodeRef to a string that is XHTML compatible
|
||||||
|
/// The only DOM nodes serialized are Text and Element nodes
|
||||||
|
pub fn serialize_to_xhtml<W: std::io::Write>(
|
||||||
|
node_ref: &NodeRef,
|
||||||
|
mut w: &mut W,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
let mut escape_map = HashMap::new();
|
||||||
|
escape_map.insert("<", "<");
|
||||||
|
escape_map.insert(">", ">");
|
||||||
|
escape_map.insert("&", "&");
|
||||||
|
for edge in node_ref.traverse_inclusive() {
|
||||||
|
match edge {
|
||||||
|
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
|
||||||
|
kuchiki::NodeData::Text(rc_text) => {
|
||||||
|
let text = rc_text.borrow();
|
||||||
|
let esc_text = ESC_SEQ_REGEX
|
||||||
|
.replace_all(&text, |captures: ®ex::Captures| escape_map[&captures[1]]);
|
||||||
|
write!(&mut w, "{}", esc_text)?;
|
||||||
|
}
|
||||||
|
kuchiki::NodeData::Element(elem_data) => {
|
||||||
|
let attrs = elem_data.attributes.borrow();
|
||||||
|
let attrs_str = attrs
|
||||||
|
.map
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| {
|
||||||
|
format!(
|
||||||
|
"{}=\"{}\"",
|
||||||
|
k.local,
|
||||||
|
ESC_SEQ_REGEX
|
||||||
|
.replace_all(&v.value, |captures: ®ex::Captures| {
|
||||||
|
escape_map[&captures[1]]
|
||||||
|
})
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.fold("".to_string(), |acc, val| acc + " " + &val);
|
||||||
|
write!(&mut w, "<{}{}>", &elem_data.name.local, attrs_str)?;
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
},
|
||||||
|
kuchiki::iter::NodeEdge::End(n) => match n.data() {
|
||||||
|
kuchiki::NodeData::Element(elem_data) => {
|
||||||
|
write!(&mut w, "</{}>", &elem_data.name.local)?;
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -204,23 +271,15 @@ mod test {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_map_mime_type_to_ext() {
|
fn test_map_mime_type_to_ext() {
|
||||||
let mime_types = vec![
|
let mime_subtypes = vec![
|
||||||
"image/apng",
|
"apng", "bmp", "gif", "x-icon", "jpeg", "png", "svg+xml", "tiff", "webp",
|
||||||
"image/bmp",
|
|
||||||
"image/gif",
|
|
||||||
"image/x-icon",
|
|
||||||
"image/jpeg",
|
|
||||||
"image/png",
|
|
||||||
"image/svg+xml",
|
|
||||||
"image/tiff",
|
|
||||||
"image/webp",
|
|
||||||
];
|
];
|
||||||
let exts = mime_types
|
let exts = mime_subtypes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|mime_type| map_mime_type_to_ext(mime_type).unwrap())
|
.map(|mime_type| map_mime_subtype_to_ext(mime_type))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
vec![".apng", ".bmp", ".gif", ".ico", ".jpeg", ".png", ".svg", ".tiff", ".webp"],
|
vec!["apng", "bmp", "gif", "ico", "jpeg", "png", "svg", "tiff", "webp"],
|
||||||
exts
|
exts
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
68
src/main.rs
68
src/main.rs
|
@ -3,9 +3,8 @@ extern crate lazy_static;
|
||||||
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
|
||||||
use async_std::{fs::create_dir, fs::remove_dir_all, task};
|
use async_std::task;
|
||||||
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
use epub_builder::{EpubBuilder, EpubContent, ZipLibrary};
|
||||||
use structopt::StructOpt;
|
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
mod cli;
|
mod cli;
|
||||||
|
@ -14,33 +13,36 @@ mod moz_readability;
|
||||||
|
|
||||||
use extractor::Extractor;
|
use extractor::Extractor;
|
||||||
fn main() {
|
fn main() {
|
||||||
let opt = cli::Opts::from_args();
|
let app = cli::cli_init();
|
||||||
if !opt.urls.is_empty() {
|
let arg_matches = app.get_matches();
|
||||||
println!("Downloading single article");
|
if let Some(vals) = arg_matches.values_of("urls") {
|
||||||
download(opt.urls);
|
let urls = vals.map(|val| val.to_string()).collect::<Vec<_>>();
|
||||||
|
download(urls);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type HTMLResource = (String, String);
|
type HTMLResource = (String, String);
|
||||||
|
|
||||||
async fn fetch_url(url: &str) -> HTMLResource {
|
async fn fetch_url(url: &str) -> Result<HTMLResource, Box<dyn std::error::Error>> {
|
||||||
let client = surf::Client::new();
|
let client = surf::Client::new();
|
||||||
println!("Fetching...");
|
println!("Fetching...");
|
||||||
// TODO: Add middleware for following redirects
|
let mut res = client
|
||||||
(
|
.with(surf::middleware::Redirect::default())
|
||||||
url.to_string(),
|
.get(url)
|
||||||
client
|
.send()
|
||||||
.get(url)
|
.await
|
||||||
.recv_string()
|
.expect(&format!("Unable to fetch {}", url));
|
||||||
.await
|
if res.status() == 200 {
|
||||||
.expect("Unable to fetch URL"),
|
Ok((url.to_string(), res.body_string().await?))
|
||||||
)
|
} else {
|
||||||
|
Err("Request failed to return HTTP 200".into())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn download(urls: Vec<String>) {
|
fn download(urls: Vec<String>) {
|
||||||
let mut async_url_tasks = Vec::with_capacity(urls.len());
|
let mut async_url_tasks = Vec::with_capacity(urls.len());
|
||||||
for url in urls {
|
for url in urls {
|
||||||
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await }));
|
async_url_tasks.push(task::spawn(async move { fetch_url(&url).await.unwrap() }));
|
||||||
}
|
}
|
||||||
task::block_on(async {
|
task::block_on(async {
|
||||||
for url_task in async_url_tasks {
|
for url_task in async_url_tasks {
|
||||||
|
@ -49,41 +51,35 @@ fn download(urls: Vec<String>) {
|
||||||
let mut extractor = Extractor::from_html(&html);
|
let mut extractor = Extractor::from_html(&html);
|
||||||
extractor.extract_content(&url);
|
extractor.extract_content(&url);
|
||||||
if extractor.article().is_some() {
|
if extractor.article().is_some() {
|
||||||
create_dir("res/")
|
|
||||||
.await
|
|
||||||
.expect("Unable to create res/ output folder");
|
|
||||||
extractor
|
extractor
|
||||||
.download_images(&Url::parse(&url).unwrap())
|
.download_images(&Url::parse(&url).unwrap())
|
||||||
.await
|
.await
|
||||||
.expect("Unable to download images");
|
.expect("Unable to download images");
|
||||||
let mut out_file =
|
let file_name = format!("{}.epub", extractor.metadata().title());
|
||||||
File::create(format!("{}.epub", extractor.metadata().title())).unwrap();
|
let mut out_file = File::create(&file_name).unwrap();
|
||||||
let mut html_buf = Vec::new();
|
let mut html_buf = Vec::new();
|
||||||
extractor
|
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
|
||||||
.article()
|
.expect("Unable to serialize to xhtml");
|
||||||
.unwrap()
|
|
||||||
.serialize(&mut html_buf)
|
|
||||||
.expect("Unable to serialize");
|
|
||||||
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
let html_buf = std::str::from_utf8(&html_buf).unwrap();
|
||||||
let html_buf = moz_readability::regexes::REPLACE_SELF_CLOSING_REGEX
|
|
||||||
.replace_all(html_buf, "$tag/>");
|
|
||||||
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
let mut epub = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||||
if let Some(author) = extractor.metadata().byline() {
|
if let Some(author) = extractor.metadata().byline() {
|
||||||
epub.metadata("author", author).unwrap();
|
epub.metadata("author", author.replace("&", "&"))
|
||||||
|
.unwrap();
|
||||||
}
|
}
|
||||||
epub.metadata("title", extractor.metadata().title())
|
epub.metadata("title", extractor.metadata().title().replace("&", "&"))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
for img in extractor.img_urls {
|
for img in extractor.img_urls {
|
||||||
let file_path = format!("{}", &img.0);
|
let mut file_path = std::env::temp_dir();
|
||||||
|
file_path.push(&img.0);
|
||||||
|
|
||||||
let img_buf = File::open(file_path).expect("Can't read file");
|
let img_buf = File::open(&file_path).expect("Can't read file");
|
||||||
epub.add_resource(img.0, img_buf, img.1.unwrap()).unwrap();
|
epub.add_resource(file_path.file_name().unwrap(), img_buf, img.1.unwrap())
|
||||||
|
.unwrap();
|
||||||
}
|
}
|
||||||
epub.generate(&mut out_file).unwrap();
|
epub.generate(&mut out_file).unwrap();
|
||||||
println!("Cleaning up");
|
println!("Created {:?}", file_name);
|
||||||
remove_dir_all("res/").await.unwrap();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
|
@ -653,7 +653,12 @@ impl Readability {
|
||||||
})
|
})
|
||||||
.map(|node_ref| {
|
.map(|node_ref| {
|
||||||
let node_attrs = node_ref.attributes.borrow();
|
let node_attrs = node_ref.attributes.borrow();
|
||||||
Url::parse(node_attrs.get("href").unwrap()).unwrap()
|
let href = node_attrs.get("href").unwrap();
|
||||||
|
if href.trim() == "/" {
|
||||||
|
document_uri.join("/").unwrap()
|
||||||
|
} else {
|
||||||
|
Url::parse(href).unwrap()
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.next()
|
.next()
|
||||||
.unwrap_or(document_uri.clone());
|
.unwrap_or(document_uri.clone());
|
||||||
|
@ -758,14 +763,66 @@ impl Readability {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts an inline CSS string to a [HashMap] of property and value(s)
|
/// Converts an inline CSS string to a [HashMap] of property and value(s)
|
||||||
fn inline_css_str_to_map(css_str: &str) -> HashMap<&str, &str> {
|
fn inline_css_str_to_map(css_str: &str) -> HashMap<String, String> {
|
||||||
css_str
|
enum State {
|
||||||
.split(";")
|
ReadProp,
|
||||||
.filter(|split_str| !split_str.trim().is_empty())
|
ReadVal,
|
||||||
.map(|str_pair| {
|
ReadQuot,
|
||||||
let mut vals = str_pair.split(":");
|
ReadDquot,
|
||||||
(vals.next().unwrap().trim(), vals.next().unwrap().trim())
|
}
|
||||||
})
|
let mut decl: (Option<String>, Option<String>) = (None, None);
|
||||||
|
let mut chars = css_str.chars();
|
||||||
|
let mut state = State::ReadProp;
|
||||||
|
let mut token = String::new();
|
||||||
|
let mut tokens = vec![];
|
||||||
|
while let Some(c) = chars.next() {
|
||||||
|
match state {
|
||||||
|
State::ReadProp => {
|
||||||
|
if c != ':' {
|
||||||
|
token.push(c);
|
||||||
|
} else {
|
||||||
|
state = State::ReadVal;
|
||||||
|
decl.0 = Some(token.trim().to_string());
|
||||||
|
token.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
State::ReadVal => {
|
||||||
|
if c == '\'' {
|
||||||
|
state = State::ReadQuot;
|
||||||
|
token.push(c);
|
||||||
|
} else if c == '"' {
|
||||||
|
state = State::ReadDquot;
|
||||||
|
token.push(c);
|
||||||
|
} else if c == ';' {
|
||||||
|
state = State::ReadProp;
|
||||||
|
decl.1 = Some(token.trim().to_string());
|
||||||
|
tokens.push(decl.clone());
|
||||||
|
token.clear();
|
||||||
|
} else {
|
||||||
|
token.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
State::ReadQuot => {
|
||||||
|
token.push(c);
|
||||||
|
if c == '\'' {
|
||||||
|
state = State::ReadVal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
State::ReadDquot => {
|
||||||
|
token.push(c);
|
||||||
|
if c == '"' {
|
||||||
|
state = State::ReadVal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !token.is_empty() {
|
||||||
|
decl.1 = Some(token.trim().to_string());
|
||||||
|
tokens.push(decl);
|
||||||
|
}
|
||||||
|
tokens
|
||||||
|
.into_iter()
|
||||||
|
.map(|tok_pair| (tok_pair.0.unwrap(), tok_pair.1.unwrap()))
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2394,18 +2451,19 @@ mod test {
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
let css_str = "display: flex; height: 200px; width: 250px; justify-content: center; align-items: center; border: 2px solid black";
|
let css_str = "display: flex; height: 200px; width: 250px; justify-content: center; align-items: center; border: 2px solid black";
|
||||||
let mut css_map = HashMap::new();
|
let mut css_map = HashMap::new();
|
||||||
css_map.insert("display", "flex");
|
css_map.insert("display".to_string(), "flex".to_string());
|
||||||
css_map.insert("height", "200px");
|
css_map.insert("height".to_string(), "200px".to_string());
|
||||||
css_map.insert("width", "250px");
|
css_map.insert("width".to_string(), "250px".to_string());
|
||||||
css_map.insert("justify-content", "center");
|
css_map.insert("justify-content".to_string(), "center".to_string());
|
||||||
css_map.insert("align-items", "center");
|
css_map.insert("align-items".to_string(), "center".to_string());
|
||||||
css_map.insert("border", "2px solid black");
|
css_map.insert("border".to_string(), "2px solid black".to_string());
|
||||||
|
|
||||||
let css_str_to_vec = Readability::inline_css_str_to_map(css_str);
|
let css_str_to_vec = Readability::inline_css_str_to_map(css_str);
|
||||||
assert_eq!(css_map, css_str_to_vec);
|
assert_eq!(css_map, css_str_to_vec);
|
||||||
let mut css_map = HashMap::new();
|
let mut css_map = HashMap::new();
|
||||||
css_map.insert("color", "red");
|
css_map.insert("color".to_string(), "red".to_string());
|
||||||
assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;"));
|
css_map.insert("background-image".to_string(), "url('')".to_string());
|
||||||
|
assert_eq!(css_map, Readability::inline_css_str_to_map("color: red;background-image: url('')"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
@ -132,8 +132,4 @@ lazy_static! {
|
||||||
pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
|
pub static ref REPLACE_END_SEPARATOR_REGEX: Regex =
|
||||||
Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
|
Regex::new(r"(?i)[^\|\-\\/>»]*[\|\-\\/>»](?P<end>.*)").unwrap();
|
||||||
pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
|
pub static ref REPLACE_MULTI_SEPARATOR_REGEX: Regex = Regex::new(r"[\|\-\\/>»]+").unwrap();
|
||||||
pub static ref REPLACE_SELF_CLOSING_REGEX: Regex = Regex::new(
|
|
||||||
r#"(?P<tag><(?:area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)(?: [a-z\-]+=["'][\sa-zA-Z0-9\./\-_#]+["']|[a-z\-]+)*)>"#
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue