Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use memchr to search for characters to escape #664

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 42 additions & 37 deletions src/escapei.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
//! Manage xml character escapes

use memchr::memchr2_iter;
use memchr::{memchr2_iter, memchr3_iter};
use std::borrow::Cow;
use std::ops::Range;

use crate::utils::MergeIter;
#[cfg(test)]
use pretty_assertions::assert_eq;

Expand Down Expand Up @@ -72,7 +73,14 @@ impl std::error::Error for EscapeError {}
/// | `'` | `'`
/// | `"` | `"`
pub fn escape(raw: &str) -> Cow<str> {
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
let bytes = raw.as_bytes();
_escape(
raw,
MergeIter::new(
memchr3_iter(b'<', b'>', b'&', bytes),
memchr2_iter(b'\'', b'"', bytes),
),
)
}

/// Escapes an `&str` and replaces xml special characters (`<`, `>`, `&`)
Expand All @@ -89,53 +97,53 @@ pub fn escape(raw: &str) -> Cow<str> {
/// | `>` | `&gt;`
/// | `&` | `&amp;`
pub fn partial_escape(raw: &str) -> Cow<str> {
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
_escape(raw, memchr3_iter(b'<', b'>', b'&', raw.as_bytes()))
}

/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`,
/// `&`, `'`, `"`) with their corresponding xml escaped value.
pub(crate) fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str> {
pub(crate) fn _escape<It>(raw: &str, escapes: It) -> Cow<str>
where
It: Iterator<Item = usize>,
{
let bytes = raw.as_bytes();
let mut escaped = None;
let mut iter = bytes.iter();
let mut pos = 0;
while let Some(i) = iter.position(|&b| escape_chars(b)) {
if escaped.is_none() {
escaped = Some(Vec::with_capacity(raw.len()));
}
let escaped = escaped.as_mut().expect("initialized");
let new_pos = pos + i;
escaped.extend_from_slice(&bytes[pos..new_pos]);
match bytes[new_pos] {
b'<' => escaped.extend_from_slice(b"&lt;"),
b'>' => escaped.extend_from_slice(b"&gt;"),
b'\'' => escaped.extend_from_slice(b"&apos;"),
b'&' => escaped.extend_from_slice(b"&amp;"),
b'"' => escaped.extend_from_slice(b"&quot;"),
let mut last_pos = 0;
for i in escapes {
// If we have an escape, the escaped string will be at least some larger than the raw string,
// reserve a little more space, so we might not resize at all if only a few escapes are found.
let escaped = escaped.get_or_insert_with(|| String::with_capacity(raw.len() + 64));
let byte = bytes[i];
// SAFETY: the escapes iterator should only return indexes of bytes we know how to escape.
// if one of those bytes are found, it _must_ be a complete character, so `i` must be a
// character boundary.
// last_pos will only be either 0 or i+1, and all supported chars are one byte long,
// last_pos will also always be at a char boundary
escaped.push_str(&raw[last_pos..i]);
match byte {
b'<' => escaped.push_str("&lt;"),
b'>' => escaped.push_str("&gt;"),
b'\'' => escaped.push_str("&apos;"),
b'&' => escaped.push_str("&amp;"),
b'"' => escaped.push_str("&quot;"),

// This set of escapes handles characters that should be escaped
// in elements of xs:lists, because those characters works as
// delimiters of list elements
b'\t' => escaped.extend_from_slice(b"&#9;"),
b'\n' => escaped.extend_from_slice(b"&#10;"),
b'\r' => escaped.extend_from_slice(b"&#13;"),
b' ' => escaped.extend_from_slice(b"&#32;"),
b'\t' => escaped.push_str("&#9;"),
b'\n' => escaped.push_str("&#10;"),
b'\r' => escaped.push_str("&#13;"),
b' ' => escaped.push_str("&#32;"),
_ => unreachable!(
"Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"
),
}
pos = new_pos + 1;
last_pos = i + 1;
}

if let Some(mut escaped) = escaped {
if let Some(raw) = bytes.get(pos..) {
escaped.extend_from_slice(raw);
}
// SAFETY: we operate on UTF-8 input and search for an one byte chars only,
// so all slices that was put to the `escaped` is a valid UTF-8 encoded strings
// TODO: Can be replaced with `unsafe { String::from_utf8_unchecked() }`
// if unsafe code will be allowed
Cow::Owned(String::from_utf8(escaped).unwrap())
escaped.push_str(&raw[last_pos..]);
Cow::Owned(escaped)
} else {
Cow::Borrowed(raw)
}
Expand Down Expand Up @@ -175,17 +183,14 @@ where
match iter.next() {
Some(end) if bytes[end] == b';' => {
// append valid data
if unescaped.is_none() {
unescaped = Some(String::with_capacity(raw.len()));
}
let unescaped = unescaped.as_mut().expect("initialized");
let unescaped = unescaped.get_or_insert_with(|| String::with_capacity(raw.len()));
unescaped.push_str(&raw[last_end..start]);

// search for character correctness
let pat = &raw[start + 1..end];
if let Some(entity) = pat.strip_prefix('#') {
let codepoint = parse_number(entity, start..end)?;
unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
unescaped.push(codepoint);
} else if let Some(value) = named_entity(pat) {
unescaped.push_str(value);
} else if let Some(value) = resolve_entity(pat) {
Expand Down
237 changes: 138 additions & 99 deletions src/se/simple_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
use crate::errors::serialize::DeError;
use crate::escapei::_escape;
use crate::se::{Indent, QuoteLevel};
use crate::utils::MergeIter;
use memchr::{memchr2_iter, memchr3_iter, memchr_iter};
use serde::ser::{
Impossible, Serialize, SerializeSeq, SerializeTuple, SerializeTupleStruct, Serializer,
};
Expand All @@ -29,67 +31,96 @@ fn escape_item(value: &str, target: QuoteTarget, level: QuoteLevel) -> Cow<str>
use QuoteLevel::*;
use QuoteTarget::*;

let bytes = value.as_bytes();

match (target, level) {
(_, Full) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' | b'>' | b'\'' | b'\"' => true,
_ => false,
}),
(_, Full) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<', '>', '\'', '"': Required characters to escape
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr3_iter(b'>', b'\'', b'"', bytes),
),
),
//----------------------------------------------------------------------
(Text, Partial) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' | b'>' => true,
_ => false,
}),
(Text, Minimal) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' => true,
_ => false,
}),
(Text, Partial) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<', '>': Required characters to escape
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr_iter(b'>', bytes),
),
),
(Text, Minimal) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<': Required characters to escape
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
),
//----------------------------------------------------------------------
(DoubleQAttr, Partial) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' | b'>' => true,
// Double quoted attribute should escape quote
b'"' => true,
_ => false,
}),
(DoubleQAttr, Minimal) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' => true,
// Double quoted attribute should escape quote
b'"' => true,
_ => false,
}),
(DoubleQAttr, Partial) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<', '>': Required characters to escape
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr2_iter(b'>', b'"', bytes),
),
),
(DoubleQAttr, Minimal) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<': Required characters to escape
// '"': Double quoted attribute should escape quote
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr_iter(b'"', bytes),
),
),
//----------------------------------------------------------------------
(SingleQAttr, Partial) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' | b'>' => true,
// Single quoted attribute should escape quote
b'\'' => true,
_ => false,
}),
(SingleQAttr, Minimal) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' => true,
// Single quoted attribute should escape quote
b'\'' => true,
_ => false,
}),
(SingleQAttr, Partial) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<', '>': Required characters to escape
// '\'': Single quoted attribute should escape quote
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr2_iter(b'>', b'\'', bytes),
),
),
(SingleQAttr, Minimal) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<': Required characters to escape
// '\'': Single quoted attribute should escape quote
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr_iter(b'\'', bytes),
),
),
}
}

Expand All @@ -98,53 +129,61 @@ fn escape_list(value: &str, target: QuoteTarget, level: QuoteLevel) -> Cow<str>
use QuoteLevel::*;
use QuoteTarget::*;

let bytes = value.as_bytes();

match (target, level) {
(_, Full) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' | b'>' | b'\'' | b'\"' => true,
_ => false,
}),
(_, Full) => _escape(
value,
// '&', '<', '>', '\'', '"': Required characters to escape
MergeIter::new(
memchr3_iter(b'&', b'<', b'>', bytes),
memchr2_iter(b'\'', b'"', bytes),
),
),
//----------------------------------------------------------------------
(Text, Partial) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' | b'>' => true,
_ => false,
}),
(Text, Minimal) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' => true,
_ => false,
}),
(Text, Partial) => _escape(
value,
// '&', '<', '>': Required characters to escape
memchr3_iter(b'&', b'<', b'>', bytes),
),
(Text, Minimal) => _escape(
value,
// '&', '<': Required characters to escape
memchr2_iter(b'&', b'<', bytes),
),
//----------------------------------------------------------------------
(DoubleQAttr, Partial) => _escape(value, |ch| match ch {
(DoubleQAttr, Partial) => _escape(
value,
// '&', '<', '>': Required characters to escape
// '"': Double quoted attribute should escape quote
MergeIter::new(
memchr3_iter(b'&', b'<', b'>', bytes),
memchr_iter(b'"', bytes),
),
),
(DoubleQAttr, Minimal) => _escape(
value,
// '&', '<': Required characters to escape
// '"': Double quoted attribute should escape quote
// Required characters to escape
b'&' | b'<' | b'>' => true,
// Double quoted attribute should escape quote
b'"' => true,
_ => false,
}),
(DoubleQAttr, Minimal) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' => true,
// Double quoted attribute should escape quote
b'"' => true,
_ => false,
}),
memchr3_iter(b'&', b'<', b'"', bytes),
),
//----------------------------------------------------------------------
(SingleQAttr, Partial) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' | b'>' => true,
// Single quoted attribute should escape quote
b'\'' => true,
_ => false,
}),
(SingleQAttr, Minimal) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' => true,
// Single quoted attribute should escape quote
b'\'' => true,
_ => false,
}),
(SingleQAttr, Partial) => _escape(
value,
// '&', '<', '>': Required characters to escape
// '\'': Single quoted attribute should escape quote
MergeIter::new(
memchr3_iter(b'&', b'<', b'>', bytes),
memchr_iter(b'\'', bytes),
),
),
(SingleQAttr, Minimal) => _escape(
value,
// '&', '<': Required characters to escape
// '\': Single quoted attribute should escape quote
memchr3_iter(b'&', b'<', b'\'', bytes),
),
}
}

Expand Down
Loading