From 7974cda6ed27e75cd8809a3e3844e2426fb556a6 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 09:46:27 -0600 Subject: [PATCH] fast path for ascii strings --- src/py_string_cache.rs | 54 ++++++++++++++++++++++++++++++------------ src/python.rs | 6 ++--- src/string_decoder.rs | 13 ++++++++++ 3 files changed, 55 insertions(+), 18 deletions(-) diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs index 92b70860..4791ae8e 100644 --- a/src/py_string_cache.rs +++ b/src/py_string_cache.rs @@ -2,10 +2,13 @@ use std::cell::RefCell; use ahash::random_state::RandomState; use pyo3::exceptions::{PyTypeError, PyValueError}; +use pyo3::ffi; use pyo3::prelude::*; use pyo3::sync::{GILOnceCell, GILProtected}; use pyo3::types::{PyBool, PyString}; +use crate::string_decoder::StrType; + #[derive(Debug, Clone, Copy)] pub enum StringCacheMode { All, @@ -45,9 +48,9 @@ impl From for StringCacheMode { } pub trait StringMaybeCache { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString>; + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString>; - fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { Self::get_key(py, json_str) } } @@ -55,7 +58,7 @@ pub trait StringMaybeCache { pub struct StringCacheAll; impl StringMaybeCache for StringCacheAll { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { cached_py_string(py, json_str) } } @@ -63,20 +66,20 @@ impl StringMaybeCache for StringCacheAll { pub struct StringCacheKeys; impl StringMaybeCache for StringCacheKeys { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { cached_py_string(py, json_str) } - fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { - PyString::new_bound(py, json_str) + fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { + pystring_unicode_known(py, json_str) } } pub struct StringNoCache; impl StringMaybeCache for StringNoCache { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { - PyString::new_bound(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { + pystring_unicode_known(py, json_str) } } @@ -98,12 +101,18 @@ pub fn cache_clear(py: Python) { get_string_cache!(py).borrow_mut().clear() } -pub fn cached_py_string<'py>(py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> { +static EMPTY_STRING: GILOnceCell> = GILOnceCell::new(); + +pub fn cached_py_string<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { // from tests, 0 and 1 character strings are faster not cached - if (2..64).contains(&raw_str.len()) { + let len = raw_str.s.len(); + if len == 0 { + let s = EMPTY_STRING.get_or_init(py, || unsafe { pystring_unicode(py, "") }.into_py(py)); + s.clone_ref(py).into_bound(py) + } else if (2..64).contains(&len) { get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str) } else { - PyString::new_bound(py, raw_str) + pystring_unicode_known(py, raw_str) } } @@ -135,13 +144,13 @@ impl Default for PyStringCache { impl PyStringCache { /// Lookup the cache for an entry with the given string. If it exists, return it. /// If it is not set or has a different string, insert it and return it. - fn get_or_insert<'py>(&mut self, py: Python<'py>, s: &str) -> Bound<'py, PyString> { - let hash = self.hash_builder.hash_one(s); + fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { + let hash = self.hash_builder.hash_one(raw_str.s); let hash_index = hash as usize % CAPACITY; let set_entry = |entry: &mut Entry| { - let py_str = PyString::new_bound(py, s); + let py_str = pystring_unicode_known(py, raw_str); *entry = Some((hash, py_str.to_owned().unbind())); py_str }; @@ -153,7 +162,7 @@ impl PyStringCache { // to avoid a string comparison, we first compare the hashes if *entry_hash == hash { // if the hashes match, we compare the strings to be absolutely sure - as a hashmap would do - if py_str_ob.bind(py).to_str().ok() == Some(s) { + if py_str_ob.bind(py).to_str().ok() == Some(raw_str.s) { // the strings matched, return the cached string object return py_str_ob.bind(py).to_owned(); } @@ -183,3 +192,18 @@ impl PyStringCache { self.entries.fill(None); } } + +pub fn pystring_unicode_known<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { + if raw_str.known_ascii { + unsafe { pystring_unicode(py, raw_str.s) } + } else { + PyString::new_bound(py, raw_str.s) + } +} +pub unsafe fn pystring_unicode<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { + let ptr = ffi::PyUnicode_New(s.len() as isize, 127); + let data_ptr = ptr.cast::().offset(1) as *mut u8; + core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len()); + core::ptr::write(data_ptr.add(s.len()), 0); + Bound::from_owned_ptr(py, ptr).downcast_into_unchecked() +} diff --git a/src/python.rs b/src/python.rs index 23bf0c67..dbd25e16 100644 --- a/src/python.rs +++ b/src/python.rs @@ -100,7 +100,7 @@ impl<'j> PythonParser<'j> { } Peek::String => { let s = self.parser.consume_string::(&mut self.tape)?; - Ok(StringCache::get_value(py, s.as_str()).into_any()) + Ok(StringCache::get_value(py, s.as_str_type()).into_any()) } Peek::Array => { let list = if let Some(peek_first) = tri!(self.parser.array_first(), PyList::empty_bound(py)) { @@ -175,12 +175,12 @@ impl<'j> PythonParser<'j> { } }; if let Some(first_key) = self.parser.object_first::(&mut self.tape)? { - let first_key = StringCache::get_key(py, first_key.as_str()); + let first_key = StringCache::get_key(py, first_key.as_str_type()); let peek = self.parser.peek()?; let first_value = self._check_take_value::(py, peek)?; set_item(first_key, first_value); while let Some(key) = self.parser.object_step::(&mut self.tape)? { - let key = StringCache::get_key(py, key.as_str()); + let key = StringCache::get_key(py, key.as_str_type()); let peek = self.parser.peek()?; let value = self._check_take_value::(py, peek)?; set_item(key, value); diff --git a/src/string_decoder.rs b/src/string_decoder.rs index bc102084..342a84d2 100644 --- a/src/string_decoder.rs +++ b/src/string_decoder.rs @@ -47,6 +47,12 @@ impl<'t, 'j> From> for Cow<'j, str> { } } +#[derive(Debug, Clone, Copy)] +pub struct StrType<'a> { + pub s: &'a str, + pub known_ascii: bool, +} + impl<'t, 'j> StringOutput<'t, 'j> { pub fn as_str(&self) -> &'t str { match self { @@ -54,6 +60,13 @@ impl<'t, 'j> StringOutput<'t, 'j> { Self::Data(s) => s, } } + + pub fn as_str_type(&self) -> StrType<'t> { + match self { + Self::Tape(s) => StrType { s, known_ascii: false }, + Self::Data(s) => StrType { s, known_ascii: true }, + } + } } // taken serde-rs/json but altered