Skip to content

Commit

Permalink
use bytecount::num_chars
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelcolvin committed Mar 27, 2024
1 parent 7974cda commit 569fe11
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 45 deletions.
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ ahash = "0.8.0"
smallvec = "1.11.0"
pyo3 = { version = "0.21.0-beta.0", default-features=false, features = ["num-bigint"], optional = true }
lexical-parse-float = { version = "0.8.5", features = ["format"] }
bytecount = { version = "0.6.7", default_features = false, features = ["runtime-dispatch-simd"] }

[features]
python = ["dep:pyo3"]
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ pub use parse::Peek;
pub use value::{JsonArray, JsonObject, JsonValue};

#[cfg(feature = "python")]
pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, StringCacheMode};
pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, pystring_fast_new, StringCacheMode};
#[cfg(feature = "python")]
pub use python::{map_json_error, python_parse};
49 changes: 21 additions & 28 deletions src/py_string_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ use pyo3::prelude::*;
use pyo3::sync::{GILOnceCell, GILProtected};
use pyo3::types::{PyBool, PyString};

use crate::string_decoder::StrType;

#[derive(Debug, Clone, Copy)]
pub enum StringCacheMode {
All,
Expand Down Expand Up @@ -48,38 +46,38 @@ impl From<bool> for StringCacheMode {
}

pub trait StringMaybeCache {
fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString>;
fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString>;

fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> {
Self::get_key(py, json_str)
}
}

pub struct StringCacheAll;

impl StringMaybeCache for StringCacheAll {
fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> {
cached_py_string(py, json_str)
}
}

pub struct StringCacheKeys;

impl StringMaybeCache for StringCacheKeys {
fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> {
cached_py_string(py, json_str)
}

fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
pystring_unicode_known(py, json_str)
fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> {
pystring_fast_new(py, json_str)
}
}

pub struct StringNoCache;

impl StringMaybeCache for StringNoCache {
fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
pystring_unicode_known(py, json_str)
fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> {
pystring_fast_new(py, json_str)
}
}

Expand All @@ -101,18 +99,12 @@ pub fn cache_clear(py: Python) {
get_string_cache!(py).borrow_mut().clear()
}

static EMPTY_STRING: GILOnceCell<Py<PyString>> = GILOnceCell::new();

pub fn cached_py_string<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> {
pub fn cached_py_string<'py>(py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> {
// from tests, 0 and 1 character strings are faster not cached
let len = raw_str.s.len();
if len == 0 {
let s = EMPTY_STRING.get_or_init(py, || unsafe { pystring_unicode(py, "") }.into_py(py));
s.clone_ref(py).into_bound(py)
} else if (2..64).contains(&len) {
if (2..64).contains(&raw_str.len()) {
get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str)
} else {
pystring_unicode_known(py, raw_str)
pystring_fast_new(py, raw_str)
}
}

Expand Down Expand Up @@ -144,13 +136,13 @@ impl Default for PyStringCache {
impl PyStringCache {
/// Lookup the cache for an entry with the given string. If it exists, return it.
/// If it is not set or has a different string, insert it and return it.
fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> {
let hash = self.hash_builder.hash_one(raw_str.s);
fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> {
let hash = self.hash_builder.hash_one(raw_str);

let hash_index = hash as usize % CAPACITY;

let set_entry = |entry: &mut Entry| {
let py_str = pystring_unicode_known(py, raw_str);
let py_str = pystring_fast_new(py, raw_str);
*entry = Some((hash, py_str.to_owned().unbind()));
py_str
};
Expand All @@ -162,7 +154,7 @@ impl PyStringCache {
// to avoid a string comparison, we first compare the hashes
if *entry_hash == hash {
// if the hashes match, we compare the strings to be absolutely sure - as a hashmap would do
if py_str_ob.bind(py).to_str().ok() == Some(raw_str.s) {
if py_str_ob.bind(py).to_str().ok() == Some(raw_str) {
// the strings matched, return the cached string object
return py_str_ob.bind(py).to_owned();
}
Expand Down Expand Up @@ -193,14 +185,15 @@ impl PyStringCache {
}
}

pub fn pystring_unicode_known<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> {
if raw_str.known_ascii {
unsafe { pystring_unicode(py, raw_str.s) }
pub fn pystring_fast_new<'py>(py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> {
if bytecount::num_chars(raw_str.as_bytes()) == raw_str.len() {
unsafe { pystring_unicode_ascii(py, raw_str) }
} else {
PyString::new_bound(py, raw_str.s)
PyString::new_bound(py, raw_str)
}
}
pub unsafe fn pystring_unicode<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {

unsafe fn pystring_unicode_ascii<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
let ptr = ffi::PyUnicode_New(s.len() as isize, 127);
let data_ptr = ptr.cast::<ffi::PyASCIIObject>().offset(1) as *mut u8;
core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len());
Expand Down
6 changes: 3 additions & 3 deletions src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ impl<'j> PythonParser<'j> {
}
Peek::String => {
let s = self.parser.consume_string::<StringDecoder>(&mut self.tape)?;
Ok(StringCache::get_value(py, s.as_str_type()).into_any())
Ok(StringCache::get_value(py, s.as_str()).into_any())
}
Peek::Array => {
let list = if let Some(peek_first) = tri!(self.parser.array_first(), PyList::empty_bound(py)) {
Expand Down Expand Up @@ -175,12 +175,12 @@ impl<'j> PythonParser<'j> {
}
};
if let Some(first_key) = self.parser.object_first::<StringDecoder>(&mut self.tape)? {
let first_key = StringCache::get_key(py, first_key.as_str_type());
let first_key = StringCache::get_key(py, first_key.as_str());
let peek = self.parser.peek()?;
let first_value = self._check_take_value::<StringCache>(py, peek)?;
set_item(first_key, first_value);
while let Some(key) = self.parser.object_step::<StringDecoder>(&mut self.tape)? {
let key = StringCache::get_key(py, key.as_str_type());
let key = StringCache::get_key(py, key.as_str());
let peek = self.parser.peek()?;
let value = self._check_take_value::<StringCache>(py, peek)?;
set_item(key, value);
Expand Down
13 changes: 0 additions & 13 deletions src/string_decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,26 +47,13 @@ impl<'t, 'j> From<StringOutput<'t, 'j>> for Cow<'j, str> {
}
}

#[derive(Debug, Clone, Copy)]
pub struct StrType<'a> {
pub s: &'a str,
pub known_ascii: bool,
}

impl<'t, 'j> StringOutput<'t, 'j> {
pub fn as_str(&self) -> &'t str {
match self {
Self::Tape(s) => s,
Self::Data(s) => s,
}
}

pub fn as_str_type(&self) -> StrType<'t> {
match self {
Self::Tape(s) => StrType { s, known_ascii: false },
Self::Data(s) => StrType { s, known_ascii: true },
}
}
}

// taken serde-rs/json but altered
Expand Down
20 changes: 20 additions & 0 deletions tests/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,23 @@ fn test_cache_into() {
);
})
}

#[test]
fn test_unicode() {
let json = r#"["💩", "£"]"#;
Python::with_gil(|py| {
cache_clear(py);
let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::None, false).unwrap();
assert_eq!(obj.to_string(), "['💩', '£']");
})
}

#[test]
fn test_unicode_cache() {
let json = r#"["💩", "£"]"#;
Python::with_gil(|py| {
cache_clear(py);
let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::All, false).unwrap();
assert_eq!(obj.to_string(), "['💩', '£']");
})
}

0 comments on commit 569fe11

Please sign in to comment.