From 0b71ca08d0e1c49e4c9c262f662b3b8860af7080 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 17:39:50 -0600 Subject: [PATCH] separate argument --- src/lib.rs | 2 +- src/py_string_cache.rs | 51 ++++++++++++++++++++---------------------- src/python.rs | 6 ++--- src/string_decoder.rs | 32 +++++++++++--------------- tests/python.rs | 40 ++++++++++++++++++++++++++++++++- 5 files changed, 80 insertions(+), 51 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index dc3080f0..7f943ad6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,6 @@ pub use parse::Peek; pub use value::{JsonArray, JsonObject, JsonValue}; #[cfg(feature = "python")] -pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, StringCacheMode}; +pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, pystring_fast_new, StringCacheMode}; #[cfg(feature = "python")] pub use python::{map_json_error, python_parse}; diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs index 6e8fec40..4d1c90df 100644 --- a/src/py_string_cache.rs +++ b/src/py_string_cache.rs @@ -7,8 +7,6 @@ use pyo3::prelude::*; use pyo3::sync::{GILOnceCell, GILProtected}; use pyo3::types::{PyBool, PyString}; -use crate::string_decoder::StrType; - #[derive(Debug, Clone, Copy)] pub enum StringCacheMode { All, @@ -48,38 +46,38 @@ impl From for StringCacheMode { } pub trait StringMaybeCache { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString>; + fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString>; - fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - Self::get_key(py, json_str) + fn get_value<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + Self::get_key(py, json_str, ascii_only) } } pub struct StringCacheAll; impl StringMaybeCache for StringCacheAll { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - cached_py_string(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + cached_py_string(py, json_str, ascii_only) } } pub struct StringCacheKeys; impl StringMaybeCache for StringCacheKeys { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - cached_py_string(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + cached_py_string(py, json_str, ascii_only) } - fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - pystring_unicode_known(py, json_str) + fn get_value<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + pystring_fast_new(py, json_str, ascii_only) } } pub struct StringNoCache; impl StringMaybeCache for StringNoCache { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - pystring_unicode_known(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + pystring_fast_new(py, json_str, ascii_only) } } @@ -101,13 +99,12 @@ pub fn cache_clear(py: Python) { get_string_cache!(py).borrow_mut().clear() } -pub fn cached_py_string<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { +pub fn cached_py_string<'py>(py: Python<'py>, s: &str, ascii_only: bool) -> Bound<'py, PyString> { // from tests, 0 and 1 character strings are faster not cached - let len = raw_str.s.len(); - if (2..64).contains(&len) { - get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str) + if (2..64).contains(&s.len()) { + get_string_cache!(py).borrow_mut().get_or_insert(py, s, ascii_only) } else { - pystring_unicode_known(py, raw_str) + pystring_fast_new(py, s, ascii_only) } } @@ -139,13 +136,13 @@ impl Default for PyStringCache { impl PyStringCache { /// Lookup the cache for an entry with the given string. If it exists, return it. /// If it is not set or has a different string, insert it and return it. - fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { - let hash = self.hash_builder.hash_one(raw_str.s); + fn get_or_insert<'py>(&mut self, py: Python<'py>, s: &str, ascii_only: bool) -> Bound<'py, PyString> { + let hash = self.hash_builder.hash_one(s); let hash_index = hash as usize % CAPACITY; let set_entry = |entry: &mut Entry| { - let py_str = pystring_unicode_known(py, raw_str); + let py_str = pystring_fast_new(py, s, ascii_only); *entry = Some((hash, py_str.to_owned().unbind())); py_str }; @@ -157,7 +154,7 @@ impl PyStringCache { // to avoid a string comparison, we first compare the hashes if *entry_hash == hash { // if the hashes match, we compare the strings to be absolutely sure - as a hashmap would do - if py_str_ob.bind(py).to_str().ok() == Some(raw_str.s) { + if py_str_ob.bind(py).to_str().ok() == Some(s) { // the strings matched, return the cached string object return py_str_ob.bind(py).to_owned(); } @@ -188,15 +185,15 @@ impl PyStringCache { } } -pub fn pystring_unicode_known<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { - if raw_str.known_ascii { - unsafe { pystring_unicode(py, raw_str.s) } +pub fn pystring_fast_new<'py>(py: Python<'py>, s: &str, ascii_only: bool) -> Bound<'py, PyString> { + if ascii_only { + unsafe { pystring_ascii_new(py, s) } } else { - PyString::new_bound(py, raw_str.s) + PyString::new_bound(py, s) } } -pub unsafe fn pystring_unicode<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { +unsafe fn pystring_ascii_new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { let ptr = ffi::PyUnicode_New(s.len() as isize, 127); let data_ptr = ptr.cast::().offset(1) as *mut u8; core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len()); diff --git a/src/python.rs b/src/python.rs index caea115e..3cc88a38 100644 --- a/src/python.rs +++ b/src/python.rs @@ -85,7 +85,7 @@ impl<'j> PythonParser<'j> { } Peek::String => { let s = self.parser.consume_string::(&mut self.tape)?; - Ok(StringCache::get_value(py, s.as_str_type()).into_any()) + Ok(StringCache::get_value(py, s.as_str(), s.ascii_only()).into_any()) } Peek::Array => { let peek_first = match self.parser.array_first() { @@ -162,12 +162,12 @@ impl<'j> PythonParser<'j> { } }; if let Some(first_key) = self.parser.object_first::(&mut self.tape)? { - let first_key = StringCache::get_key(py, first_key.as_str_type()); + let first_key = StringCache::get_key(py, first_key.as_str(), first_key.ascii_only()); let peek = self.parser.peek()?; let first_value = self._check_take_value::(py, peek)?; set_item(first_key, first_value); while let Some(key) = self.parser.object_step::(&mut self.tape)? { - let key = StringCache::get_key(py, key.as_str_type()); + let key = StringCache::get_key(py, key.as_str(), key.ascii_only()); let peek = self.parser.peek()?; let value = self._check_take_value::(py, peek)?; set_item(key, value); diff --git a/src/string_decoder.rs b/src/string_decoder.rs index 342a84d2..cbf1623f 100644 --- a/src/string_decoder.rs +++ b/src/string_decoder.rs @@ -25,15 +25,15 @@ pub enum StringOutput<'t, 'j> where 'j: 't, { - Tape(&'t str), - Data(&'j str), + Tape(&'t str, bool), + Data(&'j str, bool), } impl From> for String { fn from(val: StringOutput) -> Self { match val { - StringOutput::Tape(s) => s.to_owned(), - StringOutput::Data(s) => s.to_owned(), + StringOutput::Tape(s, _) => s.to_owned(), + StringOutput::Data(s, _) => s.to_owned(), } } } @@ -41,30 +41,24 @@ impl From> for String { impl<'t, 'j> From> for Cow<'j, str> { fn from(val: StringOutput<'t, 'j>) -> Self { match val { - StringOutput::Tape(s) => s.to_owned().into(), - StringOutput::Data(s) => s.into(), + StringOutput::Tape(s, _) => s.to_owned().into(), + StringOutput::Data(s, _) => s.into(), } } } -#[derive(Debug, Clone, Copy)] -pub struct StrType<'a> { - pub s: &'a str, - pub known_ascii: bool, -} - impl<'t, 'j> StringOutput<'t, 'j> { pub fn as_str(&self) -> &'t str { match self { - Self::Tape(s) => s, - Self::Data(s) => s, + Self::Tape(s, _) => s, + Self::Data(s, _) => s, } } - pub fn as_str_type(&self) -> StrType<'t> { + pub fn ascii_only(&self) -> bool { match self { - Self::Tape(s) => StrType { s, known_ascii: false }, - Self::Data(s) => StrType { s, known_ascii: true }, + Self::Tape(_, ascii_only) => *ascii_only, + Self::Data(_, ascii_only) => *ascii_only, } } } @@ -156,7 +150,7 @@ where CharType::Quote => { let s = to_str(&data[start..index], ascii_only, start)?; index += 1; - return Ok((StringOutput::Data(s), index)); + return Ok((StringOutput::Data(s, ascii_only), index)); } CharType::Backslash => return decode_to_tape(data, index, tape, start, ascii_only), CharType::ControlChar => return json_err!(ControlCharacterWhileParsingString, index), @@ -217,7 +211,7 @@ fn decode_to_tape<'t, 'j>( tape.extend_from_slice(&data[last_escape..index]); index += 1; let s = to_str(tape, ascii_only, start)?; - return Ok((StringOutput::Tape(s), index)); + return Ok((StringOutput::Tape(s, ascii_only), index)); } CharType::Backslash => on_backslash!(), CharType::ControlChar => return json_err!(ControlCharacterWhileParsingString, index), diff --git a/tests/python.rs b/tests/python.rs index 3920dc56..97ba7540 100644 --- a/tests/python.rs +++ b/tests/python.rs @@ -2,7 +2,7 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyList, PyString}; use pyo3::ToPyObject; -use jiter::{cache_clear, cache_usage, map_json_error, python_parse, JsonValue, StringCacheMode}; +use jiter::{cache_clear, cache_usage, map_json_error, pystring_fast_new, python_parse, JsonValue, StringCacheMode}; #[test] fn test_to_py_object_numeric() { @@ -269,3 +269,41 @@ fn test_cache_into() { ); }) } + +#[test] +fn test_unicode() { + let json = r#"{"💩": "£"}"#; + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::None, false).unwrap(); + assert_eq!(obj.to_string(), "{'💩': '£'}"); + }) +} + +#[test] +fn test_unicode_cache() { + let json = r#"{"💩": "£"}"#; + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::All, false).unwrap(); + assert_eq!(obj.to_string(), "{'💩': '£'}"); + }) +} + +#[test] +fn test_pystring_fast_new_non_ascii() { + let json = "£100 💩"; + Python::with_gil(|py| { + let s = pystring_fast_new(py, json, false); + assert_eq!(s.to_string(), "£100 💩"); + }) +} + +#[test] +fn test_pystring_fast_new_ascii() { + let json = "100abc"; + Python::with_gil(|py| { + let s = pystring_fast_new(py, json, true); + assert_eq!(s.to_string(), "100abc"); + }) +}