Skip to content

Commit

Permalink
separate argument
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelcolvin committed Mar 28, 2024
1 parent f31a6aa commit 0b71ca0
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 51 deletions.
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ pub use parse::Peek;
pub use value::{JsonArray, JsonObject, JsonValue};

#[cfg(feature = "python")]
pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, StringCacheMode};
pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, pystring_fast_new, StringCacheMode};
#[cfg(feature = "python")]
pub use python::{map_json_error, python_parse};
51 changes: 24 additions & 27 deletions src/py_string_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ use pyo3::prelude::*;
use pyo3::sync::{GILOnceCell, GILProtected};
use pyo3::types::{PyBool, PyString};

use crate::string_decoder::StrType;

#[derive(Debug, Clone, Copy)]
pub enum StringCacheMode {
All,
Expand Down Expand Up @@ -48,38 +46,38 @@ impl From<bool> for StringCacheMode {
}

pub trait StringMaybeCache {
fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString>;
fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString>;

fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
Self::get_key(py, json_str)
fn get_value<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> {
Self::get_key(py, json_str, ascii_only)
}
}

pub struct StringCacheAll;

impl StringMaybeCache for StringCacheAll {
fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
cached_py_string(py, json_str)
fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> {
cached_py_string(py, json_str, ascii_only)
}
}

pub struct StringCacheKeys;

impl StringMaybeCache for StringCacheKeys {
fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
cached_py_string(py, json_str)
fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> {
cached_py_string(py, json_str, ascii_only)
}

fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
pystring_unicode_known(py, json_str)
fn get_value<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> {
pystring_fast_new(py, json_str, ascii_only)
}
}

pub struct StringNoCache;

impl StringMaybeCache for StringNoCache {
fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> {
pystring_unicode_known(py, json_str)
fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> {
pystring_fast_new(py, json_str, ascii_only)
}
}

Expand All @@ -101,13 +99,12 @@ pub fn cache_clear(py: Python) {
get_string_cache!(py).borrow_mut().clear()
}

pub fn cached_py_string<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> {
pub fn cached_py_string<'py>(py: Python<'py>, s: &str, ascii_only: bool) -> Bound<'py, PyString> {
// from tests, 0 and 1 character strings are faster not cached
let len = raw_str.s.len();
if (2..64).contains(&len) {
get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str)
if (2..64).contains(&s.len()) {
get_string_cache!(py).borrow_mut().get_or_insert(py, s, ascii_only)
} else {
pystring_unicode_known(py, raw_str)
pystring_fast_new(py, s, ascii_only)
}
}

Expand Down Expand Up @@ -139,13 +136,13 @@ impl Default for PyStringCache {
impl PyStringCache {
/// Lookup the cache for an entry with the given string. If it exists, return it.
/// If it is not set or has a different string, insert it and return it.
fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> {
let hash = self.hash_builder.hash_one(raw_str.s);
fn get_or_insert<'py>(&mut self, py: Python<'py>, s: &str, ascii_only: bool) -> Bound<'py, PyString> {
let hash = self.hash_builder.hash_one(s);

let hash_index = hash as usize % CAPACITY;

let set_entry = |entry: &mut Entry| {
let py_str = pystring_unicode_known(py, raw_str);
let py_str = pystring_fast_new(py, s, ascii_only);
*entry = Some((hash, py_str.to_owned().unbind()));
py_str
};
Expand All @@ -157,7 +154,7 @@ impl PyStringCache {
// to avoid a string comparison, we first compare the hashes
if *entry_hash == hash {
// if the hashes match, we compare the strings to be absolutely sure - as a hashmap would do
if py_str_ob.bind(py).to_str().ok() == Some(raw_str.s) {
if py_str_ob.bind(py).to_str().ok() == Some(s) {
// the strings matched, return the cached string object
return py_str_ob.bind(py).to_owned();
}
Expand Down Expand Up @@ -188,15 +185,15 @@ impl PyStringCache {
}
}

pub fn pystring_unicode_known<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> {
if raw_str.known_ascii {
unsafe { pystring_unicode(py, raw_str.s) }
pub fn pystring_fast_new<'py>(py: Python<'py>, s: &str, ascii_only: bool) -> Bound<'py, PyString> {
if ascii_only {
unsafe { pystring_ascii_new(py, s) }
} else {
PyString::new_bound(py, raw_str.s)
PyString::new_bound(py, s)
}
}

pub unsafe fn pystring_unicode<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
unsafe fn pystring_ascii_new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
let ptr = ffi::PyUnicode_New(s.len() as isize, 127);
let data_ptr = ptr.cast::<ffi::PyASCIIObject>().offset(1) as *mut u8;
core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len());
Expand Down
6 changes: 3 additions & 3 deletions src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ impl<'j> PythonParser<'j> {
}
Peek::String => {
let s = self.parser.consume_string::<StringDecoder>(&mut self.tape)?;
Ok(StringCache::get_value(py, s.as_str_type()).into_any())
Ok(StringCache::get_value(py, s.as_str(), s.ascii_only()).into_any())
}
Peek::Array => {
let peek_first = match self.parser.array_first() {
Expand Down Expand Up @@ -162,12 +162,12 @@ impl<'j> PythonParser<'j> {
}
};
if let Some(first_key) = self.parser.object_first::<StringDecoder>(&mut self.tape)? {
let first_key = StringCache::get_key(py, first_key.as_str_type());
let first_key = StringCache::get_key(py, first_key.as_str(), first_key.ascii_only());
let peek = self.parser.peek()?;
let first_value = self._check_take_value::<StringCache>(py, peek)?;
set_item(first_key, first_value);
while let Some(key) = self.parser.object_step::<StringDecoder>(&mut self.tape)? {
let key = StringCache::get_key(py, key.as_str_type());
let key = StringCache::get_key(py, key.as_str(), key.ascii_only());
let peek = self.parser.peek()?;
let value = self._check_take_value::<StringCache>(py, peek)?;
set_item(key, value);
Expand Down
32 changes: 13 additions & 19 deletions src/string_decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,46 +25,40 @@ pub enum StringOutput<'t, 'j>
where
'j: 't,
{
Tape(&'t str),
Data(&'j str),
Tape(&'t str, bool),
Data(&'j str, bool),
}

impl From<StringOutput<'_, '_>> for String {
fn from(val: StringOutput) -> Self {
match val {
StringOutput::Tape(s) => s.to_owned(),
StringOutput::Data(s) => s.to_owned(),
StringOutput::Tape(s, _) => s.to_owned(),
StringOutput::Data(s, _) => s.to_owned(),
}
}
}

impl<'t, 'j> From<StringOutput<'t, 'j>> for Cow<'j, str> {
fn from(val: StringOutput<'t, 'j>) -> Self {
match val {
StringOutput::Tape(s) => s.to_owned().into(),
StringOutput::Data(s) => s.into(),
StringOutput::Tape(s, _) => s.to_owned().into(),
StringOutput::Data(s, _) => s.into(),
}
}
}

#[derive(Debug, Clone, Copy)]
pub struct StrType<'a> {
pub s: &'a str,
pub known_ascii: bool,
}

impl<'t, 'j> StringOutput<'t, 'j> {
pub fn as_str(&self) -> &'t str {
match self {
Self::Tape(s) => s,
Self::Data(s) => s,
Self::Tape(s, _) => s,
Self::Data(s, _) => s,
}
}

pub fn as_str_type(&self) -> StrType<'t> {
pub fn ascii_only(&self) -> bool {
match self {
Self::Tape(s) => StrType { s, known_ascii: false },
Self::Data(s) => StrType { s, known_ascii: true },
Self::Tape(_, ascii_only) => *ascii_only,
Self::Data(_, ascii_only) => *ascii_only,
}
}
}
Expand Down Expand Up @@ -156,7 +150,7 @@ where
CharType::Quote => {
let s = to_str(&data[start..index], ascii_only, start)?;
index += 1;
return Ok((StringOutput::Data(s), index));
return Ok((StringOutput::Data(s, ascii_only), index));
}
CharType::Backslash => return decode_to_tape(data, index, tape, start, ascii_only),
CharType::ControlChar => return json_err!(ControlCharacterWhileParsingString, index),
Expand Down Expand Up @@ -217,7 +211,7 @@ fn decode_to_tape<'t, 'j>(
tape.extend_from_slice(&data[last_escape..index]);
index += 1;
let s = to_str(tape, ascii_only, start)?;
return Ok((StringOutput::Tape(s), index));
return Ok((StringOutput::Tape(s, ascii_only), index));
}
CharType::Backslash => on_backslash!(),
CharType::ControlChar => return json_err!(ControlCharacterWhileParsingString, index),
Expand Down
40 changes: 39 additions & 1 deletion tests/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use pyo3::prelude::*;
use pyo3::types::{PyDict, PyList, PyString};
use pyo3::ToPyObject;

use jiter::{cache_clear, cache_usage, map_json_error, python_parse, JsonValue, StringCacheMode};
use jiter::{cache_clear, cache_usage, map_json_error, pystring_fast_new, python_parse, JsonValue, StringCacheMode};

#[test]
fn test_to_py_object_numeric() {
Expand Down Expand Up @@ -269,3 +269,41 @@ fn test_cache_into() {
);
})
}

#[test]
fn test_unicode() {
let json = r#"{"💩": "£"}"#;
Python::with_gil(|py| {
cache_clear(py);
let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::None, false).unwrap();
assert_eq!(obj.to_string(), "{'💩': '£'}");
})
}

#[test]
fn test_unicode_cache() {
let json = r#"{"💩": "£"}"#;
Python::with_gil(|py| {
cache_clear(py);
let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::All, false).unwrap();
assert_eq!(obj.to_string(), "{'💩': '£'}");
})
}

#[test]
fn test_pystring_fast_new_non_ascii() {
let json = "£100 💩";
Python::with_gil(|py| {
let s = pystring_fast_new(py, json, false);
assert_eq!(s.to_string(), "£100 💩");
})
}

#[test]
fn test_pystring_fast_new_ascii() {
let json = "100abc";
Python::with_gil(|py| {
let s = pystring_fast_new(py, json, true);
assert_eq!(s.to_string(), "100abc");
})
}

0 comments on commit 0b71ca0

Please sign in to comment.