From 4e94e3758c2c44f11d5932adcfdce034ced9e8a7 Mon Sep 17 00:00:00 2001 From: "liuqiang.06" Date: Tue, 20 Aug 2024 11:30:40 +0800 Subject: [PATCH 1/3] feat: support stream deserialize --- src/input.rs | 2 +- src/lazyvalue/iterator.rs | 38 +++++------ src/lib.rs | 1 + src/serde/de.rs | 136 +++++++++++++++++++++++++++++++++++--- src/serde/mod.rs | 2 +- 5 files changed, 147 insertions(+), 32 deletions(-) diff --git a/src/input.rs b/src/input.rs index 759a158..72ed623 100644 --- a/src/input.rs +++ b/src/input.rs @@ -9,7 +9,7 @@ use crate::{parser::as_str, util::private::Sealed}; #[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum JsonSlice<'de> { Raw(&'de [u8]), - FastStr(FastStr), + FastStr(FastStr), // note: FastStr maybe inlined and in the stack. } impl<'de> JsonSlice<'de> { diff --git a/src/lazyvalue/iterator.rs b/src/lazyvalue/iterator.rs index 3cb533a..390df9d 100644 --- a/src/lazyvalue/iterator.rs +++ b/src/lazyvalue/iterator.rs @@ -34,7 +34,14 @@ use crate::{ /// } /// } /// ``` -pub struct ObjectJsonIter<'de>(ObjectInner<'de>); +pub struct ObjectJsonIter<'de> { + json: JsonSlice<'de>, + parser: Option>>, + strbuf: Vec, + first: bool, + ending: bool, + check: bool, +} /// A lazied iterator for JSON array text. It will parse the JSON when iterating. /// @@ -63,18 +70,7 @@ pub struct ObjectJsonIter<'de>(ObjectInner<'de>); /// } /// } /// ``` -pub struct ArrayJsonIter<'de>(ArrayInner<'de>); - -struct ObjectInner<'de> { - json: JsonSlice<'de>, - parser: Option>>, - strbuf: Vec, - first: bool, - ending: bool, - check: bool, -} - -struct ArrayInner<'de> { +pub struct ArrayJsonIter<'de> { json: JsonSlice<'de>, parser: Option>>, first: bool, @@ -82,7 +78,7 @@ struct ArrayInner<'de> { check: bool, } -impl<'de> ObjectInner<'de> { +impl<'de> ObjectJsonIter<'de> { fn new(json: JsonSlice<'de>, check: bool) -> Self { Self { json, @@ -131,7 +127,7 @@ impl<'de> ObjectInner<'de> { } } -impl<'de> ArrayInner<'de> { +impl<'de> ArrayJsonIter<'de> { fn new(json: JsonSlice<'de>, check: bool) -> Self { Self { json, @@ -219,7 +215,7 @@ impl<'de> ArrayInner<'de> { /// } /// ``` pub fn to_object_iter<'de, I: JsonInput<'de>>(json: I) -> ObjectJsonIter<'de> { - ObjectJsonIter(ObjectInner::new(json.to_json_slice(), true)) + ObjectJsonIter::new(json.to_json_slice(), true) } /// Traverse the JSON array text through a lazy iterator. The JSON parsing will doing when @@ -256,7 +252,7 @@ pub fn to_object_iter<'de, I: JsonInput<'de>>(json: I) -> ObjectJsonIter<'de> { /// } /// ``` pub fn to_array_iter<'de, I: JsonInput<'de>>(json: I) -> ArrayJsonIter<'de> { - ArrayJsonIter(ArrayInner::new(json.to_json_slice(), true)) + ArrayJsonIter::new(json.to_json_slice(), true) } /// Traverse the JSON text through a lazy object iterator. The JSON parsing will doing when @@ -293,7 +289,7 @@ pub fn to_array_iter<'de, I: JsonInput<'de>>(json: I) -> ArrayJsonIter<'de> { /// } /// ``` pub unsafe fn to_object_iter_unchecked<'de, I: JsonInput<'de>>(json: I) -> ObjectJsonIter<'de> { - ObjectJsonIter(ObjectInner::new(json.to_json_slice(), false)) + ObjectJsonIter::new(json.to_json_slice(), false) } /// Traverse the JSON text through a lazy object iterator. The JSON parsing will doing when @@ -327,14 +323,14 @@ pub unsafe fn to_object_iter_unchecked<'de, I: JsonInput<'de>>(json: I) -> Objec /// } /// ``` pub unsafe fn to_array_iter_unchecked<'de, I: JsonInput<'de>>(json: I) -> ArrayJsonIter<'de> { - ArrayJsonIter(ArrayInner::new(json.to_json_slice(), false)) + ArrayJsonIter::new(json.to_json_slice(), false) } impl<'de> Iterator for ObjectJsonIter<'de> { type Item = Result<(FastStr, LazyValue<'de>)>; fn next(&mut self) -> Option { - self.0.next_entry_impl(self.0.check) + self.next_entry_impl(self.check) } } @@ -342,7 +338,7 @@ impl<'de> Iterator for ArrayJsonIter<'de> { type Item = Result>; fn next(&mut self) -> Option { - self.0.next_elem_impl(self.0.check) + self.next_elem_impl(self.check) } } diff --git a/src/lib.rs b/src/lib.rs index d3563e9..3e70f0a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,6 +42,7 @@ pub use crate::pointer::{JsonPointer, PointerNode, PointerTree}; pub use crate::serde::{ from_slice, from_slice_unchecked, from_str, to_string, to_string_pretty, to_vec, to_vec_pretty, to_writer, to_writer_pretty, Deserializer, JsonNumberTrait, Number, RawNumber, Serializer, + StreamDeserializer, }; #[doc(inline)] pub use crate::value::{ diff --git a/src/serde/de.rs b/src/serde/de.rs index b2f23fe..aa70df6 100644 --- a/src/serde/de.rs +++ b/src/serde/de.rs @@ -1,7 +1,7 @@ //! Deserialize JSON data to a Rust data structure. // The code is cloned from [serde_json](https://github.com/serde-rs/json) and modified necessary parts. -use std::{mem::ManuallyDrop, ptr::slice_from_raw_parts}; +use std::{marker::PhantomData, mem::ManuallyDrop, ptr::slice_from_raw_parts}; use serde::{ de::{self, Expected, Unexpected}, @@ -18,6 +18,7 @@ use crate::{ reader::{Read, Reader, Reference}, util::num::ParserNumber, value::node::Value, + JsonInput, }; const MAX_ALLOWED_DEPTH: u8 = u8::MAX; @@ -31,6 +32,131 @@ pub struct Deserializer { remaining_depth: u8, } +// some functions only used for struct visitors. +impl<'de, R: Reader<'de>> Deserializer { + /// Create a new deserializer. + pub fn new(read: R) -> Self { + Self { + parser: Parser::new(read), + scratch: Vec::new(), + remaining_depth: MAX_ALLOWED_DEPTH, + } + } + + /// Deserialize a JSON stream to a Rust data structure. + /// + /// It can be used repeatedly and we do not check trailing chars after deserilalized. + /// + /// # Example + /// + /// ``` + /// # use sonic_rs::{prelude::*, Value}; + /// + /// use sonic_rs::Deserializer; + /// + /// let multiple_json = r#"{"a": 123, "b": "foo"} true [1, 2, 3] wrong chars"#; + /// + /// let mut deserializer = Deserializer::from_json(multiple_json); + /// + /// let val: Value = deserializer.deserialize().unwrap(); + /// assert_eq!(val["a"].as_i64().unwrap(), 123); + /// assert_eq!(val["b"].as_str().unwrap(), "foo"); + /// + /// let val: bool = deserializer.deserialize().unwrap(); + /// assert_eq!(val, true); + /// + /// let val: Vec = deserializer.deserialize().unwrap(); + /// assert_eq!(val, &[1, 2, 3]); + /// + /// // encounter the wrong chars in json + /// assert!(deserializer.deserialize::().is_err()); + /// ``` + pub fn deserialize(&mut self) -> Result + where + T: de::Deserialize<'de>, + { + de::Deserialize::deserialize(self) + } + + /// Convert Deserializer to a [`StreamDeserializer`]. + pub fn into_stream(self) -> StreamDeserializer<'de, T, R> { + StreamDeserializer { + de: self, + data: PhantomData, + lifetime: PhantomData, + is_ending: false, + } + } +} + +impl<'de> Deserializer> { + /// Create a new deserializer from a json input [`JsonInput`]. + pub fn from_json>(input: I) -> Self { + Self::new(Read::from(input)) + } + + /// Create a new deserializer from a string. + #[allow(clippy::should_implement_trait)] + pub fn from_str(s: &'de str) -> Self { + Self::new(Read::from(s)) + } + + /// Create a new deserializer from a string slice. + pub fn from_slice(s: &'de [u8]) -> Self { + Self::new(Read::from(s)) + } +} + +/// An iterator that deserializes a json stream into multiple `T` values. +/// +/// # Example +/// +/// ``` +/// use sonic_rs::{prelude::*, Deserializer, Value}; +/// +/// let multiple_json = r#"{"a": 123, "b": "foo"} true [1, 2, 3] wrong chars"#; +/// +/// let mut stream = Deserializer::from_json(multiple_json).into_stream::(); +/// +/// let val = stream.next().unwrap().unwrap(); +/// assert_eq!(val["a"].as_i64().unwrap(), 123); +/// assert_eq!(val["b"].as_str().unwrap(), "foo"); +/// +/// let val = stream.next().unwrap().unwrap(); +/// assert_eq!(val, true); +/// +/// let val = stream.next().unwrap().unwrap(); +/// assert_eq!(val, &[1, 2, 3]); +/// +/// // encounter the wrong chars in json +/// assert!(stream.next().unwrap().is_err()); +/// ``` +pub struct StreamDeserializer<'de, T, R> { + de: Deserializer, + data: PhantomData, + lifetime: PhantomData<&'de R>, + is_ending: bool, +} + +impl<'de, T, R> Iterator for StreamDeserializer<'de, T, R> +where + T: de::Deserialize<'de>, + R: Reader<'de>, +{ + type Item = Result; + + fn next(&mut self) -> Option { + if self.is_ending { + return None; + } + let val: Result = self.de.deserialize(); + if val.is_err() { + self.is_ending = true; + } + Some(val) + } +} + // We only use our own error type; no need for From conversions provided by the // standard library's try! macro. This reduces lines of LLVM IR by 4%. macro_rules! tri { @@ -100,14 +226,6 @@ macro_rules! impl_deserialize_number { // some functions only used for struct visitors. impl<'de, R: Reader<'de>> Deserializer { - pub fn new(read: R) -> Self { - Self { - parser: Parser::new(read), - scratch: Vec::new(), - remaining_depth: MAX_ALLOWED_DEPTH, - } - } - pub(crate) fn deserialize_number(&mut self, visitor: V) -> Result where V: de::Visitor<'de>, diff --git a/src/serde/mod.rs b/src/serde/mod.rs index 51d1e7c..7448b18 100644 --- a/src/serde/mod.rs +++ b/src/serde/mod.rs @@ -7,7 +7,7 @@ pub(crate) mod ser; pub(crate) use self::de::tri; pub use self::{ - de::{from_slice, from_slice_unchecked, from_str, Deserializer}, + de::{from_slice, from_slice_unchecked, from_str, Deserializer, StreamDeserializer}, number::{JsonNumberTrait, Number}, rawnumber::RawNumber, ser::{ From 522e129e280c1cf41e081a1dda68d43c6f6dda80 Mon Sep 17 00:00:00 2001 From: "liuqiang.06" Date: Tue, 20 Aug 2024 17:07:01 +0800 Subject: [PATCH 2/3] chore: remove some dependencies --- .gitignore | 2 + Cargo.toml | 6 ++- benches/value_operator.rs | 83 +++++++++++++++++++++++++++++++++++++++ src/parser.rs | 7 ++-- src/util/arch/mod.rs | 13 ------ src/util/mod.rs | 6 +-- src/util/string.rs | 18 ++++++--- src/value/node.rs | 9 +---- src/value/object.rs | 26 ++++++++++++ 9 files changed, 135 insertions(+), 35 deletions(-) create mode 100644 benches/value_operator.rs diff --git a/.gitignore b/.gitignore index d2a4c4f..62a1f85 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ rust- *.data opensource_git_commit.log Cargo.lock +*.profraw +*.profdata \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index f23c673..51e6f91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,6 @@ categories = ["encoding", "parser-implementations"] [dependencies] cfg-if = "1.0" -arrayref = "0.3" serde = { version = "1.0", features = ["rc", "derive"] } itoa = "1.0" ryu = "1.0" @@ -26,7 +25,6 @@ bytes = "1.4" thiserror = "1.0" simdutf8 = "0.1" parking_lot = "0.12" -page_size = "0.6" [target.'cfg(not(target_env = "msvc"))'.dev-dependencies] @@ -78,6 +76,10 @@ harness = false name = "get_from" harness = false +[[bench]] +name = "value_operator" +harness = false + [features] default = [] diff --git a/benches/value_operator.rs b/benches/value_operator.rs new file mode 100644 index 0000000..4e899cb --- /dev/null +++ b/benches/value_operator.rs @@ -0,0 +1,83 @@ +#[macro_use] +extern crate criterion; +use std::io::Read; + +use criterion::{criterion_group, BatchSize, Criterion}; +use sonic_rs::JsonValueTrait; + +fn bench_get(c: &mut Criterion) { + let core_ids = core_affinity::get_core_ids().unwrap(); + core_affinity::set_for_current(core_ids[0]); + + let mut data = Vec::new(); + let root = env!("CARGO_MANIFEST_DIR").to_owned(); + std::fs::File::open(root + concat!("/benches/testdata/twitter.json")) + .unwrap() + .read_to_end(&mut data) + .unwrap(); + + let sonic_value: sonic_rs::Value = sonic_rs::from_slice(&data).unwrap(); + let serde_value: serde_json::Value = serde_json::from_slice(&data).unwrap(); + + assert_eq!( + sonic_value["statuses"][4]["entities"]["media"][0]["source_status_id_str"].as_str(), + Some("439430848190742528") + ); + assert_eq!( + serde_value["statuses"][4]["entities"]["media"][0]["source_status_id_str"].as_str(), + Some("439430848190742528") + ); + + let mut group = c.benchmark_group("value"); + group.bench_with_input("sonic-rs::value_get", &sonic_value, |b, data| { + b.iter_batched( + || data, + |value| { + let _ = + value["statuses"][4]["entities"]["media"][0]["source_status_id_str"].as_str(); + }, + BatchSize::SmallInput, + ) + }); + + group.bench_with_input("serde_json::value_get", &serde_value, |b, data| { + b.iter_batched( + || data, + |value| { + let _ = + value["statuses"][4]["entities"]["media"][0]["source_status_id_str"].as_str(); + }, + BatchSize::SmallInput, + ) + }); + + group.bench_with_input("sonic_rs::value_new", &sonic_value, |b, data| { + b.iter_batched( + || data, + |_value| { + let mut value = sonic_rs::Array::new(); + for i in 0..100 { + value.push(i); + } + }, + BatchSize::SmallInput, + ) + }); + + group.bench_with_input("serde_json::value_new", &serde_value, |b, data| { + b.iter_batched( + || data, + |_value| { + let mut value = serde_json::Value::Array(Vec::new()); + let array = &mut value.as_array_mut().unwrap(); + for i in 0..100 { + array.push(serde_json::Value::from(i as f64)); + } + }, + BatchSize::SmallInput, + ) + }); +} + +criterion_group!(benches, bench_get); +criterion_main!(benches); diff --git a/src/parser.rs b/src/parser.rs index 70c3a2f..71c3e76 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -5,7 +5,6 @@ use std::{ str::from_utf8_unchecked, }; -use arrayref::array_ref; use faststr::FastStr; use serde::de::{self, Expected, Unexpected}; use smallvec::SmallVec; @@ -1217,7 +1216,7 @@ where let reader = &mut self.read; while let Some(chunk) = reader.peek_n(64) { - let input = array_ref![chunk, 0, 64]; + let input = unsafe { &*(chunk.as_ptr() as *const [_; 64]) }; if let Some(count) = skip_container_loop( input, &mut prev_instring, @@ -1291,7 +1290,7 @@ where // then we use simd to accelerate skipping space while let Some(chunk) = reader.peek_n(64) { - let chunk = array_ref![chunk, 0, 64]; + let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 64]) }; let bitmap = unsafe { get_nonspace_bits(chunk) }; if bitmap != 0 { self.nospace_bits = bitmap; @@ -1351,7 +1350,7 @@ where // then we use simd to accelerate skipping space while let Some(chunk) = reader.peek_n(64) { - let chunk = array_ref![chunk, 0, 64]; + let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 64]) }; let bitmap = unsafe { get_nonspace_bits(chunk) }; if bitmap != 0 { self.nospace_bits = bitmap; diff --git a/src/util/arch/mod.rs b/src/util/arch/mod.rs index 598a7ce..c740702 100644 --- a/src/util/arch/mod.rs +++ b/src/util/arch/mod.rs @@ -12,19 +12,6 @@ cfg_if::cfg_if! { } } -#[inline] -pub fn page_size() -> usize { - cfg_if::cfg_if! { - // fast path for most common arch - if #[cfg(any(target_os = "linux", target_os = "macos"))] { - 4096 - } else { - // slow path for portability - ::page_size::get() - } - } -} - #[cfg(test)] mod test { use super::*; diff --git a/src/util/mod.rs b/src/util/mod.rs index 2faa7c1..b31c7e4 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,6 @@ pub(crate) mod arc; pub(crate) mod arch; + pub(crate) mod num; pub(crate) mod private; pub(crate) mod range; @@ -9,10 +10,9 @@ pub(crate) mod taggedptr; pub(crate) mod unicode; pub(crate) mod utf8; -pub(crate) mod mock; - #[allow(non_camel_case_types)] #[allow(unused_imports)] pub(crate) mod simd; -// TODO: move into separate crate +#[cfg(test)] +pub(crate) mod mock; diff --git a/src/util/string.rs b/src/util/string.rs index 8a9290a..a571441 100644 --- a/src/util/string.rs +++ b/src/util/string.rs @@ -12,7 +12,6 @@ use crate::{ self, ControlCharacterWhileParsingString, InvalidEscape, InvalidUnicodeCodePoint, }, util::{ - arch::page_size, simd::{BitMask, Mask, Simd}, unicode::handle_unicode_codepoint_mut, }, @@ -496,9 +495,18 @@ unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u } #[inline(always)] -fn cross_page(ptr: *const u8, step: usize) -> bool { - let page_size = page_size(); - ((ptr as usize & (page_size - 1)) + step) > page_size +fn check_cross_page(ptr: *const u8, step: usize) -> bool { + #[cfg(any(target_os = "linux", target_os = "macos"))] + { + let page_size = 4096; + ((ptr as usize & (page_size - 1)) + step) > page_size + } + + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + // not check page cross in fallback envs, always true + true + } } #[inline(always)] @@ -565,7 +573,7 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) let mut temp: [u8; LANES] = [0u8; LANES]; while nb > 0 { - v = if cross_page(sptr, LANES) { + v = if check_cross_page(sptr, LANES) { std::ptr::copy_nonoverlapping(sptr, temp[..].as_mut_ptr(), nb); load(temp[..].as_ptr()) } else { diff --git a/src/value/node.rs b/src/value/node.rs index 0c50657..c3933df 100644 --- a/src/value/node.rs +++ b/src/value/node.rs @@ -190,10 +190,6 @@ impl Value { self.meta.tag() < STRING && !self.is_static() } - pub(crate) fn is_shared(&self) -> bool { - !self.is_root() && !self.is_static() - } - pub(crate) fn unmark_root(&mut self) { let tag = self.meta.tag(); if tag >= STRING { @@ -1493,10 +1489,7 @@ pub(crate) enum ValueState<'a> { #[inline] pub(crate) fn replace_value(dst: &mut Value, mut src: Value) -> Value { match dst.state() { - ValueState::Static(dst) => { - let old = std::mem::replace(dst, src); - return old; - } + ValueState::Static(dst) => return std::mem::replace(dst, src), ValueState::Shared(_) | ValueState::Inlined(_) => {} ValueState::Root(dst) => return std::mem::replace(dst, src), } diff --git a/src/value/object.rs b/src/value/object.rs index 579e780..8799824 100644 --- a/src/value/object.rs +++ b/src/value/object.rs @@ -293,12 +293,38 @@ impl Object { } /// Returns an immutable iterator over the key-value pairs of the object. + /// + /// # Examples + /// ``` + /// use sonic_rs::object; + /// + /// let obj = object! {"a": 1, "b": true, "c": null}; + /// + /// for (key, value) in obj.iter() { + /// println!("{}: {}", key, value); + /// } + /// ``` #[inline] pub fn iter(&self) -> Iter<'_> { Iter(self.0.iter::()) } /// Returns an mutable iterator over the key-value pairs of the object. + /// + /// # Examples + /// ``` + /// use sonic_rs::{object, Value}; + /// + /// let mut obj = object! {"a": 1, "b": true, "c": null}; + /// + /// for (key, value) in obj.iter_mut() { + /// *value = Value::from(key); + /// } + /// + /// assert_eq!(obj["a"], "a"); + /// assert_eq!(obj["b"], "b"); + /// assert_eq!(obj["c"], "c"); + /// ``` #[inline] pub fn iter_mut(&mut self) -> IterMut<'_> { IterMut(self.0.iter_mut::()) From a48d53b8d81a6312a365988fcf986029711d832e Mon Sep 17 00:00:00 2001 From: "liuqiang.06" Date: Tue, 20 Aug 2024 17:10:10 +0800 Subject: [PATCH 3/3] bump 0.3.11 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 51e6f91..e2aac58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sonic-rs" -version = "0.3.10" +version = "0.3.11" authors = ["Volo Team "] edition = "2021" description = "Sonic-rs is a fast Rust JSON library based on SIMD"