Skip to content

Commit

Permalink
fix: fix value and add benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
liuq19 committed Nov 4, 2024
1 parent 565b650 commit f03773b
Show file tree
Hide file tree
Showing 15 changed files with 134 additions and 46 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ rust-
opensource_git_commit.log
Cargo.lock
*.profraw
*.profdata
*.profdata
*.svg
*.diff
15 changes: 9 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ categories = ["encoding", "parser-implementations"]
description = "Sonic-rs is a fast Rust JSON library based on SIMD"
documentation = "https://docs.rs/sonic-rs"
edition = "2021"
exclude = ["benchmarks", "assets"]
keywords = ["json", "simd", "serde", "serialization"]
license = "Apache-2.0"
name = "sonic-rs"
Expand All @@ -25,15 +26,14 @@ simdutf8 = "0.1"
thiserror = "1.0"

[dev-dependencies]
bytes = { version = "1.4", features = ["serde"] }
chrono = { version = "0.4", features = ["serde"] }
encoding_rs = "0.8"
paste = "1.0"
schema = { path = "./benchmarks/benches/schema" }
serde_bytes = "0.11"
serde_derive = "1.0"
serde_json = { version = "1.0", features = ["float_roundtrip", "raw_value"] }
# This schema are used in benches and copied from https://github.com/serde-rs/json-benchmark
bytes = { version = "1.4", features = ["serde"] }
chrono = { version = "0.4", features = ["serde"] }
paste = "1.0"
schema = { path = "./benchmarks/benches/schema" }
serde_bytes = "0.11"

[profile.release]
codegen-units = 1
Expand All @@ -55,3 +55,6 @@ arbitrary_precision = []

# Sort the keys when serializing `sonic_rs::Value`.
sort_keys = []

# Will record the raw message of number and string when parse JSON into `sonic::Value`, and serialize the value will use the raw message
use_raw = []
31 changes: 31 additions & 0 deletions benchmarks/benches/common.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#[derive(Debug, Clone, Copy)]
struct SonicConfig {
use_raw: bool,
use_rawnum: bool,
}

static SONIC_DEFAULT_CFG: SonicConfig = SonicConfig {
use_raw: false,
use_rawnum: false,
};

static SONIC_USE_RAWNUM_CFG: SonicConfig = SonicConfig {
use_raw: false,
use_rawnum: true,
};

static SONIC_USE_RAW_CFG: SonicConfig = SonicConfig {
use_raw: true,
use_rawnum: false,
};

fn do_sonic_rs_from_slice(data: &[u8], cfg: SonicConfig) -> sonic_rs::Result<sonic_rs::Value> {
let mut de = sonic_rs::Deserializer::new(sonic_rs::Read::from(data));
if cfg.use_rawnum {
de = de.use_rawnumber();
}
if cfg.use_raw {
de = de.use_raw();
}
sonic_rs::Deserialize::deserialize(&mut de)
}
24 changes: 21 additions & 3 deletions benchmarks/benches/deserialize_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use std::{fs::File, io::Read, str::from_utf8_unchecked};

use criterion::{criterion_group, BatchSize, Criterion, SamplingMode, Throughput};

include!("./common.rs");

#[cfg(not(target_env = "msvc"))]
#[global_allocator]
static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
Expand All @@ -26,8 +28,8 @@ fn serde_from_str(data: &[u8]) {
let _: serde_json::Value = serde_json::from_str(data).unwrap();
}

fn sonic_rs_from_slice(data: &[u8]) {
let _: sonic_rs::Value = sonic_rs::from_slice(data).unwrap();
fn sonic_rs_from_slice(data: &[u8], cfg: SonicConfig) {
let _: sonic_rs::Value = do_sonic_rs_from_slice(data, cfg).unwrap();
}

fn sonic_rs_from_slice_unchecked(data: &[u8]) {
Expand Down Expand Up @@ -78,7 +80,23 @@ macro_rules! bench_file {
group.bench_with_input("sonic_rs_dom::from_slice", &vec, |b, data| {
b.iter_batched(
|| data,
|bytes| sonic_rs_from_slice(&bytes),
|bytes| sonic_rs_from_slice(&bytes, SONIC_DEFAULT_CFG),
BatchSize::SmallInput,
)
});

group.bench_with_input("sonic_rs_dom::from_slice_use_raw", &vec, |b, data| {
b.iter_batched(
|| data,
|bytes| sonic_rs_from_slice(&bytes, SONIC_USE_RAW_CFG),
BatchSize::SmallInput,
)
});

group.bench_with_input("sonic_rs_dom::from_slice_use_rawnum", &vec, |b, data| {
b.iter_batched(
|| data,
|bytes| sonic_rs_from_slice(&bytes, SONIC_USE_RAWNUM_CFG),
BatchSize::SmallInput,
)
});
Expand Down
23 changes: 22 additions & 1 deletion benchmarks/benches/serialize_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use std::{fs::File, io::Read};

use criterion::{criterion_group, BatchSize, Criterion, SamplingMode, Throughput};

include!("./common.rs");

#[cfg(not(target_env = "msvc"))]
#[global_allocator]
static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
Expand Down Expand Up @@ -71,7 +73,7 @@ macro_rules! bench_file {
let mut group = c.benchmark_group(stringify!($name));
group.sampling_mode(SamplingMode::Flat);

let value: sonic_rs::Value = sonic_rs::from_slice(&data).unwrap();
let value: sonic_rs::Value = do_sonic_rs_from_slice(&data, SONIC_DEFAULT_CFG).unwrap();
group.bench_with_input("sonic_rs::to_string", &value, |b, data| {
b.iter_batched(
|| data,
Expand All @@ -80,6 +82,25 @@ macro_rules! bench_file {
)
});

let value: sonic_rs::Value = do_sonic_rs_from_slice(&data, SONIC_USE_RAW_CFG).unwrap();
group.bench_with_input("sonic_rs::to_string_use_raw", &value, |b, data| {
b.iter_batched(
|| data,
|val| sonic_rs_to_string(&val),
BatchSize::SmallInput,
)
});

let value: sonic_rs::Value =
do_sonic_rs_from_slice(&data, SONIC_USE_RAWNUM_CFG).unwrap();
group.bench_with_input("sonic_rs::to_string_use_rawnum", &value, |b, data| {
b.iter_batched(
|| data,
|val| sonic_rs_to_string(&val),
BatchSize::SmallInput,
)
});

let value: serde_json::Value = serde_json::from_slice(&data).unwrap();
group.bench_with_input("serde_json::to_string", &value, |b, data| {
b.iter_batched(|| data, |val| serde_to_string(&val), BatchSize::SmallInput)
Expand Down
2 changes: 1 addition & 1 deletion fuzz/fuzz_targets/from_slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ fuzz_target!(|data: &[u8]| {
let sv2: Value = from_str(&sout).unwrap();
let eq = compare_value(&jv2, &sv2);

// compre use raw
// compare use raw
fuzz_use_raw(data, &sv);

if jv.is_object() && eq {
Expand Down
7 changes: 7 additions & 0 deletions scripts/sanitize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@ for san in address leak; do
echo "Running tests with $san"
RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --target x86_64-unknown-linux-gnu -- --test-threads=1
RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --doc --package sonic-rs --target x86_64-unknown-linux-gnu -- --show-output --test-threads=1

RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --features arbitrary_precision --target x86_64-unknown-linux-gnu -- --test-threads=1
RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --features arbitrary_precision --doc --package sonic-rs --target x86_64-unknown-linux-gnu -- --show-output --test-threads=1


RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --features use_raw --target x86_64-unknown-linux-gnu -- --test-threads=1
RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --features use_raw --doc --package sonic-rs --target x86_64-unknown-linux-gnu -- --show-output --test-threads=1
done


4 changes: 2 additions & 2 deletions src/lazyvalue/iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ impl<'de> ArrayJsonIter<'de> {
/// iterating.
///
/// The item of the iterator is a key-value pair: ([FastStr][`faststr::FastStr`],
/// [Result<LazyValue>][`crate::LazyValue`]).
/// [`Result<LazyValue>`][`crate::LazyValue`]).
///
/// # Errors
///
Expand Down Expand Up @@ -246,7 +246,7 @@ pub fn to_array_iter<'de, I: JsonInput<'de>>(json: I) -> ArrayJsonIter<'de> {
/// iterating.
///
/// The item of the iterator is a key-value pair: ([FastStr][`faststr::FastStr`],
/// [Result<LazyValue>][`crate::LazyValue`]).
/// [`Result<LazyValue>`][`crate::LazyValue`]).
///
/// # Errors
///
Expand Down
17 changes: 14 additions & 3 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ where
let raw = as_str(&self.read.as_u8_slice()[start - 1..end]);
let alloc = vis.allocator().unwrap();
let raw = RawStr::new_in(alloc, raw);
let s = &*(alloc.alloc_str(s) as *mut str);
check_visit!(self, vis.visit_raw_str(s, raw))
},
}
Expand All @@ -264,6 +265,16 @@ where
}
}

fn check_string_eof_inpadding(&self) -> Result<usize> {
let json = self.read.as_u8_slice();
let cur = self.read.index();
if cur > json.len() {
perr!(self, EofWhileParsing)
} else {
Ok(cur)
}
}

#[inline(always)]
fn parse_string_inplace<V: JsonVisitor<'de>>(&mut self, vis: &mut V) -> Result<()> {
if !self.cfg.use_raw {
Expand All @@ -275,7 +286,7 @@ where
let start = self.read.cur_ptr();
match self.skip_string_unchecked()? {
ParseStatus::HasEscaped => {
let end = self.read.index();
let end = self.check_string_eof_inpadding()?;
let raw = as_str(&self.read.as_u8_slice()[start_idx - 1..end]);
let alloc = vis.allocator().unwrap();
let raw = RawStr::new_in(alloc, raw);
Expand All @@ -285,8 +296,8 @@ where
check_visit!(self, vis.visit_raw_str(s, raw))
}
ParseStatus::None => {
let end = self.read.index() - 1;
let s = as_str(&self.read.as_u8_slice()[start_idx..end]);
let end = self.check_string_eof_inpadding()?;
let s = as_str(&self.read.as_u8_slice()[start_idx..end - 1]);
check_visit!(self, vis.visit_borrowed_str(s))
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/pointer/point.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use faststr::FastStr;

/// Represents a json pointer path. It can be created by [`pointer!`] macro.
/// Represents a json pointer path. It can be created by [`pointer`] macro.
pub type JsonPointer = Vec<PointerNode>;

/// Represents a node in a json pointer path.
Expand Down
14 changes: 14 additions & 0 deletions src/serde/de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1282,12 +1282,26 @@ where
R: Reader<'de>,
T: de::Deserialize<'de>,
{
// check JSON size, because the design of `sonic_rs::Value`, parsing JSON larger than 2 GB is
// not supported
let len = read.as_u8_slice().len();
if len >= (1 << 32) {
return Err(crate::error::make_error(format!(
"Only support JSON less than 2 GB, the input JSON is too large here, len is {len}"
)));
}

let mut de = Deserializer::new(read);
#[cfg(feature = "arbitrary_precision")]
{
de = de.use_rawnumber();
}

#[cfg(feature = "use_raw")]
{
de = de.use_raw();
}

let value = tri!(de::Deserialize::deserialize(&mut de));

// Make sure the whole stream has been consumed.
Expand Down
30 changes: 6 additions & 24 deletions src/util/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,21 @@ use crate::error::{Error, ErrorCode, Result};

// simduft8 will cause `out-of-bounds pointer arithmetic` when using Miri tests
#[cfg(not(miri))]
#[inline(always)]
#[inline]
pub(crate) fn from_utf8(data: &[u8]) -> Result<&str> {
match simdutf8::basic::from_utf8(data) {
Ok(ret) => Ok(ret),
Err(_) => {
// slow path, get the correct position of the first invalid utf-8 character
from_utf8_compat(data)
}
}
simdutf8::basic::from_utf8(data).or_else(|_| from_utf8_compat(data))
}

#[cfg(miri)]
pub(crate) fn from_utf8(data: &[u8]) -> Result<&str> {
match std::str::from_utf8(data) {
Ok(ret) => Ok(ret),
Err(err) => Err(Error::syntax(
ErrorCode::InvalidUTF8,
data,
err.valid_up_to(),
)),
}
std::str::from_utf8(data)
.map_err(|e| Error::syntax(ErrorCode::InvalidUTF8, data, e.valid_up_to()))
}

#[cfg(not(miri))]
#[cold]
fn from_utf8_compat(data: &[u8]) -> Result<&str> {
// compat::from_utf8 is slower than basic::from_utf8
match simdutf8::compat::from_utf8(data) {
Ok(ret) => Ok(ret),
Err(err) => Err(Error::syntax(
ErrorCode::InvalidUTF8,
data,
err.valid_up_to(),
)),
}
simdutf8::compat::from_utf8(data)
.map_err(|e| Error::syntax(ErrorCode::InvalidUTF8, data, e.valid_up_to()))
}
2 changes: 1 addition & 1 deletion src/value/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ impl<'a, T: Clone + Into<Value> + 'a> Extend<&'a T> for Array {
value.push(v.clone().into());
}
} else {
unreachable!("should not happend")
unreachable!("should not happened")
}
}
}
Expand Down
3 changes: 1 addition & 2 deletions src/value/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,7 @@ impl Value {
}

fn forward_find_shared(current: *const Value, idx: usize) -> *const Shared {
let meta = unsafe { &*(current.sub(idx) as *const MetaNode) };
meta.shared
unsafe { (*(current.sub(idx) as *const MetaNode)).shared }
}

fn unpack_shared(&self) -> &Shared {
Expand Down
2 changes: 1 addition & 1 deletion src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pub trait WriteExt: io::Write {
/// a slice of `MaybeUninit<u8>`.
///
/// The returned slice will be used to write new data before marking the data as initialized
/// using the [`add_len`] method.
/// using the [`WriteExt::flush_len`] method.
fn reserve_with(&mut self, additional: usize) -> io::Result<&mut [MaybeUninit<u8>]>;

/// Flush the `additional` length to the output stream, ensuring that `additional` bytes
Expand Down

0 comments on commit f03773b

Please sign in to comment.