From bab0accd4a861f803b08a3b35a0dcbb20e73fff1 Mon Sep 17 00:00:00 2001 From: "Heinz N. Gies" Date: Tue, 30 Jul 2019 20:41:32 +0200 Subject: [PATCH 1/9] Put something in the readme so we can have a PR --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index da57b807..e4f633af 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,8 @@ To be able to take advantage of simdjson your system needs to be SIMD compatible. This means to compile with native cpu support and the given features. Look at [The cargo config in this repository](.cargo/config) to get an example. +simd-json.rs supports AVX2, SSE4.2 and NEON. + ### jemalloc If you are writing performance centric code, make sure to use jemalloc and not the system allocator (which has now become default in rust), it gives a very noticeable boost in performance. From 489477bd45d8b0ce4faf5d51636d5eb651ad8c6a Mon Sep 17 00:00:00 2001 From: "Heinz N. Gies" Date: Tue, 30 Jul 2019 21:52:51 +0200 Subject: [PATCH 2/9] Add drone file --- .drone.yml | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .drone.yml diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 00000000..7f69b6c1 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,46 @@ +kind: pipeline +name: test-on-avx2 + +platform: + arch: amd64 + +steps: +- name: test + image: rust:1 + environment: + RUSTFLAGS: '-C target-cpu=native' + commands: + - cargo build --verbose --all + - cargo test --verbose --all + +--- + +kind: pipeline +name: test-on-sse42 + +platform: + arch: amd64 + +steps: +- name: test + image: rust:1 + environment: + RUSTFLAGS: '-C target-cpu=native -C target-feature=-avx2' + commands: + - cargo build --verbose --all + - cargo test --verbose --all + +--- + +kind: pipeline +name: test-on-arm64 + +platform: + arch: arm64 + +steps: +- name: test + image: rust:1 + commands: + - cargo build --verbose --all + - cargo test --verbose --all From e2abd7a9656c8ff38ef8e9fd3f8f9a99b5baefdb Mon Sep 17 00:00:00 2001 From: "Heinz N. Gies" Date: Tue, 30 Jul 2019 21:57:52 +0200 Subject: [PATCH 3/9] update build status --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e4f633af..91fad1c4 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -# SIMD Json for Rust   [![Build Status]][circleci.com] [![Windows Build Status]][appveyor.com] [![Latest Version]][crates.io] +# SIMD Json for Rust   [![Build Status]][drone.io] [![Windows Build Status]][appveyor.com] [![Latest Version]][crates.io] -[Build Status]: https://circleci.com/gh/Licenser/simdjson-rs/tree/master.svg?style=svg -[circleci.com]: https://circleci.com/gh/Licenser/simdjson-rs/tree/master +[Build Status]: https://cloud.drone.io/api/badges/Licenser/simdjson-rs/status.svg +[drone.io]: https://cloud.drone.io/Licenser/simdjson-rs [Windows Build Status]: https://ci.appveyor.com/api/projects/status/0kf0v6hj5v2gite9?svg=true [appveyor.com]: https://ci.appveyor.com/project/Licenser/simdjson-rs [Latest Version]: https://img.shields.io/crates/v/simd-json.svg From 5113c8f7b71159381826ddfd90b912756e236115 Mon Sep 17 00:00:00 2001 From: "Heinz N. Gies" Date: Tue, 30 Jul 2019 22:07:27 +0200 Subject: [PATCH 4/9] unguard for sse4.2 to allow rust to polyfill on older platforms --- .drone.yml | 18 +++++++++++++++++- src/lib.rs | 6 +++--- src/stage2.rs | 2 +- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.drone.yml b/.drone.yml index 7f69b6c1..9b8e8bb2 100644 --- a/.drone.yml +++ b/.drone.yml @@ -12,7 +12,6 @@ steps: commands: - cargo build --verbose --all - cargo test --verbose --all - --- kind: pipeline @@ -32,6 +31,23 @@ steps: --- +kind: pipeline +name: test-on-pre-sse42 + +platform: + arch: amd64 + +steps: +- name: test + image: rust:1 + environment: + RUSTFLAGS: '-C target-cpu=native -C target-feature=-avx2,-sse4.2' + commands: + - cargo build --verbose --all + - cargo test --verbose --all + +--- + kind: pipeline name: test-on-arm64 diff --git a/src/lib.rs b/src/lib.rs index f74e419a..02038553 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -89,11 +89,11 @@ pub use crate::avx2::deser::*; #[cfg(target_feature = "avx2")] use crate::avx2::stage1::SIMDJSON_PADDING; -#[cfg(all(target_feature = "sse4.2", not(target_feature = "avx2")))] +#[cfg(not(target_feature = "avx2"))] mod sse42; -#[cfg(all(target_feature = "sse4.2", not(target_feature = "avx2")))] +#[cfg(not(target_feature = "avx2"))] pub use crate::sse42::deser::*; -#[cfg(all(target_feature = "sse4.2", not(target_feature = "avx2")))] +#[cfg(not(target_feature = "avx2"))] use crate::sse42::stage1::SIMDJSON_PADDING; mod stage2; diff --git a/src/stage2.rs b/src/stage2.rs index 7bbb1da2..aaa51ad8 100644 --- a/src/stage2.rs +++ b/src/stage2.rs @@ -2,7 +2,7 @@ #[cfg(target_feature = "avx2")] use crate::avx2::stage1::SIMDJSON_PADDING; use crate::charutils::*; -#[cfg(all(target_feature = "sse4.2", not(target_feature = "avx2")))] +#[cfg(not(target_feature = "avx2"))] use crate::sse42::stage1::SIMDJSON_PADDING; use crate::{Deserializer, Error, ErrorType, Result}; From 50ba38a18f355c450b2b2629777eb0059841b06d Mon Sep 17 00:00:00 2001 From: "Heinz N. Gies" Date: Fri, 16 Aug 2019 18:20:49 +0200 Subject: [PATCH 5/9] Add more simd tests --- data/fail/fail70.json | 1 + data/fail/fail71.json | 1 + data/fail/fail72.json | 1 + data/fail/fail73.json | 1 + data/fail/fail74.json | 1 + data/fail/fail75.json | 1 + data/fail/fail76.json | 1 + tests/jsonchecker.rs | 11 +++++++++++ 8 files changed, 18 insertions(+) create mode 100644 data/fail/fail70.json create mode 100644 data/fail/fail71.json create mode 100644 data/fail/fail72.json create mode 100644 data/fail/fail73.json create mode 100644 data/fail/fail74.json create mode 100644 data/fail/fail75.json create mode 100644 data/fail/fail76.json diff --git a/data/fail/fail70.json b/data/fail/fail70.json new file mode 100644 index 00000000..e3d3437a --- /dev/null +++ b/data/fail/fail70.json @@ -0,0 +1 @@ +["string contains bad UTF-8 €"] diff --git a/data/fail/fail71.json b/data/fail/fail71.json new file mode 100644 index 00000000..9c03e5f0 --- /dev/null +++ b/data/fail/fail71.json @@ -0,0 +1 @@ +["contains bad UTF-8 €"] diff --git a/data/fail/fail72.json b/data/fail/fail72.json new file mode 100644 index 00000000..9c03e5f0 --- /dev/null +++ b/data/fail/fail72.json @@ -0,0 +1 @@ +["contains bad UTF-8 €"] diff --git a/data/fail/fail73.json b/data/fail/fail73.json new file mode 100644 index 00000000..2f27af72 --- /dev/null +++ b/data/fail/fail73.json @@ -0,0 +1 @@ +["bad UTF-8 €"] diff --git a/data/fail/fail74.json b/data/fail/fail74.json new file mode 100644 index 00000000..e4ad406d --- /dev/null +++ b/data/fail/fail74.json @@ -0,0 +1 @@ +["UTF-8 €"] diff --git a/data/fail/fail75.json b/data/fail/fail75.json new file mode 100644 index 00000000..04452784 --- /dev/null +++ b/data/fail/fail75.json @@ -0,0 +1 @@ +[" €"] diff --git a/data/fail/fail76.json b/data/fail/fail76.json new file mode 100644 index 00000000..c76c04c3 --- /dev/null +++ b/data/fail/fail76.json @@ -0,0 +1 @@ +["€"] diff --git a/tests/jsonchecker.rs b/tests/jsonchecker.rs index 2e29c1a0..ee7ed07a 100644 --- a/tests/jsonchecker.rs +++ b/tests/jsonchecker.rs @@ -149,6 +149,17 @@ fail!(fail67); fail!(fail68); fail!(fail69); +fail!(fail70); +fail!(fail71); +fail!(fail72); +fail!(fail73); +fail!(fail74); +fail!(fail75); +fail!(fail76); +//fail!(fail77); +//fail!(fail78); +//fail!(fail79); + crash!(crash000000); crash!(crash000001); crash!(crash000002); From 9f7b8a9d9f05059de8e35801b633a48800a1752c Mon Sep 17 00:00:00 2001 From: Sunny Gleason Date: Fri, 16 Aug 2019 16:22:52 -0400 Subject: [PATCH 6/9] RFC: Neon support (pretty much working) (#35) * feat: neon support * feat: temp stub replacements for neon intrinsics (pending https://github.com/rust-lang/stdarch/pull/792) * fix: drone CI rustup nightly * feat: fix guards, use rust stdlib for bit count operations * fix: remove double semicolon * feat: fancy generic generator functions, thanks @Licenser --- .drone.yml | 6 +- src/avx2/generator.rs | 51 ++++ src/avx2/mod.rs | 3 +- src/lib.rs | 34 ++- src/neon/deser.rs | 199 ++++++++++++++ src/neon/generator.rs | 48 ++++ src/neon/intrinsics.rs | 557 ++++++++++++++++++++++++++++++++++++++ src/neon/mod.rs | 7 + src/neon/simd.rs | 470 ++++++++++++++++++++++++++++++++ src/neon/simd_llvm.rs | 54 ++++ src/neon/stage1.rs | 598 +++++++++++++++++++++++++++++++++++++++++ src/neon/utf8check.rs | 253 +++++++++++++++++ src/numberparse.rs | 17 +- src/portability.rs | 30 +++ src/sse42/generator.rs | 51 ++++ src/sse42/mod.rs | 3 +- src/stage2.rs | 6 +- src/stringparse.rs | 6 + src/value.rs | 2 +- src/value/generator.rs | 98 +------ 20 files changed, 2393 insertions(+), 100 deletions(-) create mode 100644 src/avx2/generator.rs create mode 100644 src/neon/deser.rs create mode 100644 src/neon/generator.rs create mode 100644 src/neon/intrinsics.rs create mode 100644 src/neon/mod.rs create mode 100644 src/neon/simd.rs create mode 100644 src/neon/simd_llvm.rs create mode 100644 src/neon/stage1.rs create mode 100644 src/neon/utf8check.rs create mode 100644 src/portability.rs create mode 100644 src/sse42/generator.rs diff --git a/.drone.yml b/.drone.yml index 9b8e8bb2..dbc33acb 100644 --- a/.drone.yml +++ b/.drone.yml @@ -58,5 +58,7 @@ steps: - name: test image: rust:1 commands: - - cargo build --verbose --all - - cargo test --verbose --all + - rustup default nightly + - rustup update + - cargo clean && cargo +nightly build --verbose --all + - cargo +nightly test --verbose --all diff --git a/src/avx2/generator.rs b/src/avx2/generator.rs new file mode 100644 index 00000000..13e72061 --- /dev/null +++ b/src/avx2/generator.rs @@ -0,0 +1,51 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use crate::value::generator::ESCAPED; +use std::io; + +#[inline(always)] +pub unsafe fn write_str_simd(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write { + let zero = _mm256_set1_epi8(0); + let lower_quote_range = _mm256_set1_epi8(0x1F as i8); + let quote = _mm256_set1_epi8(b'"' as i8); + let backslash = _mm256_set1_epi8(b'\\' as i8); + while *len - *idx >= 32 { + // Load 32 bytes of data; + #[allow(clippy::cast_ptr_alignment)] + let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(*idx) as *const __m256i); + // Test the data against being backslash and quote. + let bs_or_quote = _mm256_or_si256( + _mm256_cmpeq_epi8(data, backslash), + _mm256_cmpeq_epi8(data, quote), + ); + // Now mask the data with the quote range (0x1F). + let in_quote_range = _mm256_and_si256(data, lower_quote_range); + // then test of the data is unchanged. aka: xor it with the + // Any field that was inside the quote range it will be zero + // now. + let is_unchanged = _mm256_xor_si256(data, in_quote_range); + let in_range = _mm256_cmpeq_epi8(is_unchanged, zero); + let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range)); + if quote_bits != 0 { + let quote_dist = quote_bits.trailing_zeros() as usize; + stry!(writer.write_all(&string[0..*idx + quote_dist])); + let ch = string[*idx + quote_dist]; + match ESCAPED[ch as usize] { + b'u' => stry!(write!(writer, "\\u{:04x}", ch)), + + escape => stry!(writer.write_all(&[b'\\', escape])), + }; + *string = &string[*idx + quote_dist + 1..]; + *idx = 0; + *len = string.len(); + } else { + *idx += 32; + } + } + stry!(writer.write_all(&string[0..*idx])); + *string = &string[*idx..]; + Ok(()) +} \ No newline at end of file diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 30c55c86..ac608ae2 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -1,3 +1,4 @@ pub mod deser; pub mod stage1; -pub mod utf8check; \ No newline at end of file +pub mod utf8check; +pub mod generator; \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 02038553..b68deadb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,18 @@ #![deny(warnings)] + +#![cfg_attr(target_feature = "neon", feature( + asm, + stdsimd, + repr_simd, + custom_inner_attributes, + aarch64_target_feature, + platform_intrinsics, + stmt_expr_attributes, + simd_ffi, + link_llvm_intrinsics + ) +)] + #![cfg_attr(feature = "hints", feature(core_intrinsics))] //! simdjson-rs is a rust port of the simejson c++ library. It follows //! most of the design closely with a few exceptions to make it better @@ -89,17 +103,25 @@ pub use crate::avx2::deser::*; #[cfg(target_feature = "avx2")] use crate::avx2::stage1::SIMDJSON_PADDING; -#[cfg(not(target_feature = "avx2"))] +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] mod sse42; -#[cfg(not(target_feature = "avx2"))] +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] pub use crate::sse42::deser::*; -#[cfg(not(target_feature = "avx2"))] +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] use crate::sse42::stage1::SIMDJSON_PADDING; +#[cfg(target_feature = "neon")] +mod neon; +#[cfg(target_feature = "neon")] +pub use crate::neon::deser::*; +#[cfg(target_feature = "neon")] +use crate::neon::stage1::SIMDJSON_PADDING; + mod stage2; pub mod value; use crate::numberparse::Number; +#[cfg(not(target_feature = "neon"))] use std::mem; use std::str; @@ -163,7 +185,11 @@ impl<'de> Deserializer<'de> { let counts = Deserializer::validate(input, &structural_indexes)?; - let strings = Vec::with_capacity(len + SIMDJSON_PADDING); + // Set length to allow slice access in ARM code + let mut strings = Vec::with_capacity(len + SIMDJSON_PADDING); + unsafe { + strings.set_len(len + SIMDJSON_PADDING); + } Ok(Deserializer { counts, diff --git a/src/neon/deser.rs b/src/neon/deser.rs new file mode 100644 index 00000000..5d70b7af --- /dev/null +++ b/src/neon/deser.rs @@ -0,0 +1,199 @@ + +pub use crate::error::{Error, ErrorType}; +pub use crate::Deserializer; +pub use crate::Result; +pub use crate::neon::stage1::*; +pub use crate::neon::utf8check::*; +pub use crate::neon::intrinsics::*; +pub use crate::stringparse::*; + +impl<'de> Deserializer<'de> { + #[cfg_attr(not(feature = "no-inline"), inline(always))] + pub fn parse_str_(&mut self) -> Result<&'de str> { + // Add 1 to skip the initial " + let idx = self.iidx + 1; + let mut padding = [0u8; 32]; + //let mut read: usize = 0; + + // we include the terminal '"' so we know where to end + // This is safe since we check sub's lenght in the range access above and only + // create sub sliced form sub to `sub.len()`. + + let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) }; + let mut src_i: usize = 0; + let mut len = src_i; + loop { + // store to dest unconditionally - we can overwrite the bits we don't like + // later + + let (v0, v1) = if src.len() >= src_i + 32 { + // This is safe since we ensure src is at least 16 wide + #[allow(clippy::cast_ptr_alignment)] + unsafe { + ( + vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()), + vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()), + ) + } + } else { + unsafe { + padding + .get_unchecked_mut(..src.len() - src_i) + .clone_from_slice(src.get_unchecked(src_i..)); + // This is safe since we ensure src is at least 32 wide + ( + vld1q_u8(padding.get_unchecked(0..16).as_ptr()), + vld1q_u8(padding.get_unchecked(16..32).as_ptr()), + ) + } + }; + + let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1); + + if (bs_bits.wrapping_sub(1) & quote_bits) != 0 { + // we encountered quotes first. Move dst to point to quotes and exit + // find out where the quote is... + let quote_dist: u32 = quote_bits.trailing_zeros(); + + /////////////////////// + // Above, check for overflow in case someone has a crazy string (>=4GB?) + // But only add the overflow check when the document itself exceeds 4GB + // Currently unneeded because we refuse to parse docs larger or equal to 4GB. + //////////////////////// + + // we advance the point, accounting for the fact that we have a NULl termination + + len += quote_dist as usize; + unsafe { + let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str; + return Ok(&*v); + } + + // we compare the pointers since we care if they are 'at the same spot' + // not if they are the same value + } + if (quote_bits.wrapping_sub(1) & bs_bits) != 0 { + // Move to the 'bad' character + let bs_dist: u32 = bs_bits.trailing_zeros(); + len += bs_dist as usize; + src_i += bs_dist as usize; + break; + } else { + // they are the same. Since they can't co-occur, it means we encountered + // neither. + src_i += 32; + len += 32; + } + } + + let mut dst_i: usize = 0; + let dst: &mut [u8] = self.strings.as_mut_slice(); + + loop { + let (v0, v1) = if src.len() >= src_i + 32 { + // This is safe since we ensure src is at least 16 wide + #[allow(clippy::cast_ptr_alignment)] + unsafe { + ( + vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()), + vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()), + ) + } + } else { + unsafe { + padding + .get_unchecked_mut(..src.len() - src_i) + .clone_from_slice(src.get_unchecked(src_i..)); + // This is safe since we ensure src is at least 32 wide + ( + vld1q_u8(padding.get_unchecked(0..16).as_ptr()), + vld1q_u8(padding.get_unchecked(16..32).as_ptr()), + ) + } + }; + + unsafe { + dst.get_unchecked_mut(dst_i..dst_i + 32).copy_from_slice(src.get_unchecked(src_i..src_i + 32)); + } + + // store to dest unconditionally - we can overwrite the bits we don't like + // later + let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1); + + if (bs_bits.wrapping_sub(1) & quote_bits) != 0 { + // we encountered quotes first. Move dst to point to quotes and exit + // find out where the quote is... + let quote_dist: u32 = quote_bits.trailing_zeros(); + + /////////////////////// + // Above, check for overflow in case someone has a crazy string (>=4GB?) + // But only add the overflow check when the document itself exceeds 4GB + // Currently unneeded because we refuse to parse docs larger or equal to 4GB. + //////////////////////// + + // we advance the point, accounting for the fact that we have a NULl termination + + dst_i += quote_dist as usize; + unsafe { + self.input + .get_unchecked_mut(idx + len..idx + len + dst_i) + .clone_from_slice(&self.strings.get_unchecked(..dst_i)); + let v = self.input.get_unchecked(idx..idx + len + dst_i) as *const [u8] + as *const str; + self.str_offset += dst_i as usize; + return Ok(&*v); + } + + // we compare the pointers since we care if they are 'at the same spot' + // not if they are the same value + } + if (quote_bits.wrapping_sub(1) & bs_bits) != 0 { + // find out where the backspace is + let bs_dist: u32 = bs_bits.trailing_zeros(); + let escape_char: u8 = unsafe { *src.get_unchecked(src_i + bs_dist as usize + 1) }; + // we encountered backslash first. Handle backslash + if escape_char == b'u' { + // move src/dst up to the start; they will be further adjusted + // within the unicode codepoint handling code. + src_i += bs_dist as usize; + dst_i += bs_dist as usize; + let (o, s) = if let Ok(r) = handle_unicode_codepoint( + unsafe { src.get_unchecked(src_i..) }, + unsafe { dst.get_unchecked_mut(dst_i..) } + ) + { + r + } else { + return Err(self.error(ErrorType::InvlaidUnicodeCodepoint)); + }; + if o == 0 { + return Err(self.error(ErrorType::InvlaidUnicodeCodepoint)); + }; + // We moved o steps forword at the destiation and 6 on the source + src_i += s; + dst_i += o; + } else { + // simple 1:1 conversion. Will eat bs_dist+2 characters in input and + // write bs_dist+1 characters to output + // note this may reach beyond the part of the buffer we've actually + // seen. I think this is ok + let escape_result: u8 = + unsafe { *ESCAPE_MAP.get_unchecked(escape_char as usize) }; + if escape_result == 0 { + return Err(self.error(ErrorType::InvalidEscape)); + } + unsafe { + *dst.get_unchecked_mut(dst_i + bs_dist as usize) = escape_result; + } + src_i += bs_dist as usize + 2; + dst_i += bs_dist as usize + 1; + } + } else { + // they are the same. Since they can't co-occur, it means we encountered + // neither. + src_i += 32; + dst_i += 32; + } + } + } +} \ No newline at end of file diff --git a/src/neon/generator.rs b/src/neon/generator.rs new file mode 100644 index 00000000..6c8cf358 --- /dev/null +++ b/src/neon/generator.rs @@ -0,0 +1,48 @@ +use crate::value::generator::ESCAPED; +use std::io; +use crate::neon::intrinsics::*; +use crate::neon::stage1::neon_movemask; + +#[inline(always)] +pub unsafe fn write_str_simd(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write { + // The case where we have a 16+ byte block + // we repeate the same logic as above but with + // only 16 bytes + let zero = vdupq_n_u8(0); + let lower_quote_range = vdupq_n_u8(0x1F); + let quote = vdupq_n_u8(b'"'); + let backslash = vdupq_n_u8(b'\\'); + while *len - *idx > 16 { + // Load 16 bytes of data; + let data: uint8x16_t = vld1q_u8(string.as_ptr().add(*idx)); + // Test the data against being backslash and quote. + let bs_or_quote = + vorrq_u8(vceqq_u8(data, backslash), vceqq_u8(data, quote)); + // Now mask the data with the quote range (0x1F). + let in_quote_range = vandq_u8(data, lower_quote_range); + // then test of the data is unchanged. aka: xor it with the + // Any field that was inside the quote range it will be zero + // now. + let is_unchanged = vxorrq_u8(data, in_quote_range); + let in_range = vceqq_u8(is_unchanged, zero); + let quote_bits = neon_movemask(vorrq_u8(bs_or_quote, in_range)); + if quote_bits != 0 { + let quote_dist = quote_bits.trailing_zeros() as usize; + stry!(writer.write_all(&string[0..*idx + quote_dist])); + let ch = string[*idx + quote_dist]; + match ESCAPED[ch as usize] { + b'u' => stry!(write!(writer, "\\u{:04x}", ch)), + + escape => stry!(writer.write_all(&[b'\\', escape])), + }; + *string = &string[*idx + quote_dist + 1..]; + *idx = 0; + *len = string.len(); + } else { + *idx += 16; + } + } + stry!(writer.write_all(&string[0..*idx])); + *string = &string[*idx..]; + Ok(()) +} diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs new file mode 100644 index 00000000..c5c98cb6 --- /dev/null +++ b/src/neon/intrinsics.rs @@ -0,0 +1,557 @@ +//use std::arch:: + +use crate::neon::simd_llvm; + +use std::mem; +use core; + +#[allow(unused)] +macro_rules! types { + ($( + $(#[$doc:meta])* + pub struct $name:ident($($fields:tt)*); + )*) => ($( + $(#[$doc])* + #[derive(Copy, Clone, Debug)] + #[allow(non_camel_case_types)] + #[repr(simd)] + #[allow(clippy::missing_inline_in_public_items)] + pub struct $name($($fields)*); + )*) +} + +#[allow(non_camel_case_types)] +pub type poly64_t = i64; + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.aarch64.neon.addp.v16u8"] + fn vpaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; + #[link_name = "llvm.aarch64.neon.pmull64"] + fn vmull_p64_(a: i64, b: i64) -> int8x16_t; + #[link_name = "llvm.aarch64.neon.uqxtn.v2u32"] + fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t; + #[link_name = "llvm.aarch64.neon.uqsub.v16u8"] + fn vqsubq_u8_(a: uint8x16_t, a: uint8x16_t) -> uint8x16_t; + #[link_name = "llvm.aarch64.neon.uqsub.v16i8"] + fn vqsubq_s8_(a: int8x16_t, a: int8x16_t) -> int8x16_t; +} + +#[inline] +unsafe fn vaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) +} + +#[inline] +unsafe fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t { + simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) +} + +#[inline] +unsafe fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t { + simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) +} + +#[inline] +pub unsafe fn vnegq_u8(a: uint8x16_t) -> uint8x16_t { + let x: u128 = mem::transmute(a); + let nx = !x; + mem::transmute(nx) +} + +#[inline] +pub unsafe fn vnegq_s8(a: int8x16_t) -> int8x16_t { + let x: u128 = mem::transmute(a); + let nx = !x; + mem::transmute(nx) +} + + +#[inline] +fn rotate_(a: u128, b: u128, n: u128) -> u128 { + let az = a >> (n * 8); + let bz = b << (128 - (n * 8)); + az | bz +} + +#[inline] +pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: u8) -> uint8x16_t { + mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128)) +} + +#[inline] +pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t { + mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128)) +} + +#[inline] +pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t { + mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b))) +} + +#[inline] +pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + unsafe { vpaddq_u8_(a, b) } +} + +#[inline] +pub unsafe fn vshrq_n_u8(a: uint8x16_t, n: u8) -> uint8x16_t { + uint8x16_t( + a.0 >> n, + a.1 >> n, + a.2 >> n, + a.3 >> n, + a.4 >> n, + a.5 >> n, + a.6 >> n, + a.7 >> n, + a.8 >> n, + a.9 >> n, + a.10 >> n, + a.11 >> n, + a.12 >> n, + a.13 >> n, + a.14 >> n, + a.15 >> n, + ) +} + +types! { + /// ARM-specific 64-bit wide vector of eight packed `i8`. + pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8); + /// ARM-specific 64-bit wide vector of eight packed `u8`. + pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8); + /// ARM-specific 64-bit wide polynomial vector of eight packed `u8`. + pub struct poly8x8_t(u8, u8, u8, u8, u8, u8, u8, u8); + /// ARM-specific 64-bit wide vector of four packed `i16`. + pub struct int16x4_t(i16, i16, i16, i16); + /// ARM-specific 64-bit wide vector of four packed `u16`. + pub struct uint16x4_t(u16, u16, u16, u16); + /// ARM-specific 64-bit wide vector of four packed `u16`. + pub struct poly16x4_t(u16, u16, u16, u16); + /// ARM-specific 64-bit wide vector of two packed `i32`. + pub struct int32x2_t(i32, i32); + /// ARM-specific 64-bit wide vector of two packed `u32`. + pub struct uint32x2_t(u32, u32); + /// ARM-specific 64-bit wide vector of two packed `f32`. + pub struct float32x2_t(f32, f32); + /// ARM-specific 64-bit wide vector of one packed `i64`. + pub struct int64x1_t(i64); + /// ARM-specific 64-bit wide vector of one packed `u64`. + pub struct uint64x1_t(u64); + /// ARM-specific 128-bit wide vector of sixteen packed `i8`. + pub struct int8x16_t( + i8, i8 ,i8, i8, i8, i8 ,i8, i8, + i8, i8 ,i8, i8, i8, i8 ,i8, i8, + ); + /// ARM-specific 128-bit wide vector of sixteen packed `u8`. + pub struct uint8x16_t( + u8, u8 ,u8, u8, u8, u8 ,u8, u8, + u8, u8 ,u8, u8, u8, u8 ,u8, u8, + ); + /// ARM-specific 128-bit wide vector of sixteen packed `u8`. + pub struct poly8x16_t( + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8 + ); + /// ARM-specific 128-bit wide vector of eight packed `i16`. + pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16); + /// ARM-specific 128-bit wide vector of eight packed `u16`. + pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16); + /// ARM-specific 128-bit wide vector of eight packed `u16`. + pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16); + /// ARM-specific 128-bit wide vector of four packed `i32`. + pub struct int32x4_t(i32, i32, i32, i32); + /// ARM-specific 128-bit wide vector of four packed `u32`. + pub struct uint32x4_t(u32, u32, u32, u32); + /// ARM-specific 128-bit wide vector of four packed `f32`. + pub struct float32x4_t(f32, f32, f32, f32); + /// ARM-specific 128-bit wide vector of two packed `i64`. + pub struct int64x2_t(i64, i64); + /// ARM-specific 128-bit wide vector of two packed `u64`. + pub struct uint64x2_t(u64, u64); + /// ARM-specific 128-bit wide vector of one packed `i128`. + pub struct poly128_t(i128); // FIXME: check this! +} + +impl uint8x16_t { + #[inline] + pub fn new(a: u8, b: u8, c: u8, d: u8, e: u8, f: u8, g: u8, h: u8, i: u8, j: u8, k: u8, l: u8, m: u8, n: u8, o: u8, p: u8) -> uint8x16_t { + uint8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) + } +} + +impl int8x16_t { + #[inline] + pub fn new(a: i8, b: i8, c: i8, d: i8, e: i8, f: i8, g: i8, h: i8, i: i8, j: i8, k: i8, l: i8, m: i8, n: i8, o: i8, p: i8) -> int8x16_t { + int8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) + } +} + +impl int32x4_t { + #[inline] + pub fn new(a: i32, b: i32, c: i32, d: i32) -> int32x4_t { + int32x4_t(a, b, c, d) + } +} + +//#[inline] +//pub fn add_overflow(a: u64, b: u64, out: &mut u64) -> bool { +// let (carry, did_carry) = a.overflowing_add(b); +// *out = carry; +// did_carry +//} + +#[inline] +pub unsafe fn vld1q_s8(addr: *const i8) -> int8x16_t { + *(addr as *const int8x16_t) +} + +#[inline] +pub unsafe fn vld1q_u8(addr: *const u8) -> uint8x16_t { + *(addr as *const uint8x16_t) +} + +#[inline] +pub unsafe fn vst1q_u8(addr: *mut u8, val: uint8x16_t) { + std::ptr::write(addr as *mut uint8x16_t, val); +} + +macro_rules! aarch64_simd_2 { + ($name: ident, $type: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => { + aarch64_simd_2!($name, $type, $type, $simd_fn, $intrarm, $intraarch); + }; + ($name: ident, $type: ty, $res: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => { + #[inline] + pub fn $name(a: $type, b: $type) -> $res { + unsafe { simd_llvm::$simd_fn(a, b) } + } + } +} + +macro_rules! aarch64_simd_ceq { + ($name: ident, $type: ty, $res: ty) => { + /// Compare bitwise Equal (vector) + aarch64_simd_2!($name, $type, $res, simd_eq, cmeq, cmeq); + }; +} + +aarch64_simd_ceq!(vceq_s8, int8x8_t, uint8x8_t); +aarch64_simd_ceq!(vceqq_s8, int8x16_t, uint8x16_t); +aarch64_simd_ceq!(vceq_s16, int16x4_t, uint16x4_t); +aarch64_simd_ceq!(vceqq_s16, int16x8_t, uint16x8_t); +aarch64_simd_ceq!(vceq_s32, int32x2_t, uint32x2_t); +aarch64_simd_ceq!(vceqq_s32, int32x4_t, uint32x4_t); +aarch64_simd_ceq!(vceq_u8, uint8x8_t, uint8x8_t); +aarch64_simd_ceq!(vceqq_u8, uint8x16_t, uint8x16_t); +aarch64_simd_ceq!(vceq_u16, uint16x4_t, uint16x4_t); +aarch64_simd_ceq!(vceqq_u16, uint16x8_t, uint16x8_t); +aarch64_simd_ceq!(vceq_u32, uint32x2_t, uint32x2_t); +aarch64_simd_ceq!(vceqq_u32, uint32x4_t, uint32x4_t); +aarch64_simd_2!(vceq_f32, float32x2_t, uint32x2_t, simd_eq, fcmeq, fcmeq); +aarch64_simd_2!(vceqq_f32, float32x4_t, uint32x4_t, simd_eq, fcmeq, fcmeq); +aarch64_simd_ceq!(vceq_p8, poly8x8_t, poly8x8_t); +aarch64_simd_ceq!(vceqq_p8, poly8x16_t, poly8x16_t); + +macro_rules! aarch64_simd_cgt { + ($name:ident, $type:ty, $res:ty) => { + /// Compare signed Greater than (vector) + aarch64_simd_2!($name, $type, $res, simd_gt, cmgt, cmgt); + }; +} + +//macro_rules! aarch64_simd_cgtu { +// ($name: ident, $type: ty) => { +// /// Compare Greater than (vector) +// aarch64_simd_2!($name, $type, simd_gt, cmhi); +// }; +//} + +aarch64_simd_cgt!(vcgt_s8, int8x8_t, uint8x8_t); +aarch64_simd_cgt!(vcgtq_s8, int8x16_t, uint8x16_t); +aarch64_simd_cgt!(vcgt_s16, int16x4_t, uint16x4_t); +aarch64_simd_cgt!(vcgtq_s16, int16x8_t, uint16x8_t); +aarch64_simd_cgt!(vcgt_s32, int32x2_t, uint32x2_t); +aarch64_simd_cgt!(vcgtq_s32, int32x4_t, uint32x4_t); + +//aarch64_simd_cgtu!(vcgtq_u8, uint8x16_t); +//aarch64_simd_cgt!(vcgt_s64, int64x1_t); +//aarch64_simd_cgt!(vcgtq_s64, int64x2_t); +//aarch64_simd_cgtu!(vcgt_u64, uint64x1_t); +//aarch64_simd_cgtu!(vcgtq_u64, uint64x2_t); + +macro_rules! aarch64_simd_clt { + ($name:ident, $type:ty, $res:ty) => { + /// Compare signed Lesser than (vector) + aarch64_simd_2!($name, $type, $res, simd_lt, cmgt, cmgt); + }; +} + +//macro_rules! aarch64_simd_cltu { +//( $ name: ident, $ type: ty) => { +///// Compare Lesser than (vector) +//aarch64_simd_2 ! ( $ name, $ type, simd_lt, cmhi); +//}; +//} + +aarch64_simd_clt!(vclt_s8, int8x8_t, uint8x8_t); +aarch64_simd_clt!(vcltq_s8, int8x16_t, uint8x16_t); +aarch64_simd_clt!(vclt_s16, int16x4_t, uint16x4_t); +aarch64_simd_clt!(vcltq_s16, int16x8_t, uint16x8_t); +aarch64_simd_clt!(vclt_s32, int32x2_t, uint32x2_t); +aarch64_simd_clt!(vcltq_s32, int32x4_t, uint32x4_t); + +//arm_simd_cltu!(vclt_u8, uint8x8_t); +//arm_simd_cltu!(vcltq_u8, uint8x16_t); +//arm_simd_cltu!(vclt_u16, uint16x4_t); +//arm_simd_cltu!(vcltq_u16, uint16x8_t); +//arm_simd_cltu!(vclt_u32, uint32x2_t); +//arm_simd_cltu!(vcltq_u32, uint32x4_t); + +macro_rules! aarch64_simd_cge { + ($name:ident, $type:ty, $res:ty) => { + /// Compare signed Greater than equals (vector) + aarch64_simd_2!($name, $type, $res, simd_ge, cmge, cmge); + }; +} + +//macro_rules! aarch64_simd_cgeu { +//( $ name: ident, $ type: ty) => { +///// Compare Greater than (vector) +//aarch64_simd_2 ! ( $ name, $ type, simd_ge, cmhs); +//}; +//} + +aarch64_simd_cge!(vcge_s8, int8x8_t, uint8x8_t); +aarch64_simd_cge!(vcgeq_s8, int8x16_t, uint8x16_t); +aarch64_simd_cge!(vcge_s16, int16x4_t, uint16x4_t); +aarch64_simd_cge!(vcgeq_s16, int16x8_t, uint16x8_t); +aarch64_simd_cge!(vcge_s32, int32x2_t, uint32x2_t); +aarch64_simd_cge!(vcgeq_s32, int32x4_t, uint32x4_t); +//arm_simd_cgeu!(vcge_u8, uint8x8_t); +//arm_simd_cgeu!(vcgeq_u8, uint8x16_t); +//arm_simd_cgeu!(vcge_u16, uint16x4_t); +//arm_simd_cgeu!(vcgeq_u16, uint16x8_t); +//arm_simd_cgeu!(vcge_u32, uint32x2_t); +//arm_simd_cgeu!(vcgeq_u32, uint32x4_t); + +macro_rules! aarch64_simd_cle { + ($name:ident, $type:ty, $res:ty) => { + /// Compare signed Lesser than equals (vector) + aarch64_simd_2!($name, $type, $res, simd_le, cmge, cmge); + }; +} + +//macro_rules! aarch64_simd_cleu { +//( $ name: ident, $ type: ty) => { +///// Compare Lesser than (vector) +//aarch64_simd_2 ! ( $ name, $ type, simd_le, cmhs); +//}; +//} + +aarch64_simd_cle!(vcle_s8, int8x8_t, uint8x8_t); +aarch64_simd_cle!(vcleq_s8, int8x16_t, uint8x16_t); +aarch64_simd_cle!(vcle_s16, int16x4_t, uint16x4_t); +aarch64_simd_cle!(vcleq_s16, int16x8_t, uint16x8_t); +aarch64_simd_cle!(vcle_s32, int32x2_t, uint32x2_t); +aarch64_simd_cle!(vcleq_s32, int32x4_t, uint32x4_t); +//arm_simd_cleu!(vcle_u8, uint8x8_t); +aarch64_simd_cle!(vcleq_u8, uint8x16_t, uint8x16_t); +//arm_simd_cleu!(vcle_u16, uint16x4_t); +//arm_simd_cleu!(vcleq_u16, uint16x8_t); +//arm_simd_cleu!(vcle_u32, uint32x2_t); +//arm_simd_cleu!(vcleq_u32, uint32x4_t); + +#[inline] +pub fn vdupq_n_s8(a: i8) -> int8x16_t { + int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} + +#[inline] +pub fn zeroi8x16() -> int8x16_t { + int8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) +} + +#[inline] +pub fn vdupq_n_u8(a: u8) -> uint8x16_t { + uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} + +#[inline] +pub fn vmovq_n_u8(a: u8) -> uint8x16_t { + uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} + +#[inline] +pub fn vmovq_n_s8(a: i8) -> int8x16_t { + int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} + +#[inline] +pub fn zerou8x16() -> uint8x16_t { + uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) +} + +#[inline] +pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + mem::transmute(vaddq_u8_(mem::transmute(a), mem::transmute(b))) +} + +#[inline] +pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + mem::transmute(vaddq_s8_(mem::transmute(a), mem::transmute(b))) +} + +#[inline] +pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + mem::transmute(vaddq_s32_(mem::transmute(a), mem::transmute(b))) +} + +#[inline] +pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_and(a, b) } } +#[inline] +pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_and(a, b) } } +#[inline] +pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { unsafe { simd_llvm::simd_and(a, b) } } +#[inline] +pub fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_or(a, b) } } +#[inline] +pub fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_or(a, b) } } +#[inline] +pub fn vxorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_xor(a, b) } } +#[inline] +pub fn vxorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_xor(a, b) } } + +macro_rules! arm_reinterpret { + ($name: ident, $from: ty, $to: ty) => { + // Vector reinterpret cast operation + #[inline] + pub fn $name(a: $from) -> $to { + unsafe { mem::transmute(a) } + } + }; +} + +arm_reinterpret!(vreinterpret_u64_u32, uint32x2_t, uint64x1_t); +arm_reinterpret!(vreinterpretq_u64_u32, uint32x4_t, uint64x2_t); +arm_reinterpret!(vreinterpretq_s8_u8, uint8x16_t, int8x16_t); +arm_reinterpret!(vreinterpretq_u16_u8, uint8x16_t, uint16x8_t); +arm_reinterpret!(vreinterpretq_u32_u8, uint8x16_t, uint32x4_t); +arm_reinterpret!(vreinterpretq_u64_u8, uint8x16_t, uint64x2_t); +arm_reinterpret!(vreinterpretq_u64_s8, int8x16_t, uint64x2_t); +arm_reinterpret!(vreinterpretq_u8_s8, int8x16_t, uint8x16_t); + +arm_reinterpret!(vreinterpretq_s16_s8, int8x16_t, int16x8_t); +arm_reinterpret!(vreinterpretq_s32_s8, int8x16_t, int32x4_t); +arm_reinterpret!(vreinterpretq_s64_s8, int8x16_t, int64x2_t); + +macro_rules! arm_vget_lane { + ($name: ident, $to: ty, $from: ty, $lanes: literal) => { + #[inline] + pub unsafe fn $name(v: $from, lane: u32) -> $ to { + simd_llvm::simd_extract(v, lane) + } + }; +} + +arm_vget_lane!(vgetq_lane_u16, u16, uint16x8_t, 7); +arm_vget_lane!(vgetq_lane_u32, u32, uint32x4_t, 3); +arm_vget_lane!(vgetq_lane_u64, u64, uint64x2_t, 1); +arm_vget_lane!(vget_lane_u64, u64, uint64x1_t, 0); + +arm_vget_lane!(vgetq_lane_s16, i16, int16x8_t, 7); +arm_vget_lane!(vgetq_lane_s32, i32, int32x4_t, 3); +arm_vget_lane!(vgetq_lane_s64, i64, int64x2_t, 1); +arm_vget_lane!(vget_lane_s64, i64, int64x1_t, 0); + +#[inline] +pub unsafe fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t { + vqmovn_u64_(a) +} + +#[inline] +pub unsafe fn vqtbl1q_s8(t: int8x16_t, idx: uint8x16_t) -> int8x16_t { + mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx))) +} + +#[inline] +pub unsafe fn vqtbl1q_u8(t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t { + mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx))) +} + +#[inline] +pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + vqsubq_u8_(a, b) +} + +#[inline] +pub unsafe fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + vqsubq_s8_(a, b) +} + +#[inline] +fn test_u8(a: u8, b: u8) -> u8 { + if a & b != 0 { + 0xFF + } else { + 0x00 + } +} + +#[inline] +pub unsafe fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + uint8x16_t( + test_u8(a.0, b.0), + test_u8(a.1, b.1), + test_u8(a.2, b.2), + test_u8(a.3, b.3), + test_u8(a.4, b.4), + test_u8(a.5, b.5), + test_u8(a.6, b.6), + test_u8(a.7, b.7), + test_u8(a.8, b.8), + test_u8(a.9, b.9), + test_u8(a.10, b.10), + test_u8(a.11, b.11), + test_u8(a.12, b.12), + test_u8(a.13, b.13), + test_u8(a.14, b.14), + test_u8(a.15, b.15), + ) +} + +#[inline] +fn test_s8(a: i8, b: i8) -> i8 { + if a & b != 0 { + -1 + } else { + 0x00 + } +} + +#[inline] +pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + int8x16_t( + test_s8(a.0, b.0), + test_s8(a.1, b.1), + test_s8(a.2, b.2), + test_s8(a.3, b.3), + test_s8(a.4, b.4), + test_s8(a.5, b.5), + test_s8(a.6, b.6), + test_s8(a.7, b.7), + test_s8(a.8, b.8), + test_s8(a.9, b.9), + test_s8(a.10, b.10), + test_s8(a.11, b.11), + test_s8(a.12, b.12), + test_s8(a.13, b.13), + test_s8(a.14, b.14), + test_s8(a.15, b.15), + ) +} + +#[inline] +pub unsafe fn vst1q_u32(addr: *mut u8, val: uint32x4_t) { + std::ptr::write(addr as *mut uint32x4_t, val) +} diff --git a/src/neon/mod.rs b/src/neon/mod.rs new file mode 100644 index 00000000..f7868249 --- /dev/null +++ b/src/neon/mod.rs @@ -0,0 +1,7 @@ +pub mod deser; +pub mod stage1; +pub mod utf8check; +pub mod generator; +mod simd; +mod simd_llvm; +mod intrinsics; \ No newline at end of file diff --git a/src/neon/simd.rs b/src/neon/simd.rs new file mode 100644 index 00000000..8a5a21fc --- /dev/null +++ b/src/neon/simd.rs @@ -0,0 +1,470 @@ +#![allow(non_camel_case_types)] +#![allow(unused)] + +use crate::neon::simd_llvm; + +macro_rules! simd_ty { + ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => { + #[repr(simd)] + #[derive(Copy, Clone, Debug, PartialEq)] + pub(crate) struct $id($(pub $elem_ty),*); + + #[allow(clippy::use_self)] + impl $id { + #[inline] + pub(crate) const fn new($($elem_name: $elem_ty),*) -> Self { + $id($($elem_name),*) + } + // FIXME: Workaround rust@60637 + #[inline(always)] + pub(crate) const fn splat(value: $ety) -> Self { + $id($({ + #[allow(non_camel_case_types, dead_code)] + struct $elem_name; + value + }),*) + } + + // FIXME: Workaround rust@60637 + #[inline(always)] + pub(crate) fn extract(self, index: usize) -> $ety { + unsafe { + simd_llvm::simd_extract(self, index as u32) + } + } + } + } +} + +macro_rules! simd_m_ty { + ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => { + #[repr(simd)] + #[derive(Copy, Clone, Debug, PartialEq)] + pub(crate) struct $id($(pub $elem_ty),*); + + #[allow(clippy::use_self)] + impl $id { + #[inline] + const fn bool_to_internal(x: bool) -> $ety { + [0 as $ety, !(0 as $ety)][x as usize] + } + + #[inline] + pub(crate) const fn new($($elem_name: bool),*) -> Self { + $id($(Self::bool_to_internal($elem_name)),*) + } + + // FIXME: Workaround rust@60637 + #[inline(always)] + pub(crate) const fn splat(value: bool) -> Self { + $id($({ + #[allow(non_camel_case_types, dead_code)] + struct $elem_name; + Self::bool_to_internal(value) + }),*) + } + + // FIXME: Workaround rust@60637 + #[inline(always)] + pub(crate) fn extract(self, index: usize) -> bool { + let r: $ety = unsafe { + simd_llvm::simd_extract(self, index as u32) + }; + r != 0 + } + } + } +} + +// 16-bit wide types: + +simd_ty!(u8x2[u8]: u8, u8 | x0, x1); +simd_ty!(i8x2[i8]: i8, i8 | x0, x1); + +// 32-bit wide types: + +simd_ty!(u8x4[u8]: u8, u8, u8, u8 | x0, x1, x2, x3); +simd_ty!(u16x2[u16]: u16, u16 | x0, x1); + +simd_ty!(i8x4[i8]: i8, i8, i8, i8 | x0, x1, x2, x3); +simd_ty!(i16x2[i16]: i16, i16 | x0, x1); + +// 64-bit wide types: + +simd_ty!(u8x8[u8]: + u8, u8, u8, u8, u8, u8, u8, u8 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(u16x4[u16]: u16, u16, u16, u16 | x0, x1, x2, x3); +simd_ty!(u32x2[u32]: u32, u32 | x0, x1); +simd_ty!(u64x1[u64]: u64 | x1); + +simd_ty!(i8x8[i8]: + i8, i8, i8, i8, i8, i8, i8, i8 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(i16x4[i16]: i16, i16, i16, i16 | x0, x1, x2, x3); +simd_ty!(i32x2[i32]: i32, i32 | x0, x1); +simd_ty!(i64x1[i64]: i64 | x1); + +simd_ty!(f32x2[f32]: f32, f32 | x0, x1); + +// 128-bit wide types: + +simd_ty!(u8x16[u8]: + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_ty!(u16x8[u16]: + u16, u16, u16, u16, u16, u16, u16, u16 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(u32x4[u32]: u32, u32, u32, u32 | x0, x1, x2, x3); +simd_ty!(u64x2[u64]: u64, u64 | x0, x1); + +simd_ty!(i8x16[i8]: + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_ty!(i16x8[i16]: + i16, i16, i16, i16, i16, i16, i16, i16 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(i32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3); +simd_ty!(i64x2[i64]: i64, i64 | x0, x1); + +simd_ty!(f32x4[f32]: f32, f32, f32, f32 | x0, x1, x2, x3); +simd_ty!(f64x2[f64]: f64, f64 | x0, x1); + +simd_m_ty!(m8x16[i8]: + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_m_ty!(m16x8[i16]: + i16, i16, i16, i16, i16, i16, i16, i16 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_m_ty!(m32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3); +simd_m_ty!(m64x2[i64]: i64, i64 | x0, x1); + +// 256-bit wide types: + +simd_ty!(u8x32[u8]: + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8 + | x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 +); +simd_ty!(u16x16[u16]: + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_ty!(u32x8[u32]: + u32, u32, u32, u32, u32, u32, u32, u32 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(u64x4[u64]: u64, u64, u64, u64 | x0, x1, x2, x3); + +simd_ty!(i8x32[i8]: + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8 + | x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 +); +simd_ty!(i16x16[i16]: + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_ty!(i32x8[i32]: + i32, i32, i32, i32, i32, i32, i32, i32 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3); + +simd_ty!(f32x8[f32]: + f32, f32, f32, f32, f32, f32, f32, f32 | + x0, x1, x2, x3, x4, x5, x6, x7); + +// 512-bit wide types: + +simd_ty!(i32x16[i32]: + i32, i32, i32, i32, i32, i32, i32, i32, + i32, i32, i32, i32, i32, i32, i32, i32 + | x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15); + +simd_ty!(i64x8[i64]: + i64, i64, i64, i64, i64, i64, i64, i64 + | x0, x1, x2, x3, x4, x5, x6, x7); + +#[allow(unused)] +#[macro_export] +macro_rules! constify_imm8 { + ($imm8:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match ($imm8) & 0b1111_1111 { + 0 => $expand!(0), + 1 => $expand!(1), + 2 => $expand!(2), + 3 => $expand!(3), + 4 => $expand!(4), + 5 => $expand!(5), + 6 => $expand!(6), + 7 => $expand!(7), + 8 => $expand!(8), + 9 => $expand!(9), + 10 => $expand!(10), + 11 => $expand!(11), + 12 => $expand!(12), + 13 => $expand!(13), + 14 => $expand!(14), + 15 => $expand!(15), + 16 => $expand!(16), + 17 => $expand!(17), + 18 => $expand!(18), + 19 => $expand!(19), + 20 => $expand!(20), + 21 => $expand!(21), + 22 => $expand!(22), + 23 => $expand!(23), + 24 => $expand!(24), + 25 => $expand!(25), + 26 => $expand!(26), + 27 => $expand!(27), + 28 => $expand!(28), + 29 => $expand!(29), + 30 => $expand!(30), + 31 => $expand!(31), + 32 => $expand!(32), + 33 => $expand!(33), + 34 => $expand!(34), + 35 => $expand!(35), + 36 => $expand!(36), + 37 => $expand!(37), + 38 => $expand!(38), + 39 => $expand!(39), + 40 => $expand!(40), + 41 => $expand!(41), + 42 => $expand!(42), + 43 => $expand!(43), + 44 => $expand!(44), + 45 => $expand!(45), + 46 => $expand!(46), + 47 => $expand!(47), + 48 => $expand!(48), + 49 => $expand!(49), + 50 => $expand!(50), + 51 => $expand!(51), + 52 => $expand!(52), + 53 => $expand!(53), + 54 => $expand!(54), + 55 => $expand!(55), + 56 => $expand!(56), + 57 => $expand!(57), + 58 => $expand!(58), + 59 => $expand!(59), + 60 => $expand!(60), + 61 => $expand!(61), + 62 => $expand!(62), + 63 => $expand!(63), + 64 => $expand!(64), + 65 => $expand!(65), + 66 => $expand!(66), + 67 => $expand!(67), + 68 => $expand!(68), + 69 => $expand!(69), + 70 => $expand!(70), + 71 => $expand!(71), + 72 => $expand!(72), + 73 => $expand!(73), + 74 => $expand!(74), + 75 => $expand!(75), + 76 => $expand!(76), + 77 => $expand!(77), + 78 => $expand!(78), + 79 => $expand!(79), + 80 => $expand!(80), + 81 => $expand!(81), + 82 => $expand!(82), + 83 => $expand!(83), + 84 => $expand!(84), + 85 => $expand!(85), + 86 => $expand!(86), + 87 => $expand!(87), + 88 => $expand!(88), + 89 => $expand!(89), + 90 => $expand!(90), + 91 => $expand!(91), + 92 => $expand!(92), + 93 => $expand!(93), + 94 => $expand!(94), + 95 => $expand!(95), + 96 => $expand!(96), + 97 => $expand!(97), + 98 => $expand!(98), + 99 => $expand!(99), + 100 => $expand!(100), + 101 => $expand!(101), + 102 => $expand!(102), + 103 => $expand!(103), + 104 => $expand!(104), + 105 => $expand!(105), + 106 => $expand!(106), + 107 => $expand!(107), + 108 => $expand!(108), + 109 => $expand!(109), + 110 => $expand!(110), + 111 => $expand!(111), + 112 => $expand!(112), + 113 => $expand!(113), + 114 => $expand!(114), + 115 => $expand!(115), + 116 => $expand!(116), + 117 => $expand!(117), + 118 => $expand!(118), + 119 => $expand!(119), + 120 => $expand!(120), + 121 => $expand!(121), + 122 => $expand!(122), + 123 => $expand!(123), + 124 => $expand!(124), + 125 => $expand!(125), + 126 => $expand!(126), + 127 => $expand!(127), + 128 => $expand!(128), + 129 => $expand!(129), + 130 => $expand!(130), + 131 => $expand!(131), + 132 => $expand!(132), + 133 => $expand!(133), + 134 => $expand!(134), + 135 => $expand!(135), + 136 => $expand!(136), + 137 => $expand!(137), + 138 => $expand!(138), + 139 => $expand!(139), + 140 => $expand!(140), + 141 => $expand!(141), + 142 => $expand!(142), + 143 => $expand!(143), + 144 => $expand!(144), + 145 => $expand!(145), + 146 => $expand!(146), + 147 => $expand!(147), + 148 => $expand!(148), + 149 => $expand!(149), + 150 => $expand!(150), + 151 => $expand!(151), + 152 => $expand!(152), + 153 => $expand!(153), + 154 => $expand!(154), + 155 => $expand!(155), + 156 => $expand!(156), + 157 => $expand!(157), + 158 => $expand!(158), + 159 => $expand!(159), + 160 => $expand!(160), + 161 => $expand!(161), + 162 => $expand!(162), + 163 => $expand!(163), + 164 => $expand!(164), + 165 => $expand!(165), + 166 => $expand!(166), + 167 => $expand!(167), + 168 => $expand!(168), + 169 => $expand!(169), + 170 => $expand!(170), + 171 => $expand!(171), + 172 => $expand!(172), + 173 => $expand!(173), + 174 => $expand!(174), + 175 => $expand!(175), + 176 => $expand!(176), + 177 => $expand!(177), + 178 => $expand!(178), + 179 => $expand!(179), + 180 => $expand!(180), + 181 => $expand!(181), + 182 => $expand!(182), + 183 => $expand!(183), + 184 => $expand!(184), + 185 => $expand!(185), + 186 => $expand!(186), + 187 => $expand!(187), + 188 => $expand!(188), + 189 => $expand!(189), + 190 => $expand!(190), + 191 => $expand!(191), + 192 => $expand!(192), + 193 => $expand!(193), + 194 => $expand!(194), + 195 => $expand!(195), + 196 => $expand!(196), + 197 => $expand!(197), + 198 => $expand!(198), + 199 => $expand!(199), + 200 => $expand!(200), + 201 => $expand!(201), + 202 => $expand!(202), + 203 => $expand!(203), + 204 => $expand!(204), + 205 => $expand!(205), + 206 => $expand!(206), + 207 => $expand!(207), + 208 => $expand!(208), + 209 => $expand!(209), + 210 => $expand!(210), + 211 => $expand!(211), + 212 => $expand!(212), + 213 => $expand!(213), + 214 => $expand!(214), + 215 => $expand!(215), + 216 => $expand!(216), + 217 => $expand!(217), + 218 => $expand!(218), + 219 => $expand!(219), + 220 => $expand!(220), + 221 => $expand!(221), + 222 => $expand!(222), + 223 => $expand!(223), + 224 => $expand!(224), + 225 => $expand!(225), + 226 => $expand!(226), + 227 => $expand!(227), + 228 => $expand!(228), + 229 => $expand!(229), + 230 => $expand!(230), + 231 => $expand!(231), + 232 => $expand!(232), + 233 => $expand!(233), + 234 => $expand!(234), + 235 => $expand!(235), + 236 => $expand!(236), + 237 => $expand!(237), + 238 => $expand!(238), + 239 => $expand!(239), + 240 => $expand!(240), + 241 => $expand!(241), + 242 => $expand!(242), + 243 => $expand!(243), + 244 => $expand!(244), + 245 => $expand!(245), + 246 => $expand!(246), + 247 => $expand!(247), + 248 => $expand!(248), + 249 => $expand!(249), + 250 => $expand!(250), + 251 => $expand!(251), + 252 => $expand!(252), + 253 => $expand!(253), + 254 => $expand!(254), + _ => $expand!(255), + } + }; +} diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs new file mode 100644 index 00000000..6e60b63c --- /dev/null +++ b/src/neon/simd_llvm.rs @@ -0,0 +1,54 @@ +extern "platform-intrinsic" { + pub fn simd_eq(x: T, y: T) -> U; +// pub fn simd_ne(x: T, y: T) -> U; + pub fn simd_lt(x: T, y: T) -> U; + pub fn simd_le(x: T, y: T) -> U; + pub fn simd_gt(x: T, y: T) -> U; + pub fn simd_ge(x: T, y: T) -> U; +// +// pub fn simd_shuffle2(x: T, y: T, idx: [u32; 2]) -> U; +// pub fn simd_shuffle4(x: T, y: T, idx: [u32; 4]) -> U; +// pub fn simd_shuffle8(x: T, y: T, idx: [u32; 8]) -> U; +// pub fn simd_shuffle16(x: T, y: T, idx: [u32; 16]) -> U; +// pub fn simd_shuffle32(x: T, y: T, idx: [u32; 32]) -> U; +// pub fn simd_shuffle64(x: T, y: T, idx: [u32; 64]) -> U; +// pub fn simd_shuffle128(x: T, y: T, idx: [u32; 128]) -> U; +// +// pub fn simd_insert(x: T, idx: u32, val: U) -> T; + pub fn simd_extract(x: T, idx: u32) -> U; +// +// pub fn simd_cast(x: T) -> U; +// + pub fn simd_add(x: T, y: T) -> T; +// pub fn simd_sub(x: T, y: T) -> T; +// pub fn simd_mul(x: T, y: T) -> T; +// pub fn simd_div(x: T, y: T) -> T; +// pub fn simd_shl(x: T, y: T) -> T; +// pub fn simd_shr(x: T, y: T) -> T; + pub fn simd_and(x: T, y: T) -> T; + pub fn simd_or(x: T, y: T) -> T; + pub fn simd_xor(x: T, y: T) -> T; +// +// pub fn simd_reduce_add_unordered(x: T) -> U; +// pub fn simd_reduce_mul_unordered(x: T) -> U; +// pub fn simd_reduce_add_ordered(x: T, acc: U) -> U; +// pub fn simd_reduce_mul_ordered(x: T, acc: U) -> U; +// pub fn simd_reduce_min(x: T) -> U; +// pub fn simd_reduce_max(x: T) -> U; +// pub fn simd_reduce_min_nanless(x: T) -> U; +// pub fn simd_reduce_max_nanless(x: T) -> U; +// pub fn simd_reduce_and(x: T) -> U; +// pub fn simd_reduce_or(x: T) -> U; +// pub fn simd_reduce_xor(x: T) -> U; +// pub fn simd_reduce_all(x: T) -> bool; +// pub fn simd_reduce_any(x: T) -> bool; +// +// pub fn simd_select(m: M, a: T, b: T) -> T; +// pub fn simd_select_bitmask(m: M, a: T, b: T) -> T; +// +// pub fn simd_fmin(a: T, b: T) -> T; +// pub fn simd_fmax(a: T, b: T) -> T; +// +// pub fn simd_fsqrt(a: T) -> T; +// pub fn simd_fma(a: T, b: T, c: T) -> T; +} \ No newline at end of file diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs new file mode 100644 index 00000000..45322cee --- /dev/null +++ b/src/neon/stage1.rs @@ -0,0 +1,598 @@ +#![allow(dead_code)] + +use crate::neon::intrinsics::*; +use crate::neon::utf8check::*; +use crate::*; + +use std::mem; + +// NEON-SPECIFIC + +macro_rules! bit_mask { + () => { + uint8x16_t::new( + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + ) + }; +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +pub(crate) unsafe fn neon_movemask(input: uint8x16_t) -> u16 { + let minput: uint8x16_t = vandq_u8(input, bit_mask!()); + let tmp: uint8x16_t = vpaddq_u8(minput, minput); + let tmp = vpaddq_u8(tmp, tmp); + let tmp = vpaddq_u8(tmp, tmp); + + vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0) +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +pub unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3: uint8x16_t) -> u64 { + let bit_mask = bit_mask!(); + + let t0 = vandq_u8(p0, bit_mask); + let t1 = vandq_u8(p1, bit_mask); + let t2 = vandq_u8(p2, bit_mask); + let t3 = vandq_u8(p3, bit_mask); + let sum0 = vpaddq_u8(t0, t1); + let sum1 = vpaddq_u8(t2, t3); + let sum0 = vpaddq_u8(sum0, sum1); + let sum0 = vpaddq_u8(sum0, sum0); + + vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0) +} + +// /NEON-SPECIFIC + +pub const SIMDJSON_PADDING: usize = mem::size_of::() * 4; + +unsafe fn compute_quote_mask(quote_bits: u64) -> u64 { + vgetq_lane_u64( + vreinterpretq_u64_u8( + mem::transmute( + vmull_p64( + -1, + quote_bits as i64) + ) + ), + 0 + ) +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn check_ascii(si: &SimdInput) -> bool { + let highbit: uint8x16_t = vdupq_n_u8(0x80); + let t0: uint8x16_t = vorrq_u8(si.v0, si.v1); + let t1: uint8x16_t = vorrq_u8(si.v2, si.v3); + let t3: uint8x16_t = vorrq_u8(t0, t1); + let t4: uint8x16_t = vandq_u8(t3, highbit); + + let v64: uint64x2_t = vreinterpretq_u64_u8(t4); + let v32: uint32x2_t = vqmovn_u64(v64); + let result: uint64x1_t = vreinterpret_u64_u32(v32); + + vget_lane_u64(result, 0) == 0 +} + +#[derive(Debug)] +struct SimdInput { + v0: uint8x16_t, + v1: uint8x16_t, + v2: uint8x16_t, + v3: uint8x16_t, +} + +fn fill_input(ptr: &[u8]) -> SimdInput { + unsafe { + #[allow(clippy::cast_ptr_alignment)] + SimdInput { + v0: vld1q_u8(ptr.as_ptr() as *const u8), + v1: vld1q_u8(ptr.as_ptr().add(16) as *const u8), + v2: vld1q_u8(ptr.as_ptr().add(32) as *const u8), + v3: vld1q_u8(ptr.as_ptr().add(48) as *const u8), + } + } +} + +struct Utf8CheckingState { + has_error: int8x16_t, + previous: ProcessedUtfBytes, +} + +impl Default for Utf8CheckingState { + #[cfg_attr(not(feature = "no-inline"), inline)] + fn default() -> Self { + Utf8CheckingState { + has_error: vdupq_n_s8(0), + previous: ProcessedUtfBytes::default(), + } + } +} + +#[inline] +fn is_utf8_status_ok(has_error: int8x16_t) -> bool { + unsafe { + let has_error_128 : i128 = mem::transmute(has_error); + + has_error_128 == 0 + } +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn check_utf8( + input: &SimdInput, + state: &mut Utf8CheckingState, +) { + if check_ascii(input) { + // All bytes are ascii. Therefore the byte that was just before must be + // ascii too. We only check the byte that was just before simd_input. Nines + // are arbitrary values. + let verror: int8x16_t = int8x16_t::new( + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, + ); + state.has_error = vreinterpretq_s8_u8(vorrq_u8( + vcgtq_s8( + state.previous.carried_continuations, + verror, + ), + vreinterpretq_u8_s8(state.has_error)), + ); + } else { + // it is not ascii so we have to do heavy work + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0), &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1), &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2), &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3), &mut state.previous, &mut state.has_error); + } +} + +// a straightforward comparison of a mask against input +#[cfg_attr(not(feature = "no-inline"), inline(always))] +fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 { + unsafe { + let mask: uint8x16_t = vmovq_n_u8(m); + let cmp_res_0: uint8x16_t = vceqq_u8(input.v0, mask); + let cmp_res_1: uint8x16_t = vceqq_u8(input.v1, mask); + let cmp_res_2: uint8x16_t = vceqq_u8(input.v2, mask); + let cmp_res_3: uint8x16_t = vceqq_u8(input.v3, mask); + + neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3) + } +} + +// find all values less than or equal than the content of maxval (using unsigned arithmetic) +#[cfg_attr(not(feature = "no-inline"), inline(always))] +fn unsigned_lteq_against_input(input: &SimdInput, maxval: uint8x16_t) -> u64 { + unsafe { + let cmp_res_0: uint8x16_t = vcleq_u8(input.v0, maxval); + let cmp_res_1: uint8x16_t = vcleq_u8(input.v1, maxval); + let cmp_res_2: uint8x16_t = vcleq_u8(input.v2, maxval); + let cmp_res_3: uint8x16_t = vcleq_u8(input.v3, maxval); + neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3) + } +} + +// return a bitvector indicating where we have characters that end an odd-length +// sequence of backslashes (and thus change the behavior of the next character +// to follow). A even-length sequence of backslashes, and, for that matter, the +// largest even-length prefix of our odd-length sequence of backslashes, simply +// modify the behavior of the backslashes themselves. +// We also update the prev_iter_ends_odd_backslash reference parameter to +// indicate whether we end an iteration on an odd-length sequence of +// backslashes, which modifies our subsequent search for odd-length +// sequences of backslashes in an obvious way. +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_backslash: &mut u64) -> u64 { + const EVEN_BITS: u64 = 0x5555_5555_5555_5555; + const ODD_BITS: u64 = !EVEN_BITS; + + let bs_bits: u64 = cmp_mask_against_input(&input, b'\\'); + let start_edges: u64 = bs_bits & !(bs_bits << 1); + // flip lowest if we have an odd-length run at the end of the prior + // iteration + let even_start_mask: u64 = EVEN_BITS ^ *prev_iter_ends_odd_backslash; + let even_starts: u64 = start_edges & even_start_mask; + let odd_starts: u64 = start_edges & !even_start_mask; + let even_carries: u64 = bs_bits.wrapping_add(even_starts); + + // must record the carry-out of our odd-carries out of bit 63; this + // indicates whether the sense of any edge going to the next iteration + // should be flipped + let (mut odd_carries, iter_ends_odd_backslash) = bs_bits.overflowing_add(odd_starts); + + odd_carries |= *prev_iter_ends_odd_backslash; + // push in bit zero as a potential end + // if we had an odd-numbered run at the + // end of the previous iteration + *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 }; + let even_carry_ends: u64 = even_carries & !bs_bits; + let odd_carry_ends: u64 = odd_carries & !bs_bits; + let even_start_odd_end: u64 = even_carry_ends & ODD_BITS; + let odd_start_even_end: u64 = odd_carry_ends & EVEN_BITS; + let odd_ends: u64 = even_start_odd_end | odd_start_even_end; + odd_ends +} + +// return both the quote mask (which is a half-open mask that covers the first +// quote in an unescaped quote pair and everything in the quote pair) and the +// quote bits, which are the simple unescaped quoted bits. +// +// We also update the prev_iter_inside_quote value to tell the next iteration +// whether we finished the final iteration inside a quote pair; if so, this +// inverts our behavior of whether we're inside quotes for the next iteration. +// +// Note that we don't do any error checking to see if we have backslash +// sequences outside quotes; these +// backslash sequences (of any length) will be detected elsewhere. +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn find_quote_mask_and_bits( + input: &SimdInput, + odd_ends: u64, + prev_iter_inside_quote: &mut u64, + quote_bits: &mut u64, + error_mask: &mut u64, +) -> u64 { + *quote_bits = cmp_mask_against_input(&input, b'"'); + *quote_bits &= !odd_ends; + // remove from the valid quoted region the unescapted characters. + let mut quote_mask: u64 = compute_quote_mask(*quote_bits); + + quote_mask ^= *prev_iter_inside_quote; + // All Unicode characters may be placed within the + // quotation marks, except for the characters that MUST be escaped: + // quotation mark, reverse solidus, and the control characters (U+0000 + //through U+001F). + // https://tools.ietf.org/html/rfc8259 + let unescaped: u64 = unsigned_lteq_against_input(input, vmovq_n_u8(0x1F)); + *error_mask |= quote_mask & unescaped; + // right shift of a signed value expected to be well-defined and standard + // compliant as of C++20, + // John Regher from Utah U. says this is fine code + *prev_iter_inside_quote = static_cast_u64!(static_cast_i64!(quote_mask) >> 63); + quote_mask +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn find_whitespace_and_structurals( + input: &SimdInput, + whitespace: &mut u64, + structurals: &mut u64, +) { + // do a 'shufti' to detect structural JSON characters + // they are + // * `{` 0x7b + // * `}` 0x7d + // * `:` 0x3a + // * `[` 0x5b + // * `]` 0x5d + // * `,` 0x2c + // these go into the first 3 buckets of the comparison (1/2/4) + + // we are also interested in the four whitespace characters: + // * space 0x20 + // * linefeed 0x0a + // * horizontal tab 0x09 + // * carriage return 0x0d + // these go into the next 2 buckets of the comparison (8/16) + + // TODO: const? + let low_nibble_mask: uint8x16_t = uint8x16_t::new( + 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, + ); + // TODO: const? + let high_nibble_mask: uint8x16_t = uint8x16_t::new( + 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, + ); + + let structural_shufti_mask: uint8x16_t = vmovq_n_u8(0x7); + let whitespace_shufti_mask: uint8x16_t = vmovq_n_u8(0x18); + let low_nib_and_mask: uint8x16_t = vmovq_n_u8(0xf); + + let nib_0_lo: uint8x16_t = vandq_u8(input.v0, low_nib_and_mask); + let nib_0_hi: uint8x16_t = vshrq_n_u8(input.v0, 4); + let shuf_0_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_0_lo); + let shuf_0_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_0_hi); + let v_0: uint8x16_t = vandq_u8(shuf_0_lo, shuf_0_hi); + + let nib_1_lo: uint8x16_t = vandq_u8(input.v1, low_nib_and_mask); + let nib_1_hi: uint8x16_t = vshrq_n_u8(input.v1, 4); + let shuf_1_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_1_lo); + let shuf_1_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_1_hi); + let v_1: uint8x16_t = vandq_u8(shuf_1_lo, shuf_1_hi); + + let nib_2_lo: uint8x16_t = vandq_u8(input.v2, low_nib_and_mask); + let nib_2_hi: uint8x16_t = vshrq_n_u8(input.v2, 4); + let shuf_2_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_2_lo); + let shuf_2_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_2_hi); + let v_2: uint8x16_t = vandq_u8(shuf_2_lo, shuf_2_hi); + + let nib_3_lo: uint8x16_t = vandq_u8(input.v3, low_nib_and_mask); + let nib_3_hi: uint8x16_t = vshrq_n_u8(input.v3, 4); + let shuf_3_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_3_lo); + let shuf_3_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_3_hi); + let v_3: uint8x16_t = vandq_u8(shuf_3_lo, shuf_3_hi); + + let tmp_0: uint8x16_t = vtstq_u8(v_0, structural_shufti_mask); + let tmp_1: uint8x16_t = vtstq_u8(v_1, structural_shufti_mask); + let tmp_2: uint8x16_t = vtstq_u8(v_2, structural_shufti_mask); + let tmp_3: uint8x16_t = vtstq_u8(v_3, structural_shufti_mask); + *structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); + + let tmp_ws_v0: uint8x16_t = vtstq_u8(v_0, whitespace_shufti_mask); + let tmp_ws_v1: uint8x16_t = vtstq_u8(v_1, whitespace_shufti_mask); + let tmp_ws_v2: uint8x16_t = vtstq_u8(v_2, whitespace_shufti_mask); + let tmp_ws_v3: uint8x16_t = vtstq_u8(v_3, whitespace_shufti_mask); + *whitespace = neon_movemask_bulk(tmp_ws_v0, tmp_ws_v1, tmp_ws_v2, tmp_ws_v3); +} + +// flatten out values in 'bits' assuming that they are are to have values of idx +// plus their position in the bitvector, and store these indexes at +// base_ptr[base] incrementing base as we go +// will potentially store extra values beyond end of valid bits, so base_ptr +// needs to be large enough to handle this +//TODO: usize was u32 here does this matter? +#[cfg_attr(not(feature = "no-inline"), inline(always))] +fn flatten_bits(base: &mut Vec, idx: u32, mut bits: u64) { + let cnt: usize = bits.count_ones() as usize; + let mut l = base.len(); + let idx_minus_64 = idx.wrapping_sub(64); + let idx_64_v = unsafe { + int32x4_t::new( + static_cast_i32!(idx_minus_64), + static_cast_i32!(idx_minus_64), + static_cast_i32!(idx_minus_64), + static_cast_i32!(idx_minus_64), + ) + }; + + // We're doing some trickery here. + // We reserve 64 extra entries, because we've at most 64 bit to set + // then we trunctate the base to the next base (that we calcuate above) + // We later indiscriminatory writre over the len we set but that's OK + // since we ensure we reserve the needed space + base.reserve(64); + unsafe { + base.set_len(l + cnt); + } + + while bits != 0 { + unsafe { + let v0 = bits.trailing_zeros() as i32; + bits &= bits.wrapping_sub(1); + let v1 = bits.trailing_zeros() as i32; + bits &= bits.wrapping_sub(1); + let v2 = bits.trailing_zeros() as i32; + bits &= bits.wrapping_sub(1); + let v3 = bits.trailing_zeros() as i32; + bits &= bits.wrapping_sub(1); + + let v: int32x4_t = int32x4_t::new(v0, v1, v2, v3); + let v: int32x4_t = vaddq_s32(idx_64_v, v); + #[allow(clippy::cast_ptr_alignment)] + std::ptr::write(base.as_mut_ptr().add(l) as *mut int32x4_t, v); + } + l += 4; + } +} + +// return a updated structural bit vector with quoted contents cleared out and +// pseudo-structural characters added to the mask +// updates prev_iter_ends_pseudo_pred which tells us whether the previous +// iteration ended on a whitespace or a structural character (which means that +// the next iteration +// will have a pseudo-structural character at its start) +#[cfg_attr(not(feature = "no-inline"), inline(always))] +fn finalize_structurals( + mut structurals: u64, + whitespace: u64, + quote_mask: u64, + quote_bits: u64, + prev_iter_ends_pseudo_pred: &mut u64, +) -> u64 { + // mask off anything inside quotes + structurals &= !quote_mask; + // add the real quote bits back into our bitmask as well, so we can + // quickly traverse the strings we've spent all this trouble gathering + structurals |= quote_bits; + // Now, establish "pseudo-structural characters". These are non-whitespace + // characters that are (a) outside quotes and (b) have a predecessor that's + // either whitespace or a structural character. This means that subsequent + // passes will get a chance to encounter the first character of every string + // of non-whitespace and, if we're parsing an atom like true/false/null or a + // number we can stop at the first whitespace or structural character + // following it. + + // a qualified predecessor is something that can happen 1 position before an + // psuedo-structural character + let pseudo_pred: u64 = structurals | whitespace; + + let shifted_pseudo_pred: u64 = (pseudo_pred << 1) | *prev_iter_ends_pseudo_pred; + *prev_iter_ends_pseudo_pred = pseudo_pred >> 63; + let pseudo_structurals: u64 = shifted_pseudo_pred & (!whitespace) & (!quote_mask); + structurals |= pseudo_structurals; + + // now, we've used our close quotes all we need to. So let's switch them off + // they will be off in the quote mask and on in quote bits. + structurals &= !(quote_bits & !quote_mask); + structurals +} + +pub fn find_bs_bits_and_quote_bits(v0: uint8x16_t, v1: uint8x16_t) -> ParseStringHelper { + let quote_mask = vmovq_n_u8(b'"'); + let bs_mask = vmovq_n_u8(b'\\'); + let bit_mask = bit_mask!(); + + let cmp_bs_0 : uint8x16_t = vceqq_u8(v0, bs_mask); + let cmp_bs_1 : uint8x16_t = vceqq_u8(v1, bs_mask); + let cmp_qt_0 : uint8x16_t = vceqq_u8(v0, quote_mask); + let cmp_qt_1 : uint8x16_t = vceqq_u8(v1, quote_mask); + + let cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask); + let cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask); + let cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask); + let cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask); + + let sum0 : uint8x16_t = vpaddq_u8(cmp_bs_0, cmp_bs_1); + let sum1 : uint8x16_t = vpaddq_u8(cmp_qt_0, cmp_qt_1); + let sum0 = vpaddq_u8(sum0, sum1); + let sum0 = vpaddq_u8(sum0, sum0); + + ParseStringHelper { + bs_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0) }, + quote_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) }, + } +} + +impl<'de> Deserializer<'de> { + //#[inline(never)] + pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result, ErrorType> { + let len = input.len(); + // 6 is a heuristic number to estimate it turns out a rate of 1/6 structural caracters lears + // almost never to relocations. + let mut structural_indexes = Vec::with_capacity(len / 6); + structural_indexes.push(0); // push extra root element + + let mut utf8_state: Utf8CheckingState = Utf8CheckingState::default(); + + // we have padded the input out to 64 byte multiple with the remainder being + // zeros + + // persistent state across loop + // does the last iteration end with an odd-length sequence of backslashes? + // either 0 or 1, but a 64-bit value + let mut prev_iter_ends_odd_backslash: u64 = 0; + // does the previous iteration end inside a double-quote pair? + let mut prev_iter_inside_quote: u64 = 0; + // either all zeros or all ones + // does the previous iteration end on something that is a predecessor of a + // pseudo-structural character - i.e. whitespace or a structural character + // effectively the very first char is considered to follow "whitespace" for + // the + // purposes of pseudo-structural character detection so we initialize to 1 + let mut prev_iter_ends_pseudo_pred: u64 = 1; + + // structurals are persistent state across loop as we flatten them on the + // subsequent iteration into our array pointed to be base_ptr. + // This is harmless on the first iteration as structurals==0 + // and is done for performance reasons; we can hide some of the latency of the + // expensive carryless multiply in the previous step with this work + let mut structurals: u64 = 0; + + let lenminus64: usize = if len < 64 { 0 } else { len as usize - 64 }; + let mut idx: usize = 0; + let mut error_mask: u64 = 0; // for unescaped characters within strings (ASCII code points < 0x20) + + while idx < lenminus64 { + /* + #ifndef _MSC_VER + __builtin_prefetch(buf + idx + 128); + #endif + */ + let input: SimdInput = fill_input(input.get_unchecked(idx as usize..)); + check_utf8(&input, &mut utf8_state); + // detect odd sequences of backslashes + let odd_ends: u64 = + find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash); + + // detect insides of quote pairs ("quote_mask") and also our quote_bits + // themselves + let mut quote_bits: u64 = 0; + let quote_mask: u64 = find_quote_mask_and_bits( + &input, + odd_ends, + &mut prev_iter_inside_quote, + &mut quote_bits, + &mut error_mask, + ); + + // take the previous iterations structural bits, not our current iteration, + // and flatten + flatten_bits(&mut structural_indexes, idx as u32, structurals); + + let mut whitespace: u64 = 0; + find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals); + + // fixup structurals to reflect quotes and add pseudo-structural characters + structurals = finalize_structurals( + structurals, + whitespace, + quote_mask, + quote_bits, + &mut prev_iter_ends_pseudo_pred, + ); + idx += 64; + } + + // we use a giant copy-paste which is ugly. + // but otherwise the string needs to be properly padded or else we + // risk invalidating the UTF-8 checks. + if idx < len { + let mut tmpbuf: [u8; 64] = [0x20; 64]; + tmpbuf + .as_mut_ptr() + .copy_from(input.as_ptr().add(idx), len as usize - idx); + let input: SimdInput = fill_input(&tmpbuf); + + check_utf8(&input, &mut utf8_state); + + // detect odd sequences of backslashes + let odd_ends: u64 = + find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash); + + // detect insides of quote pairs ("quote_mask") and also our quote_bits + // themselves + let mut quote_bits: u64 = 0; + let quote_mask: u64 = find_quote_mask_and_bits( + &input, + odd_ends, + &mut prev_iter_inside_quote, + &mut quote_bits, + &mut error_mask, + ); + + // take the previous iterations structural bits, not our current iteration, + // and flatten + flatten_bits(&mut structural_indexes, idx as u32, structurals); + + let mut whitespace: u64 = 0; + find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals); + + // fixup structurals to reflect quotes and add pseudo-structural characters + structurals = finalize_structurals( + structurals, + whitespace, + quote_mask, + quote_bits, + &mut prev_iter_ends_pseudo_pred, + ); + idx += 64; + } + // This test isn't in upstream, for some reason the error mask is et for then. + if prev_iter_inside_quote != 0 { + return Err(ErrorType::Syntax); + } + // finally, flatten out the remaining structurals from the last iteration + flatten_bits(&mut structural_indexes, idx as u32, structurals); + + // a valid JSON file cannot have zero structural indexes - we should have + // found something (note that we compare to 1 as we always add the root!) + if structural_indexes.len() == 1 { + return Err(ErrorType::EOF); + } + + if structural_indexes.last() > Some(&(len as u32)) { + return Err(ErrorType::InternalError); + } + + if error_mask != 0 { + return Err(ErrorType::Syntax); + } + + if is_utf8_status_ok(utf8_state.has_error) { + Ok(structural_indexes) + } else { + Err(ErrorType::InvalidUTF8) + } + } +} diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs new file mode 100644 index 00000000..082183b1 --- /dev/null +++ b/src/neon/utf8check.rs @@ -0,0 +1,253 @@ +use crate::neon::intrinsics::*; + +/* + * legal utf-8 byte sequence + * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 + * + * Code Points 1st 2s 3s 4s + * U+0000..U+007F 00..7F + * U+0080..U+07FF C2..DF 80..BF + * U+0800..U+0FFF E0 A0..BF 80..BF + * U+1000..U+CFFF E1..EC 80..BF 80..BF + * U+D000..U+D7FF ED 80..9F 80..BF + * U+E000..U+FFFF EE..EF 80..BF 80..BF + * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + * + */ + +/*****************************/ +#[cfg_attr(not(feature = "no-inline"), inline)] +fn push_last_byte_of_a_to_b(a: int8x16_t, b: int8x16_t) -> int8x16_t { + unsafe { + vextq_s8(a, b, 16 - 1) + } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn push_last_2bytes_of_a_to_b(a: int8x16_t, b: int8x16_t) -> int8x16_t { + unsafe { + vextq_s8(a, b, 16 - 2) + } +} + +// all byte values must be no larger than 0xF4 +#[cfg_attr(not(feature = "no-inline"), inline)] +fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8x16_t) { + // unsigned, saturates to 0 below max + *has_error = unsafe { + vorrq_s8( + *has_error, + vqsubq_s8(current_bytes, vdupq_n_s8(-12 /* 0xF4 */)) + ) + }; +} + +macro_rules! nibbles_tbl { + () => { + int8x16_t::new( + 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + 0, 0, 0, 0, // 10xx (continuation) + 2, 2, // 110x + 3, // 1110 + 4, // 1111, next should be 0 (not checked here) + ) + }; +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t { + unsafe { + vqtbl1q_s8( + nibbles_tbl!(), + vreinterpretq_u8_s8(high_nibbles), + ) + } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t { + unsafe { + let right1: int8x16_t = vqsubq_s8( + push_last_byte_of_a_to_b(previous_carries, initial_lengths), + vdupq_n_s8(1), + ); + let sum: int8x16_t = vaddq_s8(initial_lengths, right1); + let right2: int8x16_t = vqsubq_s8( + push_last_2bytes_of_a_to_b(previous_carries, sum), + vdupq_n_s8(2), + ); + vaddq_s8(sum, right2) + } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) { + // overlap || underlap + // carry > length && length > 0 || !(carry > length) && !(length > 0) + // (carries > length) == (lengths > 0) + { + let overunder: uint8x16_t = vceqq_u8( + vcgtq_s8(carries, initial_lengths), + vcgtq_s8(initial_lengths, vdupq_n_s8(0)), + ); + + *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder)); + } +} + +// when 0xED is found, next byte must be no larger than 0x9F +// when 0xF4 is found, next byte must be no larger than 0x8F +// next byte must be continuation, ie sign bit is set, so signed < is ok +#[cfg_attr(not(feature = "no-inline"), inline)] +fn check_first_continuation_max( + current_bytes: int8x16_t, + off1_current_bytes: int8x16_t, + has_error: &mut int8x16_t, +) { + { + let mask_ed: uint8x16_t = vceqq_s8( + off1_current_bytes, + vdupq_n_s8(-19 /* 0xED */), + ); + let mask_f4: uint8x16_t = vceqq_s8( + off1_current_bytes, + vdupq_n_s8(-12 /* 0xF4 */), + ); + + let badfollow_ed: uint8x16_t = vandq_u8( + vcgtq_s8(current_bytes, vdupq_n_s8(-97 /* 0x9F */)), + mask_ed, + ); + let badfollow_f4: uint8x16_t = vandq_u8( + vcgtq_s8(current_bytes, vdupq_n_s8(-113 /* 0x8F */)), + mask_f4, + ); + + *has_error = vorrq_s8( + *has_error, + vreinterpretq_s8_u8(vorrq_u8(badfollow_ed, badfollow_f4)), + ); + } +} + +macro_rules! initial_mins_tbl { + () => { + int8x16_t::new( + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + -62 /* 0xC2 */, -128, // 110x + -31 /* 0xE1 */, // 1110 + -15 /*0xF1 */, // 1111 + ) + }; +} + +macro_rules! second_mins_tbl { + () => { + int8x16_t::new( + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + -96 /* 0xA0 */, // 1110 + -112 /* 0x90 */, // 1111 + ) + }; +} + +// map off1_hibits => error condition +// hibits off1 cur +// C => < C2 && true +// E => < E1 && < A0 +// F => < F1 && < 90 +// else false && false +#[cfg_attr(not(feature = "no-inline"), inline)] +fn check_overlong( + current_bytes: int8x16_t, + off1_current_bytes: int8x16_t, + hibits: int8x16_t, + previous_hibits: int8x16_t, + has_error: &mut int8x16_t, +) { + unsafe { + let off1_hibits: int8x16_t = push_last_byte_of_a_to_b(previous_hibits, hibits); + let initial_mins: int8x16_t = vqtbl1q_s8( + initial_mins_tbl!(), + vreinterpretq_u8_s8(off1_hibits) + ); + + let initial_under: uint8x16_t = vcgtq_s8(initial_mins, off1_current_bytes); + + let second_mins: int8x16_t = vqtbl1q_s8( + second_mins_tbl!(), + vreinterpretq_u8_s8(off1_hibits) + ); + let second_under: uint8x16_t = vcgtq_s8(second_mins, current_bytes); + *has_error = vorrq_s8( + *has_error, + vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)) + ); + } +} + +pub struct ProcessedUtfBytes { + rawbytes: int8x16_t, + high_nibbles: int8x16_t, + pub carried_continuations: int8x16_t, +} + +impl Default for ProcessedUtfBytes { + #[cfg_attr(not(feature = "no-inline"), inline)] + fn default() -> Self { + ProcessedUtfBytes { + rawbytes: vdupq_n_s8(0x00), + high_nibbles: vdupq_n_s8(0x00), + carried_continuations: vdupq_n_s8(0x00), + } + } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn count_nibbles(bytes: int8x16_t, answer: &mut ProcessedUtfBytes) { + answer.rawbytes = bytes; + answer.high_nibbles = unsafe { + vandq_s8( + vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)), + vmovq_n_s8(0x0F) + ) + }; +} + +// check whether the current bytes are valid UTF-8 +// at the end of the function, previous gets updated +#[cfg_attr(not(feature = "no-inline"), inline)] +pub fn check_utf8_bytes( + current_bytes: int8x16_t, + previous: &mut ProcessedUtfBytes, + has_error: &mut int8x16_t, +) -> ProcessedUtfBytes { + let mut pb = ProcessedUtfBytes::default(); + count_nibbles(current_bytes, &mut pb); + + check_smaller_than_0xf4(current_bytes, has_error); + + let initial_lengths: int8x16_t = continuation_lengths(pb.high_nibbles); + + pb.carried_continuations = + carry_continuations(initial_lengths, previous.carried_continuations); + + check_continuations(initial_lengths, pb.carried_continuations, has_error); + + let off1_current_bytes: int8x16_t = push_last_byte_of_a_to_b(previous.rawbytes, pb.rawbytes); + check_first_continuation_max(current_bytes, off1_current_bytes, has_error); + + check_overlong( + current_bytes, + off1_current_bytes, + pb.high_nibbles, + previous.high_nibbles, + has_error, + ); + pb +} diff --git a/src/numberparse.rs b/src/numberparse.rs index 03880d56..01af460e 100644 --- a/src/numberparse.rs +++ b/src/numberparse.rs @@ -1,6 +1,7 @@ use crate::charutils::*; use crate::unlikely; use crate::*; + #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -133,6 +134,7 @@ pub enum Number { } #[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 { unsafe { // this actually computes *16* values so we are being wasteful. @@ -143,7 +145,7 @@ fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 { let mul_1_10000: __m128i = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1); // We know what we're doing right? :P #[allow(clippy::cast_ptr_alignment)] - let input: __m128i = _mm_sub_epi8( + let input: __m128i = _mm_sub_epi8( _mm_loadu_si128(chars.get_unchecked(0..16).as_ptr() as *const __m128i), ascii0, ); @@ -155,6 +157,17 @@ fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 { } } +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_feature = "neon")] +fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 { + let val: u64 = unsafe { *(chars.as_ptr() as *const u64) }; + // memcpy(&val, chars, sizeof(u64)); + let val = (val & 0x0F0F0F0F0F0F0F0F).wrapping_mul(2561) >> 8; + let val = (val & 0x00FF00FF00FF00FF).wrapping_mul(6553601) >> 16; + + return ((val & 0x0000FFFF0000FFFF).wrapping_mul(42949672960001) >> 32) as u32; +} + impl<'de> Deserializer<'de> { /// called by parse_number when we know that the output is a float, /// but where there might be some integer overflow. The trick here is to @@ -215,7 +228,7 @@ impl<'de> Deserializer<'de> { digit = unsafe { *p.get_unchecked(digitcount) } - b'0'; digitcount += 1; fraction_weight *= 10.0; - fraction += f64::from(digit) / fraction_weight;; + fraction += f64::from(digit) / fraction_weight; } i += fraction; } diff --git a/src/portability.rs b/src/portability.rs new file mode 100644 index 00000000..69481a85 --- /dev/null +++ b/src/portability.rs @@ -0,0 +1,30 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_arch = "x86_64")] +pub fn add_overflow(value1: u64, value2: u64, result: &mut u64) -> bool { + unsafe { _addcarry_u64(0, value1, value2, result) != 0 } +} + +//TODO: static? + +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_arch = "x86_64")] +pub fn hamming(input_num: u64) -> u32 { + unsafe { _popcnt64(input_num as i64) as u32 } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_arch = "x86_64")] +pub fn hamming(input_num: u64) -> u32 { + unsafe { __popcnt(input_num as u32) + __popcnt((input_num >> 32) as u32) as u32 } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_arch = "x86_64")] +pub fn trailingzeroes(input_num: u64) -> u32 { + unsafe { _tzcnt_u64(input_num) as u32 } +} diff --git a/src/sse42/generator.rs b/src/sse42/generator.rs new file mode 100644 index 00000000..e6636585 --- /dev/null +++ b/src/sse42/generator.rs @@ -0,0 +1,51 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use crate::value::generator::ESCAPED; +use std::io; + +#[inline(always)] +pub unsafe fn write_str_simd(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write { + let zero = _mm_set1_epi8(0); + let lower_quote_range = _mm_set1_epi8(0x1F as i8); + let quote = _mm_set1_epi8(b'"' as i8); + let backslash = _mm_set1_epi8(b'\\' as i8); + while *len - *idx > 16 { + // Load 16 bytes of data; + #[allow(clippy::cast_ptr_alignment)] + let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i); + // Test the data against being backslash and quote. + let bs_or_quote = _mm_or_si128( + _mm_cmpeq_epi8(data, backslash), + _mm_cmpeq_epi8(data, quote) + ); + // Now mask the data with the quote range (0x1F). + let in_quote_range = _mm_and_si128(data, lower_quote_range); + // then test of the data is unchanged. aka: xor it with the + // Any field that was inside the quote range it will be zero + // now. + let is_unchanged = _mm_xor_si128(data, in_quote_range); + let in_range = _mm_cmpeq_epi8(is_unchanged, zero); + let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range)); + if quote_bits != 0 { + let quote_dist = quote_bits.trailing_zeros() as usize; + stry!(writer.write_all(&string[0..*idx + quote_dist])); + let ch = string[*idx + quote_dist]; + match ESCAPED[ch as usize] { + b'u' => stry!(write!(writer, "\\u{:04x}", ch)), + + escape => stry!(writer.write_all(&[b'\\', escape])), + }; + *string = &string[*idx + quote_dist + 1..]; + *idx = 0; + *len = string.len(); + } else { + *idx += 16; + } + } + stry!(writer.write_all(&string[0..*idx])); + *string = &string[*idx..]; + Ok(()) +} diff --git a/src/sse42/mod.rs b/src/sse42/mod.rs index 30c55c86..ac608ae2 100644 --- a/src/sse42/mod.rs +++ b/src/sse42/mod.rs @@ -1,3 +1,4 @@ pub mod deser; pub mod stage1; -pub mod utf8check; \ No newline at end of file +pub mod utf8check; +pub mod generator; \ No newline at end of file diff --git a/src/stage2.rs b/src/stage2.rs index aaa51ad8..3bbd71c1 100644 --- a/src/stage2.rs +++ b/src/stage2.rs @@ -1,9 +1,11 @@ #![allow(dead_code)] +use crate::charutils::*; #[cfg(target_feature = "avx2")] use crate::avx2::stage1::SIMDJSON_PADDING; -use crate::charutils::*; -#[cfg(not(target_feature = "avx2"))] +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] use crate::sse42::stage1::SIMDJSON_PADDING; +#[cfg(target_feature = "neon")] +use crate::neon::stage1::SIMDJSON_PADDING; use crate::{Deserializer, Error, ErrorType, Result}; #[cfg_attr(not(feature = "no-inline"), inline(always))] diff --git a/src/stringparse.rs b/src/stringparse.rs index 0ff8a078..9a7e3b8f 100644 --- a/src/stringparse.rs +++ b/src/stringparse.rs @@ -73,3 +73,9 @@ pub fn handle_unicode_codepoint( let offset: usize = codepoint_to_utf8(code_point, dst_ptr); Ok((offset, src_offset)) } + +// Holds backslashes and quotes locations. +pub struct ParseStringHelper { + pub bs_bits: u32, + pub quote_bits: u32, +} diff --git a/src/value.rs b/src/value.rs index 06245878..c5d5aa87 100644 --- a/src/value.rs +++ b/src/value.rs @@ -11,7 +11,7 @@ /// we do not require prior knowledge sbout string comtent to to take advantage /// of it. pub mod borrowed; -mod generator; +pub(crate) mod generator; pub mod owned; pub use self::borrowed::{to_value as to_borrowed_value, Value as BorrowedValue}; diff --git a/src/value/generator.rs b/src/value/generator.rs index 55824f57..ebdb0a88 100644 --- a/src/value/generator.rs +++ b/src/value/generator.rs @@ -5,19 +5,21 @@ // https://github.com/maciejhirsz/json-rust/blob/master/src/codegen.rs use crate::value::ValueTrait; -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; use std::io; use std::io::Write; use std::marker::PhantomData; use std::ptr; +use crate::*; + #[cfg(target_feature = "avx2")] -const AVX2_PRESENT : bool = true; -#[cfg(not(target_feature = "avx2"))] -const AVX2_PRESENT : bool = false; +use crate::avx2::generator::*; + +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] +use crate::sse42::generator::*; + +#[cfg(target_feature = "neon")] +use crate::neon::generator::*; const QU: u8 = b'"'; const BS: u8 = b'\\'; @@ -30,7 +32,7 @@ const UU: u8 = b'u'; const __: u8 = 0; // Look up table for characters that need escaping in a product string -static ESCAPED: [u8; 256] = [ +pub(crate) static ESCAPED: [u8; 256] = [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0 UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1 @@ -106,85 +108,7 @@ pub trait BaseGenerator { // quote characters that gives us a bitmask of 0x1f for that // region, only quote (`"`) and backslash (`\`) are not in // this range. - if AVX2_PRESENT { - let zero = _mm256_set1_epi8(0); - let lower_quote_range = _mm256_set1_epi8(0x1F as i8); - let quote = _mm256_set1_epi8(b'"' as i8); - let backslash = _mm256_set1_epi8(b'\\' as i8); - while len - idx >= 32 { - // Load 32 bytes of data; - #[allow(clippy::cast_ptr_alignment)] - let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(idx) as *const __m256i); - // Test the data against being backslash and quote. - let bs_or_quote = _mm256_or_si256( - _mm256_cmpeq_epi8(data, backslash), - _mm256_cmpeq_epi8(data, quote), - ); - // Now mask the data with the quote range (0x1F). - let in_quote_range = _mm256_and_si256(data, lower_quote_range); - // then test of the data is unchanged. aka: xor it with the - // Any field that was inside the quote range it will be zero - // now. - let is_unchanged = _mm256_xor_si256(data, in_quote_range); - let in_range = _mm256_cmpeq_epi8(is_unchanged, zero); - let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range)); - if quote_bits != 0 { - let quote_dist = quote_bits.trailing_zeros() as usize; - stry!(self.get_writer().write_all(&string[0..idx + quote_dist])); - let ch = string[idx + quote_dist]; - match ESCAPED[ch as usize] { - b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)), - - escape => stry!(self.write(&[b'\\', escape])), - }; - string = &string[idx + quote_dist + 1..]; - idx = 0; - len = string.len(); - } else { - idx += 32; - } - } - } - // The case where we have a 16+ byte block - // we repeate the same logic as above but with - // only 16 bytes - let zero = _mm_set1_epi8(0); - let lower_quote_range = _mm_set1_epi8(0x1F as i8); - let quote = _mm_set1_epi8(b'"' as i8); - let backslash = _mm_set1_epi8(b'\\' as i8); - while len - idx > 16 { - // Load 16 bytes of data; - #[allow(clippy::cast_ptr_alignment)] - let data: __m128i = _mm_loadu_si128(string.as_ptr().add(idx) as *const __m128i); - // Test the data against being backslash and quote. - let bs_or_quote = - _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote)); - // Now mask the data with the quote range (0x1F). - let in_quote_range = _mm_and_si128(data, lower_quote_range); - // then test of the data is unchanged. aka: xor it with the - // Any field that was inside the quote range it will be zero - // now. - let is_unchanged = _mm_xor_si128(data, in_quote_range); - let in_range = _mm_cmpeq_epi8(is_unchanged, zero); - let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range)); - if quote_bits != 0 { - let quote_dist = quote_bits.trailing_zeros() as usize; - stry!(self.get_writer().write_all(&string[0..idx + quote_dist])); - let ch = string[idx + quote_dist]; - match ESCAPED[ch as usize] { - b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)), - - escape => stry!(self.write(&[b'\\', escape])), - }; - string = &string[idx + quote_dist + 1..]; - idx = 0; - len = string.len(); - } else { - idx += 16; - } - } - stry!(self.get_writer().write_all(&string[0..idx])); - string = &string[idx..]; + stry!(write_str_simd(self.get_writer(), &mut string, &mut len, &mut idx)); } // Legacy code to handle the remainder of the code for (index, ch) in string.iter().enumerate() { From ebf9ca9bfc6147bbd4edff17301ca136bd4725d1 Mon Sep 17 00:00:00 2001 From: Heinz Gies Date: Sat, 17 Aug 2019 00:33:47 +0200 Subject: [PATCH 7/9] Update extq intrinsics --- src/lib.rs | 3 +- src/neon/intrinsics.rs | 219 ++++++++++++++++++++++++++++++++++++++--- src/neon/simd_llvm.rs | 4 +- 3 files changed, 210 insertions(+), 16 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b68deadb..d76270d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,7 +9,8 @@ platform_intrinsics, stmt_expr_attributes, simd_ffi, - link_llvm_intrinsics + link_llvm_intrinsics, + rustc_attrs ) )] diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs index c5c98cb6..2221a419 100644 --- a/src/neon/intrinsics.rs +++ b/src/neon/intrinsics.rs @@ -1,7 +1,7 @@ //use std::arch:: -use crate::neon::simd_llvm; - +use crate::neon::simd_llvm::{self, *}; +use std::hint::unreachable_unchecked; use std::mem; use core; @@ -67,21 +67,214 @@ pub unsafe fn vnegq_s8(a: int8x16_t) -> int8x16_t { } +/// Extract vector from pair of vectors +//uint8x16_t vextq_s8 (uint8x16_t a, uint8x16_t b, const int n) #[inline] -fn rotate_(a: u128, b: u128, n: u128) -> u128 { - let az = a >> (n * 8); - let bz = b << (128 - (n * 8)); - az | bz -} - -#[inline] -pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: u8) -> uint8x16_t { - mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128)) +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[rustc_args_required_const(2)] +pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: i32) -> uint8x16_t { + if n < 0 || n > 15 { + unreachable_unchecked(); + }; + match n & 0b1111 { + 0 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), + 1 => simd_shuffle16( + a, + b, + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ), + 2 => simd_shuffle16( + a, + b, + [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], + ), + 3 => simd_shuffle16( + a, + b, + [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], + ), + 4 => simd_shuffle16( + a, + b, + [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + ), + 5 => simd_shuffle16( + a, + b, + [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], + ), + 6 => simd_shuffle16( + a, + b, + [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], + ), + 7 => simd_shuffle16( + a, + b, + [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], + ), + 8 => simd_shuffle16( + a, + b, + [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], + ), + 9 => simd_shuffle16( + a, + b, + [ + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + ], + ), + 10 => simd_shuffle16( + a, + b, + [ + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + ], + ), + 11 => simd_shuffle16( + a, + b, + [ + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + ], + ), + 12 => simd_shuffle16( + a, + b, + [ + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + ], + ), + 13 => simd_shuffle16( + a, + b, + [ + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + ], + ), + 14 => simd_shuffle16( + a, + b, + [ + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + ], + ), + 15 => simd_shuffle16( + a, + b, + [ + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + ], + ), + _ => unreachable_unchecked(), + } } +/// Extract vector from pair of vectors +//int8x16_t vextq_s8 (int8x16_t a, int8x16_t b, const int n) #[inline] -pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t { - mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128)) +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[rustc_args_required_const(2)] +pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: i32) -> int8x16_t { + if n < 0 || n > 15 { + unreachable_unchecked(); + }; + match n & 0b1111 { + 0 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), + 1 => simd_shuffle16( + a, + b, + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ), + 2 => simd_shuffle16( + a, + b, + [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], + ), + 3 => simd_shuffle16( + a, + b, + [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], + ), + 4 => simd_shuffle16( + a, + b, + [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + ), + 5 => simd_shuffle16( + a, + b, + [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], + ), + 6 => simd_shuffle16( + a, + b, + [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], + ), + 7 => simd_shuffle16( + a, + b, + [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], + ), + 8 => simd_shuffle16( + a, + b, + [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], + ), + 9 => simd_shuffle16( + a, + b, + [ + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + ], + ), + 10 => simd_shuffle16( + a, + b, + [ + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + ], + ), + 11 => simd_shuffle16( + a, + b, + [ + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + ], + ), + 12 => simd_shuffle16( + a, + b, + [ + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + ], + ), + 13 => simd_shuffle16( + a, + b, + [ + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + ], + ), + 14 => simd_shuffle16( + a, + b, + [ + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + ], + ), + 15 => simd_shuffle16( + a, + b, + [ + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + ], + ), + _ => unreachable_unchecked(), + } } #[inline] diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs index 6e60b63c..4edf41f7 100644 --- a/src/neon/simd_llvm.rs +++ b/src/neon/simd_llvm.rs @@ -9,7 +9,7 @@ extern "platform-intrinsic" { // pub fn simd_shuffle2(x: T, y: T, idx: [u32; 2]) -> U; // pub fn simd_shuffle4(x: T, y: T, idx: [u32; 4]) -> U; // pub fn simd_shuffle8(x: T, y: T, idx: [u32; 8]) -> U; -// pub fn simd_shuffle16(x: T, y: T, idx: [u32; 16]) -> U; + pub fn simd_shuffle16(x: T, y: T, idx: [u32; 16]) -> U; // pub fn simd_shuffle32(x: T, y: T, idx: [u32; 32]) -> U; // pub fn simd_shuffle64(x: T, y: T, idx: [u32; 64]) -> U; // pub fn simd_shuffle128(x: T, y: T, idx: [u32; 128]) -> U; @@ -51,4 +51,4 @@ extern "platform-intrinsic" { // // pub fn simd_fsqrt(a: T) -> T; // pub fn simd_fma(a: T, b: T, c: T) -> T; -} \ No newline at end of file +} From 4acbe1722e1ab8fad94db1c3585b332f52ac8fad Mon Sep 17 00:00:00 2001 From: "Heinz N. Gies" Date: Tue, 3 Sep 2019 22:12:43 +0200 Subject: [PATCH 8/9] Use simd-lite (#39) * Use simd-lite * Update badge * Update badge * Get rid of transmutes * Use NeonInit trait * vqsubq_u8 fix * vqsubq_u8 fix pt. 2 * use reexprted values from simd-lite --- .drone.yml | 4 +- Cargo.toml | 3 + README.md | 7 +- src/lib.rs | 8 +- src/neon/deser.rs | 15 +- src/neon/generator.rs | 4 +- src/neon/intrinsics.rs | 750 ----------------------------------------- src/neon/mod.rs | 3 - src/neon/simd.rs | 470 -------------------------- src/neon/simd_llvm.rs | 54 --- src/neon/stage1.rs | 40 ++- src/neon/utf8check.rs | 58 ++-- src/stringparse.rs | 6 - 13 files changed, 75 insertions(+), 1347 deletions(-) delete mode 100644 src/neon/intrinsics.rs delete mode 100644 src/neon/simd.rs delete mode 100644 src/neon/simd_llvm.rs diff --git a/.drone.yml b/.drone.yml index dbc33acb..fbf71755 100644 --- a/.drone.yml +++ b/.drone.yml @@ -60,5 +60,5 @@ steps: commands: - rustup default nightly - rustup update - - cargo clean && cargo +nightly build --verbose --all - - cargo +nightly test --verbose --all + - cargo clean && cargo +nightly build --verbose --all --features neon + - cargo +nightly test --verbose --all --features neon diff --git a/Cargo.toml b/Cargo.toml index 399f7c9d..5ab206b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ jemallocator = { version = "0.3", optional = true } perfcnt = { version = "0.4", optional = true } getopts = { version = "0.2", optional = true } colored = { version = "1.7", optional = true } +simd-lite = { git = "https://github.com/simd-lite/simd-lite", optional = true, branch = "reexport" } @@ -45,6 +46,8 @@ harness = false [features] default = ["swar-number-parsing", "serde_impl"] +# Support for ARM NEON SIMD +neon = ["simd-lite"] # use 8 number at once parsing strategy swar-number-parsing = [] # serde compatibility diff --git a/README.md b/README.md index 91fad1c4..fc189a87 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ # SIMD Json for Rust   [![Build Status]][drone.io] [![Windows Build Status]][appveyor.com] [![Latest Version]][crates.io] -[Build Status]: https://cloud.drone.io/api/badges/Licenser/simdjson-rs/status.svg -[drone.io]: https://cloud.drone.io/Licenser/simdjson-rs -[Windows Build Status]: https://ci.appveyor.com/api/projects/status/0kf0v6hj5v2gite9?svg=true + +[Build Status]: https://cloud.drone.io/api/badges/simd-lite/simdjson-rs/status.svg +[drone.io]: https://cloud.drone.io/simd-lite/simdjson-rs +[Windows Build Status]: https://ci.appveyor.com/api/projects/status/ffi2ese7dxse6pb8?svg=true [appveyor.com]: https://ci.appveyor.com/project/Licenser/simdjson-rs [Latest Version]: https://img.shields.io/crates/v/simd-json.svg [crates.io]: https://crates.io/crates/simd-json diff --git a/src/lib.rs b/src/lib.rs index d76270d8..6020c585 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,7 +10,7 @@ stmt_expr_attributes, simd_ffi, link_llvm_intrinsics, - rustc_attrs + rustc_attrs, ) )] @@ -111,11 +111,11 @@ pub use crate::sse42::deser::*; #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] use crate::sse42::stage1::SIMDJSON_PADDING; -#[cfg(target_feature = "neon")] +#[cfg(all(target_feature = "neon", feature = "neon"))] mod neon; -#[cfg(target_feature = "neon")] +#[cfg(all(target_feature = "neon", feature = "neon"))] pub use crate::neon::deser::*; -#[cfg(target_feature = "neon")] +#[cfg(all(target_feature = "neon", feature = "neon"))] use crate::neon::stage1::SIMDJSON_PADDING; mod stage2; diff --git a/src/neon/deser.rs b/src/neon/deser.rs index 5d70b7af..6e9cd14d 100644 --- a/src/neon/deser.rs +++ b/src/neon/deser.rs @@ -1,11 +1,10 @@ -pub use crate::error::{Error, ErrorType}; -pub use crate::Deserializer; -pub use crate::Result; -pub use crate::neon::stage1::*; -pub use crate::neon::utf8check::*; -pub use crate::neon::intrinsics::*; -pub use crate::stringparse::*; +use crate::error::{ErrorType}; +use crate::Deserializer; +use crate::Result; +use crate::stringparse::*; +use simd_lite::aarch64::*; +use crate::neon::stage1::*; impl<'de> Deserializer<'de> { #[cfg_attr(not(feature = "no-inline"), inline(always))] @@ -196,4 +195,4 @@ impl<'de> Deserializer<'de> { } } } -} \ No newline at end of file +} diff --git a/src/neon/generator.rs b/src/neon/generator.rs index 6c8cf358..61f3dc95 100644 --- a/src/neon/generator.rs +++ b/src/neon/generator.rs @@ -1,7 +1,7 @@ use crate::value::generator::ESCAPED; use std::io; -use crate::neon::intrinsics::*; use crate::neon::stage1::neon_movemask; +use simd_lite::aarch64::*; #[inline(always)] pub unsafe fn write_str_simd(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write { @@ -23,7 +23,7 @@ pub unsafe fn write_str_simd(writer: &mut W, string: &mut &[u8], len: &mut us // then test of the data is unchanged. aka: xor it with the // Any field that was inside the quote range it will be zero // now. - let is_unchanged = vxorrq_u8(data, in_quote_range); + let is_unchanged = veorq_u8(data, in_quote_range); let in_range = vceqq_u8(is_unchanged, zero); let quote_bits = neon_movemask(vorrq_u8(bs_or_quote, in_range)); if quote_bits != 0 { diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs deleted file mode 100644 index 2221a419..00000000 --- a/src/neon/intrinsics.rs +++ /dev/null @@ -1,750 +0,0 @@ -//use std::arch:: - -use crate::neon::simd_llvm::{self, *}; -use std::hint::unreachable_unchecked; -use std::mem; -use core; - -#[allow(unused)] -macro_rules! types { - ($( - $(#[$doc:meta])* - pub struct $name:ident($($fields:tt)*); - )*) => ($( - $(#[$doc])* - #[derive(Copy, Clone, Debug)] - #[allow(non_camel_case_types)] - #[repr(simd)] - #[allow(clippy::missing_inline_in_public_items)] - pub struct $name($($fields)*); - )*) -} - -#[allow(non_camel_case_types)] -pub type poly64_t = i64; - -#[allow(improper_ctypes)] -extern "C" { - #[link_name = "llvm.aarch64.neon.addp.v16u8"] - fn vpaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; - #[link_name = "llvm.aarch64.neon.pmull64"] - fn vmull_p64_(a: i64, b: i64) -> int8x16_t; - #[link_name = "llvm.aarch64.neon.uqxtn.v2u32"] - fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t; - #[link_name = "llvm.aarch64.neon.uqsub.v16u8"] - fn vqsubq_u8_(a: uint8x16_t, a: uint8x16_t) -> uint8x16_t; - #[link_name = "llvm.aarch64.neon.uqsub.v16i8"] - fn vqsubq_s8_(a: int8x16_t, a: int8x16_t) -> int8x16_t; -} - -#[inline] -unsafe fn vaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { - simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) -} - -#[inline] -unsafe fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t { - simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) -} - -#[inline] -unsafe fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t { - simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) -} - -#[inline] -pub unsafe fn vnegq_u8(a: uint8x16_t) -> uint8x16_t { - let x: u128 = mem::transmute(a); - let nx = !x; - mem::transmute(nx) -} - -#[inline] -pub unsafe fn vnegq_s8(a: int8x16_t) -> int8x16_t { - let x: u128 = mem::transmute(a); - let nx = !x; - mem::transmute(nx) -} - - -/// Extract vector from pair of vectors -//uint8x16_t vextq_s8 (uint8x16_t a, uint8x16_t b, const int n) -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[rustc_args_required_const(2)] -pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: i32) -> uint8x16_t { - if n < 0 || n > 15 { - unreachable_unchecked(); - }; - match n & 0b1111 { - 0 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle16( - a, - b, - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], - ), - 2 => simd_shuffle16( - a, - b, - [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], - ), - 3 => simd_shuffle16( - a, - b, - [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - ), - 4 => simd_shuffle16( - a, - b, - [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - ), - 5 => simd_shuffle16( - a, - b, - [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - ), - 6 => simd_shuffle16( - a, - b, - [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], - ), - 7 => simd_shuffle16( - a, - b, - [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], - ), - 8 => simd_shuffle16( - a, - b, - [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], - ), - 9 => simd_shuffle16( - a, - b, - [ - 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - ], - ), - 10 => simd_shuffle16( - a, - b, - [ - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - ], - ), - 11 => simd_shuffle16( - a, - b, - [ - 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, - ], - ), - 12 => simd_shuffle16( - a, - b, - [ - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, - ], - ), - 13 => simd_shuffle16( - a, - b, - [ - 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, - ], - ), - 14 => simd_shuffle16( - a, - b, - [ - 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - ], - ), - 15 => simd_shuffle16( - a, - b, - [ - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - ], - ), - _ => unreachable_unchecked(), - } -} - -/// Extract vector from pair of vectors -//int8x16_t vextq_s8 (int8x16_t a, int8x16_t b, const int n) -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[rustc_args_required_const(2)] -pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: i32) -> int8x16_t { - if n < 0 || n > 15 { - unreachable_unchecked(); - }; - match n & 0b1111 { - 0 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle16( - a, - b, - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], - ), - 2 => simd_shuffle16( - a, - b, - [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], - ), - 3 => simd_shuffle16( - a, - b, - [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - ), - 4 => simd_shuffle16( - a, - b, - [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - ), - 5 => simd_shuffle16( - a, - b, - [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - ), - 6 => simd_shuffle16( - a, - b, - [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], - ), - 7 => simd_shuffle16( - a, - b, - [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], - ), - 8 => simd_shuffle16( - a, - b, - [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], - ), - 9 => simd_shuffle16( - a, - b, - [ - 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - ], - ), - 10 => simd_shuffle16( - a, - b, - [ - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - ], - ), - 11 => simd_shuffle16( - a, - b, - [ - 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, - ], - ), - 12 => simd_shuffle16( - a, - b, - [ - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, - ], - ), - 13 => simd_shuffle16( - a, - b, - [ - 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, - ], - ), - 14 => simd_shuffle16( - a, - b, - [ - 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - ], - ), - 15 => simd_shuffle16( - a, - b, - [ - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - ], - ), - _ => unreachable_unchecked(), - } -} - -#[inline] -pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t { - mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b))) -} - -#[inline] -pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { - unsafe { vpaddq_u8_(a, b) } -} - -#[inline] -pub unsafe fn vshrq_n_u8(a: uint8x16_t, n: u8) -> uint8x16_t { - uint8x16_t( - a.0 >> n, - a.1 >> n, - a.2 >> n, - a.3 >> n, - a.4 >> n, - a.5 >> n, - a.6 >> n, - a.7 >> n, - a.8 >> n, - a.9 >> n, - a.10 >> n, - a.11 >> n, - a.12 >> n, - a.13 >> n, - a.14 >> n, - a.15 >> n, - ) -} - -types! { - /// ARM-specific 64-bit wide vector of eight packed `i8`. - pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8); - /// ARM-specific 64-bit wide vector of eight packed `u8`. - pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8); - /// ARM-specific 64-bit wide polynomial vector of eight packed `u8`. - pub struct poly8x8_t(u8, u8, u8, u8, u8, u8, u8, u8); - /// ARM-specific 64-bit wide vector of four packed `i16`. - pub struct int16x4_t(i16, i16, i16, i16); - /// ARM-specific 64-bit wide vector of four packed `u16`. - pub struct uint16x4_t(u16, u16, u16, u16); - /// ARM-specific 64-bit wide vector of four packed `u16`. - pub struct poly16x4_t(u16, u16, u16, u16); - /// ARM-specific 64-bit wide vector of two packed `i32`. - pub struct int32x2_t(i32, i32); - /// ARM-specific 64-bit wide vector of two packed `u32`. - pub struct uint32x2_t(u32, u32); - /// ARM-specific 64-bit wide vector of two packed `f32`. - pub struct float32x2_t(f32, f32); - /// ARM-specific 64-bit wide vector of one packed `i64`. - pub struct int64x1_t(i64); - /// ARM-specific 64-bit wide vector of one packed `u64`. - pub struct uint64x1_t(u64); - /// ARM-specific 128-bit wide vector of sixteen packed `i8`. - pub struct int8x16_t( - i8, i8 ,i8, i8, i8, i8 ,i8, i8, - i8, i8 ,i8, i8, i8, i8 ,i8, i8, - ); - /// ARM-specific 128-bit wide vector of sixteen packed `u8`. - pub struct uint8x16_t( - u8, u8 ,u8, u8, u8, u8 ,u8, u8, - u8, u8 ,u8, u8, u8, u8 ,u8, u8, - ); - /// ARM-specific 128-bit wide vector of sixteen packed `u8`. - pub struct poly8x16_t( - u8, u8, u8, u8, u8, u8, u8, u8, - u8, u8, u8, u8, u8, u8, u8, u8 - ); - /// ARM-specific 128-bit wide vector of eight packed `i16`. - pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16); - /// ARM-specific 128-bit wide vector of eight packed `u16`. - pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16); - /// ARM-specific 128-bit wide vector of eight packed `u16`. - pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16); - /// ARM-specific 128-bit wide vector of four packed `i32`. - pub struct int32x4_t(i32, i32, i32, i32); - /// ARM-specific 128-bit wide vector of four packed `u32`. - pub struct uint32x4_t(u32, u32, u32, u32); - /// ARM-specific 128-bit wide vector of four packed `f32`. - pub struct float32x4_t(f32, f32, f32, f32); - /// ARM-specific 128-bit wide vector of two packed `i64`. - pub struct int64x2_t(i64, i64); - /// ARM-specific 128-bit wide vector of two packed `u64`. - pub struct uint64x2_t(u64, u64); - /// ARM-specific 128-bit wide vector of one packed `i128`. - pub struct poly128_t(i128); // FIXME: check this! -} - -impl uint8x16_t { - #[inline] - pub fn new(a: u8, b: u8, c: u8, d: u8, e: u8, f: u8, g: u8, h: u8, i: u8, j: u8, k: u8, l: u8, m: u8, n: u8, o: u8, p: u8) -> uint8x16_t { - uint8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) - } -} - -impl int8x16_t { - #[inline] - pub fn new(a: i8, b: i8, c: i8, d: i8, e: i8, f: i8, g: i8, h: i8, i: i8, j: i8, k: i8, l: i8, m: i8, n: i8, o: i8, p: i8) -> int8x16_t { - int8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) - } -} - -impl int32x4_t { - #[inline] - pub fn new(a: i32, b: i32, c: i32, d: i32) -> int32x4_t { - int32x4_t(a, b, c, d) - } -} - -//#[inline] -//pub fn add_overflow(a: u64, b: u64, out: &mut u64) -> bool { -// let (carry, did_carry) = a.overflowing_add(b); -// *out = carry; -// did_carry -//} - -#[inline] -pub unsafe fn vld1q_s8(addr: *const i8) -> int8x16_t { - *(addr as *const int8x16_t) -} - -#[inline] -pub unsafe fn vld1q_u8(addr: *const u8) -> uint8x16_t { - *(addr as *const uint8x16_t) -} - -#[inline] -pub unsafe fn vst1q_u8(addr: *mut u8, val: uint8x16_t) { - std::ptr::write(addr as *mut uint8x16_t, val); -} - -macro_rules! aarch64_simd_2 { - ($name: ident, $type: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => { - aarch64_simd_2!($name, $type, $type, $simd_fn, $intrarm, $intraarch); - }; - ($name: ident, $type: ty, $res: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => { - #[inline] - pub fn $name(a: $type, b: $type) -> $res { - unsafe { simd_llvm::$simd_fn(a, b) } - } - } -} - -macro_rules! aarch64_simd_ceq { - ($name: ident, $type: ty, $res: ty) => { - /// Compare bitwise Equal (vector) - aarch64_simd_2!($name, $type, $res, simd_eq, cmeq, cmeq); - }; -} - -aarch64_simd_ceq!(vceq_s8, int8x8_t, uint8x8_t); -aarch64_simd_ceq!(vceqq_s8, int8x16_t, uint8x16_t); -aarch64_simd_ceq!(vceq_s16, int16x4_t, uint16x4_t); -aarch64_simd_ceq!(vceqq_s16, int16x8_t, uint16x8_t); -aarch64_simd_ceq!(vceq_s32, int32x2_t, uint32x2_t); -aarch64_simd_ceq!(vceqq_s32, int32x4_t, uint32x4_t); -aarch64_simd_ceq!(vceq_u8, uint8x8_t, uint8x8_t); -aarch64_simd_ceq!(vceqq_u8, uint8x16_t, uint8x16_t); -aarch64_simd_ceq!(vceq_u16, uint16x4_t, uint16x4_t); -aarch64_simd_ceq!(vceqq_u16, uint16x8_t, uint16x8_t); -aarch64_simd_ceq!(vceq_u32, uint32x2_t, uint32x2_t); -aarch64_simd_ceq!(vceqq_u32, uint32x4_t, uint32x4_t); -aarch64_simd_2!(vceq_f32, float32x2_t, uint32x2_t, simd_eq, fcmeq, fcmeq); -aarch64_simd_2!(vceqq_f32, float32x4_t, uint32x4_t, simd_eq, fcmeq, fcmeq); -aarch64_simd_ceq!(vceq_p8, poly8x8_t, poly8x8_t); -aarch64_simd_ceq!(vceqq_p8, poly8x16_t, poly8x16_t); - -macro_rules! aarch64_simd_cgt { - ($name:ident, $type:ty, $res:ty) => { - /// Compare signed Greater than (vector) - aarch64_simd_2!($name, $type, $res, simd_gt, cmgt, cmgt); - }; -} - -//macro_rules! aarch64_simd_cgtu { -// ($name: ident, $type: ty) => { -// /// Compare Greater than (vector) -// aarch64_simd_2!($name, $type, simd_gt, cmhi); -// }; -//} - -aarch64_simd_cgt!(vcgt_s8, int8x8_t, uint8x8_t); -aarch64_simd_cgt!(vcgtq_s8, int8x16_t, uint8x16_t); -aarch64_simd_cgt!(vcgt_s16, int16x4_t, uint16x4_t); -aarch64_simd_cgt!(vcgtq_s16, int16x8_t, uint16x8_t); -aarch64_simd_cgt!(vcgt_s32, int32x2_t, uint32x2_t); -aarch64_simd_cgt!(vcgtq_s32, int32x4_t, uint32x4_t); - -//aarch64_simd_cgtu!(vcgtq_u8, uint8x16_t); -//aarch64_simd_cgt!(vcgt_s64, int64x1_t); -//aarch64_simd_cgt!(vcgtq_s64, int64x2_t); -//aarch64_simd_cgtu!(vcgt_u64, uint64x1_t); -//aarch64_simd_cgtu!(vcgtq_u64, uint64x2_t); - -macro_rules! aarch64_simd_clt { - ($name:ident, $type:ty, $res:ty) => { - /// Compare signed Lesser than (vector) - aarch64_simd_2!($name, $type, $res, simd_lt, cmgt, cmgt); - }; -} - -//macro_rules! aarch64_simd_cltu { -//( $ name: ident, $ type: ty) => { -///// Compare Lesser than (vector) -//aarch64_simd_2 ! ( $ name, $ type, simd_lt, cmhi); -//}; -//} - -aarch64_simd_clt!(vclt_s8, int8x8_t, uint8x8_t); -aarch64_simd_clt!(vcltq_s8, int8x16_t, uint8x16_t); -aarch64_simd_clt!(vclt_s16, int16x4_t, uint16x4_t); -aarch64_simd_clt!(vcltq_s16, int16x8_t, uint16x8_t); -aarch64_simd_clt!(vclt_s32, int32x2_t, uint32x2_t); -aarch64_simd_clt!(vcltq_s32, int32x4_t, uint32x4_t); - -//arm_simd_cltu!(vclt_u8, uint8x8_t); -//arm_simd_cltu!(vcltq_u8, uint8x16_t); -//arm_simd_cltu!(vclt_u16, uint16x4_t); -//arm_simd_cltu!(vcltq_u16, uint16x8_t); -//arm_simd_cltu!(vclt_u32, uint32x2_t); -//arm_simd_cltu!(vcltq_u32, uint32x4_t); - -macro_rules! aarch64_simd_cge { - ($name:ident, $type:ty, $res:ty) => { - /// Compare signed Greater than equals (vector) - aarch64_simd_2!($name, $type, $res, simd_ge, cmge, cmge); - }; -} - -//macro_rules! aarch64_simd_cgeu { -//( $ name: ident, $ type: ty) => { -///// Compare Greater than (vector) -//aarch64_simd_2 ! ( $ name, $ type, simd_ge, cmhs); -//}; -//} - -aarch64_simd_cge!(vcge_s8, int8x8_t, uint8x8_t); -aarch64_simd_cge!(vcgeq_s8, int8x16_t, uint8x16_t); -aarch64_simd_cge!(vcge_s16, int16x4_t, uint16x4_t); -aarch64_simd_cge!(vcgeq_s16, int16x8_t, uint16x8_t); -aarch64_simd_cge!(vcge_s32, int32x2_t, uint32x2_t); -aarch64_simd_cge!(vcgeq_s32, int32x4_t, uint32x4_t); -//arm_simd_cgeu!(vcge_u8, uint8x8_t); -//arm_simd_cgeu!(vcgeq_u8, uint8x16_t); -//arm_simd_cgeu!(vcge_u16, uint16x4_t); -//arm_simd_cgeu!(vcgeq_u16, uint16x8_t); -//arm_simd_cgeu!(vcge_u32, uint32x2_t); -//arm_simd_cgeu!(vcgeq_u32, uint32x4_t); - -macro_rules! aarch64_simd_cle { - ($name:ident, $type:ty, $res:ty) => { - /// Compare signed Lesser than equals (vector) - aarch64_simd_2!($name, $type, $res, simd_le, cmge, cmge); - }; -} - -//macro_rules! aarch64_simd_cleu { -//( $ name: ident, $ type: ty) => { -///// Compare Lesser than (vector) -//aarch64_simd_2 ! ( $ name, $ type, simd_le, cmhs); -//}; -//} - -aarch64_simd_cle!(vcle_s8, int8x8_t, uint8x8_t); -aarch64_simd_cle!(vcleq_s8, int8x16_t, uint8x16_t); -aarch64_simd_cle!(vcle_s16, int16x4_t, uint16x4_t); -aarch64_simd_cle!(vcleq_s16, int16x8_t, uint16x8_t); -aarch64_simd_cle!(vcle_s32, int32x2_t, uint32x2_t); -aarch64_simd_cle!(vcleq_s32, int32x4_t, uint32x4_t); -//arm_simd_cleu!(vcle_u8, uint8x8_t); -aarch64_simd_cle!(vcleq_u8, uint8x16_t, uint8x16_t); -//arm_simd_cleu!(vcle_u16, uint16x4_t); -//arm_simd_cleu!(vcleq_u16, uint16x8_t); -//arm_simd_cleu!(vcle_u32, uint32x2_t); -//arm_simd_cleu!(vcleq_u32, uint32x4_t); - -#[inline] -pub fn vdupq_n_s8(a: i8) -> int8x16_t { - int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) -} - -#[inline] -pub fn zeroi8x16() -> int8x16_t { - int8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) -} - -#[inline] -pub fn vdupq_n_u8(a: u8) -> uint8x16_t { - uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) -} - -#[inline] -pub fn vmovq_n_u8(a: u8) -> uint8x16_t { - uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) -} - -#[inline] -pub fn vmovq_n_s8(a: i8) -> int8x16_t { - int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) -} - -#[inline] -pub fn zerou8x16() -> uint8x16_t { - uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) -} - -#[inline] -pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { - mem::transmute(vaddq_u8_(mem::transmute(a), mem::transmute(b))) -} - -#[inline] -pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { - mem::transmute(vaddq_s8_(mem::transmute(a), mem::transmute(b))) -} - -#[inline] -pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { - mem::transmute(vaddq_s32_(mem::transmute(a), mem::transmute(b))) -} - -#[inline] -pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_and(a, b) } } -#[inline] -pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_and(a, b) } } -#[inline] -pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { unsafe { simd_llvm::simd_and(a, b) } } -#[inline] -pub fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_or(a, b) } } -#[inline] -pub fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_or(a, b) } } -#[inline] -pub fn vxorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_xor(a, b) } } -#[inline] -pub fn vxorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_xor(a, b) } } - -macro_rules! arm_reinterpret { - ($name: ident, $from: ty, $to: ty) => { - // Vector reinterpret cast operation - #[inline] - pub fn $name(a: $from) -> $to { - unsafe { mem::transmute(a) } - } - }; -} - -arm_reinterpret!(vreinterpret_u64_u32, uint32x2_t, uint64x1_t); -arm_reinterpret!(vreinterpretq_u64_u32, uint32x4_t, uint64x2_t); -arm_reinterpret!(vreinterpretq_s8_u8, uint8x16_t, int8x16_t); -arm_reinterpret!(vreinterpretq_u16_u8, uint8x16_t, uint16x8_t); -arm_reinterpret!(vreinterpretq_u32_u8, uint8x16_t, uint32x4_t); -arm_reinterpret!(vreinterpretq_u64_u8, uint8x16_t, uint64x2_t); -arm_reinterpret!(vreinterpretq_u64_s8, int8x16_t, uint64x2_t); -arm_reinterpret!(vreinterpretq_u8_s8, int8x16_t, uint8x16_t); - -arm_reinterpret!(vreinterpretq_s16_s8, int8x16_t, int16x8_t); -arm_reinterpret!(vreinterpretq_s32_s8, int8x16_t, int32x4_t); -arm_reinterpret!(vreinterpretq_s64_s8, int8x16_t, int64x2_t); - -macro_rules! arm_vget_lane { - ($name: ident, $to: ty, $from: ty, $lanes: literal) => { - #[inline] - pub unsafe fn $name(v: $from, lane: u32) -> $ to { - simd_llvm::simd_extract(v, lane) - } - }; -} - -arm_vget_lane!(vgetq_lane_u16, u16, uint16x8_t, 7); -arm_vget_lane!(vgetq_lane_u32, u32, uint32x4_t, 3); -arm_vget_lane!(vgetq_lane_u64, u64, uint64x2_t, 1); -arm_vget_lane!(vget_lane_u64, u64, uint64x1_t, 0); - -arm_vget_lane!(vgetq_lane_s16, i16, int16x8_t, 7); -arm_vget_lane!(vgetq_lane_s32, i32, int32x4_t, 3); -arm_vget_lane!(vgetq_lane_s64, i64, int64x2_t, 1); -arm_vget_lane!(vget_lane_s64, i64, int64x1_t, 0); - -#[inline] -pub unsafe fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t { - vqmovn_u64_(a) -} - -#[inline] -pub unsafe fn vqtbl1q_s8(t: int8x16_t, idx: uint8x16_t) -> int8x16_t { - mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx))) -} - -#[inline] -pub unsafe fn vqtbl1q_u8(t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t { - mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx))) -} - -#[inline] -pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { - vqsubq_u8_(a, b) -} - -#[inline] -pub unsafe fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { - vqsubq_s8_(a, b) -} - -#[inline] -fn test_u8(a: u8, b: u8) -> u8 { - if a & b != 0 { - 0xFF - } else { - 0x00 - } -} - -#[inline] -pub unsafe fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { - uint8x16_t( - test_u8(a.0, b.0), - test_u8(a.1, b.1), - test_u8(a.2, b.2), - test_u8(a.3, b.3), - test_u8(a.4, b.4), - test_u8(a.5, b.5), - test_u8(a.6, b.6), - test_u8(a.7, b.7), - test_u8(a.8, b.8), - test_u8(a.9, b.9), - test_u8(a.10, b.10), - test_u8(a.11, b.11), - test_u8(a.12, b.12), - test_u8(a.13, b.13), - test_u8(a.14, b.14), - test_u8(a.15, b.15), - ) -} - -#[inline] -fn test_s8(a: i8, b: i8) -> i8 { - if a & b != 0 { - -1 - } else { - 0x00 - } -} - -#[inline] -pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { - int8x16_t( - test_s8(a.0, b.0), - test_s8(a.1, b.1), - test_s8(a.2, b.2), - test_s8(a.3, b.3), - test_s8(a.4, b.4), - test_s8(a.5, b.5), - test_s8(a.6, b.6), - test_s8(a.7, b.7), - test_s8(a.8, b.8), - test_s8(a.9, b.9), - test_s8(a.10, b.10), - test_s8(a.11, b.11), - test_s8(a.12, b.12), - test_s8(a.13, b.13), - test_s8(a.14, b.14), - test_s8(a.15, b.15), - ) -} - -#[inline] -pub unsafe fn vst1q_u32(addr: *mut u8, val: uint32x4_t) { - std::ptr::write(addr as *mut uint32x4_t, val) -} diff --git a/src/neon/mod.rs b/src/neon/mod.rs index f7868249..d8b9996a 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -2,6 +2,3 @@ pub mod deser; pub mod stage1; pub mod utf8check; pub mod generator; -mod simd; -mod simd_llvm; -mod intrinsics; \ No newline at end of file diff --git a/src/neon/simd.rs b/src/neon/simd.rs deleted file mode 100644 index 8a5a21fc..00000000 --- a/src/neon/simd.rs +++ /dev/null @@ -1,470 +0,0 @@ -#![allow(non_camel_case_types)] -#![allow(unused)] - -use crate::neon::simd_llvm; - -macro_rules! simd_ty { - ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => { - #[repr(simd)] - #[derive(Copy, Clone, Debug, PartialEq)] - pub(crate) struct $id($(pub $elem_ty),*); - - #[allow(clippy::use_self)] - impl $id { - #[inline] - pub(crate) const fn new($($elem_name: $elem_ty),*) -> Self { - $id($($elem_name),*) - } - // FIXME: Workaround rust@60637 - #[inline(always)] - pub(crate) const fn splat(value: $ety) -> Self { - $id($({ - #[allow(non_camel_case_types, dead_code)] - struct $elem_name; - value - }),*) - } - - // FIXME: Workaround rust@60637 - #[inline(always)] - pub(crate) fn extract(self, index: usize) -> $ety { - unsafe { - simd_llvm::simd_extract(self, index as u32) - } - } - } - } -} - -macro_rules! simd_m_ty { - ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => { - #[repr(simd)] - #[derive(Copy, Clone, Debug, PartialEq)] - pub(crate) struct $id($(pub $elem_ty),*); - - #[allow(clippy::use_self)] - impl $id { - #[inline] - const fn bool_to_internal(x: bool) -> $ety { - [0 as $ety, !(0 as $ety)][x as usize] - } - - #[inline] - pub(crate) const fn new($($elem_name: bool),*) -> Self { - $id($(Self::bool_to_internal($elem_name)),*) - } - - // FIXME: Workaround rust@60637 - #[inline(always)] - pub(crate) const fn splat(value: bool) -> Self { - $id($({ - #[allow(non_camel_case_types, dead_code)] - struct $elem_name; - Self::bool_to_internal(value) - }),*) - } - - // FIXME: Workaround rust@60637 - #[inline(always)] - pub(crate) fn extract(self, index: usize) -> bool { - let r: $ety = unsafe { - simd_llvm::simd_extract(self, index as u32) - }; - r != 0 - } - } - } -} - -// 16-bit wide types: - -simd_ty!(u8x2[u8]: u8, u8 | x0, x1); -simd_ty!(i8x2[i8]: i8, i8 | x0, x1); - -// 32-bit wide types: - -simd_ty!(u8x4[u8]: u8, u8, u8, u8 | x0, x1, x2, x3); -simd_ty!(u16x2[u16]: u16, u16 | x0, x1); - -simd_ty!(i8x4[i8]: i8, i8, i8, i8 | x0, x1, x2, x3); -simd_ty!(i16x2[i16]: i16, i16 | x0, x1); - -// 64-bit wide types: - -simd_ty!(u8x8[u8]: - u8, u8, u8, u8, u8, u8, u8, u8 - | x0, x1, x2, x3, x4, x5, x6, x7); -simd_ty!(u16x4[u16]: u16, u16, u16, u16 | x0, x1, x2, x3); -simd_ty!(u32x2[u32]: u32, u32 | x0, x1); -simd_ty!(u64x1[u64]: u64 | x1); - -simd_ty!(i8x8[i8]: - i8, i8, i8, i8, i8, i8, i8, i8 - | x0, x1, x2, x3, x4, x5, x6, x7); -simd_ty!(i16x4[i16]: i16, i16, i16, i16 | x0, x1, x2, x3); -simd_ty!(i32x2[i32]: i32, i32 | x0, x1); -simd_ty!(i64x1[i64]: i64 | x1); - -simd_ty!(f32x2[f32]: f32, f32 | x0, x1); - -// 128-bit wide types: - -simd_ty!(u8x16[u8]: - u8, u8, u8, u8, u8, u8, u8, u8, - u8, u8, u8, u8, u8, u8, u8, u8 - | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 -); -simd_ty!(u16x8[u16]: - u16, u16, u16, u16, u16, u16, u16, u16 - | x0, x1, x2, x3, x4, x5, x6, x7); -simd_ty!(u32x4[u32]: u32, u32, u32, u32 | x0, x1, x2, x3); -simd_ty!(u64x2[u64]: u64, u64 | x0, x1); - -simd_ty!(i8x16[i8]: - i8, i8, i8, i8, i8, i8, i8, i8, - i8, i8, i8, i8, i8, i8, i8, i8 - | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 -); -simd_ty!(i16x8[i16]: - i16, i16, i16, i16, i16, i16, i16, i16 - | x0, x1, x2, x3, x4, x5, x6, x7); -simd_ty!(i32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3); -simd_ty!(i64x2[i64]: i64, i64 | x0, x1); - -simd_ty!(f32x4[f32]: f32, f32, f32, f32 | x0, x1, x2, x3); -simd_ty!(f64x2[f64]: f64, f64 | x0, x1); - -simd_m_ty!(m8x16[i8]: - i8, i8, i8, i8, i8, i8, i8, i8, - i8, i8, i8, i8, i8, i8, i8, i8 - | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 -); -simd_m_ty!(m16x8[i16]: - i16, i16, i16, i16, i16, i16, i16, i16 - | x0, x1, x2, x3, x4, x5, x6, x7); -simd_m_ty!(m32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3); -simd_m_ty!(m64x2[i64]: i64, i64 | x0, x1); - -// 256-bit wide types: - -simd_ty!(u8x32[u8]: - u8, u8, u8, u8, u8, u8, u8, u8, - u8, u8, u8, u8, u8, u8, u8, u8, - u8, u8, u8, u8, u8, u8, u8, u8, - u8, u8, u8, u8, u8, u8, u8, u8 - | x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15, - x16, x17, x18, x19, x20, x21, x22, x23, - x24, x25, x26, x27, x28, x29, x30, x31 -); -simd_ty!(u16x16[u16]: - u16, u16, u16, u16, u16, u16, u16, u16, - u16, u16, u16, u16, u16, u16, u16, u16 - | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 -); -simd_ty!(u32x8[u32]: - u32, u32, u32, u32, u32, u32, u32, u32 - | x0, x1, x2, x3, x4, x5, x6, x7); -simd_ty!(u64x4[u64]: u64, u64, u64, u64 | x0, x1, x2, x3); - -simd_ty!(i8x32[i8]: - i8, i8, i8, i8, i8, i8, i8, i8, - i8, i8, i8, i8, i8, i8, i8, i8, - i8, i8, i8, i8, i8, i8, i8, i8, - i8, i8, i8, i8, i8, i8, i8, i8 - | x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15, - x16, x17, x18, x19, x20, x21, x22, x23, - x24, x25, x26, x27, x28, x29, x30, x31 -); -simd_ty!(i16x16[i16]: - i16, i16, i16, i16, i16, i16, i16, i16, - i16, i16, i16, i16, i16, i16, i16, i16 - | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 -); -simd_ty!(i32x8[i32]: - i32, i32, i32, i32, i32, i32, i32, i32 - | x0, x1, x2, x3, x4, x5, x6, x7); -simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3); - -simd_ty!(f32x8[f32]: - f32, f32, f32, f32, f32, f32, f32, f32 | - x0, x1, x2, x3, x4, x5, x6, x7); - -// 512-bit wide types: - -simd_ty!(i32x16[i32]: - i32, i32, i32, i32, i32, i32, i32, i32, - i32, i32, i32, i32, i32, i32, i32, i32 - | x0, x1, x2, x3, x4, x5, x6, x7, - x8, x9, x10, x11, x12, x13, x14, x15); - -simd_ty!(i64x8[i64]: - i64, i64, i64, i64, i64, i64, i64, i64 - | x0, x1, x2, x3, x4, x5, x6, x7); - -#[allow(unused)] -#[macro_export] -macro_rules! constify_imm8 { - ($imm8:expr, $expand:ident) => { - #[allow(overflowing_literals)] - match ($imm8) & 0b1111_1111 { - 0 => $expand!(0), - 1 => $expand!(1), - 2 => $expand!(2), - 3 => $expand!(3), - 4 => $expand!(4), - 5 => $expand!(5), - 6 => $expand!(6), - 7 => $expand!(7), - 8 => $expand!(8), - 9 => $expand!(9), - 10 => $expand!(10), - 11 => $expand!(11), - 12 => $expand!(12), - 13 => $expand!(13), - 14 => $expand!(14), - 15 => $expand!(15), - 16 => $expand!(16), - 17 => $expand!(17), - 18 => $expand!(18), - 19 => $expand!(19), - 20 => $expand!(20), - 21 => $expand!(21), - 22 => $expand!(22), - 23 => $expand!(23), - 24 => $expand!(24), - 25 => $expand!(25), - 26 => $expand!(26), - 27 => $expand!(27), - 28 => $expand!(28), - 29 => $expand!(29), - 30 => $expand!(30), - 31 => $expand!(31), - 32 => $expand!(32), - 33 => $expand!(33), - 34 => $expand!(34), - 35 => $expand!(35), - 36 => $expand!(36), - 37 => $expand!(37), - 38 => $expand!(38), - 39 => $expand!(39), - 40 => $expand!(40), - 41 => $expand!(41), - 42 => $expand!(42), - 43 => $expand!(43), - 44 => $expand!(44), - 45 => $expand!(45), - 46 => $expand!(46), - 47 => $expand!(47), - 48 => $expand!(48), - 49 => $expand!(49), - 50 => $expand!(50), - 51 => $expand!(51), - 52 => $expand!(52), - 53 => $expand!(53), - 54 => $expand!(54), - 55 => $expand!(55), - 56 => $expand!(56), - 57 => $expand!(57), - 58 => $expand!(58), - 59 => $expand!(59), - 60 => $expand!(60), - 61 => $expand!(61), - 62 => $expand!(62), - 63 => $expand!(63), - 64 => $expand!(64), - 65 => $expand!(65), - 66 => $expand!(66), - 67 => $expand!(67), - 68 => $expand!(68), - 69 => $expand!(69), - 70 => $expand!(70), - 71 => $expand!(71), - 72 => $expand!(72), - 73 => $expand!(73), - 74 => $expand!(74), - 75 => $expand!(75), - 76 => $expand!(76), - 77 => $expand!(77), - 78 => $expand!(78), - 79 => $expand!(79), - 80 => $expand!(80), - 81 => $expand!(81), - 82 => $expand!(82), - 83 => $expand!(83), - 84 => $expand!(84), - 85 => $expand!(85), - 86 => $expand!(86), - 87 => $expand!(87), - 88 => $expand!(88), - 89 => $expand!(89), - 90 => $expand!(90), - 91 => $expand!(91), - 92 => $expand!(92), - 93 => $expand!(93), - 94 => $expand!(94), - 95 => $expand!(95), - 96 => $expand!(96), - 97 => $expand!(97), - 98 => $expand!(98), - 99 => $expand!(99), - 100 => $expand!(100), - 101 => $expand!(101), - 102 => $expand!(102), - 103 => $expand!(103), - 104 => $expand!(104), - 105 => $expand!(105), - 106 => $expand!(106), - 107 => $expand!(107), - 108 => $expand!(108), - 109 => $expand!(109), - 110 => $expand!(110), - 111 => $expand!(111), - 112 => $expand!(112), - 113 => $expand!(113), - 114 => $expand!(114), - 115 => $expand!(115), - 116 => $expand!(116), - 117 => $expand!(117), - 118 => $expand!(118), - 119 => $expand!(119), - 120 => $expand!(120), - 121 => $expand!(121), - 122 => $expand!(122), - 123 => $expand!(123), - 124 => $expand!(124), - 125 => $expand!(125), - 126 => $expand!(126), - 127 => $expand!(127), - 128 => $expand!(128), - 129 => $expand!(129), - 130 => $expand!(130), - 131 => $expand!(131), - 132 => $expand!(132), - 133 => $expand!(133), - 134 => $expand!(134), - 135 => $expand!(135), - 136 => $expand!(136), - 137 => $expand!(137), - 138 => $expand!(138), - 139 => $expand!(139), - 140 => $expand!(140), - 141 => $expand!(141), - 142 => $expand!(142), - 143 => $expand!(143), - 144 => $expand!(144), - 145 => $expand!(145), - 146 => $expand!(146), - 147 => $expand!(147), - 148 => $expand!(148), - 149 => $expand!(149), - 150 => $expand!(150), - 151 => $expand!(151), - 152 => $expand!(152), - 153 => $expand!(153), - 154 => $expand!(154), - 155 => $expand!(155), - 156 => $expand!(156), - 157 => $expand!(157), - 158 => $expand!(158), - 159 => $expand!(159), - 160 => $expand!(160), - 161 => $expand!(161), - 162 => $expand!(162), - 163 => $expand!(163), - 164 => $expand!(164), - 165 => $expand!(165), - 166 => $expand!(166), - 167 => $expand!(167), - 168 => $expand!(168), - 169 => $expand!(169), - 170 => $expand!(170), - 171 => $expand!(171), - 172 => $expand!(172), - 173 => $expand!(173), - 174 => $expand!(174), - 175 => $expand!(175), - 176 => $expand!(176), - 177 => $expand!(177), - 178 => $expand!(178), - 179 => $expand!(179), - 180 => $expand!(180), - 181 => $expand!(181), - 182 => $expand!(182), - 183 => $expand!(183), - 184 => $expand!(184), - 185 => $expand!(185), - 186 => $expand!(186), - 187 => $expand!(187), - 188 => $expand!(188), - 189 => $expand!(189), - 190 => $expand!(190), - 191 => $expand!(191), - 192 => $expand!(192), - 193 => $expand!(193), - 194 => $expand!(194), - 195 => $expand!(195), - 196 => $expand!(196), - 197 => $expand!(197), - 198 => $expand!(198), - 199 => $expand!(199), - 200 => $expand!(200), - 201 => $expand!(201), - 202 => $expand!(202), - 203 => $expand!(203), - 204 => $expand!(204), - 205 => $expand!(205), - 206 => $expand!(206), - 207 => $expand!(207), - 208 => $expand!(208), - 209 => $expand!(209), - 210 => $expand!(210), - 211 => $expand!(211), - 212 => $expand!(212), - 213 => $expand!(213), - 214 => $expand!(214), - 215 => $expand!(215), - 216 => $expand!(216), - 217 => $expand!(217), - 218 => $expand!(218), - 219 => $expand!(219), - 220 => $expand!(220), - 221 => $expand!(221), - 222 => $expand!(222), - 223 => $expand!(223), - 224 => $expand!(224), - 225 => $expand!(225), - 226 => $expand!(226), - 227 => $expand!(227), - 228 => $expand!(228), - 229 => $expand!(229), - 230 => $expand!(230), - 231 => $expand!(231), - 232 => $expand!(232), - 233 => $expand!(233), - 234 => $expand!(234), - 235 => $expand!(235), - 236 => $expand!(236), - 237 => $expand!(237), - 238 => $expand!(238), - 239 => $expand!(239), - 240 => $expand!(240), - 241 => $expand!(241), - 242 => $expand!(242), - 243 => $expand!(243), - 244 => $expand!(244), - 245 => $expand!(245), - 246 => $expand!(246), - 247 => $expand!(247), - 248 => $expand!(248), - 249 => $expand!(249), - 250 => $expand!(250), - 251 => $expand!(251), - 252 => $expand!(252), - 253 => $expand!(253), - 254 => $expand!(254), - _ => $expand!(255), - } - }; -} diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs deleted file mode 100644 index 4edf41f7..00000000 --- a/src/neon/simd_llvm.rs +++ /dev/null @@ -1,54 +0,0 @@ -extern "platform-intrinsic" { - pub fn simd_eq(x: T, y: T) -> U; -// pub fn simd_ne(x: T, y: T) -> U; - pub fn simd_lt(x: T, y: T) -> U; - pub fn simd_le(x: T, y: T) -> U; - pub fn simd_gt(x: T, y: T) -> U; - pub fn simd_ge(x: T, y: T) -> U; -// -// pub fn simd_shuffle2(x: T, y: T, idx: [u32; 2]) -> U; -// pub fn simd_shuffle4(x: T, y: T, idx: [u32; 4]) -> U; -// pub fn simd_shuffle8(x: T, y: T, idx: [u32; 8]) -> U; - pub fn simd_shuffle16(x: T, y: T, idx: [u32; 16]) -> U; -// pub fn simd_shuffle32(x: T, y: T, idx: [u32; 32]) -> U; -// pub fn simd_shuffle64(x: T, y: T, idx: [u32; 64]) -> U; -// pub fn simd_shuffle128(x: T, y: T, idx: [u32; 128]) -> U; -// -// pub fn simd_insert(x: T, idx: u32, val: U) -> T; - pub fn simd_extract(x: T, idx: u32) -> U; -// -// pub fn simd_cast(x: T) -> U; -// - pub fn simd_add(x: T, y: T) -> T; -// pub fn simd_sub(x: T, y: T) -> T; -// pub fn simd_mul(x: T, y: T) -> T; -// pub fn simd_div(x: T, y: T) -> T; -// pub fn simd_shl(x: T, y: T) -> T; -// pub fn simd_shr(x: T, y: T) -> T; - pub fn simd_and(x: T, y: T) -> T; - pub fn simd_or(x: T, y: T) -> T; - pub fn simd_xor(x: T, y: T) -> T; -// -// pub fn simd_reduce_add_unordered(x: T) -> U; -// pub fn simd_reduce_mul_unordered(x: T) -> U; -// pub fn simd_reduce_add_ordered(x: T, acc: U) -> U; -// pub fn simd_reduce_mul_ordered(x: T, acc: U) -> U; -// pub fn simd_reduce_min(x: T) -> U; -// pub fn simd_reduce_max(x: T) -> U; -// pub fn simd_reduce_min_nanless(x: T) -> U; -// pub fn simd_reduce_max_nanless(x: T) -> U; -// pub fn simd_reduce_and(x: T) -> U; -// pub fn simd_reduce_or(x: T) -> U; -// pub fn simd_reduce_xor(x: T) -> U; -// pub fn simd_reduce_all(x: T) -> bool; -// pub fn simd_reduce_any(x: T) -> bool; -// -// pub fn simd_select(m: M, a: T, b: T) -> T; -// pub fn simd_select_bitmask(m: M, a: T, b: T) -> T; -// -// pub fn simd_fmin(a: T, b: T) -> T; -// pub fn simd_fmax(a: T, b: T) -> T; -// -// pub fn simd_fsqrt(a: T) -> T; -// pub fn simd_fma(a: T, b: T, c: T) -> T; -} diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs index 45322cee..6c43946a 100644 --- a/src/neon/stage1.rs +++ b/src/neon/stage1.rs @@ -1,18 +1,18 @@ #![allow(dead_code)] -use crate::neon::intrinsics::*; use crate::neon::utf8check::*; use crate::*; - +use simd_lite::aarch64::*; +use simd_lite::NeonInit; use std::mem; // NEON-SPECIFIC macro_rules! bit_mask { () => { - uint8x16_t::new( - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + uint8x16_t::new( + [0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80] ) }; } @@ -52,8 +52,8 @@ unsafe fn compute_quote_mask(quote_bits: u64) -> u64 { vreinterpretq_u64_u8( mem::transmute( vmull_p64( - -1, - quote_bits as i64) + mem::transmute(-1 as i64), + mem::transmute(quote_bits as i64)) ) ), 0 @@ -104,7 +104,7 @@ impl Default for Utf8CheckingState { #[cfg_attr(not(feature = "no-inline"), inline)] fn default() -> Self { Utf8CheckingState { - has_error: vdupq_n_s8(0), + has_error: unsafe{vdupq_n_s8(0)}, previous: ProcessedUtfBytes::default(), } } @@ -129,7 +129,7 @@ unsafe fn check_utf8( // ascii too. We only check the byte that was just before simd_input. Nines // are arbitrary values. let verror: int8x16_t = int8x16_t::new( - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, + [9i8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1] ); state.has_error = vreinterpretq_s8_u8(vorrq_u8( vcgtq_s8( @@ -278,11 +278,11 @@ unsafe fn find_whitespace_and_structurals( // TODO: const? let low_nibble_mask: uint8x16_t = uint8x16_t::new( - 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, + [16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0] ); // TODO: const? let high_nibble_mask: uint8x16_t = uint8x16_t::new( - 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, + [8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0] ); let structural_shufti_mask: uint8x16_t = vmovq_n_u8(0x7); @@ -338,12 +338,12 @@ fn flatten_bits(base: &mut Vec, idx: u32, mut bits: u64) { let mut l = base.len(); let idx_minus_64 = idx.wrapping_sub(64); let idx_64_v = unsafe { - int32x4_t::new( + mem::transmute::<_, int32x4_t>([ static_cast_i32!(idx_minus_64), static_cast_i32!(idx_minus_64), static_cast_i32!(idx_minus_64), static_cast_i32!(idx_minus_64), - ) + ]) }; // We're doing some trickery here. @@ -367,7 +367,7 @@ fn flatten_bits(base: &mut Vec, idx: u32, mut bits: u64) { let v3 = bits.trailing_zeros() as i32; bits &= bits.wrapping_sub(1); - let v: int32x4_t = int32x4_t::new(v0, v1, v2, v3); + let v: int32x4_t = mem::transmute([v0, v1, v2, v3]); let v: int32x4_t = vaddq_s32(idx_64_v, v); #[allow(clippy::cast_ptr_alignment)] std::ptr::write(base.as_mut_ptr().add(l) as *mut int32x4_t, v); @@ -419,6 +419,7 @@ fn finalize_structurals( } pub fn find_bs_bits_and_quote_bits(v0: uint8x16_t, v1: uint8x16_t) -> ParseStringHelper { + unsafe{ let quote_mask = vmovq_n_u8(b'"'); let bs_mask = vmovq_n_u8(b'\\'); let bit_mask = bit_mask!(); @@ -439,8 +440,9 @@ pub fn find_bs_bits_and_quote_bits(v0: uint8x16_t, v1: uint8x16_t) -> ParseStrin let sum0 = vpaddq_u8(sum0, sum0); ParseStringHelper { - bs_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0) }, - quote_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) }, + bs_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0) , + quote_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) , + } } } @@ -596,3 +598,9 @@ impl<'de> Deserializer<'de> { } } } + +// Holds backslashes and quotes locations. +pub struct ParseStringHelper { + pub bs_bits: u32, + pub quote_bits: u32, +} diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs index 082183b1..74ef5319 100644 --- a/src/neon/utf8check.rs +++ b/src/neon/utf8check.rs @@ -1,4 +1,5 @@ -use crate::neon::intrinsics::*; +use simd_lite::aarch64::*; +use simd_lite::NeonInit; /* * legal utf-8 byte sequence @@ -39,62 +40,56 @@ fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8x16_t) *has_error = unsafe { vorrq_s8( *has_error, - vqsubq_s8(current_bytes, vdupq_n_s8(-12 /* 0xF4 */)) + vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))) ) }; } macro_rules! nibbles_tbl { () => { - int8x16_t::new( - 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + int8x16_t::new([ + 1i8, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) 0, 0, 0, 0, // 10xx (continuation) 2, 2, // 110x 3, // 1110 4, // 1111, next should be 0 (not checked here) - ) + ]) }; } #[cfg_attr(not(feature = "no-inline"), inline)] -fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t { - unsafe { +unsafe fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t { vqtbl1q_s8( nibbles_tbl!(), vreinterpretq_u8_s8(high_nibbles), ) - } } #[cfg_attr(not(feature = "no-inline"), inline)] -fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t { - unsafe { - let right1: int8x16_t = vqsubq_s8( - push_last_byte_of_a_to_b(previous_carries, initial_lengths), - vdupq_n_s8(1), - ); +unsafe fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t { + let right1: int8x16_t = vreinterpretq_s8_u8(vqsubq_u8( + vreinterpretq_u8_s8(push_last_byte_of_a_to_b(previous_carries, initial_lengths)), + vdupq_n_u8(1), + )); let sum: int8x16_t = vaddq_s8(initial_lengths, right1); - let right2: int8x16_t = vqsubq_s8( - push_last_2bytes_of_a_to_b(previous_carries, sum), - vdupq_n_s8(2), - ); + let right2: int8x16_t = vreinterpretq_s8_u8(vqsubq_u8( + vreinterpretq_u8_s8(push_last_2bytes_of_a_to_b(previous_carries, sum)), + vdupq_n_u8(2), + )); vaddq_s8(sum, right2) - } } #[cfg_attr(not(feature = "no-inline"), inline)] -fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) { +unsafe fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) { // overlap || underlap // carry > length && length > 0 || !(carry > length) && !(length > 0) // (carries > length) == (lengths > 0) - { let overunder: uint8x16_t = vceqq_u8( vcgtq_s8(carries, initial_lengths), vcgtq_s8(initial_lengths, vdupq_n_s8(0)), ); *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder)); - } } // when 0xED is found, next byte must be no larger than 0x9F @@ -106,7 +101,7 @@ fn check_first_continuation_max( off1_current_bytes: int8x16_t, has_error: &mut int8x16_t, ) { - { + unsafe { let mask_ed: uint8x16_t = vceqq_s8( off1_current_bytes, vdupq_n_s8(-19 /* 0xED */), @@ -134,25 +129,25 @@ fn check_first_continuation_max( macro_rules! initial_mins_tbl { () => { - int8x16_t::new( - -128, -128, -128, -128, -128, -128, + int8x16_t::new([ + -128i8, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, // 10xx => false -62 /* 0xC2 */, -128, // 110x -31 /* 0xE1 */, // 1110 -15 /*0xF1 */, // 1111 - ) + ]) }; } macro_rules! second_mins_tbl { () => { - int8x16_t::new( - -128, -128, -128, -128, -128, -128, + int8x16_t::new([ + -128i8, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, // 10xx => false 127, 127, // 110x => true -96 /* 0xA0 */, // 1110 -112 /* 0x90 */, // 1111 - ) + ]) }; } @@ -200,11 +195,13 @@ pub struct ProcessedUtfBytes { impl Default for ProcessedUtfBytes { #[cfg_attr(not(feature = "no-inline"), inline)] fn default() -> Self { + unsafe{ ProcessedUtfBytes { rawbytes: vdupq_n_s8(0x00), high_nibbles: vdupq_n_s8(0x00), carried_continuations: vdupq_n_s8(0x00), } + } } } @@ -228,6 +225,7 @@ pub fn check_utf8_bytes( has_error: &mut int8x16_t, ) -> ProcessedUtfBytes { let mut pb = ProcessedUtfBytes::default(); + unsafe { count_nibbles(current_bytes, &mut pb); check_smaller_than_0xf4(current_bytes, has_error); @@ -249,5 +247,7 @@ pub fn check_utf8_bytes( previous.high_nibbles, has_error, ); + } pb + } diff --git a/src/stringparse.rs b/src/stringparse.rs index 9a7e3b8f..0ff8a078 100644 --- a/src/stringparse.rs +++ b/src/stringparse.rs @@ -73,9 +73,3 @@ pub fn handle_unicode_codepoint( let offset: usize = codepoint_to_utf8(code_point, dst_ptr); Ok((offset, src_offset)) } - -// Holds backslashes and quotes locations. -pub struct ParseStringHelper { - pub bs_bits: u32, - pub quote_bits: u32, -} From 3ee0aa9302edd71e601a39439d9786c6740a8d0c Mon Sep 17 00:00:00 2001 From: Sunny Gleason Date: Wed, 4 Sep 2019 10:11:14 -0400 Subject: [PATCH 9/9] add simd-lite real version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 5ab206b0..f7f6fe17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,7 @@ jemallocator = { version = "0.3", optional = true } perfcnt = { version = "0.4", optional = true } getopts = { version = "0.2", optional = true } colored = { version = "1.7", optional = true } -simd-lite = { git = "https://github.com/simd-lite/simd-lite", optional = true, branch = "reexport" } +simd-lite = { version = "0.1.0", optional = true }