From c5a4b61eb54be294b13e6ce75dd34035614a9630 Mon Sep 17 00:00:00 2001 From: Sunny Gleason Date: Thu, 15 Aug 2019 22:34:35 -0400 Subject: [PATCH] feat: architecture refactoring for greater similarity --- src/avx2/deser.rs | 39 +++----- src/avx2/stage1.rs | 172 +++++++++++++++++++++------------- src/avx2/utf8check.rs | 206 +++++++++++++++++++---------------------- src/sse42/deser.rs | 42 +++------ src/sse42/stage1.rs | 132 ++++++++++++++++---------- src/sse42/utf8check.rs | 154 +++++++++++++++--------------- 6 files changed, 398 insertions(+), 347 deletions(-) diff --git a/src/avx2/deser.rs b/src/avx2/deser.rs index f0b2fac2..abfb118e 100644 --- a/src/avx2/deser.rs +++ b/src/avx2/deser.rs @@ -3,11 +3,10 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use std::mem; - pub use crate::error::{Error, ErrorType}; pub use crate::Deserializer; pub use crate::Result; +pub use crate::avx2::stage1::*; pub use crate::avx2::utf8check::*; pub use crate::stringparse::*; @@ -27,7 +26,7 @@ impl<'de> Deserializer<'de> { let mut src_i: usize = 0; let mut len = src_i; loop { - let v: __m256i = if src.len() >= src_i + 32 { + let srcx: __m256i = if src.len() >= src_i + 32 { // This is safe since we ensure src is at least 32 wide #[allow(clippy::cast_ptr_alignment)] unsafe { @@ -44,16 +43,8 @@ impl<'de> Deserializer<'de> { } }; - // store to dest unconditionally - we can overwrite the bits we don't like - // later - let bs_bits: u32 = unsafe { - static_cast_u32!(_mm256_movemask_epi8(_mm256_cmpeq_epi8( - v, - _mm256_set1_epi8(b'\\' as i8) - ))) - }; - let quote_mask = unsafe { _mm256_cmpeq_epi8(v, _mm256_set1_epi8(b'"' as i8)) }; - let quote_bits = unsafe { static_cast_u32!(_mm256_movemask_epi8(quote_mask)) }; + let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(srcx); + if (bs_bits.wrapping_sub(1) & quote_bits) != 0 { // we encountered quotes first. Move dst to point to quotes and exit // find out where the quote is... @@ -94,7 +85,7 @@ impl<'de> Deserializer<'de> { let dst: &mut [u8] = &mut self.strings; loop { - let v: __m256i = if src.len() >= src_i + 32 { + let srcx: __m256i = if src.len() >= src_i + 32 { // This is safe since we ensure src is at least 32 wide #[allow(clippy::cast_ptr_alignment)] unsafe { @@ -113,19 +104,13 @@ impl<'de> Deserializer<'de> { #[allow(clippy::cast_ptr_alignment)] unsafe { - _mm256_storeu_si256(dst.as_mut_ptr().add(dst_i) as *mut __m256i, v) + _mm256_storeu_si256(dst.as_mut_ptr().add(dst_i) as *mut __m256i, srcx) }; // store to dest unconditionally - we can overwrite the bits we don't like // later - let bs_bits: u32 = unsafe { - static_cast_u32!(_mm256_movemask_epi8(_mm256_cmpeq_epi8( - v, - _mm256_set1_epi8(b'\\' as i8) - ))) - }; - let quote_mask = unsafe { _mm256_cmpeq_epi8(v, _mm256_set1_epi8(b'"' as i8)) }; - let quote_bits = unsafe { static_cast_u32!(_mm256_movemask_epi8(quote_mask)) }; + let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(srcx); + if (bs_bits.wrapping_sub(1) & quote_bits) != 0 { // we encountered quotes first. Move dst to point to quotes and exit // find out where the quote is... @@ -164,9 +149,11 @@ impl<'de> Deserializer<'de> { src_i += bs_dist as usize; dst_i += bs_dist as usize; let (o, s) = if let Ok(r) = - handle_unicode_codepoint(unsafe { src.get_unchecked(src_i..) }, unsafe { - dst.get_unchecked_mut(dst_i..) - }) { + handle_unicode_codepoint( + unsafe { src.get_unchecked(src_i..) }, + unsafe { dst.get_unchecked_mut(dst_i..) }, + ) + { r } else { return Err(self.error(ErrorType::InvlaidUnicodeCodepoint)); diff --git a/src/avx2/stage1.rs b/src/avx2/stage1.rs index f202b8e3..72112e9f 100644 --- a/src/avx2/stage1.rs +++ b/src/avx2/stage1.rs @@ -10,45 +10,86 @@ use std::mem; pub const SIMDJSON_PADDING: usize = mem::size_of::<__m256i>(); +unsafe fn compute_quote_mask(quote_bits: u64) -> u64 { + _mm_cvtsi128_si64( + _mm_clmulepi64_si128( + _mm_set_epi64x(0, static_cast_i64!(quote_bits)), + _mm_set1_epi8(-1 /* 0xFF */), + 0, + ) + ) as u64 +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn check_ascii(input: &SimdInput) -> bool { + let highbit: __m256i = _mm256_set1_epi8(static_cast_i8!(0x80u8)); + let test_v0v1 = _mm256_testz_si256(_mm256_or_si256(input.v0, input.v1), highbit); + + test_v0v1 == 1 +} + #[derive(Debug)] struct SimdInput { - lo: __m256i, - hi: __m256i, + v0: __m256i, + v1: __m256i, } fn fill_input(ptr: &[u8]) -> SimdInput { unsafe { #[allow(clippy::cast_ptr_alignment)] SimdInput { - lo: _mm256_loadu_si256(ptr.as_ptr() as *const __m256i), - hi: _mm256_loadu_si256(ptr.as_ptr().add(32) as *const __m256i), + v0: _mm256_loadu_si256(ptr.as_ptr() as *const __m256i), + v1: _mm256_loadu_si256(ptr.as_ptr().add(32) as *const __m256i), + } + } +} + +struct Utf8CheckingState { + has_error: __m256i, + previous: ProcessedUtfBytes, +} + +impl Default for Utf8CheckingState { + #[cfg_attr(not(feature = "no-inline"), inline)] + fn default() -> Self { + Utf8CheckingState { + has_error: unsafe { _mm256_setzero_si256() }, + previous: ProcessedUtfBytes::default(), } } } +#[inline] +fn is_utf8_status_ok(has_error: __m256i) -> bool { + unsafe { + _mm256_testz_si256(has_error, has_error) != 0 + } +} + #[cfg_attr(not(feature = "no-inline"), inline(always))] unsafe fn check_utf8( input: &SimdInput, - has_error: &mut __m256i, - previous: &mut AvxProcessedUtfBytes, + state: &mut Utf8CheckingState, ) { - let highbit: __m256i = _mm256_set1_epi8(static_cast_i8!(0x80u8)); - if (_mm256_testz_si256(_mm256_or_si256(input.lo, input.hi), highbit)) == 1 { - // it is ascii, we just check continuation - *has_error = _mm256_or_si256( + if check_ascii(input) { + // All bytes are ascii. Therefore the byte that was just before must be + // ascii too. We only check the byte that was just before simd_input. Nines + // are arbitrary values. + let verror: __m256i = _mm256_setr_epi8( + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, + ); + state.has_error = _mm256_or_si256( _mm256_cmpgt_epi8( - previous.carried_continuations, - _mm256_setr_epi8( - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 1, - ), + state.previous.carried_continuations, + verror, ), - *has_error, + state.has_error, ); } else { // it is not ascii so we have to do heavy work - *previous = avxcheck_utf8_bytes(input.lo, &previous, has_error); - *previous = avxcheck_utf8_bytes(input.hi, &previous, has_error); + state.previous = check_utf8_bytes(input.v0, &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(input.v1, &mut state.previous, &mut state.has_error); } } @@ -58,9 +99,9 @@ unsafe fn check_utf8( fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 { unsafe { let mask: __m256i = _mm256_set1_epi8(m as i8); - let cmp_res_0: __m256i = _mm256_cmpeq_epi8(input.lo, mask); + let cmp_res_0: __m256i = _mm256_cmpeq_epi8(input.v0, mask); let res_0: u64 = u64::from(static_cast_u32!(_mm256_movemask_epi8(cmp_res_0))); - let cmp_res_1: __m256i = _mm256_cmpeq_epi8(input.hi, mask); + let cmp_res_1: __m256i = _mm256_cmpeq_epi8(input.v1, mask); let res_1: u64 = _mm256_movemask_epi8(cmp_res_1) as u64; res_0 | (res_1 << 32) } @@ -70,10 +111,9 @@ fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 { #[cfg_attr(not(feature = "no-inline"), inline(always))] fn unsigned_lteq_against_input(input: &SimdInput, maxval: __m256i) -> u64 { unsafe { - let cmp_res_0: __m256i = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, input.lo), maxval); - // TODO: c++ uses static cast, here what are the implications? + let cmp_res_0: __m256i = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, input.v0), maxval); let res_0: u64 = u64::from(static_cast_u32!(_mm256_movemask_epi8(cmp_res_0))); - let cmp_res_1: __m256i = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, input.hi), maxval); + let cmp_res_1: __m256i = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, input.v1), maxval); let res_1: u64 = _mm256_movemask_epi8(cmp_res_1) as u64; res_0 | (res_1 << 32) } @@ -107,9 +147,10 @@ fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_backslash: // should be flipped let (mut odd_carries, iter_ends_odd_backslash) = bs_bits.overflowing_add(odd_starts); - odd_carries |= *prev_iter_ends_odd_backslash; // push in bit zero as a potential end - // if we had an odd-numbered run at the - // end of the previous iteration + odd_carries |= *prev_iter_ends_odd_backslash; + // push in bit zero as a potential end + // if we had an odd-numbered run at the + // end of the previous iteration *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 }; let even_carry_ends: u64 = even_carries & !bs_bits; let odd_carry_ends: u64 = odd_carries & !bs_bits; @@ -141,12 +182,8 @@ unsafe fn find_quote_mask_and_bits( *quote_bits = cmp_mask_against_input(&input, b'"'); *quote_bits &= !odd_ends; // remove from the valid quoted region the unescapted characters. - #[allow(overflowing_literals)] - let mut quote_mask: u64 = _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0, static_cast_i64!(*quote_bits)), - _mm_set1_epi8(0xFF), - 0, - )) as u64; + let mut quote_mask: u64 = compute_quote_mask(*quote_bits); + quote_mask ^= *prev_iter_inside_quote; // All Unicode characters may be placed within the // quotation marks, except for the characters that MUST be escaped: @@ -187,57 +224,57 @@ unsafe fn find_whitespace_and_structurals( // TODO: const? let low_nibble_mask: __m256i = _mm256_setr_epi8( - 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, - 9, 0, 0, + 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, + 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, ); // TODO: const? let high_nibble_mask: __m256i = _mm256_setr_epi8( - 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, - 0, 0, + 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, + 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, ); let structural_shufti_mask: __m256i = _mm256_set1_epi8(0x7); let whitespace_shufti_mask: __m256i = _mm256_set1_epi8(0x18); - let v_lo: __m256i = _mm256_and_si256( - _mm256_shuffle_epi8(low_nibble_mask, input.lo), + let v_v0: __m256i = _mm256_and_si256( + _mm256_shuffle_epi8(low_nibble_mask, input.v0), _mm256_shuffle_epi8( high_nibble_mask, - _mm256_and_si256(_mm256_srli_epi32(input.lo, 4), _mm256_set1_epi8(0x7f)), + _mm256_and_si256(_mm256_srli_epi32(input.v0, 4), _mm256_set1_epi8(0x7f)), ), ); - - let v_hi: __m256i = _mm256_and_si256( - _mm256_shuffle_epi8(low_nibble_mask, input.hi), + let v_v1: __m256i = _mm256_and_si256( + _mm256_shuffle_epi8(low_nibble_mask, input.v1), _mm256_shuffle_epi8( high_nibble_mask, - _mm256_and_si256(_mm256_srli_epi32(input.hi, 4), _mm256_set1_epi8(0x7f)), + _mm256_and_si256(_mm256_srli_epi32(input.v1, 4), _mm256_set1_epi8(0x7f)), ), ); - let tmp_lo: __m256i = _mm256_cmpeq_epi8( - _mm256_and_si256(v_lo, structural_shufti_mask), + + let tmp_v0: __m256i = _mm256_cmpeq_epi8( + _mm256_and_si256(v_v0, structural_shufti_mask), _mm256_set1_epi8(0), ); - let tmp_hi: __m256i = _mm256_cmpeq_epi8( - _mm256_and_si256(v_hi, structural_shufti_mask), + let tmp_v1: __m256i = _mm256_cmpeq_epi8( + _mm256_and_si256(v_v1, structural_shufti_mask), _mm256_set1_epi8(0), ); - let structural_res_0: u64 = u64::from(static_cast_u32!(_mm256_movemask_epi8(tmp_lo))); - let structural_res_1: u64 = _mm256_movemask_epi8(tmp_hi) as u64; + let structural_res_0: u64 = u64::from(static_cast_u32!(_mm256_movemask_epi8(tmp_v0))); + let structural_res_1: u64 = _mm256_movemask_epi8(tmp_v1) as u64; *structurals = !(structural_res_0 | (structural_res_1 << 32)); - let tmp_ws_lo: __m256i = _mm256_cmpeq_epi8( - _mm256_and_si256(v_lo, whitespace_shufti_mask), + let tmp_ws_v0: __m256i = _mm256_cmpeq_epi8( + _mm256_and_si256(v_v0, whitespace_shufti_mask), _mm256_set1_epi8(0), ); - let tmp_ws_hi: __m256i = _mm256_cmpeq_epi8( - _mm256_and_si256(v_hi, whitespace_shufti_mask), + let tmp_ws_v1: __m256i = _mm256_cmpeq_epi8( + _mm256_and_si256(v_v1, whitespace_shufti_mask), _mm256_set1_epi8(0), ); - let ws_res_0: u64 = u64::from(static_cast_u32!(_mm256_movemask_epi8(tmp_ws_lo))); - let ws_res_1: u64 = _mm256_movemask_epi8(tmp_ws_hi) as u64; + let ws_res_0: u64 = u64::from(static_cast_u32!(_mm256_movemask_epi8(tmp_ws_v0))); + let ws_res_1: u64 = _mm256_movemask_epi8(tmp_ws_v1) as u64; *whitespace = !(ws_res_0 | (ws_res_1 << 32)); } @@ -345,9 +382,18 @@ fn finalize_structurals( structurals } -//WARN_UNUSED -/*never_inline*/ -//#[inline(never)] +pub fn find_bs_bits_and_quote_bits(v: __m256i) -> ParseStringHelper { + let quote_mask = unsafe { _mm256_cmpeq_epi8(v, _mm256_set1_epi8(b'"' as i8)) }; + let quote_bits = unsafe { static_cast_u32!(_mm256_movemask_epi8(quote_mask)) }; + let bs_mask = unsafe { _mm256_cmpeq_epi8(v, _mm256_set1_epi8(b'\\' as i8)) }; + let bs_bits = unsafe { static_cast_u32!(_mm256_movemask_epi8(bs_mask)) }; + + ParseStringHelper { + bs_bits, + quote_bits, + } +} + impl<'de> Deserializer<'de> { //#[inline(never)] pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result, ErrorType> { @@ -357,8 +403,8 @@ impl<'de> Deserializer<'de> { let mut structural_indexes = Vec::with_capacity(len / 6); structural_indexes.push(0); // push extra root element - let mut has_error: __m256i = _mm256_setzero_si256(); - let mut previous = AvxProcessedUtfBytes::default(); + let mut utf8_state: Utf8CheckingState = Utf8CheckingState::default(); + // we have padded the input out to 64 byte multiple with the remainder being // zeros @@ -394,7 +440,7 @@ impl<'de> Deserializer<'de> { #endif */ let input: SimdInput = fill_input(input.get_unchecked(idx as usize..)); - check_utf8(&input, &mut has_error, &mut previous); + check_utf8(&input, &mut utf8_state); // detect odd sequences of backslashes let odd_ends: u64 = find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash); @@ -438,7 +484,7 @@ impl<'de> Deserializer<'de> { .copy_from(input.as_ptr().add(idx), len as usize - idx); let input: SimdInput = fill_input(&tmpbuf); - check_utf8(&input, &mut has_error, &mut previous); + check_utf8(&input, &mut utf8_state); // detect odd sequences of backslashes let odd_ends: u64 = @@ -493,7 +539,7 @@ impl<'de> Deserializer<'de> { return Err(ErrorType::Syntax); } - if _mm256_testz_si256(has_error, has_error) != 0 { + if is_utf8_status_ok(utf8_state.has_error) { Ok(structural_indexes) } else { Err(ErrorType::InvalidUTF8) diff --git a/src/avx2/utf8check.rs b/src/avx2/utf8check.rs index cb583cc8..48bbb84f 100644 --- a/src/avx2/utf8check.rs +++ b/src/avx2/utf8check.rs @@ -21,54 +21,62 @@ use std::arch::x86_64::*; * */ -// all byte values must be no larger than 0xF4 - /*****************************/ #[cfg_attr(not(feature = "no-inline"), inline)] fn push_last_byte_of_a_to_b(a: __m256i, b: __m256i) -> __m256i { - unsafe { _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15) } + unsafe { + _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 16 - 1) + } } #[cfg_attr(not(feature = "no-inline"), inline)] fn push_last_2bytes_of_a_to_b(a: __m256i, b: __m256i) -> __m256i { - unsafe { _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 14) } + unsafe { + _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 16 - 2) + } } // all byte values must be no larger than 0xF4 #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcheck_smaller_than_0xf4(current_bytes: __m256i, has_error: &mut __m256i) { +fn check_smaller_than_0xf4(current_bytes: __m256i, has_error: &mut __m256i) { // unsigned, saturates to 0 below max *has_error = unsafe { _mm256_or_si256( *has_error, - _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(static_cast_i8!(0xF4u8))), + _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(-12i8 /* 0xF4 */)), + ) + }; +} + +macro_rules! nibbles_tbl { + () => { + _mm256_setr_epi8( + 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + 0, 0, 0, 0, // 10xx (continuation) + 2, 2, // 110x + 3, // 1110 + 4, // 1111, next should be 0 (not checked here) + 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + 0, 0, 0, 0, // 10xx (continuation) + 2, 2, // 110x + 3, // 1110 + 4, // 1111, next should be 0 (not checked here) ) }; } #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcontinuation_lengths(high_nibbles: __m256i) -> __m256i { +fn continuation_lengths(high_nibbles: __m256i) -> __m256i { unsafe { _mm256_shuffle_epi8( - _mm256_setr_epi8( - 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) - 0, 0, 0, 0, // 10xx (continuation) - 2, 2, // 110x - 3, // 1110 - 4, // 1111, next should be 0 (not checked here) - 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) - 0, 0, 0, 0, // 10xx (continuation) - 2, 2, // 110x - 3, // 1110 - 4, // 1111, next should be 0 (not checked here) - ), + nibbles_tbl!(), high_nibbles, ) } } #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcarry_continuations(initial_lengths: __m256i, previous_carries: __m256i) -> __m256i { +fn carry_continuations(initial_lengths: __m256i, previous_carries: __m256i) -> __m256i { unsafe { let right1: __m256i = _mm256_subs_epu8( push_last_byte_of_a_to_b(previous_carries, initial_lengths), @@ -84,7 +92,7 @@ fn avxcarry_continuations(initial_lengths: __m256i, previous_carries: __m256i) - } #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcheck_continuations(initial_lengths: __m256i, carries: __m256i, has_error: &mut __m256i) { +fn check_continuations(initial_lengths: __m256i, carries: __m256i, has_error: &mut __m256i) { // overlap || underlap // carry > length && length > 0 || !(carry > length) && !(length > 0) // (carries > length) == (lengths > 0) @@ -102,7 +110,7 @@ fn avxcheck_continuations(initial_lengths: __m256i, carries: __m256i, has_error: // when 0xF4 is found, next byte must be no larger than 0x8F // next byte must be continuation, ie sign bit is set, so signed < is ok #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcheck_first_continuation_max( +fn check_first_continuation_max( current_bytes: __m256i, off1_current_bytes: __m256i, has_error: &mut __m256i, @@ -126,10 +134,47 @@ fn avxcheck_first_continuation_max( mask_f4, ); - *has_error = _mm256_or_si256(*has_error, _mm256_or_si256(badfollow_ed, badfollow_f4)); + *has_error = _mm256_or_si256( + *has_error, + _mm256_or_si256(badfollow_ed, badfollow_f4), + ); } } +macro_rules! initial_mins_tbl { + () => { + _mm256_setr_epi8( + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + -62 /* 0xC2 */, -128, // 110x + -31 /* 0xE1 */, // 1110 + -15 /*0xF1 */, // 1111 + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + -62 /* 0xC2 */, -128, // 110x + -31 /* 0xE1 */, // 1110 + -15 /*0xF1 */, // 1111 + ) + }; +} + +macro_rules! second_mins_tbl { + () => { + _mm256_setr_epi8( + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + -96 /* 0xA0 */, // 1110 + -112 /* 0x90 */, // 1111 + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + -96 /* 0xA0 */, // 1110 + -112 /* 0x90 */, // 1111 + ) + }; +} + // map off1_hibits => error condition // hibits off1 cur // C => < C2 && true @@ -137,7 +182,7 @@ fn avxcheck_first_continuation_max( // F => < F1 && < 90 // else false && false #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcheck_overlong( +fn check_overlong( current_bytes: __m256i, off1_current_bytes: __m256i, hibits: __m256i, @@ -147,98 +192,35 @@ fn avxcheck_overlong( unsafe { let off1_hibits: __m256i = push_last_byte_of_a_to_b(previous_hibits, hibits); let initial_mins: __m256i = _mm256_shuffle_epi8( - _mm256_setr_epi8( - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, // 10xx => false - static_cast_i8!(0xC2u8), - -128, // 110x - static_cast_i8!(0xE1u8), // 1110 - static_cast_i8!(0xF1u8), // 1111 - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, // 10xx => false - static_cast_i8!(0xC2u8), - -128, // 110x - static_cast_i8!(0xE1u8), // 1110 - static_cast_i8!(0xF1u8), - ), // 1111 + initial_mins_tbl!(), off1_hibits, ); let initial_under: __m256i = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes); let second_mins: __m256i = _mm256_shuffle_epi8( - _mm256_setr_epi8( - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, // 10xx => false - 127, - 127, // 110x => true - static_cast_i8!(0xA0u8), // 1110 - static_cast_i8!(0x90u8), // 1111 - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, // 10xx => false - 127, - 127, // 110x => true - static_cast_i8!(0xA0u8), // 1110 - static_cast_i8!(0x90u8), - ), // 1111 + second_mins_tbl!(), off1_hibits, ); let second_under: __m256i = _mm256_cmpgt_epi8(second_mins, current_bytes); - *has_error = _mm256_or_si256(*has_error, _mm256_and_si256(initial_under, second_under)); + *has_error = _mm256_or_si256( + *has_error, + _mm256_and_si256(initial_under, second_under) + ); } } -pub struct AvxProcessedUtfBytes { +pub struct ProcessedUtfBytes { rawbytes: __m256i, high_nibbles: __m256i, pub carried_continuations: __m256i, } -impl Default for AvxProcessedUtfBytes { +impl Default for ProcessedUtfBytes { #[cfg_attr(not(feature = "no-inline"), inline)] fn default() -> Self { unsafe { - AvxProcessedUtfBytes { + ProcessedUtfBytes { rawbytes: _mm256_setzero_si256(), high_nibbles: _mm256_setzero_si256(), carried_continuations: _mm256_setzero_si256(), @@ -248,36 +230,40 @@ impl Default for AvxProcessedUtfBytes { } #[cfg_attr(not(feature = "no-inline"), inline)] -fn avx_count_nibbles(bytes: __m256i, answer: &mut AvxProcessedUtfBytes) { +fn count_nibbles(bytes: __m256i, answer: &mut ProcessedUtfBytes) { answer.rawbytes = bytes; - answer.high_nibbles = - unsafe { _mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F)) }; + answer.high_nibbles = unsafe { + _mm256_and_si256( + _mm256_srli_epi16(bytes, 4), + _mm256_set1_epi8(0x0F) + ) + }; } // check whether the current bytes are valid UTF-8 // at the end of the function, previous gets updated #[cfg_attr(not(feature = "no-inline"), inline)] -pub fn avxcheck_utf8_bytes( +pub fn check_utf8_bytes( current_bytes: __m256i, - previous: &AvxProcessedUtfBytes, + previous: &ProcessedUtfBytes, has_error: &mut __m256i, -) -> AvxProcessedUtfBytes { - let mut pb = AvxProcessedUtfBytes::default(); - avx_count_nibbles(current_bytes, &mut pb); +) -> ProcessedUtfBytes { + let mut pb = ProcessedUtfBytes::default(); + count_nibbles(current_bytes, &mut pb); - avxcheck_smaller_than_0xf4(current_bytes, has_error); + check_smaller_than_0xf4(current_bytes, has_error); - let initial_lengths: __m256i = avxcontinuation_lengths(pb.high_nibbles); + let initial_lengths: __m256i = continuation_lengths(pb.high_nibbles); pb.carried_continuations = - avxcarry_continuations(initial_lengths, previous.carried_continuations); + carry_continuations(initial_lengths, previous.carried_continuations); - avxcheck_continuations(initial_lengths, pb.carried_continuations, has_error); + check_continuations(initial_lengths, pb.carried_continuations, has_error); let off1_current_bytes: __m256i = push_last_byte_of_a_to_b(previous.rawbytes, pb.rawbytes); - avxcheck_first_continuation_max(current_bytes, off1_current_bytes, has_error); + check_first_continuation_max(current_bytes, off1_current_bytes, has_error); - avxcheck_overlong( + check_overlong( current_bytes, off1_current_bytes, pb.high_nibbles, diff --git a/src/sse42/deser.rs b/src/sse42/deser.rs index 6c643cb0..20c242c0 100644 --- a/src/sse42/deser.rs +++ b/src/sse42/deser.rs @@ -3,15 +3,13 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use std::mem; - pub use crate::error::{Error, ErrorType}; pub use crate::Deserializer; pub use crate::Result; +pub use crate::sse42::stage1::*; pub use crate::sse42::utf8check::*; pub use crate::stringparse::*; - impl<'de> Deserializer<'de> { #[cfg_attr(not(feature = "no-inline"), inline(always))] pub fn parse_str_(&mut self) -> Result<&'de str> { @@ -28,7 +26,7 @@ impl<'de> Deserializer<'de> { let mut src_i: usize = 0; let mut len = src_i; loop { - let v: __m128i = if src.len() >= src_i + 16 { + let srcx: __m128i = if src.len() >= src_i + 16 { // This is safe since we ensure src is at least 16 wide #[allow(clippy::cast_ptr_alignment)] unsafe { @@ -39,22 +37,14 @@ impl<'de> Deserializer<'de> { padding .get_unchecked_mut(..src.len() - src_i) .clone_from_slice(src.get_unchecked(src_i..)); - // This is safe since we ensure src is at least 32 wide + // This is safe since we ensure src is at least 16 wide #[allow(clippy::cast_ptr_alignment)] _mm_loadu_si128(padding.as_ptr() as *const __m128i) } }; - // store to dest unconditionally - we can overwrite the bits we don't like - // later - let bs_bits: u32 = unsafe { - static_cast_u32!(_mm_movemask_epi8(_mm_cmpeq_epi8( - v, - _mm_set1_epi8(b'\\' as i8) - ))) - }; - let quote_mask = unsafe { _mm_cmpeq_epi8(v, _mm_set1_epi8(b'"' as i8)) }; - let quote_bits = unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) }; + let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(srcx) }; + if (bs_bits.wrapping_sub(1) & quote_bits) != 0 { // we encountered quotes first. Move dst to point to quotes and exit // find out where the quote is... @@ -95,7 +85,7 @@ impl<'de> Deserializer<'de> { let dst: &mut [u8] = &mut self.strings; loop { - let v: __m128i = if src.len() >= src_i + 16 { + let srcx: __m128i = if src.len() >= src_i + 16 { // This is safe since we ensure src is at least 16 wide #[allow(clippy::cast_ptr_alignment)] unsafe { @@ -114,19 +104,13 @@ impl<'de> Deserializer<'de> { #[allow(clippy::cast_ptr_alignment)] unsafe { - _mm_storeu_si128(dst.as_mut_ptr().add(dst_i) as *mut __m128i, v) + _mm_storeu_si128(dst.as_mut_ptr().add(dst_i) as *mut __m128i, srcx) }; // store to dest unconditionally - we can overwrite the bits we don't like // later - let bs_bits: u32 = unsafe { - static_cast_u32!(_mm_movemask_epi8(_mm_cmpeq_epi8( - v, - _mm_set1_epi8(b'\\' as i8) - ))) - }; - let quote_mask = unsafe { _mm_cmpeq_epi8(v, _mm_set1_epi8(b'"' as i8)) }; - let quote_bits = unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) }; + let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(srcx) }; + if (bs_bits.wrapping_sub(1) & quote_bits) != 0 { // we encountered quotes first. Move dst to point to quotes and exit // find out where the quote is... @@ -165,9 +149,11 @@ impl<'de> Deserializer<'de> { src_i += bs_dist as usize; dst_i += bs_dist as usize; let (o, s) = if let Ok(r) = - handle_unicode_codepoint(unsafe { src.get_unchecked(src_i..) }, unsafe { - dst.get_unchecked_mut(dst_i..) - }) { + handle_unicode_codepoint( + unsafe { src.get_unchecked(src_i..) }, + unsafe { dst.get_unchecked_mut(dst_i..) }, + ) + { r } else { return Err(self.error(ErrorType::InvlaidUnicodeCodepoint)); diff --git a/src/sse42/stage1.rs b/src/sse42/stage1.rs index 81d6005e..c2ae0a8d 100644 --- a/src/sse42/stage1.rs +++ b/src/sse42/stage1.rs @@ -10,6 +10,31 @@ use std::mem; pub const SIMDJSON_PADDING: usize = mem::size_of::<__m128i>() * 2; +unsafe fn compute_quote_mask(quote_bits: u64) -> u64 { + _mm_cvtsi128_si64( + _mm_clmulepi64_si128( + _mm_set_epi64x(0, static_cast_i64!(quote_bits)), + _mm_set1_epi8(-1 /* 0xFF */), + 0, + ) + ) as u64 +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn check_ascii(input: &SimdInput) -> bool { + let highbit: __m128i = _mm_set1_epi8(static_cast_i8!(0x80u8)); + let test_v0v1 = _mm_testz_si128( + _mm_or_si128(input.v0, input.v1), + highbit, + ); + let test_v2v3 = _mm_testz_si128( + _mm_or_si128(input.v2, input.v3), + highbit, + ); + + (test_v0v1 == 1) && (test_v2v3 == 1) +} + #[derive(Debug)] struct SimdInput { v0: __m128i, @@ -30,45 +55,53 @@ fn fill_input(ptr: &[u8]) -> SimdInput { } } +struct Utf8CheckingState { + has_error: __m128i, + previous: ProcessedUtfBytes, +} + +impl Default for Utf8CheckingState { + #[cfg_attr(not(feature = "no-inline"), inline)] + fn default() -> Self { + Utf8CheckingState { + has_error: unsafe { _mm_setzero_si128() }, + previous: ProcessedUtfBytes::default(), + } + } +} + +#[inline] +fn is_utf8_status_ok(has_error: __m128i) -> bool { + unsafe { + _mm_testz_si128(has_error, has_error) != 0 + } +} + #[cfg_attr(not(feature = "no-inline"), inline(always))] unsafe fn check_utf8( input: &SimdInput, - has_error: &mut __m128i, - previous: &mut AvxProcessedUtfBytes, + state: &mut Utf8CheckingState, ) { - let highbit: __m128i = _mm_set1_epi8(static_cast_i8!(0x80u8)); - if (_mm_testz_si128(_mm_or_si128(input.v0, input.v1), highbit)) == 1 { - // it is ascii, we just check continuation - *has_error = _mm_or_si128( - _mm_cmpgt_epi8( - previous.carried_continuations, - _mm_setr_epi8( - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, - ), - ), - *has_error, + if check_ascii(input) { + // All bytes are ascii. Therefore the byte that was just before must be + // ascii too. We only check the byte that was just before simd_input. Nines + // are arbitrary values. + let verror: __m128i = _mm_setr_epi8( + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, ); - } else { - // it is not ascii so we have to do heavy work - *previous = avxcheck_utf8_bytes(input.v0, &previous, has_error); - *previous = avxcheck_utf8_bytes(input.v1, &previous, has_error); - } - - if (_mm_testz_si128(_mm_or_si128(input.v2, input.v3), highbit)) == 1 { - // it is ascii, we just check continuation - *has_error = _mm_or_si128( + state.has_error = _mm_or_si128( _mm_cmpgt_epi8( - previous.carried_continuations, - _mm_setr_epi8( - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, - ), + state.previous.carried_continuations, + verror, ), - *has_error, + state.has_error, ); } else { // it is not ascii so we have to do heavy work - *previous = avxcheck_utf8_bytes(input.v2, &previous, has_error); - *previous = avxcheck_utf8_bytes(input.v3, &previous, has_error); + state.previous = check_utf8_bytes(input.v0, &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(input.v1, &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(input.v2, &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(input.v3, &mut state.previous, &mut state.has_error); } } @@ -134,9 +167,10 @@ fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_backslash: // should be flipped let (mut odd_carries, iter_ends_odd_backslash) = bs_bits.overflowing_add(odd_starts); - odd_carries |= *prev_iter_ends_odd_backslash; // push in bit zero as a potential end - // if we had an odd-numbered run at the - // end of the previous iteration + odd_carries |= *prev_iter_ends_odd_backslash; + // push in bit zero as a potential end + // if we had an odd-numbered run at the + // end of the previous iteration *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 }; let even_carry_ends: u64 = even_carries & !bs_bits; let odd_carry_ends: u64 = odd_carries & !bs_bits; @@ -168,12 +202,8 @@ unsafe fn find_quote_mask_and_bits( *quote_bits = cmp_mask_against_input(&input, b'"'); *quote_bits &= !odd_ends; // remove from the valid quoted region the unescapted characters. - #[allow(overflowing_literals)] - let mut quote_mask: u64 = _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0, static_cast_i64!(*quote_bits)), - _mm_set1_epi8(0xFF), - 0, - )) as u64; + let mut quote_mask: u64 = compute_quote_mask(*quote_bits); + quote_mask ^= *prev_iter_inside_quote; // All Unicode characters may be placed within the // quotation marks, except for the characters that MUST be escaped: @@ -274,7 +304,6 @@ unsafe fn find_whitespace_and_structurals( let structural_res_1: u64 = _mm_movemask_epi8(tmp_v1) as u64; let structural_res_2: u64 = _mm_movemask_epi8(tmp_v2) as u64; let structural_res_3: u64 = _mm_movemask_epi8(tmp_v3) as u64; - *structurals = !(structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48)); let tmp_ws_v0: __m128i = _mm_cmpeq_epi8( @@ -394,9 +423,18 @@ fn finalize_structurals( structurals } -//WARN_UNUSED -/*never_inline*/ -//#[inline(never)] +pub unsafe fn find_bs_bits_and_quote_bits(src: __m128i) -> ParseStringHelper { + let quote_mask = unsafe { _mm_cmpeq_epi8(v, _mm_set1_epi8(b'"' as i8)) }; + let quote_bits = unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) }; + let bs_mask = unsafe { _mm_cmpeq_epi8(v, _mm_set1_epi8(b'\\' as i8)) }; + let bs_bits = unsafe { static_cast_u32!(_mm_movemask_epi8(bs_mask)) }; + + ParseStringHelper { + bs_bits, + quote_bits, + } +} + impl<'de> Deserializer<'de> { //#[inline(never)] pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result, ErrorType> { @@ -406,8 +444,8 @@ impl<'de> Deserializer<'de> { let mut structural_indexes = Vec::with_capacity(len / 6); structural_indexes.push(0); // push extra root element - let mut has_error: __m128i = _mm_setzero_si128(); - let mut previous = AvxProcessedUtfBytes::default(); + let mut utf8_state: Utf8CheckingState = Utf8CheckingState::default(); + // we have padded the input out to 64 byte multiple with the remainder being // zeros @@ -443,7 +481,7 @@ impl<'de> Deserializer<'de> { #endif */ let input: SimdInput = fill_input(input.get_unchecked(idx as usize..)); - check_utf8(&input, &mut has_error, &mut previous); + check_utf8(&input, &mut utf8_state); // detect odd sequences of backslashes let odd_ends: u64 = find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash); @@ -487,7 +525,7 @@ impl<'de> Deserializer<'de> { .copy_from(input.as_ptr().add(idx), len as usize - idx); let input: SimdInput = fill_input(&tmpbuf); - check_utf8(&input, &mut has_error, &mut previous); + check_utf8(&input, &mut utf8_state); // detect odd sequences of backslashes let odd_ends: u64 = @@ -542,7 +580,7 @@ impl<'de> Deserializer<'de> { return Err(ErrorType::Syntax); } - if _mm_testz_si128(has_error, has_error) != 0 { + if is_utf8_status_ok(utf8_state.has_error) { Ok(structural_indexes) } else { Err(ErrorType::InvalidUTF8) diff --git a/src/sse42/utf8check.rs b/src/sse42/utf8check.rs index 3b80e8df..f66e2963 100644 --- a/src/sse42/utf8check.rs +++ b/src/sse42/utf8check.rs @@ -21,49 +21,57 @@ use std::arch::x86_64::*; * */ -// all byte values must be no larger than 0xF4 - /*****************************/ #[cfg_attr(not(feature = "no-inline"), inline)] fn push_last_byte_of_a_to_b(a: __m128i, b: __m128i) -> __m128i { - unsafe { _mm_alignr_epi8(b, a, 15) } + unsafe { + _mm_alignr_epi8(b, a, 16 - 1) + } } #[cfg_attr(not(feature = "no-inline"), inline)] fn push_last_2bytes_of_a_to_b(a: __m128i, b: __m128i) -> __m128i { - unsafe { _mm_alignr_epi8(b, a, 14) } + unsafe { + _mm_alignr_epi8(b, a, 16 - 2) + } } // all byte values must be no larger than 0xF4 #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcheck_smaller_than_0xf4(current_bytes: __m128i, has_error: &mut __m128i) { +fn check_smaller_than_0xf4(current_bytes: __m128i, has_error: &mut __m128i) { // unsigned, saturates to 0 below max *has_error = unsafe { _mm_or_si128( *has_error, - _mm_subs_epu8(current_bytes, _mm_set1_epi8(static_cast_i8!(0xF4u8))), + _mm_subs_epu8(current_bytes, _mm_set1_epi8(-12i8 /* 0xF4 */)), + ) + }; +} + +macro_rules! nibbles_tbl { + () => { + _mm_setr_epi8( + 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + 0, 0, 0, 0, // 10xx (continuation) + 2, 2, // 110x + 3, // 1110 + 4, // 1111, next should be 0 (not checked here) ) }; } #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcontinuation_lengths(high_nibbles: __m128i) -> __m128i { +fn continuation_lengths(high_nibbles: __m128i) -> __m128i { unsafe { _mm_shuffle_epi8( - _mm_setr_epi8( - 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) - 0, 0, 0, 0, // 10xx (continuation) - 2, 2, // 110x - 3, // 1110 - 4, // 1111, next should be 0 (not checked here) - ), + nibbles_tbl!(), high_nibbles, ) } } #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcarry_continuations(initial_lengths: __m128i, previous_carries: __m128i) -> __m128i { +fn carry_continuations(initial_lengths: __m128i, previous_carries: __m128i) -> __m128i { unsafe { let right1: __m128i = _mm_subs_epu8( push_last_byte_of_a_to_b(previous_carries, initial_lengths), @@ -79,7 +87,7 @@ fn avxcarry_continuations(initial_lengths: __m128i, previous_carries: __m128i) - } #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcheck_continuations(initial_lengths: __m128i, carries: __m128i, has_error: &mut __m128i) { +fn check_continuations(initial_lengths: __m128i, carries: __m128i, has_error: &mut __m128i) { // overlap || underlap // carry > length && length > 0 || !(carry > length) && !(length > 0) // (carries > length) == (lengths > 0) @@ -97,7 +105,7 @@ fn avxcheck_continuations(initial_lengths: __m128i, carries: __m128i, has_error: // when 0xF4 is found, next byte must be no larger than 0x8F // next byte must be continuation, ie sign bit is set, so signed < is ok #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcheck_first_continuation_max( +fn check_first_continuation_max( current_bytes: __m128i, off1_current_bytes: __m128i, has_error: &mut __m128i, @@ -121,10 +129,37 @@ fn avxcheck_first_continuation_max( mask_f4, ); - *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollow_ed, badfollow_f4)); + *has_error = _mm_or_si128( + *has_error, + _mm_or_si128(badfollow_ed, badfollow_f4), + ); } } +macro_rules! initial_mins_tbl { + () => { + _mm_setr_epi8( + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + -62 /* 0xC2 */, -128, // 110x + -31 /* 0xE1 */, // 1110 + -15 /*0xF1 */, // 1111 + ) + }; +} + +macro_rules! second_mins_tbl { + () => { + _mm_setr_epi8( + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + -96 /* 0xA0 */, // 1110 + -112 /* 0x90 */, // 1111 + ) + }; +} + // map off1_hibits => error condition // hibits off1 cur // C => < C2 && true @@ -132,7 +167,7 @@ fn avxcheck_first_continuation_max( // F => < F1 && < 90 // else false && false #[cfg_attr(not(feature = "no-inline"), inline)] -fn avxcheck_overlong( +fn check_overlong( current_bytes: __m128i, off1_current_bytes: __m128i, hibits: __m128i, @@ -142,66 +177,35 @@ fn avxcheck_overlong( unsafe { let off1_hibits: __m128i = push_last_byte_of_a_to_b(previous_hibits, hibits); let initial_mins: __m128i = _mm_shuffle_epi8( - _mm_setr_epi8( - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, // 10xx => false - static_cast_i8!(0xC2u8), - -128, // 110x - static_cast_i8!(0xE1u8), // 1110 - static_cast_i8!(0xF1u8), // 1111 - ), + initial_mins_tbl!(), off1_hibits, ); let initial_under: __m128i = _mm_cmpgt_epi8(initial_mins, off1_current_bytes); let second_mins: __m128i = _mm_shuffle_epi8( - _mm_setr_epi8( - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, - -128, // 10xx => false - 127, - 127, // 110x => true - static_cast_i8!(0xA0u8), // 1110 - static_cast_i8!(0x90u8), // 1111 - ), + second_mins_tbl!(), off1_hibits, ); let second_under: __m128i = _mm_cmpgt_epi8(second_mins, current_bytes); - *has_error = _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under)); + *has_error = _mm_or_si128( + *has_error, + _mm_and_si128(initial_under, second_under) + ); } } -pub struct AvxProcessedUtfBytes { +pub struct ProcessedUtfBytes { rawbytes: __m128i, high_nibbles: __m128i, pub carried_continuations: __m128i, } -impl Default for AvxProcessedUtfBytes { +impl Default for ProcessedUtfBytes { #[cfg_attr(not(feature = "no-inline"), inline)] fn default() -> Self { unsafe { - AvxProcessedUtfBytes { + ProcessedUtfBytes { rawbytes: _mm_setzero_si128(), high_nibbles: _mm_setzero_si128(), carried_continuations: _mm_setzero_si128(), @@ -211,36 +215,40 @@ impl Default for AvxProcessedUtfBytes { } #[cfg_attr(not(feature = "no-inline"), inline)] -fn avx_count_nibbles(bytes: __m128i, answer: &mut AvxProcessedUtfBytes) { +fn count_nibbles(bytes: __m128i, answer: &mut ProcessedUtfBytes) { answer.rawbytes = bytes; - answer.high_nibbles = - unsafe { _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F)) }; + answer.high_nibbles = unsafe { + _mm_and_si128( + _mm_srli_epi16(bytes, 4), + _mm_set1_epi8(0x0F) + ) + }; } // check whether the current bytes are valid UTF-8 // at the end of the function, previous gets updated #[cfg_attr(not(feature = "no-inline"), inline)] -pub fn avxcheck_utf8_bytes( +pub fn check_utf8_bytes( current_bytes: __m128i, - previous: &AvxProcessedUtfBytes, + previous: &ProcessedUtfBytes, has_error: &mut __m128i, -) -> AvxProcessedUtfBytes { - let mut pb = AvxProcessedUtfBytes::default(); - avx_count_nibbles(current_bytes, &mut pb); +) -> ProcessedUtfBytes { + let mut pb = ProcessedUtfBytes::default(); + count_nibbles(current_bytes, &mut pb); - avxcheck_smaller_than_0xf4(current_bytes, has_error); + check_smaller_than_0xf4(current_bytes, has_error); - let initial_lengths: __m128i = avxcontinuation_lengths(pb.high_nibbles); + let initial_lengths: __m128i = continuation_lengths(pb.high_nibbles); pb.carried_continuations = - avxcarry_continuations(initial_lengths, previous.carried_continuations); + carry_continuations(initial_lengths, previous.carried_continuations); - avxcheck_continuations(initial_lengths, pb.carried_continuations, has_error); + check_continuations(initial_lengths, pb.carried_continuations, has_error); let off1_current_bytes: __m128i = push_last_byte_of_a_to_b(previous.rawbytes, pb.rawbytes); - avxcheck_first_continuation_max(current_bytes, off1_current_bytes, has_error); + check_first_continuation_max(current_bytes, off1_current_bytes, has_error); - avxcheck_overlong( + check_overlong( current_bytes, off1_current_bytes, pb.high_nibbles,