From ae6702a11ccf507492373999d8e4404cd35196f0 Mon Sep 17 00:00:00 2001 From: Evan Nemerson Date: Sat, 24 Jul 2021 14:34:37 -0400 Subject: [PATCH] neon/cgt, simd128: improve some unsigned comparisons on x86 These are based on @aqrit's suggestions at https://github.com/simd-everywhere/simde/issues/855#issuecomment-884547085 and https://github.com/simd-everywhere/simde/issues/855#issuecomment-886057227 --- simde/arm/neon/cgt.h | 15 +++++++++------ simde/wasm/simd128.h | 16 ++++++++++++++-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/simde/arm/neon/cgt.h b/simde/arm/neon/cgt.h index c63f91c7f..a090dca5b 100644 --- a/simde/arm/neon/cgt.h +++ b/simde/arm/neon/cgt.h @@ -315,8 +315,8 @@ simde_vcgtq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { b_ = simde_uint8x16_to_private(b); #if defined(SIMDE_X86_SSE2_NATIVE) - __m128i sign_bit = _mm_set1_epi8(INT8_MIN); - r_.m128i = _mm_cmpgt_epi8(_mm_xor_si128(a_.m128i, sign_bit), _mm_xor_si128(b_.m128i, sign_bit)); + __m128i tmp = _mm_subs_epu8(a_.m128i, b_.m128i); + r_.m128i = _mm_adds_epu8(tmp, _mm_sub_epi8(_mm_setzero_si128(), tmp)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_gt(a_.v128, b_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -350,8 +350,8 @@ simde_vcgtq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { b_ = simde_uint16x8_to_private(b); #if defined(SIMDE_X86_SSE2_NATIVE) - __m128i sign_bit = _mm_set1_epi16(INT16_MIN); - r_.m128i = _mm_cmpgt_epi16(_mm_xor_si128(a_.m128i, sign_bit), _mm_xor_si128(b_.m128i, sign_bit)); + __m128i tmp = _mm_subs_epu16(a_.m128i, b_.m128i); + r_.m128i = _mm_adds_epu16(tmp, _mm_sub_epi16(_mm_setzero_si128(), tmp)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_gt(a_.v128, b_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -385,8 +385,11 @@ simde_vcgtq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { b_ = simde_uint32x4_to_private(b); #if defined(SIMDE_X86_SSE2_NATIVE) - __m128i sign_bit = _mm_set1_epi32(INT32_MIN); - r_.m128i = _mm_cmpgt_epi32(_mm_xor_si128(a_.m128i, sign_bit), _mm_xor_si128(b_.m128i, sign_bit)); + r_.m128i = + _mm_xor_si128( + _mm_cmpgt_epi32(a_.m128i, b_.m128i), + _mm_srai_epi32(_mm_xor_si128(a_.m128i, b_.m128i), 31) + ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_gt(a_.v128, b_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) diff --git a/simde/wasm/simd128.h b/simde/wasm/simd128.h index 22e159889..5aea4b3ff 100644 --- a/simde/wasm/simd128.h +++ b/simde/wasm/simd128.h @@ -1911,6 +1911,9 @@ simde_wasm_u8x16_gt (simde_v128_t a, simde_v128_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u8 = vcgtq_u8(a_.neon_u8, b_.neon_u8); + #elif defined(SIMDE_X86_SSE2_NATIVE) + __m128i tmp = _mm_subs_epu8(a_.sse_m128i, b_.sse_m128i); + r_.sse_m128i = _mm_adds_epu8(tmp, _mm_sub_epi8(_mm_setzero_si128(), tmp)); #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 > b_.u8); #else @@ -1938,7 +1941,10 @@ simde_wasm_u16x8_gt (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) + __m128i tmp = _mm_subs_epu16(a_.sse_m128i, b_.sse_m128i); + r_.sse_m128i = _mm_adds_epu16(tmp, _mm_sub_epi16(_mm_setzero_si128(), tmp)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u16 = vcgtq_u16(a_.neon_u16, b_.neon_u16); #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 > b_.u16); @@ -1967,7 +1973,13 @@ simde_wasm_u32x4_gt (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) + r_.sse_m128i = + _mm_xor_si128( + _mm_cmpgt_epi32(a_.sse_m128i, b_.sse_m128i), + _mm_srai_epi32(_mm_xor_si128(a_.sse_m128i, b_.sse_m128i), 31) + ); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vcgtq_u32(a_.neon_u32, b_.neon_u32); #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 > b_.u32);