Skip to content

Commit

Permalink
neon/cgt, simd128: improve some unsigned comparisons on x86
Browse files Browse the repository at this point in the history
These are based on @aqrit's suggestions at
#855 (comment)
and
#855 (comment)
  • Loading branch information
nemequ committed Jul 24, 2021
1 parent c3ddbbe commit ae6702a
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 8 deletions.
15 changes: 9 additions & 6 deletions simde/arm/neon/cgt.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,8 +315,8 @@ simde_vcgtq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
b_ = simde_uint8x16_to_private(b);

#if defined(SIMDE_X86_SSE2_NATIVE)
__m128i sign_bit = _mm_set1_epi8(INT8_MIN);
r_.m128i = _mm_cmpgt_epi8(_mm_xor_si128(a_.m128i, sign_bit), _mm_xor_si128(b_.m128i, sign_bit));
__m128i tmp = _mm_subs_epu8(a_.m128i, b_.m128i);
r_.m128i = _mm_adds_epu8(tmp, _mm_sub_epi8(_mm_setzero_si128(), tmp));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_u8x16_gt(a_.v128, b_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
Expand Down Expand Up @@ -350,8 +350,8 @@ simde_vcgtq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
b_ = simde_uint16x8_to_private(b);

#if defined(SIMDE_X86_SSE2_NATIVE)
__m128i sign_bit = _mm_set1_epi16(INT16_MIN);
r_.m128i = _mm_cmpgt_epi16(_mm_xor_si128(a_.m128i, sign_bit), _mm_xor_si128(b_.m128i, sign_bit));
__m128i tmp = _mm_subs_epu16(a_.m128i, b_.m128i);
r_.m128i = _mm_adds_epu16(tmp, _mm_sub_epi16(_mm_setzero_si128(), tmp));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_u16x8_gt(a_.v128, b_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
Expand Down Expand Up @@ -385,8 +385,11 @@ simde_vcgtq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
b_ = simde_uint32x4_to_private(b);

#if defined(SIMDE_X86_SSE2_NATIVE)
__m128i sign_bit = _mm_set1_epi32(INT32_MIN);
r_.m128i = _mm_cmpgt_epi32(_mm_xor_si128(a_.m128i, sign_bit), _mm_xor_si128(b_.m128i, sign_bit));
r_.m128i =
_mm_xor_si128(
_mm_cmpgt_epi32(a_.m128i, b_.m128i),
_mm_srai_epi32(_mm_xor_si128(a_.m128i, b_.m128i), 31)
);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_u32x4_gt(a_.v128, b_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
Expand Down
16 changes: 14 additions & 2 deletions simde/wasm/simd128.h
Original file line number Diff line number Diff line change
Expand Up @@ -1911,6 +1911,9 @@ simde_wasm_u8x16_gt (simde_v128_t a, simde_v128_t b) {

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_u8 = vcgtq_u8(a_.neon_u8, b_.neon_u8);
#elif defined(SIMDE_X86_SSE2_NATIVE)
__m128i tmp = _mm_subs_epu8(a_.sse_m128i, b_.sse_m128i);
r_.sse_m128i = _mm_adds_epu8(tmp, _mm_sub_epi8(_mm_setzero_si128(), tmp));
#elif defined(SIMDE_VECTOR_SUBSCRIPT)
r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 > b_.u8);
#else
Expand Down Expand Up @@ -1938,7 +1941,10 @@ simde_wasm_u16x8_gt (simde_v128_t a, simde_v128_t b) {
b_ = simde_v128_to_private(b),
r_;

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
#if defined(SIMDE_X86_SSE2_NATIVE)
__m128i tmp = _mm_subs_epu16(a_.sse_m128i, b_.sse_m128i);
r_.sse_m128i = _mm_adds_epu16(tmp, _mm_sub_epi16(_mm_setzero_si128(), tmp));
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_u16 = vcgtq_u16(a_.neon_u16, b_.neon_u16);
#elif defined(SIMDE_VECTOR_SUBSCRIPT)
r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 > b_.u16);
Expand Down Expand Up @@ -1967,7 +1973,13 @@ simde_wasm_u32x4_gt (simde_v128_t a, simde_v128_t b) {
b_ = simde_v128_to_private(b),
r_;

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
#if defined(SIMDE_X86_SSE2_NATIVE)
r_.sse_m128i =
_mm_xor_si128(
_mm_cmpgt_epi32(a_.sse_m128i, b_.sse_m128i),
_mm_srai_epi32(_mm_xor_si128(a_.sse_m128i, b_.sse_m128i), 31)
);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_u32 = vcgtq_u32(a_.neon_u32, b_.neon_u32);
#elif defined(SIMDE_VECTOR_SUBSCRIPT)
r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 > b_.u32);
Expand Down

0 comments on commit ae6702a

Please sign in to comment.