Skip to content

Commit

Permalink
neon: refactor to use different types on all targets
Browse files Browse the repository at this point in the history
Previously we would typedef native types to all of the NEON types.  For
WASM and x86, this means multiple NEON types are typedefd to the same
type (e.g., simde_int8x16_t and simde_int16x8_t are both typedefd to
__m128i on x86, and v128_t on WASM).  This causes problems in some
software, for example C++ functions overloaded based on the NEON type.

Unfortunately it's not just a matter of switching those typedefs, since
the code was previously structured to assume that the public types
would be typedefs to native types.  This patch restructures all the x86
and WASM implementations in NEON to instead use the private types just
like the portable version.  For now I've preserved the *option* of
using the native types in the public API, but I don't plan on testing
it, and unless anyone actually uses it I'll probably just remove it
soon.

Note that AltiVec and NEON don't really change; changing them would
just introduce unnecessary complexity for the compiler (which it
*should* be able to elide, but I see no reason to risk it).
  • Loading branch information
nemequ committed May 12, 2021
1 parent ebe5c7d commit c17957a
Show file tree
Hide file tree
Showing 69 changed files with 4,487 additions and 4,268 deletions.
126 changes: 65 additions & 61 deletions simde/arm/neon/abs.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,14 @@ simde_int8x8_t
simde_vabs_s8(simde_int8x8_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabs_s8(a);
#elif defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
return _mm_abs_pi8(a);
#else
simde_int8x8_private
r_,
a_ = simde_int8x8_to_private(a);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi8(a_.m64);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
Expand All @@ -128,14 +128,14 @@ simde_int16x4_t
simde_vabs_s16(simde_int16x4_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabs_s16(a);
#elif defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
return _mm_abs_pi16(a);
#else
simde_int16x4_private
r_,
a_ = simde_int16x4_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi16(a_.m64);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
Expand All @@ -158,14 +158,14 @@ simde_int32x2_t
simde_vabs_s32(simde_int32x2_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabs_s32(a);
#elif defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
return _mm_abs_pi32(a);
#else
simde_int32x2_private
r_,
a_ = simde_int32x2_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi32(a_.m64);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
Expand Down Expand Up @@ -218,22 +218,24 @@ simde_vabsq_f32(simde_float32x4_t a) {
return vabsq_f32(a);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_abs(a);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
return wasm_f32x4_abs(a);
#elif defined(SIMDE_X86_SSE_NATIVE)
simde_float32 mask_;
uint32_t u32_ = UINT32_C(0x7FFFFFFF);
simde_memcpy(&mask_, &u32_, sizeof(u32_));
return _mm_and_ps(_mm_set1_ps(mask_), a);
#else
simde_float32x4_private
r_,
a_ = simde_float32x4_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_math_fabsf(a_.values[i]);
}
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_f32x4_abs(a_.v128);
#elif defined(SIMDE_X86_SSE_NATIVE)
simde_float32 mask_;
uint32_t u32_ = UINT32_C(0x7FFFFFFF);
simde_memcpy(&mask_, &u32_, sizeof(u32_));
r_.m128 = _mm_and_ps(_mm_set1_ps(mask_), a_.m128);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_math_fabsf(a_.values[i]);
}
#endif

return simde_float32x4_from_private(r_);
#endif
Expand All @@ -250,20 +252,22 @@ simde_vabsq_f64(simde_float64x2_t a) {
return vabsq_f64(a);
#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
return vec_abs(a);
#elif defined(SIMDE_X86_SSE2_NATIVE)
simde_float64 mask_;
uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF);
simde_memcpy(&mask_, &u64_, sizeof(u64_));
return _mm_and_pd(_mm_set1_pd(mask_), a);
#else
simde_float64x2_private
r_,
a_ = simde_float64x2_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_math_fabs(a_.values[i]);
}
#if defined(SIMDE_X86_SSE2_NATIVE)
simde_float64 mask_;
uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF);
simde_memcpy(&mask_, &u64_, sizeof(u64_));
r_.m128d = _mm_and_pd(_mm_set1_pd(mask_), a_.m128d);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_math_fabs(a_.values[i]);
}
#endif

return simde_float64x2_from_private(r_);
#endif
Expand All @@ -278,22 +282,22 @@ simde_int8x16_t
simde_vabsq_s8(simde_int8x16_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabsq_s8(a);
#elif defined(SIMDE_X86_SSSE3_NATIVE)
return _mm_abs_epi8(a);
#elif defined(SIMDE_X86_SSE2_NATIVE)
return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_abs(a);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
return wasm_i8x16_abs(a);
#else
simde_int8x16_private
r_,
a_ = simde_int8x16_to_private(a);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#if defined(SIMDE_X86_SSSE3_NATIVE)
r_.m128i = _mm_abs_epi8(a_.m128i);
#elif defined(SIMDE_X86_SSE2_NATIVE)
r_.m128i = _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_abs(a_.v128);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand All @@ -314,20 +318,20 @@ simde_int16x8_t
simde_vabsq_s16(simde_int16x8_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabsq_s16(a);
#elif defined(SIMDE_X86_SSSE3_NATIVE)
return _mm_abs_epi16(a);
#elif defined(SIMDE_X86_SSE2_NATIVE)
return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_abs(a);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
return wasm_i16x8_abs(a);
#else
simde_int16x8_private
r_,
a_ = simde_int16x8_to_private(a);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_X86_SSSE3_NATIVE)
r_.m128i = _mm_abs_epi16(a_.m128i);
#elif defined(SIMDE_X86_SSE2_NATIVE)
r_.m128i = _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_abs(a_.v128);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
Expand All @@ -350,21 +354,21 @@ simde_int32x4_t
simde_vabsq_s32(simde_int32x4_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabsq_s32(a);
#elif defined(SIMDE_X86_SSSE3_NATIVE)
return _mm_abs_epi32(a);
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i m = _mm_cmpgt_epi32(_mm_setzero_si128(), a);
return _mm_sub_epi32(_mm_xor_si128(a, m), m);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_abs(a);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
return wasm_i32x4_abs(a);
#else
simde_int32x4_private
r_,
a_ = simde_int32x4_to_private(a);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_X86_SSSE3_NATIVE)
r_.m128i = _mm_abs_epi32(a_.m128i);
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i m = _mm_cmpgt_epi32(_mm_setzero_si128(), a);
r_.m128i = _mm_sub_epi32(_mm_xor_si128(a_.m128i, m), m);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i32x4_abs(a_.v128);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
Expand All @@ -389,21 +393,21 @@ simde_vabsq_s64(simde_int64x2_t a) {
return vabsq_s64(a);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vbslq_s64(vreinterpretq_u64_s64(vshrq_n_s64(a, 63)), vsubq_s64(vdupq_n_s64(0), a), a);
#elif defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_abs_epi64(a);
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i m = _mm_srai_epi32(_mm_shuffle_epi32(a, 0xF5), 31);
return _mm_sub_epi64(_mm_xor_si128(a, m), m);
#elif defined(SIMDE_POWER_ALTIVEC_P64_NATIVE) && !defined(HEDLEY_IBM_VERSION)
return vec_abs(a);
#elif defined(SIMDE_WASM_SIMD128_NATIVE) && 0
return wasm_i64x2_abs(a);
#else
simde_int64x2_private
r_,
a_ = simde_int64x2_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_X86_AVX512VL_NATIVE)
r_.m128i = _mm_abs_epi64(a_.m128i);
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i m = _mm_srai_epi32(_mm_shuffle_epi32(a, 0xF5), 31);
r_.m128i = _mm_sub_epi64(_mm_xor_si128(a_.m128i, m), m);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i64x2_abs(a_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
Expand Down
Loading

0 comments on commit c17957a

Please sign in to comment.