Skip to content

Commit

Permalink
sse{,2,4.1}, avx{,2} *_stream_{,load}: use __builtin_nontemporal_{loa…
Browse files Browse the repository at this point in the history
…d,store}
  • Loading branch information
mr-c committed Oct 19, 2023
1 parent fba97e4 commit 6ce6030
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 16 deletions.
8 changes: 7 additions & 1 deletion simde/x86/avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -5345,6 +5345,8 @@ void
simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_stream_ps(mem_addr, a);
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT)
__builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr));
#else
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a));
#endif
Expand All @@ -5359,6 +5361,8 @@ void
simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_stream_pd(mem_addr, a);
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT)
__builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr));
#else
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a));
#endif
Expand All @@ -5373,8 +5377,10 @@ void
simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_stream_si256(mem_addr, a);
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT)
__builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr));
#else
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a));
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
Expand Down
2 changes: 2 additions & 0 deletions simde/x86/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -5117,6 +5117,8 @@ simde__m256i
simde_mm256_stream_load_si256 (const simde__m256i* mem_addr) {
#if defined(SIMDE_X86_AVX2_NATIVE)
return _mm256_stream_load_si256(HEDLEY_CONST_CAST(simde__m256i*, mem_addr));
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT)
return __builtin_nontemporal_load(mem_addr);
#else
simde__m256i r;
simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r));
Expand Down
21 changes: 13 additions & 8 deletions simde/x86/sse.h
Original file line number Diff line number Diff line change
Expand Up @@ -4754,16 +4754,19 @@ void
simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) {
#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
_mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \
defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) || \
defined(SIMDE_VECTOR_SUBSCRIPT))
__builtin_nontemporal_store(a, mem_addr);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde__m64_private a_ = simde__m64_to_private(a);
vst1_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), a_.neon_i64);
#else
simde__m64_private*
dest = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr),
a_ = simde__m64_to_private(a);

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
dest->i64[0] = vget_lane_s64(a_.neon_i64, 0);
#else
dest->i64[0] = a_.i64[0];
#endif
dest->i64[0] = a_.i64[0];
#endif
}
#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
Expand All @@ -4775,9 +4778,11 @@ void
simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) {
#if defined(SIMDE_X86_SSE_NATIVE)
_mm_stream_ps(mem_addr, a);
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
simde__m128_private a_ = simde__m128_to_private(a);
__builtin_nontemporal_store(a_.f32, SIMDE_ALIGN_CAST(__typeof__(a_.f32)*, mem_addr));
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \
defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \
defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \
defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE))
__builtin_nontemporal_store(a, SIMDE_ALIGN_ASSUME_CAST(__typeof__(a)*, mem_addr));
#else
simde_mm_store_ps(mem_addr, a);
#endif
Expand Down
30 changes: 26 additions & 4 deletions simde/x86/sse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -3495,13 +3495,13 @@ simde__m128i
simde_mm_load_si128 (simde__m128i const* mem_addr) {
#if defined(SIMDE_X86_SSE2_NATIVE)
return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
#else
simde__m128i_private r_;

#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
#else
simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i));
#endif
Expand Down Expand Up @@ -4949,6 +4949,10 @@ simde_mm_loadu_si32 (void const* mem_addr) {
return _mm_loadu_si32(mem_addr);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
return simde__m128i_from_wasm_v128(wasm_v128_load32_zero(mem_addr));
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde__m128i_private r_;
r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0);
return simde__m128i_from_private(r_);
#else
int32_t val;
simde_memcpy(&val, mem_addr, sizeof(val));
Expand Down Expand Up @@ -6600,8 +6604,13 @@ void
simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
#if defined(SIMDE_X86_SSE2_NATIVE)
_mm_stream_pd(mem_addr, a);
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \
defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A64V8_NATIVE) || \
defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || \
defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
__builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr));
#else
simde_memcpy(mem_addr, &a, sizeof(a));
simde_mm_store_pd(mem_addr, a);
#endif
}
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
Expand All @@ -6613,8 +6622,13 @@ void
simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
_mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \
defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \
defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \
defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
__builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr));
#else
simde_memcpy(mem_addr, &a, sizeof(a));
simde_mm_store_si128(mem_addr, a);
#endif
}
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
Expand All @@ -6626,6 +6640,10 @@ void
simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
#if defined(SIMDE_X86_SSE2_NATIVE)
_mm_stream_si32(mem_addr, a);
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store)
__builtin_nontemporal_store(a, mem_addr);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
vst1q_lane_s32(mem_addr, vdupq_n_s32(a), 0);
#else
*mem_addr = a;
#endif
Expand All @@ -6639,6 +6657,10 @@ void
simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION)
_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a);
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store)
__builtin_nontemporal_store(a, mem_addr);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
vst1_s64(mem_addr, vdup_n_s64(a));
#else
*mem_addr = a;
#endif
Expand Down
9 changes: 6 additions & 3 deletions simde/x86/sse4.1.h
Original file line number Diff line number Diff line change
Expand Up @@ -2139,10 +2139,13 @@ simde__m128i
simde_mm_stream_load_si128 (const simde__m128i* mem_addr) {
#if defined(SIMDE_X86_SSE4_1_NATIVE)
return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr));
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vreinterpretq_s64_s32(vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)));
#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_load) && ( \
defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \
defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \
defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
return __builtin_nontemporal_load(mem_addr);
#else
return *mem_addr;
return simde_mm_load_si128(mem_addr);
#endif
}
#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
Expand Down

0 comments on commit 6ce6030

Please sign in to comment.