From 6488a1a985a1204a9d5a66bd120302ed9c5642f4 Mon Sep 17 00:00:00 2001 From: ogiroux Date: Thu, 10 Sep 2020 10:26:02 -0700 Subject: [PATCH 1/8] Initial version --- include/cuda/std/detail/__atomic | 42 ++- include/cuda/std/detail/__atomic_derived | 28 +- include/cuda/std/detail/__atomic_msvc | 374 +++++++++++++++++++++++ 3 files changed, 406 insertions(+), 38 deletions(-) create mode 100644 include/cuda/std/detail/__atomic_msvc diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index 4261d6a054..0933ce830a 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -102,6 +102,10 @@ namespace detail { _LIBCUDACXX_END_NAMESPACE_CUDA +#ifdef _MSC_VER + #include "__atomic_msvc" +#endif + #include "__atomic_generated" #include "__atomic_derived" @@ -123,14 +127,14 @@ __host__ __device__ inline void __cxx_atomic_thread_fence(int __order) { #ifdef __CUDA_ARCH__ detail::__atomic_thread_fence_cuda(__order, detail::__thread_scope_system_tag()); #else - ::std::atomic_thread_fence((::std::memory_order)__order); + __atomic_thread_fence(__order); #endif } __host__ __device__ inline void __cxx_atomic_signal_fence(int __order) { #ifdef __CUDA_ARCH__ detail::__atomic_signal_fence_cuda(__order); #else - ::std::atomic_signal_fence((::std::memory_order)__order); + __atomic_signal_fence(__order); #endif } @@ -190,30 +194,20 @@ struct __cxx_atomic_base_impl_default { constexpr __cxx_atomic_base_impl_default() noexcept = default; __host__ __device__ constexpr explicit __cxx_atomic_base_impl_default(_Tp __value) noexcept : __a_value(__value) { } -#ifdef __CUDA_ARCH__ __cxx_atomic_alignment_wrapper_t<_Tp> __a_value; -#else - ::std::atomic<__cxx_atomic_alignment_wrapper_t<_Tp>> __a_value; -#endif }; template __host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val) { -#if defined(__CUDA_ARCH__) auto __tmp = __cxx_atomic_alignment_wrap(__val); __cxx_atomic_assign_volatile(__a->__a_value, __tmp); -#elif defined(_LIBCUDACXX_CUDA_HAS_NO_HOST_STD_ATOMIC_INIT) - __a->__a_value.store(__val, ::std::memory_order_relaxed); -#else - ::std::atomic_init(&__a->__a_value, __cxx_atomic_alignment_wrap(__val)); -#endif } template __host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { #ifdef __CUDA_ARCH__ detail::__atomic_store_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>()); #else - ::std::atomic_store_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order); + __atomic_store_n(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order); #endif } template @@ -221,7 +215,7 @@ __host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_impl_default< #ifdef __CUDA_ARCH__ return __cxx_atomic_alignment_unwrap(detail::__atomic_load_n_cuda(&__a->__a_value, __order, detail::__scope_tag<_Sco>())); #else - return __cxx_atomic_alignment_unwrap(::std::atomic_load_explicit(&__a->__a_value, (::std::memory_order)__order)); + return __cxx_atomic_alignment_unwrap(__atomic_load_n(&__a->__a_value, __order)); #endif } template @@ -229,7 +223,7 @@ __host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl_defa #ifdef __CUDA_ARCH__ return __cxx_atomic_alignment_unwrap(detail::__atomic_exchange_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>())); #else - return __cxx_atomic_alignment_unwrap(::std::atomic_exchange_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order)); + return __cxx_atomic_alignment_unwrap(__atomic_exchange_n(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order)); #endif } template @@ -238,7 +232,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomi #ifdef __CUDA_ARCH__ bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure, detail::__scope_tag<_Sco>()); #else - bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure); + bool __result = __atomic_compare_exchange_n(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure); #endif *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; @@ -249,7 +243,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_ #ifdef __CUDA_ARCH__ bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure, detail::__scope_tag<_Sco>()); #else - bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure); + bool __result = __atomic_compare_exchange_n(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure); #endif *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; @@ -259,7 +253,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_add(&__a->__a_value, __delta, __order); #endif } template @@ -267,7 +261,7 @@ __host__ __device__ inline _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl_de #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_add(&__a->__a_value, __delta, __order); #endif } template @@ -275,7 +269,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_sub(&__a->__a_value, __delta, __order); #endif } template @@ -283,7 +277,7 @@ __host__ __device__ inline _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_de #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_sub_explicit(&__a->__a_value, __delta, __order); #endif } template @@ -291,7 +285,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_and_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_and_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); + return __atomic_fetch_and(&__a->__a_value, __pattern, __order); #endif } template @@ -299,7 +293,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl_defa #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_or_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_or_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); + return __atomic_fetch_or(&__a->__a_value, __pattern, __order); #endif } template @@ -307,7 +301,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_xor_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_xor_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); + return __atomic_fetch_xor_explicit(&__a->__a_value, __pattern, __order); #endif } diff --git a/include/cuda/std/detail/__atomic_derived b/include/cuda/std/detail/__atomic_derived index 4c08abb80c..0c3d23f3e1 100644 --- a/include/cuda/std/detail/__atomic_derived +++ b/include/cuda/std/detail/__atomic_derived @@ -43,9 +43,9 @@ template __val ? __expected : __val; + _Type __desired = __expected > __val ? __expected : __val; #ifdef __CUDA_ARCH__ while(__desired == __val && !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s)) { #else while(__desired == __val && - !::std::atomic_compare_exchange_strong_explicit(__ptr, &__expected, __desired, __memorder, __memorder)) { + !__atomic_compare_exchange(__ptr, &__expected, &__desired, true, __memorder, __memorder)) { #endif __desired = __expected > __val ? __expected : __val; } @@ -70,13 +70,13 @@ template + +#define _Compiler_barrier() _ReadWriteBarrier() + +#if defined(_M_ARM) || defined(_M_ARM64) + #define _Memory_barrier() __dmb(0xB) // inner shared data memory barrier + #define _Compiler_or_memory_barrier() _Memory_barrier() +#elif defined(_M_IX86) || defined(_M_X64) + #define _Memory_barrier() __faststorefence() + // x86/x64 hardware only emits memory barriers inside _Interlocked intrinsics + #define _Compiler_or_memory_barrier() _Compiler_barrier() +#else // ^^^ x86/x64 / unsupported hardware vvv + #error Unsupported hardware +#endif // hardware + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +static inline void __atomic_signal_fence(int __memorder) { + if (__memorder != __ATOMIC_RELAXED) + _Compiler_barrier(); +} + +static inline void __atomic_thread_fence(int __memorder) { + if (__memorder != __ATOMIC_RELAXED) + _Memory_barrier(); +} + +template::type = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int8 __tmp = __iso_volatile_load8((const volatile __int8 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template::type = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int16 __tmp = __iso_volatile_load16((const volatile __int16 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template::type = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int32 __tmp = __iso_volatile_load32((const volatile __int32 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template::type = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int64 __tmp = __iso_volatile_load64((const volatile __int64 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template +void __atomic_load(const volatile _Type *__ptr, _Type *__ret, int __memorder) { + switch (__memorder) { + case __ATOMIC_SEQ_CST: _Memory_barrier(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __atomic_load_relaxed(__ptr, __ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_load_relaxed(__ptr, __ret); break; + default: assert(0); + } +} + +template::type = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int8 __tmp = reinterpret_cast<__int8&>(*__val); + __iso_volatile_store8((volatile __int8 *)__ptr, __tmp); +} +template::type = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int16 __tmp = reinterpret_cast<__int16&>(*__val); + __iso_volatile_store16((volatile __int16 *)__ptr, __tmp); +} +template::type = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int32 __tmp = reinterpret_cast<__int32&>(*__val); + __iso_volatile_store32((volatile __int32 *)__ptr, __tmp); +} +template::type = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int64 __tmp = reinterpret_cast<__int64&>(*__val); + __iso_volatile_store64((volatile __int64 *)__ptr, __tmp); +} +template +void __atomic_store(volatile _Type *__ptr, _Type *__val, int __memorder) { + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_store_relaxed(__ptr, __val); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); + case __ATOMIC_RELAXED: __atomic_store_relaxed(__ptr, __val); break; + default: assert(0); + } +} + +template::type = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast(*__expected); + auto const __old = _InterlockedCompareExchange8((volatile char *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast<_Type&>(__old); + return false; +} +template::type = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast(*__expected); + auto const __old = _InterlockedCompareExchange16((volatile short *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast<_Type&>(__old); + return false; +} +template::type = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast(*__expected); + auto const __old = _InterlockedCompareExchange((volatile long *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast<_Type&>(__old); + return false; +} +template::type = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast<__int64&>(*__desired); + auto __tmp_expected = reinterpret_cast<__int64&>(*__expected); + auto const __old = _InterlockedCompareExchange64((volatile __int64 *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast<_Type&>(__old); + return false; +} +template +bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder) { + bool success; + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; + default: assert(0); + } + return success; +} + +template::type = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +void __atomic_exchange(_Type volatile *__ptr, const _Type *__val, _Type *__ret, int __memorder) { + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_exchange_relaxed(__ptr, __val, __ret); break; + default: assert(0); + } +} + +template::type = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_add(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __ret; + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_add_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_fetch_add_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_add_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_add_relaxed(__ptr, __val, &__ret); break; + default: assert(0); + } + return __ret; +} +template +_Type __atomic_fetch_sub(_Type volatile *__ptr, _Delta __val, int __memorder) { + return __atomic_fetch_add(__ptr, -__val, __memorder); +} + + +template::type = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAnd8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAnd16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAnd((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAnd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __ret; + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_and_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_fetch_and_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_and_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_and_relaxed(__ptr, __val, &__ret); break; + default: assert(0); + } + return __ret; +} + +template::type = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeXor8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeXor16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeXor((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeXor64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __ret; + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_fetch_xor_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_xor_relaxed(__ptr, __val, &__ret); break; + default: assert(0); + } + return __ret; +} + +template::type = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeOr8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeOr16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeOr((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeOr64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_or(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __ret; + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_or_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_fetch_or_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_or_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_or_relaxed(__ptr, __val, &__ret); break; + default: assert(0); + } + return __ret; +} + +template +_Type __atomic_load_n(const _Type volatile *__ptr, int __memorder) { + _Type __ret; + __atomic_load(__ptr, &__ret, __memorder); + return __ret; +} + +template +void __atomic_store_n(_Type volatile *__ptr, _Type __val, int __memorder) { + __atomic_store(__ptr, &__val, __memorder); +} + +template +bool __atomic_compare_exchange_n(_Type volatile *__ptr, _Type *__expected, _Type __desired, bool __weak, int __success_memorder, int __failure_memorder) { + return __atomic_compare_exchange(__ptr, __expected, &__desired, __weak, __success_memorder, __failure_memorder); +} + +template +_Type __atomic_exchange_n(_Type volatile *__ptr, _Type __val, int __memorder) { + _Type __ret; + __atomic_exchange(__ptr, &__val, &__ret, __memorder); + return __ret; +} + +template +_Type __host__ __atomic_fetch_max(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __expected = __atomic_load_n(__ptr, __ATOMIC_RELAXED); + _Type __desired = __expected < __val ? __expected : __val; + while(__desired == __val && + !__atomic_compare_exchange_n(__ptr, &__expected, __desired, __memorder, __memorder)) { + __desired = __expected > __val ? __expected : __val; + } + return __expected; +} + +template +_Type __host__ __atomic_fetch_min(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __expected = __atomic_load_n(__ptr, __ATOMIC_RELAXED); + _Type __desired = __expected < __val ? __expected : __val; + while(__desired != __val && + !__atomic_compare_exchange_n(__ptr, &__expected, __desired, __memorder, __memorder)) { + __desired = __expected < __val ? __expected : __val; + } + return __expected; +} + +_LIBCUDACXX_END_NAMESPACE_CUDA From 23d6f5fc6c3aeffceea4c2e40e745cfd337f5837 Mon Sep 17 00:00:00 2001 From: ogiroux Date: Thu, 10 Sep 2020 11:38:17 -0700 Subject: [PATCH 2/8] Now with dramatically more workingness --- include/cuda/std/detail/__atomic | 4 +- include/cuda/std/detail/__atomic_msvc | 74 +++++++++++++-------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index 0933ce830a..bd0eea6f2e 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -49,7 +49,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA namespace detail { - inline __device__ int __stronger_order_cuda(int __a, int __b) { + inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) { int const __max = __a > __b ? __a : __b; if(__max != __ATOMIC_RELEASE) return __max; @@ -301,7 +301,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_xor_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return __atomic_fetch_xor_explicit(&__a->__a_value, __pattern, __order); + return __atomic_fetch_xor(&__a->__a_value, __pattern, __order); #endif } diff --git a/include/cuda/std/detail/__atomic_msvc b/include/cuda/std/detail/__atomic_msvc index 7f37e2fd09..0e86f6a76c 100644 --- a/include/cuda/std/detail/__atomic_msvc +++ b/include/cuda/std/detail/__atomic_msvc @@ -100,48 +100,48 @@ void __atomic_store(volatile _Type *__ptr, _Type *__val, int __memorder) { template::type = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { - auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast(*__expected); auto const __old = _InterlockedCompareExchange8((volatile char *)__ptr, __tmp_desired, __tmp_expected); if(__old == __tmp_expected) return true; - *__expected = reinterpret_cast<_Type&>(__old); + *__expected = reinterpret_cast(__old); return false; } template::type = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { - auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast(*__expected); auto const __old = _InterlockedCompareExchange16((volatile short *)__ptr, __tmp_desired, __tmp_expected); if(__old == __tmp_expected) return true; - *__expected = reinterpret_cast<_Type&>(__old); + *__expected = reinterpret_cast(__old); return false; } template::type = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { - auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast(*__expected); auto const __old = _InterlockedCompareExchange((volatile long *)__ptr, __tmp_desired, __tmp_expected); if(__old == __tmp_expected) return true; - *__expected = reinterpret_cast<_Type&>(__old); + *__expected = reinterpret_cast(__old); return false; } template::type = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { - auto __tmp_desired = reinterpret_cast<__int64&>(*__desired); + auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast<__int64&>(*__expected); auto const __old = _InterlockedCompareExchange64((volatile __int64 *)__ptr, __tmp_desired, __tmp_expected); if(__old == __tmp_expected) return true; - *__expected = reinterpret_cast<_Type&>(__old); + *__expected = reinterpret_cast(__old); return false; } template bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder) { bool success; - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + switch (detail::__stronger_order_cuda(__success_memorder, __failure_memorder)) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); case __ATOMIC_ACQUIRE: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; @@ -208,11 +208,11 @@ template _Type __atomic_fetch_add(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __ret; switch (__memorder) { - case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_add_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, &__ret);break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); - case __ATOMIC_ACQUIRE: __atomic_fetch_add_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_add_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_RELAXED: __atomic_fetch_add_relaxed(__ptr, __val, &__ret); break; + case __ATOMIC_ACQUIRE: __atomic_fetch_add_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_add_relaxed(__ptr, &__val, &__ret); break; default: assert(0); } return __ret; @@ -225,33 +225,33 @@ _Type __atomic_fetch_sub(_Type volatile *__ptr, _Delta __val, int __memorder) { template::type = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeAnd8((volatile char *)__ptr, reinterpret_cast(*__val)); + auto const __old = _InterlockedAnd8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template::type = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeAnd16((volatile short *)__ptr, reinterpret_cast(*__val)); + auto const __old = _InterlockedAnd16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template::type = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeAnd((volatile long *)__ptr, reinterpret_cast(*__val)); + auto const __old = _InterlockedAnd((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template::type = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeAnd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + auto const __old = _InterlockedAnd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template _Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __ret; switch (__memorder) { - case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_and_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, &__ret);break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); - case __ATOMIC_ACQUIRE: __atomic_fetch_and_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_and_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_RELAXED: __atomic_fetch_and_relaxed(__ptr, __val, &__ret); break; + case __ATOMIC_ACQUIRE: __atomic_fetch_and_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_and_relaxed(__ptr, &__val, &__ret); break; default: assert(0); } return __ret; @@ -259,33 +259,33 @@ _Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { template::type = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeXor8((volatile char *)__ptr, reinterpret_cast(*__val)); + auto const __old = _InterlockedXor8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template::type = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeXor16((volatile short *)__ptr, reinterpret_cast(*__val)); + auto const __old = _InterlockedXor16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template::type = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeXor((volatile long *)__ptr, reinterpret_cast(*__val)); + auto const __old = _InterlockedXor((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template::type = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeXor64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + auto const __old = _InterlockedXor64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template _Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __ret; switch (__memorder) { - case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret);break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); - case __ATOMIC_ACQUIRE: __atomic_fetch_xor_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_RELAXED: __atomic_fetch_xor_relaxed(__ptr, __val, &__ret); break; + case __ATOMIC_ACQUIRE: __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret); break; default: assert(0); } return __ret; @@ -293,33 +293,33 @@ _Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { template::type = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeOr8((volatile char *)__ptr, reinterpret_cast(*__val)); + auto const __old = _InterlockedOr8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template::type = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeOr16((volatile short *)__ptr, reinterpret_cast(*__val)); + auto const __old = _InterlockedOr16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template::type = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeOr((volatile long *)__ptr, reinterpret_cast(*__val)); + auto const __old = _InterlockedOr((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template::type = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { - auto const __old = _InterlockedExchangeOr64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + auto const __old = _InterlockedOr64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } template _Type __atomic_fetch_or(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __ret; switch (__memorder) { - case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_or_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, &__ret);break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); - case __ATOMIC_ACQUIRE: __atomic_fetch_or_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_or_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_RELAXED: __atomic_fetch_or_relaxed(__ptr, __val, &__ret); break; + case __ATOMIC_ACQUIRE: __atomic_fetch_or_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_or_relaxed(__ptr, &__val, &__ret); break; default: assert(0); } return __ret; From 45c28ea7637e7c5b7ee1e33a24bb61a41415421a Mon Sep 17 00:00:00 2001 From: ogiroux Date: Thu, 10 Sep 2020 15:58:23 -0700 Subject: [PATCH 3/8] Fixed a dangling reference --- include/cuda/std/detail/__atomic | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index bd0eea6f2e..0e28394afe 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -277,7 +277,7 @@ __host__ __device__ inline _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_de #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return __atomic_fetch_sub_explicit(&__a->__a_value, __delta, __order); + return __atomic_fetch_sub(&__a->__a_value, __delta, __order); #endif } template From 0c7add52966ed80b0fbc5098500532f6456a4fae Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 14 Sep 2020 17:13:32 -0700 Subject: [PATCH 4/8] Properly offset ptrdiff_t deltas before calling compiler __atomic_add/sub; make msvc sfinae a tad less repetitive --- include/cuda/std/detail/__atomic | 10 +++- include/cuda/std/detail/__atomic_msvc | 82 ++++++++++++++------------- 2 files changed, 52 insertions(+), 40 deletions(-) diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index 0e28394afe..d401223580 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -111,6 +111,12 @@ _LIBCUDACXX_END_NAMESPACE_CUDA _LIBCUDACXX_BEGIN_NAMESPACE_STD +template +struct __skip_amt { enum {value = 1}; }; + +template +struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; + // Forward-declare the function templates that are defined libcxx later. template _LIBCUDACXX_INLINE_VISIBILITY typename enable_if::value>::type @@ -261,7 +267,7 @@ __host__ __device__ inline _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl_de #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return __atomic_fetch_add(&__a->__a_value, __delta, __order); + return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); #endif } template @@ -277,7 +283,7 @@ __host__ __device__ inline _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_de #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return __atomic_fetch_sub(&__a->__a_value, __delta, __order); + return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); #endif } template diff --git a/include/cuda/std/detail/__atomic_msvc b/include/cuda/std/detail/__atomic_msvc index 0e86f6a76c..36d217ba96 100644 --- a/include/cuda/std/detail/__atomic_msvc +++ b/include/cuda/std/detail/__atomic_msvc @@ -37,26 +37,32 @@ static inline void __atomic_thread_fence(int __memorder) { _Memory_barrier(); } -template::type = 0> +namespace detail { + template + using _enable_if_sized_as = typename _CUDA_VSTD::enable_if::type; +} + +template = 0> void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { __int8 __tmp = __iso_volatile_load8((const volatile __int8 *)__ptr); *__ret = reinterpret_cast<_Type&>(__tmp); } -template::type = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { __int16 __tmp = __iso_volatile_load16((const volatile __int16 *)__ptr); *__ret = reinterpret_cast<_Type&>(__tmp); } -template::type = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { __int32 __tmp = __iso_volatile_load32((const volatile __int32 *)__ptr); *__ret = reinterpret_cast<_Type&>(__tmp); } -template::type = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { __int64 __tmp = __iso_volatile_load64((const volatile __int64 *)__ptr); *__ret = reinterpret_cast<_Type&>(__tmp); } + template void __atomic_load(const volatile _Type *__ptr, _Type *__ret, int __memorder) { switch (__memorder) { @@ -68,22 +74,22 @@ void __atomic_load(const volatile _Type *__ptr, _Type *__ret, int __memorder) { } } -template::type = 0> +template = 0> void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { __int8 __tmp = reinterpret_cast<__int8&>(*__val); __iso_volatile_store8((volatile __int8 *)__ptr, __tmp); } -template::type = 0> +template = 0> void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { __int16 __tmp = reinterpret_cast<__int16&>(*__val); __iso_volatile_store16((volatile __int16 *)__ptr, __tmp); } -template::type = 0> +template = 0> void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { __int32 __tmp = reinterpret_cast<__int32&>(*__val); __iso_volatile_store32((volatile __int32 *)__ptr, __tmp); } -template::type = 0> +template = 0> void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { __int64 __tmp = reinterpret_cast<__int64&>(*__val); __iso_volatile_store64((volatile __int64 *)__ptr, __tmp); @@ -98,7 +104,7 @@ void __atomic_store(volatile _Type *__ptr, _Type *__val, int __memorder) { } } -template::type = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast(*__expected); @@ -108,7 +114,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__exp *__expected = reinterpret_cast(__old); return false; } -template::type = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast(*__expected); @@ -118,7 +124,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__exp *__expected = reinterpret_cast(__old); return false; } -template::type = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast(*__expected); @@ -128,7 +134,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__exp *__expected = reinterpret_cast(__old); return false; } -template::type = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast<__int64&>(*__expected); @@ -143,7 +149,7 @@ bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _ bool success; switch (detail::__stronger_order_cuda(__success_memorder, __failure_memorder)) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); case __ATOMIC_ACQUIRE: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; @@ -152,22 +158,22 @@ bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _ return success; } -template::type = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { auto const __old = _InterlockedExchange8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { auto const __old = _InterlockedExchange16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { auto const __old = _InterlockedExchange((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { auto const __old = _InterlockedExchange64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -176,7 +182,7 @@ template void __atomic_exchange(_Type volatile *__ptr, const _Type *__val, _Type *__ret, int __memorder) { switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); case __ATOMIC_ACQUIRE: __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_exchange_relaxed(__ptr, __val, __ret); break; @@ -184,22 +190,22 @@ void __atomic_exchange(_Type volatile *__ptr, const _Type *__val, _Type *__ret, } } -template::type = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedExchangeAdd8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedExchangeAdd16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedExchangeAdd((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedExchangeAdd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -209,7 +215,7 @@ _Type __atomic_fetch_add(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __ret; switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, &__ret);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); case __ATOMIC_ACQUIRE: __atomic_fetch_add_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_fetch_add_relaxed(__ptr, &__val, &__ret); break; @@ -223,22 +229,22 @@ _Type __atomic_fetch_sub(_Type volatile *__ptr, _Delta __val, int __memorder) { } -template::type = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedAnd8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedAnd16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedAnd((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedAnd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -248,7 +254,7 @@ _Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __ret; switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, &__ret);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); case __ATOMIC_ACQUIRE: __atomic_fetch_and_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_fetch_and_relaxed(__ptr, &__val, &__ret); break; @@ -257,22 +263,22 @@ _Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { return __ret; } -template::type = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedXor8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedXor16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedXor((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedXor64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -282,7 +288,7 @@ _Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __ret; switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); case __ATOMIC_ACQUIRE: __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret); break; @@ -291,22 +297,22 @@ _Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { return __ret; } -template::type = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedOr8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedOr16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedOr((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template::type = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedOr64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -316,7 +322,7 @@ _Type __atomic_fetch_or(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __ret; switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, &__ret);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); case __ATOMIC_ACQUIRE: __atomic_fetch_or_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_fetch_or_relaxed(__ptr, &__val, &__ret); break; From 4b7dca9aed98ede2ec05a491328866c1883f2727 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 15 Sep 2020 19:11:58 -0700 Subject: [PATCH 5/8] Replace __atomic.*_n with generic versions on host code --- include/cuda/std/detail/__atomic | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index d401223580..6d11e14172 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -213,7 +213,8 @@ __host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_impl_defaul #ifdef __CUDA_ARCH__ detail::__atomic_store_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>()); #else - __atomic_store_n(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order); + auto __t = __cxx_atomic_alignment_wrap(__val); + __atomic_store(&__a->__a_value, &__t, __order); #endif } template @@ -221,7 +222,10 @@ __host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_impl_default< #ifdef __CUDA_ARCH__ return __cxx_atomic_alignment_unwrap(detail::__atomic_load_n_cuda(&__a->__a_value, __order, detail::__scope_tag<_Sco>())); #else - return __cxx_atomic_alignment_unwrap(__atomic_load_n(&__a->__a_value, __order)); + alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; + auto* __dest = reinterpret_cast<_Tp*>(__buf); + __atomic_load(&__a->__a_value, __dest, __order); + return __cxx_atomic_alignment_unwrap(*__dest); #endif } template @@ -229,7 +233,11 @@ __host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl_defa #ifdef __CUDA_ARCH__ return __cxx_atomic_alignment_unwrap(detail::__atomic_exchange_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>())); #else - return __cxx_atomic_alignment_unwrap(__atomic_exchange_n(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order)); + alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; + auto* __dest = reinterpret_cast<_Tp*>(__buf); + auto __t = __cxx_atomic_alignment_wrap(__val); + __atomic_exchange(&__a->__a_value, &__t, __dest, __order); + return __cxx_atomic_alignment_unwrap(*__dest); #endif } template @@ -238,7 +246,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomi #ifdef __CUDA_ARCH__ bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure, detail::__scope_tag<_Sco>()); #else - bool __result = __atomic_compare_exchange_n(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure); + bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, false, __success, __failure); #endif *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; @@ -249,7 +257,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_ #ifdef __CUDA_ARCH__ bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure, detail::__scope_tag<_Sco>()); #else - bool __result = __atomic_compare_exchange_n(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure); + bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, true, __success, __failure); #endif *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; From 7b2a928b035106bb42955c772d08a8ea32c452ef Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 16 Sep 2020 17:07:23 -0700 Subject: [PATCH 6/8] Moved msvc atomics to libcxx in the event it is upstreamed --- include/cuda/std/detail/__atomic | 2 +- .../__atomic_msvc => libcxx/include/support/win32/atomic_msvc.h | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename include/cuda/std/detail/__atomic_msvc => libcxx/include/support/win32/atomic_msvc.h (100%) diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index 6d11e14172..e5fe05034d 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -103,7 +103,7 @@ namespace detail { _LIBCUDACXX_END_NAMESPACE_CUDA #ifdef _MSC_VER - #include "__atomic_msvc" + #include "libcxx/include/support/win32/atomic_msvc.h" #endif #include "__atomic_generated" diff --git a/include/cuda/std/detail/__atomic_msvc b/libcxx/include/support/win32/atomic_msvc.h similarity index 100% rename from include/cuda/std/detail/__atomic_msvc rename to libcxx/include/support/win32/atomic_msvc.h From 2610560296ee0d2a33e55d8dd5df0a8f4c763f58 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 21 Sep 2020 13:47:13 -0700 Subject: [PATCH 7/8] Use initialized memory rather than an initialized object for MSVC atomics --- libcxx/include/support/win32/atomic_msvc.h | 76 +++++++++++++--------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/libcxx/include/support/win32/atomic_msvc.h b/libcxx/include/support/win32/atomic_msvc.h index 36d217ba96..11cb4f6f01 100644 --- a/libcxx/include/support/win32/atomic_msvc.h +++ b/libcxx/include/support/win32/atomic_msvc.h @@ -146,7 +146,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__exp } template bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder) { - bool success; + bool success = false; switch (detail::__stronger_order_cuda(__success_memorder, __failure_memorder)) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); @@ -212,20 +212,22 @@ void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val } template _Type __atomic_fetch_add(_Type volatile *__ptr, _Delta __val, int __memorder) { - _Type __ret; + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + switch (__memorder) { - case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, &__ret);break; + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, __dest);break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); - case __ATOMIC_ACQUIRE: __atomic_fetch_add_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_RELAXED: __atomic_fetch_add_relaxed(__ptr, &__val, &__ret); break; + case __ATOMIC_ACQUIRE: __atomic_fetch_add_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_add_relaxed(__ptr, &__val, __dest); break; default: assert(0); } - return __ret; + return *__dest; } template _Type __atomic_fetch_sub(_Type volatile *__ptr, _Delta __val, int __memorder) { - return __atomic_fetch_add(__ptr, -__val, __memorder); + return __atomic_fetch_add(__ptr, 0-__val, __memorder); } @@ -251,16 +253,18 @@ void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val } template _Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { - _Type __ret; + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + switch (__memorder) { - case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, &__ret);break; + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, __dest);break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); - case __ATOMIC_ACQUIRE: __atomic_fetch_and_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_RELAXED: __atomic_fetch_and_relaxed(__ptr, &__val, &__ret); break; + case __ATOMIC_ACQUIRE: __atomic_fetch_and_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_and_relaxed(__ptr, &__val, __dest); break; default: assert(0); } - return __ret; + return *__dest; } template = 0> @@ -285,16 +289,18 @@ void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val } template _Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { - _Type __ret; + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + switch (__memorder) { - case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret);break; + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, __dest);break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); - case __ATOMIC_ACQUIRE: __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_RELAXED: __atomic_fetch_xor_relaxed(__ptr, &__val, &__ret); break; + case __ATOMIC_ACQUIRE: __atomic_fetch_xor_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_xor_relaxed(__ptr, &__val, __dest); break; default: assert(0); } - return __ret; + return *__dest; } template = 0> @@ -319,23 +325,27 @@ void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, } template _Type __atomic_fetch_or(_Type volatile *__ptr, _Delta __val, int __memorder) { - _Type __ret; + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + switch (__memorder) { - case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, &__ret);break; + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, __dest);break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); - case __ATOMIC_ACQUIRE: __atomic_fetch_or_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, &__ret); _Compiler_or_memory_barrier(); break; - case __ATOMIC_RELAXED: __atomic_fetch_or_relaxed(__ptr, &__val, &__ret); break; + case __ATOMIC_ACQUIRE: __atomic_fetch_or_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_or_relaxed(__ptr, &__val, __dest); break; default: assert(0); } - return __ret; + return *__dest; } template _Type __atomic_load_n(const _Type volatile *__ptr, int __memorder) { - _Type __ret; - __atomic_load(__ptr, &__ret, __memorder); - return __ret; + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + + __atomic_load(__ptr, __dest, __memorder); + return *__dest; } template @@ -350,9 +360,11 @@ bool __atomic_compare_exchange_n(_Type volatile *__ptr, _Type *__expected, _Type template _Type __atomic_exchange_n(_Type volatile *__ptr, _Type __val, int __memorder) { - _Type __ret; - __atomic_exchange(__ptr, &__val, &__ret, __memorder); - return __ret; + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + + __atomic_exchange(__ptr, &__val, __dest, __memorder); + return *__dest; } template From 7d802a6628827678d2a3140d06c2a48d68557d59 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 13 Oct 2020 13:37:22 -0700 Subject: [PATCH 8/8] Implement memory_order_consume operations as acquire, move support include to --- include/cuda/std/detail/__atomic | 4 ++-- libcxx/include/atomic | 5 +++++ libcxx/include/support/win32/atomic_msvc.h | 22 ++++++++++++++-------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index e5fe05034d..14f49434c2 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -44,7 +44,6 @@ #define __ATOMIC_THREAD 10 #endif //__ATOMIC_BLOCK - _LIBCUDACXX_BEGIN_NAMESPACE_CUDA namespace detail { @@ -102,7 +101,8 @@ namespace detail { _LIBCUDACXX_END_NAMESPACE_CUDA -#ifdef _MSC_VER +#if defined(_LIBCUDACXX_COMPILER_MSVC) + // Inject atomic intrinsics built from MSVC compiler intrinsics #include "libcxx/include/support/win32/atomic_msvc.h" #endif diff --git a/libcxx/include/atomic b/libcxx/include/atomic index c34cec008b..be235a64dc 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -555,6 +555,11 @@ void atomic_signal_fence(memory_order m) noexcept; #include #include #include <__pragma_push> + +#if defined(_LIBCUDACXX_COMPILER_MSVC) +#include "support/win32/atomic_msvc.h" +#endif + #endif //__cuda_std__ #if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER) diff --git a/libcxx/include/support/win32/atomic_msvc.h b/libcxx/include/support/win32/atomic_msvc.h index 11cb4f6f01..6b1f2da88e 100644 --- a/libcxx/include/support/win32/atomic_msvc.h +++ b/libcxx/include/support/win32/atomic_msvc.h @@ -66,7 +66,7 @@ void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { template void __atomic_load(const volatile _Type *__ptr, _Type *__ret, int __memorder) { switch (__memorder) { - case __ATOMIC_SEQ_CST: _Memory_barrier(); + case __ATOMIC_SEQ_CST: _Memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); case __ATOMIC_CONSUME: case __ATOMIC_ACQUIRE: __atomic_load_relaxed(__ptr, __ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_load_relaxed(__ptr, __ret); break; @@ -98,7 +98,7 @@ template void __atomic_store(volatile _Type *__ptr, _Type *__val, int __memorder) { switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_store_relaxed(__ptr, __val); break; - case __ATOMIC_SEQ_CST: _Memory_barrier(); + case __ATOMIC_SEQ_CST: _Memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); case __ATOMIC_RELAXED: __atomic_store_relaxed(__ptr, __val); break; default: assert(0); } @@ -149,7 +149,8 @@ bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _ bool success = false; switch (detail::__stronger_order_cuda(__success_memorder, __failure_memorder)) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: case __ATOMIC_ACQUIRE: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; @@ -182,7 +183,8 @@ template void __atomic_exchange(_Type volatile *__ptr, const _Type *__val, _Type *__ret, int __memorder) { switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: case __ATOMIC_ACQUIRE: __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_exchange_relaxed(__ptr, __val, __ret); break; @@ -217,7 +219,8 @@ _Type __atomic_fetch_add(_Type volatile *__ptr, _Delta __val, int __memorder) { switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, __dest);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: case __ATOMIC_ACQUIRE: __atomic_fetch_add_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_fetch_add_relaxed(__ptr, &__val, __dest); break; @@ -258,7 +261,8 @@ _Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, __dest);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: case __ATOMIC_ACQUIRE: __atomic_fetch_and_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_fetch_and_relaxed(__ptr, &__val, __dest); break; @@ -294,7 +298,8 @@ _Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, __dest);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: case __ATOMIC_ACQUIRE: __atomic_fetch_xor_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_fetch_xor_relaxed(__ptr, &__val, __dest); break; @@ -330,7 +335,8 @@ _Type __atomic_fetch_or(_Type volatile *__ptr, _Delta __val, int __memorder) { switch (__memorder) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, __dest);break; - case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: case __ATOMIC_ACQUIRE: __atomic_fetch_or_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; case __ATOMIC_RELAXED: __atomic_fetch_or_relaxed(__ptr, &__val, __dest); break;