diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index 4261d6a054..0933ce830a 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -102,6 +102,10 @@ namespace detail { _LIBCUDACXX_END_NAMESPACE_CUDA +#ifdef _MSC_VER + #include "__atomic_msvc" +#endif + #include "__atomic_generated" #include "__atomic_derived" @@ -123,14 +127,14 @@ __host__ __device__ inline void __cxx_atomic_thread_fence(int __order) { #ifdef __CUDA_ARCH__ detail::__atomic_thread_fence_cuda(__order, detail::__thread_scope_system_tag()); #else - ::std::atomic_thread_fence((::std::memory_order)__order); + __atomic_thread_fence(__order); #endif } __host__ __device__ inline void __cxx_atomic_signal_fence(int __order) { #ifdef __CUDA_ARCH__ detail::__atomic_signal_fence_cuda(__order); #else - ::std::atomic_signal_fence((::std::memory_order)__order); + __atomic_signal_fence(__order); #endif } @@ -190,30 +194,20 @@ struct __cxx_atomic_base_impl_default { constexpr __cxx_atomic_base_impl_default() noexcept = default; __host__ __device__ constexpr explicit __cxx_atomic_base_impl_default(_Tp __value) noexcept : __a_value(__value) { } -#ifdef __CUDA_ARCH__ __cxx_atomic_alignment_wrapper_t<_Tp> __a_value; -#else - ::std::atomic<__cxx_atomic_alignment_wrapper_t<_Tp>> __a_value; -#endif }; template __host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val) { -#if defined(__CUDA_ARCH__) auto __tmp = __cxx_atomic_alignment_wrap(__val); __cxx_atomic_assign_volatile(__a->__a_value, __tmp); -#elif defined(_LIBCUDACXX_CUDA_HAS_NO_HOST_STD_ATOMIC_INIT) - __a->__a_value.store(__val, ::std::memory_order_relaxed); -#else - ::std::atomic_init(&__a->__a_value, __cxx_atomic_alignment_wrap(__val)); -#endif } template __host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { #ifdef __CUDA_ARCH__ detail::__atomic_store_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>()); #else - ::std::atomic_store_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order); + __atomic_store_n(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order); #endif } template @@ -221,7 +215,7 @@ __host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_impl_default< #ifdef __CUDA_ARCH__ return __cxx_atomic_alignment_unwrap(detail::__atomic_load_n_cuda(&__a->__a_value, __order, detail::__scope_tag<_Sco>())); #else - return __cxx_atomic_alignment_unwrap(::std::atomic_load_explicit(&__a->__a_value, (::std::memory_order)__order)); + return __cxx_atomic_alignment_unwrap(__atomic_load_n(&__a->__a_value, __order)); #endif } template @@ -229,7 +223,7 @@ __host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl_defa #ifdef __CUDA_ARCH__ return __cxx_atomic_alignment_unwrap(detail::__atomic_exchange_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>())); #else - return __cxx_atomic_alignment_unwrap(::std::atomic_exchange_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order)); + return __cxx_atomic_alignment_unwrap(__atomic_exchange_n(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order)); #endif } template @@ -238,7 +232,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomi #ifdef __CUDA_ARCH__ bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure, detail::__scope_tag<_Sco>()); #else - bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure); + bool __result = __atomic_compare_exchange_n(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure); #endif *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; @@ -249,7 +243,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_ #ifdef __CUDA_ARCH__ bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure, detail::__scope_tag<_Sco>()); #else - bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure); + bool __result = __atomic_compare_exchange_n(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure); #endif *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; @@ -259,7 +253,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_add(&__a->__a_value, __delta, __order); #endif } template @@ -267,7 +261,7 @@ __host__ __device__ inline _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl_de #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_add(&__a->__a_value, __delta, __order); #endif } template @@ -275,7 +269,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_sub(&__a->__a_value, __delta, __order); #endif } template @@ -283,7 +277,7 @@ __host__ __device__ inline _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_de #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_sub_explicit(&__a->__a_value, __delta, __order); #endif } template @@ -291,7 +285,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_and_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_and_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); + return __atomic_fetch_and(&__a->__a_value, __pattern, __order); #endif } template @@ -299,7 +293,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl_defa #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_or_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_or_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); + return __atomic_fetch_or(&__a->__a_value, __pattern, __order); #endif } template @@ -307,7 +301,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_xor_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_xor_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); + return __atomic_fetch_xor_explicit(&__a->__a_value, __pattern, __order); #endif } diff --git a/include/cuda/std/detail/__atomic_derived b/include/cuda/std/detail/__atomic_derived index 4c08abb80c..0c3d23f3e1 100644 --- a/include/cuda/std/detail/__atomic_derived +++ b/include/cuda/std/detail/__atomic_derived @@ -43,9 +43,9 @@ template __val ? __expected : __val; + _Type __desired = __expected > __val ? __expected : __val; #ifdef __CUDA_ARCH__ while(__desired == __val && !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s)) { #else while(__desired == __val && - !::std::atomic_compare_exchange_strong_explicit(__ptr, &__expected, __desired, __memorder, __memorder)) { + !__atomic_compare_exchange(__ptr, &__expected, &__desired, true, __memorder, __memorder)) { #endif __desired = __expected > __val ? __expected : __val; } @@ -70,13 +70,13 @@ template + +#define _Compiler_barrier() _ReadWriteBarrier() + +#if defined(_M_ARM) || defined(_M_ARM64) + #define _Memory_barrier() __dmb(0xB) // inner shared data memory barrier + #define _Compiler_or_memory_barrier() _Memory_barrier() +#elif defined(_M_IX86) || defined(_M_X64) + #define _Memory_barrier() __faststorefence() + // x86/x64 hardware only emits memory barriers inside _Interlocked intrinsics + #define _Compiler_or_memory_barrier() _Compiler_barrier() +#else // ^^^ x86/x64 / unsupported hardware vvv + #error Unsupported hardware +#endif // hardware + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +static inline void __atomic_signal_fence(int __memorder) { + if (__memorder != __ATOMIC_RELAXED) + _Compiler_barrier(); +} + +static inline void __atomic_thread_fence(int __memorder) { + if (__memorder != __ATOMIC_RELAXED) + _Memory_barrier(); +} + +template::type = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int8 __tmp = __iso_volatile_load8((const volatile __int8 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template::type = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int16 __tmp = __iso_volatile_load16((const volatile __int16 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template::type = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int32 __tmp = __iso_volatile_load32((const volatile __int32 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template::type = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int64 __tmp = __iso_volatile_load64((const volatile __int64 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template +void __atomic_load(const volatile _Type *__ptr, _Type *__ret, int __memorder) { + switch (__memorder) { + case __ATOMIC_SEQ_CST: _Memory_barrier(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __atomic_load_relaxed(__ptr, __ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_load_relaxed(__ptr, __ret); break; + default: assert(0); + } +} + +template::type = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int8 __tmp = reinterpret_cast<__int8&>(*__val); + __iso_volatile_store8((volatile __int8 *)__ptr, __tmp); +} +template::type = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int16 __tmp = reinterpret_cast<__int16&>(*__val); + __iso_volatile_store16((volatile __int16 *)__ptr, __tmp); +} +template::type = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int32 __tmp = reinterpret_cast<__int32&>(*__val); + __iso_volatile_store32((volatile __int32 *)__ptr, __tmp); +} +template::type = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int64 __tmp = reinterpret_cast<__int64&>(*__val); + __iso_volatile_store64((volatile __int64 *)__ptr, __tmp); +} +template +void __atomic_store(volatile _Type *__ptr, _Type *__val, int __memorder) { + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_store_relaxed(__ptr, __val); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); + case __ATOMIC_RELAXED: __atomic_store_relaxed(__ptr, __val); break; + default: assert(0); + } +} + +template::type = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast(*__expected); + auto const __old = _InterlockedCompareExchange8((volatile char *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast<_Type&>(__old); + return false; +} +template::type = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast(*__expected); + auto const __old = _InterlockedCompareExchange16((volatile short *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast<_Type&>(__old); + return false; +} +template::type = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast(*__expected); + auto const __old = _InterlockedCompareExchange((volatile long *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast<_Type&>(__old); + return false; +} +template::type = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast<__int64&>(*__desired); + auto __tmp_expected = reinterpret_cast<__int64&>(*__expected); + auto const __old = _InterlockedCompareExchange64((volatile __int64 *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast<_Type&>(__old); + return false; +} +template +bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder) { + bool success; + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; + default: assert(0); + } + return success; +} + +template::type = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +void __atomic_exchange(_Type volatile *__ptr, const _Type *__val, _Type *__ret, int __memorder) { + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_exchange_relaxed(__ptr, __val, __ret); break; + default: assert(0); + } +} + +template::type = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_add(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __ret; + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_add_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_fetch_add_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_add_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_add_relaxed(__ptr, __val, &__ret); break; + default: assert(0); + } + return __ret; +} +template +_Type __atomic_fetch_sub(_Type volatile *__ptr, _Delta __val, int __memorder) { + return __atomic_fetch_add(__ptr, -__val, __memorder); +} + + +template::type = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAnd8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAnd16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAnd((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAnd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __ret; + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_and_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_fetch_and_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_and_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_and_relaxed(__ptr, __val, &__ret); break; + default: assert(0); + } + return __ret; +} + +template::type = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeXor8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeXor16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeXor((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeXor64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __ret; + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_fetch_xor_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_xor_relaxed(__ptr, __val, &__ret); break; + default: assert(0); + } + return __ret; +} + +template::type = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeOr8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeOr16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeOr((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template::type = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeOr64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_or(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __ret; + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_or_relaxed(__ptr, __val, &__ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); + case __ATOMIC_ACQUIRE: __atomic_fetch_or_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_or_relaxed(__ptr, __val, &__ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_or_relaxed(__ptr, __val, &__ret); break; + default: assert(0); + } + return __ret; +} + +template +_Type __atomic_load_n(const _Type volatile *__ptr, int __memorder) { + _Type __ret; + __atomic_load(__ptr, &__ret, __memorder); + return __ret; +} + +template +void __atomic_store_n(_Type volatile *__ptr, _Type __val, int __memorder) { + __atomic_store(__ptr, &__val, __memorder); +} + +template +bool __atomic_compare_exchange_n(_Type volatile *__ptr, _Type *__expected, _Type __desired, bool __weak, int __success_memorder, int __failure_memorder) { + return __atomic_compare_exchange(__ptr, __expected, &__desired, __weak, __success_memorder, __failure_memorder); +} + +template +_Type __atomic_exchange_n(_Type volatile *__ptr, _Type __val, int __memorder) { + _Type __ret; + __atomic_exchange(__ptr, &__val, &__ret, __memorder); + return __ret; +} + +template +_Type __host__ __atomic_fetch_max(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __expected = __atomic_load_n(__ptr, __ATOMIC_RELAXED); + _Type __desired = __expected < __val ? __expected : __val; + while(__desired == __val && + !__atomic_compare_exchange_n(__ptr, &__expected, __desired, __memorder, __memorder)) { + __desired = __expected > __val ? __expected : __val; + } + return __expected; +} + +template +_Type __host__ __atomic_fetch_min(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __expected = __atomic_load_n(__ptr, __ATOMIC_RELAXED); + _Type __desired = __expected < __val ? __expected : __val; + while(__desired != __val && + !__atomic_compare_exchange_n(__ptr, &__expected, __desired, __memorder, __memorder)) { + __desired = __expected < __val ? __expected : __val; + } + return __expected; +} + +_LIBCUDACXX_END_NAMESPACE_CUDA