diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index 4261d6a054..14f49434c2 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -44,12 +44,11 @@ #define __ATOMIC_THREAD 10 #endif //__ATOMIC_BLOCK - _LIBCUDACXX_BEGIN_NAMESPACE_CUDA namespace detail { - inline __device__ int __stronger_order_cuda(int __a, int __b) { + inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) { int const __max = __a > __b ? __a : __b; if(__max != __ATOMIC_RELEASE) return __max; @@ -102,11 +101,22 @@ namespace detail { _LIBCUDACXX_END_NAMESPACE_CUDA +#if defined(_LIBCUDACXX_COMPILER_MSVC) + // Inject atomic intrinsics built from MSVC compiler intrinsics + #include "libcxx/include/support/win32/atomic_msvc.h" +#endif + #include "__atomic_generated" #include "__atomic_derived" _LIBCUDACXX_BEGIN_NAMESPACE_STD +template +struct __skip_amt { enum {value = 1}; }; + +template +struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; + // Forward-declare the function templates that are defined libcxx later. template _LIBCUDACXX_INLINE_VISIBILITY typename enable_if::value>::type @@ -123,14 +133,14 @@ __host__ __device__ inline void __cxx_atomic_thread_fence(int __order) { #ifdef __CUDA_ARCH__ detail::__atomic_thread_fence_cuda(__order, detail::__thread_scope_system_tag()); #else - ::std::atomic_thread_fence((::std::memory_order)__order); + __atomic_thread_fence(__order); #endif } __host__ __device__ inline void __cxx_atomic_signal_fence(int __order) { #ifdef __CUDA_ARCH__ detail::__atomic_signal_fence_cuda(__order); #else - ::std::atomic_signal_fence((::std::memory_order)__order); + __atomic_signal_fence(__order); #endif } @@ -190,30 +200,21 @@ struct __cxx_atomic_base_impl_default { constexpr __cxx_atomic_base_impl_default() noexcept = default; __host__ __device__ constexpr explicit __cxx_atomic_base_impl_default(_Tp __value) noexcept : __a_value(__value) { } -#ifdef __CUDA_ARCH__ __cxx_atomic_alignment_wrapper_t<_Tp> __a_value; -#else - ::std::atomic<__cxx_atomic_alignment_wrapper_t<_Tp>> __a_value; -#endif }; template __host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val) { -#if defined(__CUDA_ARCH__) auto __tmp = __cxx_atomic_alignment_wrap(__val); __cxx_atomic_assign_volatile(__a->__a_value, __tmp); -#elif defined(_LIBCUDACXX_CUDA_HAS_NO_HOST_STD_ATOMIC_INIT) - __a->__a_value.store(__val, ::std::memory_order_relaxed); -#else - ::std::atomic_init(&__a->__a_value, __cxx_atomic_alignment_wrap(__val)); -#endif } template __host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { #ifdef __CUDA_ARCH__ detail::__atomic_store_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>()); #else - ::std::atomic_store_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order); + auto __t = __cxx_atomic_alignment_wrap(__val); + __atomic_store(&__a->__a_value, &__t, __order); #endif } template @@ -221,7 +222,10 @@ __host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_impl_default< #ifdef __CUDA_ARCH__ return __cxx_atomic_alignment_unwrap(detail::__atomic_load_n_cuda(&__a->__a_value, __order, detail::__scope_tag<_Sco>())); #else - return __cxx_atomic_alignment_unwrap(::std::atomic_load_explicit(&__a->__a_value, (::std::memory_order)__order)); + alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; + auto* __dest = reinterpret_cast<_Tp*>(__buf); + __atomic_load(&__a->__a_value, __dest, __order); + return __cxx_atomic_alignment_unwrap(*__dest); #endif } template @@ -229,7 +233,11 @@ __host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl_defa #ifdef __CUDA_ARCH__ return __cxx_atomic_alignment_unwrap(detail::__atomic_exchange_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>())); #else - return __cxx_atomic_alignment_unwrap(::std::atomic_exchange_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order)); + alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; + auto* __dest = reinterpret_cast<_Tp*>(__buf); + auto __t = __cxx_atomic_alignment_wrap(__val); + __atomic_exchange(&__a->__a_value, &__t, __dest, __order); + return __cxx_atomic_alignment_unwrap(*__dest); #endif } template @@ -238,7 +246,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomi #ifdef __CUDA_ARCH__ bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure, detail::__scope_tag<_Sco>()); #else - bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure); + bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, false, __success, __failure); #endif *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; @@ -249,7 +257,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_ #ifdef __CUDA_ARCH__ bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure, detail::__scope_tag<_Sco>()); #else - bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure); + bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, true, __success, __failure); #endif *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; @@ -259,7 +267,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_add(&__a->__a_value, __delta, __order); #endif } template @@ -267,7 +275,7 @@ __host__ __device__ inline _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl_de #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); #endif } template @@ -275,7 +283,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_sub(&__a->__a_value, __delta, __order); #endif } template @@ -283,7 +291,7 @@ __host__ __device__ inline _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_de #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); + return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); #endif } template @@ -291,7 +299,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_and_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_and_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); + return __atomic_fetch_and(&__a->__a_value, __pattern, __order); #endif } template @@ -299,7 +307,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl_defa #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_or_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_or_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); + return __atomic_fetch_or(&__a->__a_value, __pattern, __order); #endif } template @@ -307,7 +315,7 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_def #ifdef __CUDA_ARCH__ return detail::__atomic_fetch_xor_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); #else - return ::std::atomic_fetch_xor_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); + return __atomic_fetch_xor(&__a->__a_value, __pattern, __order); #endif } diff --git a/include/cuda/std/detail/__atomic_derived b/include/cuda/std/detail/__atomic_derived index 4c08abb80c..0c3d23f3e1 100644 --- a/include/cuda/std/detail/__atomic_derived +++ b/include/cuda/std/detail/__atomic_derived @@ -43,9 +43,9 @@ template __val ? __expected : __val; + _Type __desired = __expected > __val ? __expected : __val; #ifdef __CUDA_ARCH__ while(__desired == __val && !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s)) { #else while(__desired == __val && - !::std::atomic_compare_exchange_strong_explicit(__ptr, &__expected, __desired, __memorder, __memorder)) { + !__atomic_compare_exchange(__ptr, &__expected, &__desired, true, __memorder, __memorder)) { #endif __desired = __expected > __val ? __expected : __val; } @@ -70,13 +70,13 @@ template #include #include <__pragma_push> + +#if defined(_LIBCUDACXX_COMPILER_MSVC) +#include "support/win32/atomic_msvc.h" +#endif + #endif //__cuda_std__ #if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER) diff --git a/libcxx/include/support/win32/atomic_msvc.h b/libcxx/include/support/win32/atomic_msvc.h new file mode 100644 index 0000000000..6b1f2da88e --- /dev/null +++ b/libcxx/include/support/win32/atomic_msvc.h @@ -0,0 +1,398 @@ +//===----------------------------------------------------------------------===// +// +// Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _MSC_VER +#error "This file is only for CL.EXE's benefit" +#endif + +#include + +#define _Compiler_barrier() _ReadWriteBarrier() + +#if defined(_M_ARM) || defined(_M_ARM64) + #define _Memory_barrier() __dmb(0xB) // inner shared data memory barrier + #define _Compiler_or_memory_barrier() _Memory_barrier() +#elif defined(_M_IX86) || defined(_M_X64) + #define _Memory_barrier() __faststorefence() + // x86/x64 hardware only emits memory barriers inside _Interlocked intrinsics + #define _Compiler_or_memory_barrier() _Compiler_barrier() +#else // ^^^ x86/x64 / unsupported hardware vvv + #error Unsupported hardware +#endif // hardware + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +static inline void __atomic_signal_fence(int __memorder) { + if (__memorder != __ATOMIC_RELAXED) + _Compiler_barrier(); +} + +static inline void __atomic_thread_fence(int __memorder) { + if (__memorder != __ATOMIC_RELAXED) + _Memory_barrier(); +} + +namespace detail { + template + using _enable_if_sized_as = typename _CUDA_VSTD::enable_if::type; +} + +template = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int8 __tmp = __iso_volatile_load8((const volatile __int8 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int16 __tmp = __iso_volatile_load16((const volatile __int16 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int32 __tmp = __iso_volatile_load32((const volatile __int32 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} +template = 0> +void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { + __int64 __tmp = __iso_volatile_load64((const volatile __int64 *)__ptr); + *__ret = reinterpret_cast<_Type&>(__tmp); +} + +template +void __atomic_load(const volatile _Type *__ptr, _Type *__ret, int __memorder) { + switch (__memorder) { + case __ATOMIC_SEQ_CST: _Memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __atomic_load_relaxed(__ptr, __ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_load_relaxed(__ptr, __ret); break; + default: assert(0); + } +} + +template = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int8 __tmp = reinterpret_cast<__int8&>(*__val); + __iso_volatile_store8((volatile __int8 *)__ptr, __tmp); +} +template = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int16 __tmp = reinterpret_cast<__int16&>(*__val); + __iso_volatile_store16((volatile __int16 *)__ptr, __tmp); +} +template = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int32 __tmp = reinterpret_cast<__int32&>(*__val); + __iso_volatile_store32((volatile __int32 *)__ptr, __tmp); +} +template = 0> +void __atomic_store_relaxed(const volatile _Type *__ptr, _Type *__val) { + __int64 __tmp = reinterpret_cast<__int64&>(*__val); + __iso_volatile_store64((volatile __int64 *)__ptr, __tmp); +} +template +void __atomic_store(volatile _Type *__ptr, _Type *__val, int __memorder) { + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_store_relaxed(__ptr, __val); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_RELAXED: __atomic_store_relaxed(__ptr, __val); break; + default: assert(0); + } +} + +template = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast(*__expected); + auto const __old = _InterlockedCompareExchange8((volatile char *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast(__old); + return false; +} +template = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast(*__expected); + auto const __old = _InterlockedCompareExchange16((volatile short *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast(__old); + return false; +} +template = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast(*__expected); + auto const __old = _InterlockedCompareExchange((volatile long *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast(__old); + return false; +} +template = 0> +bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { + auto __tmp_desired = reinterpret_cast(*__desired); + auto __tmp_expected = reinterpret_cast<__int64&>(*__expected); + auto const __old = _InterlockedCompareExchange64((volatile __int64 *)__ptr, __tmp_desired, __tmp_expected); + if(__old == __tmp_expected) + return true; + *__expected = reinterpret_cast(__old); + return false; +} +template +bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder) { + bool success = false; + switch (detail::__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; + default: assert(0); + } + return success; +} + +template = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { + auto const __old = _InterlockedExchange64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +void __atomic_exchange(_Type volatile *__ptr, const _Type *__val, _Type *__ret, int __memorder) { + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_exchange_relaxed(__ptr, __val, __ret); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_exchange_relaxed(__ptr, __val, __ret); break; + default: assert(0); + } +} + +template = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedExchangeAdd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_add(_Type volatile *__ptr, _Delta __val, int __memorder) { + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, __dest);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __atomic_fetch_add_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_add_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_add_relaxed(__ptr, &__val, __dest); break; + default: assert(0); + } + return *__dest; +} +template +_Type __atomic_fetch_sub(_Type volatile *__ptr, _Delta __val, int __memorder) { + return __atomic_fetch_add(__ptr, 0-__val, __memorder); +} + + +template = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedAnd8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedAnd16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedAnd((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedAnd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, __dest);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __atomic_fetch_and_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_and_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_and_relaxed(__ptr, &__val, __dest); break; + default: assert(0); + } + return *__dest; +} + +template = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedXor8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedXor16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedXor((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedXor64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, __dest);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __atomic_fetch_xor_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_xor_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_xor_relaxed(__ptr, &__val, __dest); break; + default: assert(0); + } + return *__dest; +} + +template = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedOr8((volatile char *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedOr16((volatile short *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedOr((volatile long *)__ptr, reinterpret_cast(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template = 0> +void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { + auto const __old = _InterlockedOr64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); + *__ret = reinterpret_cast<_Type const&>(__old); +} +template +_Type __atomic_fetch_or(_Type volatile *__ptr, _Delta __val, int __memorder) { + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + + switch (__memorder) { + case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, __dest);break; + case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __atomic_fetch_or_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_SEQ_CST: _Memory_barrier(); __atomic_fetch_or_relaxed(__ptr, &__val, __dest); _Compiler_or_memory_barrier(); break; + case __ATOMIC_RELAXED: __atomic_fetch_or_relaxed(__ptr, &__val, __dest); break; + default: assert(0); + } + return *__dest; +} + +template +_Type __atomic_load_n(const _Type volatile *__ptr, int __memorder) { + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + + __atomic_load(__ptr, __dest, __memorder); + return *__dest; +} + +template +void __atomic_store_n(_Type volatile *__ptr, _Type __val, int __memorder) { + __atomic_store(__ptr, &__val, __memorder); +} + +template +bool __atomic_compare_exchange_n(_Type volatile *__ptr, _Type *__expected, _Type __desired, bool __weak, int __success_memorder, int __failure_memorder) { + return __atomic_compare_exchange(__ptr, __expected, &__desired, __weak, __success_memorder, __failure_memorder); +} + +template +_Type __atomic_exchange_n(_Type volatile *__ptr, _Type __val, int __memorder) { + alignas(_Type) unsigned char __buf[sizeof(_Type)] = {}; + auto* __dest = reinterpret_cast<_Type*>(__buf); + + __atomic_exchange(__ptr, &__val, __dest, __memorder); + return *__dest; +} + +template +_Type __host__ __atomic_fetch_max(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __expected = __atomic_load_n(__ptr, __ATOMIC_RELAXED); + _Type __desired = __expected < __val ? __expected : __val; + while(__desired == __val && + !__atomic_compare_exchange_n(__ptr, &__expected, __desired, __memorder, __memorder)) { + __desired = __expected > __val ? __expected : __val; + } + return __expected; +} + +template +_Type __host__ __atomic_fetch_min(_Type volatile *__ptr, _Delta __val, int __memorder) { + _Type __expected = __atomic_load_n(__ptr, __ATOMIC_RELAXED); + _Type __desired = __expected < __val ? __expected : __val; + while(__desired != __val && + !__atomic_compare_exchange_n(__ptr, &__expected, __desired, __memorder, __memorder)) { + __desired = __expected < __val ? __expected : __val; + } + return __expected; +} + +_LIBCUDACXX_END_NAMESPACE_CUDA