From 9c3b6af574495c046ece89f65eb4af0e788fafb8 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 30 Mar 2021 18:42:20 -0700 Subject: [PATCH 01/34] Remove all uses of non-compliant __CUDA_ARCH__/preprocessor macros from libcudacxx --- include/cuda/std/barrier | 4 ++-- include/cuda/std/detail/__atomic | 2 +- include/cuda/std/latch | 2 +- include/cuda/std/semaphore | 2 +- libcxx/include/__threading_support | 15 ++++++++------- libcxx/include/atomic | 6 +++--- libcxx/include/cmath | 24 +++--------------------- 7 files changed, 19 insertions(+), 36 deletions(-) diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index e7af6f138c..8d75b4b763 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 # error "CUDA synchronization primitives are only supported for sm_70 and up." #endif @@ -311,7 +311,7 @@ inline void __strided_memcpy(char * __destination, char const * __source, std::s } } -#if __CUDA_ARCH__ >= 800 +#if __CUDA_MINIMUM_ARCH__ >= 800 template 16)> struct __memcpy_async_impl { __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { diff --git a/include/cuda/std/detail/__atomic b/include/cuda/std/detail/__atomic index cdae5b1e50..5995b9f69b 100644 --- a/include/cuda/std/detail/__atomic +++ b/include/cuda/std/detail/__atomic @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#if defined(__CUDA_ARCH__) && ((!defined(_MSC_VER) && __CUDA_ARCH__ < 600) || (defined(_MSC_VER) && __CUDA_ARCH__ < 700)) +#if defined(__CUDA_MINIMUM_ARCH__) && ((!defined(_MSC_VER) && __CUDA_MINIMUM_ARCH__ < 600) || (defined(_MSC_VER) && __CUDA_MINIMUM_ARCH__ < 700)) # error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows." #endif diff --git a/include/cuda/std/latch b/include/cuda/std/latch index 0bb4c4f27a..ba27b60b8d 100644 --- a/include/cuda/std/latch +++ b/include/cuda/std/latch @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 # error "CUDA synchronization primitives are only supported for sm_70 and up." #endif diff --git a/include/cuda/std/semaphore b/include/cuda/std/semaphore index 7a02b4e332..45a9b8beb7 100644 --- a/include/cuda/std/semaphore +++ b/include/cuda/std/semaphore @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 # error "CUDA synchronization primitives are only supported for sm_70 and up." #endif diff --git a/libcxx/include/__threading_support b/libcxx/include/__threading_support index eb26e2c15b..a63e1596cf 100644 --- a/libcxx/include/__threading_support +++ b/libcxx/include/__threading_support @@ -75,17 +75,18 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD _LIBCUDACXX_INLINE_VISIBILITY inline void __libcpp_thread_yield_processor() { -#if defined(__CUDA_ARCH__) - ; -#elif defined(__aarch64__) - asm volatile ("yield" :::); + NV_DISPATCH_TARGET( + NV_IS_HOST, +#if defined(__aarch64__) + (asm volatile ("yield" :::);) #elif defined(__x86_64__) - asm volatile ("pause" :::); + (asm volatile ("pause" :::);) #elif defined (__powerpc__) - asm volatile ("or 27,27,27":::); + (asm volatile ("or 27,27,27":::);) #else - ; + (;) #endif + ) } _LIBCUDACXX_THREAD_ABI_VISIBILITY diff --git a/libcxx/include/atomic b/libcxx/include/atomic index be235a64dc..6f261ce7f6 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -1591,7 +1591,7 @@ _LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl< template struct __atomic_wait_and_notify_supported -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 : false_type #else : true_type @@ -2597,7 +2597,7 @@ typedef struct atomic_flag void clear(memory_order __m = memory_order_seq_cst) _NOEXCEPT {__cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);} -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700 _LIBCUDACXX_INLINE_VISIBILITY void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile _NOEXCEPT {__cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);} @@ -2726,7 +2726,7 @@ atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) _NOEXCEPT __o->clear(__m); } -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700 inline _LIBCUDACXX_INLINE_VISIBILITY void diff --git a/libcxx/include/cmath b/libcxx/include/cmath index 99a9b055a5..0e44738eb6 100644 --- a/libcxx/include/cmath +++ b/libcxx/include/cmath @@ -596,13 +596,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR typename enable_if::value, bool>::type __libcpp_isnan_or_builtin(_A1 __lcpp_x) _NOEXCEPT { -#if defined(__CUDA_ARCH__) - return __isnan(__lcpp_x); -#elif __has_builtin(__builtin_isnan) - return __builtin_isnan(__lcpp_x); -#else - return isnan(__lcpp_x); -#endif + return isnan(static_cast(__lcpp_x)); } template @@ -618,13 +612,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR typename enable_if::value, bool>::type __libcpp_isinf_or_builtin(_A1 __lcpp_x) _NOEXCEPT { -#if defined(__CUDA_ARCH__) - return __isinf(__lcpp_x); -#elif __has_builtin(__builtin_isinf) - return __builtin_isinf(__lcpp_x); -#else - return isinf(__lcpp_x); -#endif + return isinf(static_cast(__lcpp_x)); } template @@ -640,13 +628,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR typename enable_if::value, bool>::type __libcpp_isfinite_or_builtin(_A1 __lcpp_x) _NOEXCEPT { -#if defined(__CUDA_ARCH__) - return __finite(__lcpp_x); -#elif __has_builtin(__builtin_isfinite) - return __builtin_isfinite(__lcpp_x); -#else - return isfinite(__lcpp_x); -#endif + return isfinite(static_cast(__lcpp_x)); } template From fe05d4c0086d4ffbc4f97406266b5f6812fd8d01 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 30 Mar 2021 20:11:59 -0700 Subject: [PATCH 02/34] Fix an issue in NVRTC tests --- .upstream-tests/test/support/cuda_space_selector.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.upstream-tests/test/support/cuda_space_selector.h b/.upstream-tests/test/support/cuda_space_selector.h index e3b68c7ea2..026a46c268 100644 --- a/.upstream-tests/test/support/cuda_space_selector.h +++ b/.upstream-tests/test/support/cuda_space_selector.h @@ -36,6 +36,20 @@ struct malloc_memory_provider { static const constexpr cuda::std::size_t shared_offset = prefix_size + sizeof(T *); private: + + __device__ char* device_static_storage() { + __shared__ alignas(T*) char storage[shared_offset]; + return storage; + } + + +#if !defined(__CUDACC_RTC__) + __host__ char* host_static_storage() { + alignas(T*) static char storage[shared_offset]; + return storage; + } +#endif + __host__ __device__ T *& get_pointer() { alignas(T*) From 552207867febb196d6063e5fb964a52ab63ca6c2 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 13 Apr 2021 18:03:26 -0700 Subject: [PATCH 03/34] Move files and implementation for atomic refactor --- include/cuda/std/atomic | 53 ++- libcxx/include/__config | 7 + libcxx/include/atomic | 440 +----------------- libcxx/include/support/atomic/atomic_c11.h | 171 +++++++ .../include/support/atomic/atomic_cuda.h | 303 ++++++------ .../support/atomic/atomic_cuda_derived.h | 6 - .../support/atomic/atomic_cuda_generated.h | 6 - libcxx/include/support/atomic/atomic_gcc.h | 251 ++++++++++ .../support/{win32 => atomic}/atomic_msvc.h | 0 9 files changed, 646 insertions(+), 591 deletions(-) create mode 100644 libcxx/include/support/atomic/atomic_c11.h rename include/cuda/std/detail/__atomic => libcxx/include/support/atomic/atomic_cuda.h (67%) rename include/cuda/std/detail/__atomic_derived => libcxx/include/support/atomic/atomic_cuda_derived.h (98%) rename include/cuda/std/detail/__atomic_generated => libcxx/include/support/atomic/atomic_cuda_generated.h (99%) create mode 100644 libcxx/include/support/atomic/atomic_gcc.h rename libcxx/include/support/{win32 => atomic}/atomic_msvc.h (100%) diff --git a/include/cuda/std/atomic b/include/cuda/std/atomic index 3b07b21abd..f09c189c5f 100644 --- a/include/cuda/std/atomic +++ b/include/cuda/std/atomic @@ -46,16 +46,22 @@ #include "detail/__pragma_push" -#include "detail/__atomic" #include "detail/__threading_support" -#undef _LIBCUDACXX_HAS_GCC_ATOMIC_IMP -#undef _LIBCUDACXX_HAS_C_ATOMIC_IMP - #include "detail/libcxx/include/atomic" _LIBCUDACXX_BEGIN_NAMESPACE_CUDA +using std::detail::thread_scope; + +namespace detail { +using std::detail::__thread_scope_block_tag; +using std::detail::__thread_scope_device_tag; +using std::detail::__thread_scope_system_tag; +using std::detail::__atomic_signal_fence_cuda; +using std::detail::__atomic_thread_fence_cuda; +} + using memory_order = std::memory_order; constexpr memory_order memory_order_relaxed = std::memory_order_relaxed; @@ -67,7 +73,7 @@ constexpr memory_order memory_order_seq_cst = std::memory_order_seq_cst; // atomic -template +template struct atomic : public std::__atomic_base<_Tp, _Sco> { @@ -159,23 +165,26 @@ struct atomic<_Tp*, _Sco> _Tp* operator-=(ptrdiff_t __op) noexcept {return fetch_sub(__op) - __op;} }; -inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_scope _Scope = thread_scope_system) { -#ifdef __CUDA_ARCH__ - switch(_Scope) { - case thread_scope_system: - detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_system_tag()); - break; - case thread_scope_device: - detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_device_tag()); - break; - case thread_scope_block: - detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_block_tag()); - break; - } -#else - (void) _Scope; - ::std::atomic_thread_fence((::std::memory_order)__m); -#endif +inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_scope _Scope = thread_scope::thread_scope_system) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + switch(_Scope) { + case thread_scope::thread_scope_system: + detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_system_tag()); + break; + case thread_scope::thread_scope_device: + detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_device_tag()); + break; + case thread_scope::thread_scope_block: + detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_block_tag()); + break; + } + ), + NV_IS_HOST, ( + (void) _Scope; + ::std::atomic_thread_fence((::std::memory_order)__m); + ) + ) } inline __host__ __device__ void atomic_signal_fence(memory_order __m) { diff --git a/libcxx/include/__config b/libcxx/include/__config index 2965dea640..34e7f9d54d 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1600,6 +1600,13 @@ _LIBCUDACXX_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( # define _LIBCUDACXX_HAS_C_ATOMIC_IMP #elif defined(_LIBCUDACXX_COMPILER_GCC) # define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP +#elif defined(_LIBCUDACXX_COMPILER_MSVC) +# define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL +#endif + +// CUDA Atomics supersede host atomics in order to insert the host/device dispatch layer +#if defined(_LIBCUDACXX_COMPILER_NVCC) || defined(_LIBCUDACXX_COMPILER_PGI) +# define _LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL #endif #if (!defined(_LIBCUDACXX_HAS_C_ATOMIC_IMP) && \ diff --git a/libcxx/include/atomic b/libcxx/include/atomic index 6f261ce7f6..a58404ebd6 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -556,10 +556,6 @@ void atomic_signal_fence(memory_order m) noexcept; #include #include <__pragma_push> -#if defined(_LIBCUDACXX_COMPILER_MSVC) -#include "support/win32/atomic_msvc.h" -#endif - #endif //__cuda_std__ #if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER) @@ -674,431 +670,23 @@ __cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) #endif -#if defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) - -template -struct __cxx_atomic_base_impl { - - _LIBCUDACXX_INLINE_VISIBILITY -#ifndef _LIBCUDACXX_CXX03_LANG - __cxx_atomic_base_impl() _NOEXCEPT = default; -#else - __cxx_atomic_base_impl() _NOEXCEPT : __a_value() {} -#endif // _LIBCUDACXX_CXX03_LANG - _LIBCUDACXX_CONSTEXPR explicit __cxx_atomic_base_impl(_Tp value) _NOEXCEPT - : __a_value(value) {} - _ALIGNAS(sizeof(_Tp)) _Tp __a_value; -}; - -_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed ? __ATOMIC_RELAXED: - (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: - (__order == memory_order_release ? __ATOMIC_RELEASE: - (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: - (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL: - __ATOMIC_CONSUME)))); -} - -_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_failure_order(memory_order __order) { - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed ? __ATOMIC_RELAXED: - (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: - (__order == memory_order_release ? __ATOMIC_RELAXED: - (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: - (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE: - __ATOMIC_CONSUME)))); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp __val) { - __cxx_atomic_assign_volatile(__a->__a_value, __val); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val) { - __a->__a_value = __val; -} - -_LIBCUDACXX_INLINE_VISIBILITY inline -void __cxx_atomic_thread_fence(memory_order __order) { - __atomic_thread_fence(__to_gcc_order(__order)); -} - -_LIBCUDACXX_INLINE_VISIBILITY inline -void __cxx_atomic_signal_fence(memory_order __order) { - __atomic_signal_fence(__to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp __val, - memory_order __order) { - __atomic_store(&__a->__a_value, &__val, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val, - memory_order __order) { - __atomic_store(&__a->__a_value, &__val, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(const volatile __cxx_atomic_base_impl<_Tp>* __a, - memory_order __order) { - _Tp __ret; - __atomic_load(&__a->__a_value, &__ret, - __to_gcc_order(__order)); - return __ret; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(const __cxx_atomic_base_impl<_Tp>* __a, memory_order __order) { - _Tp __ret; - __atomic_load(&__a->__a_value, &__ret, - __to_gcc_order(__order)); - return __ret; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(volatile __cxx_atomic_base_impl<_Tp>* __a, - _Tp __value, memory_order __order) { - _Tp __ret; - __atomic_exchange(&__a->__a_value, &__value, &__ret, - __to_gcc_order(__order)); - return __ret; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp>* __a, _Tp __value, - memory_order __order) { - _Tp __ret; - __atomic_exchange(&__a->__a_value, &__value, &__ret, - __to_gcc_order(__order)); - return __ret; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_strong( - volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, - memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, - false, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_strong( - __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order __success, - memory_order __failure) { - return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, - false, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_weak( - volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, - memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, - true, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_weak( - __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order __success, - memory_order __failure) { - return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, - true, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -struct __skip_amt { enum {value = 1}; }; - -template -struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; - -// FIXME: Haven't figured out what the spec says about using arrays with -// atomic_fetch_add. Force a failure rather than creating bad behavior. -template -struct __skip_amt<_Tp[]> { }; -template -struct __skip_amt<_Tp[n]> { }; - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_base_impl<_Tp>* __a, - _Td __delta, memory_order __order) { - return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp>::value, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta, - memory_order __order) { - return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp>::value, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_base_impl<_Tp>* __a, - _Td __delta, memory_order __order) { - return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp>::value, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta, - memory_order __order) { - return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp>::value, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_base_impl<_Tp>* __a, - _Tp __pattern, memory_order __order) { - return __atomic_fetch_and(&__a->__a_value, __pattern, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a, - _Tp __pattern, memory_order __order) { - return __atomic_fetch_and(&__a->__a_value, __pattern, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_base_impl<_Tp>* __a, - _Tp __pattern, memory_order __order) { - return __atomic_fetch_or(&__a->__a_value, __pattern, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, - memory_order __order) { - return __atomic_fetch_or(&__a->__a_value, __pattern, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(volatile __cxx_atomic_base_impl<_Tp>* __a, - _Tp __pattern, memory_order __order) { - return __atomic_fetch_xor(&__a->__a_value, __pattern, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, - memory_order __order) { - return __atomic_fetch_xor(&__a->__a_value, __pattern, - __to_gcc_order(__order)); -} - -#define __cxx_atomic_is_lock_free(__s) __atomic_is_lock_free(__s, 0) - +// Headers are wrapped like so: (cuda::std::|std::)detail +namespace detail { +#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL) +# include "support/atomic/atomic_cuda.h" +#elif defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL) +# include "support/atomic/atomic_msvc.h" +#elif defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) +# include "support/atomic/atomic_gcc.h" #elif defined(_LIBCUDACXX_HAS_C_ATOMIC_IMP) - -template -struct __cxx_atomic_base_impl { - - _LIBCUDACXX_INLINE_VISIBILITY -#ifndef _LIBCUDACXX_CXX03_LANG - __cxx_atomic_base_impl() _NOEXCEPT = default; -#else - __cxx_atomic_base_impl() _NOEXCEPT : __a_value() {} -#endif // _LIBCUDACXX_CXX03_LANG - _LIBCUDACXX_CONSTEXPR explicit __cxx_atomic_base_impl(_Tp value) _NOEXCEPT - : __a_value(value) {} - _LIBCUDACXX_DISABLE_EXTENSION_WARNING _Atomic(_Tp) __a_value; -}; - -#define __cxx_atomic_is_lock_free(__s) __c11_atomic_is_lock_free(__s) - -_LIBCUDACXX_INLINE_VISIBILITY inline -void __cxx_atomic_thread_fence(memory_order __order) _NOEXCEPT { - __c11_atomic_thread_fence(static_cast<__memory_order_underlying_t>(__order)); -} - -_LIBCUDACXX_INLINE_VISIBILITY inline -void __cxx_atomic_signal_fence(memory_order __order) _NOEXCEPT { - __c11_atomic_signal_fence(static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val) _NOEXCEPT { - __c11_atomic_init(&__a->__a_value, __val); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp> * __a, _Tp __val) _NOEXCEPT { - __c11_atomic_init(&__a->__a_value, __val); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val, memory_order __order) _NOEXCEPT { - __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(__cxx_atomic_base_impl<_Tp> * __a, _Tp __val, memory_order __order) _NOEXCEPT { - __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const volatile* __a, memory_order __order) _NOEXCEPT { - using __ptr_type = typename remove_const__a_value)>::type*; - return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const* __a, memory_order __order) _NOEXCEPT { - using __ptr_type = typename remove_const__a_value)>::type*; - return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __value, memory_order __order) _NOEXCEPT { - return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp> * __a, _Tp __value, memory_order __order) _NOEXCEPT { - return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) _NOEXCEPT { - return __c11_atomic_compare_exchange_strong(&__a->__a_value, __expected, __value, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_impl<_Tp> * __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) _NOEXCEPT { - return __c11_atomic_compare_exchange_strong(&__a->__a_value, __expected, __value, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) _NOEXCEPT { - return __c11_atomic_compare_exchange_weak(&__a->__a_value, __expected, __value, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_impl<_Tp> * __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) _NOEXCEPT { - return __c11_atomic_compare_exchange_weak(&__a->__a_value, __expected, __value, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp> * __a, _Tp __delta, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*> * __a, ptrdiff_t __delta, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp> * __a, _Tp __delta, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*> * __a, ptrdiff_t __delta, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp> * __a, _Tp __pattern, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp> * __a, _Tp __pattern, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp> * __a, _Tp __pattern, memory_order __order) _NOEXCEPT { - return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); +// TODO: +// #include "support/atomic/atomic_c11.h" +#endif // _LIBCUDACXX_HAS_GCC_ATOMIC_IMP, _LIBCUDACXX_HAS_C_ATOMIC_IMP } -#endif // _LIBCUDACXX_HAS_GCC_ATOMIC_IMP, _LIBCUDACXX_HAS_C_ATOMIC_IMP +using detail::__cxx_atomic_base_impl; +using detail::__cxx_atomic_thread_fence; +using detail::__cxx_atomic_signal_fence; template _LIBCUDACXX_INLINE_VISIBILITY diff --git a/libcxx/include/support/atomic/atomic_c11.h b/libcxx/include/support/atomic/atomic_c11.h new file mode 100644 index 0000000000..7669a45a15 --- /dev/null +++ b/libcxx/include/support/atomic/atomic_c11.h @@ -0,0 +1,171 @@ +// Atomics for C11 + +template +struct __cxx_atomic_base_impl { + + _LIBCUDACXX_INLINE_VISIBILITY +#ifndef _LIBCUDACXX_CXX03_LANG + __cxx_atomic_base_impl() _NOEXCEPT = default; +#else + __cxx_atomic_base_impl() _NOEXCEPT : __a_value() {} +#endif // _LIBCUDACXX_CXX03_LANG + _LIBCUDACXX_CONSTEXPR explicit __cxx_atomic_base_impl(_Tp value) _NOEXCEPT + : __a_value(value) {} + _LIBCUDACXX_DISABLE_EXTENSION_WARNING _Atomic(_Tp) __a_value; +}; + +#define __cxx_atomic_is_lock_free(__s) __c11_atomic_is_lock_free(__s) + +_LIBCUDACXX_INLINE_VISIBILITY inline +void __cxx_atomic_thread_fence(memory_order __order) _NOEXCEPT { + __c11_atomic_thread_fence(static_cast<__memory_order_underlying_t>(__order)); +} + +_LIBCUDACXX_INLINE_VISIBILITY inline +void __cxx_atomic_signal_fence(memory_order __order) _NOEXCEPT { + __c11_atomic_signal_fence(static_cast<__memory_order_underlying_t>(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val) _NOEXCEPT { + __c11_atomic_init(&__a->__a_value, __val); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp> * __a, _Tp __val) _NOEXCEPT { + __c11_atomic_init(&__a->__a_value, __val); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +void __cxx_atomic_store(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val, memory_order __order) _NOEXCEPT { + __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +void __cxx_atomic_store(__cxx_atomic_base_impl<_Tp> * __a, _Tp __val, memory_order __order) _NOEXCEPT { + __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const volatile* __a, memory_order __order) _NOEXCEPT { + using __ptr_type = typename remove_const__a_value)>::type*; + return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const* __a, memory_order __order) _NOEXCEPT { + using __ptr_type = typename remove_const__a_value)>::type*; + return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __value, memory_order __order) _NOEXCEPT { + return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp> * __a, _Tp __value, memory_order __order) _NOEXCEPT { + return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) _NOEXCEPT { + return __c11_atomic_compare_exchange_strong(&__a->__a_value, __expected, __value, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_impl<_Tp> * __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) _NOEXCEPT { + return __c11_atomic_compare_exchange_strong(&__a->__a_value, __expected, __value, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) _NOEXCEPT { + return __c11_atomic_compare_exchange_weak(&__a->__a_value, __expected, __value, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_impl<_Tp> * __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) _NOEXCEPT { + return __c11_atomic_compare_exchange_weak(&__a->__a_value, __expected, __value, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp> * __a, _Tp __delta, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*> * __a, ptrdiff_t __delta, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp> * __a, _Tp __delta, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*> * __a, ptrdiff_t __delta, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp> * __a, _Tp __pattern, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp> * __a, _Tp __pattern, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); +} +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp> * __a, _Tp __pattern, memory_order __order) _NOEXCEPT { + return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); +} diff --git a/include/cuda/std/detail/__atomic b/libcxx/include/support/atomic/atomic_cuda.h similarity index 67% rename from include/cuda/std/detail/__atomic rename to libcxx/include/support/atomic/atomic_cuda.h index 5995b9f69b..a8b8c0553c 100644 --- a/include/cuda/std/detail/__atomic +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -45,21 +45,19 @@ #define __ATOMIC_THREAD 10 #endif //__ATOMIC_BLOCK -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA - -namespace detail { - - inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) { - int const __max = __a > __b ? __a : __b; - if(__max != __ATOMIC_RELEASE) - return __max; - static int const __xform[] = { - __ATOMIC_RELEASE, - __ATOMIC_ACQ_REL, - __ATOMIC_ACQ_REL, - __ATOMIC_RELEASE }; - return __xform[__a < __b ? __a : __b]; - } +// TODO: +// How to get this into cuda::??? + +inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) { + int const __max = __a > __b ? __a : __b; + if(__max != __ATOMIC_RELEASE) + return __max; + static int const __xform[] = { + __ATOMIC_RELEASE, + __ATOMIC_ACQ_REL, + __ATOMIC_ACQ_REL, + __ATOMIC_RELEASE }; + return __xform[__a < __b ? __a : __b]; } enum thread_scope { @@ -72,45 +70,46 @@ enum thread_scope { #define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope #define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system -namespace detail { - - struct __thread_scope_thread_tag { }; - struct __thread_scope_block_tag { }; - struct __thread_scope_device_tag { }; - struct __thread_scope_system_tag { }; - - template struct __scope_enum_to_tag { }; - /* This would be the implementation once an actual thread-scope backend exists. - template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { - using type = __thread_scope_thread_tag; }; - Until then: */ - template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { - using type = __thread_scope_block_tag; }; - template<> struct __scope_enum_to_tag<(int)thread_scope_block> { - using type = __thread_scope_block_tag; }; - template<> struct __scope_enum_to_tag<(int)thread_scope_device> { - using type = __thread_scope_device_tag; }; - template<> struct __scope_enum_to_tag<(int)thread_scope_system> { - using type = __thread_scope_system_tag; }; - - template - __host__ __device__ auto constexpr __scope_tag() -> - typename __scope_enum_to_tag<_Scope>::type { - return typename __scope_enum_to_tag<_Scope>::type(); - } -} - -_LIBCUDACXX_END_NAMESPACE_CUDA - +struct __thread_scope_thread_tag { }; +struct __thread_scope_block_tag { }; +struct __thread_scope_device_tag { }; +struct __thread_scope_system_tag { }; + +template struct __scope_enum_to_tag { }; +/* This would be the implementation once an actual thread-scope backend exists. +template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { + using type = __thread_scope_thread_tag; }; +Until then: */ +template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { + using type = __thread_scope_block_tag; }; +template<> struct __scope_enum_to_tag<(int)thread_scope_block> { + using type = __thread_scope_block_tag; }; +template<> struct __scope_enum_to_tag<(int)thread_scope_device> { + using type = __thread_scope_device_tag; }; +template<> struct __scope_enum_to_tag<(int)thread_scope_system> { + using type = __thread_scope_system_tag; }; + +template +__host__ __device__ auto constexpr __scope_tag() -> + typename __scope_enum_to_tag<_Scope>::type { + return typename __scope_enum_to_tag<_Scope>::type(); +} +// END TODO + +// Wrap atomic implementations into a sub-namespace +namespace host { #if defined(_LIBCUDACXX_COMPILER_MSVC) - // Inject atomic intrinsics built from MSVC compiler intrinsics - #include "libcxx/include/support/win32/atomic_msvc.h" +# include "atomic_msvc.h" +#elif defined (_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) +# include "atomic_gcc.h" +#elif defined (_LIBCUDACXX_HAS_C11_ATOMIC_IMP) +//TODO +// # include "atomic_c11.h" #endif +} -#include "__atomic_generated" -#include "__atomic_derived" - -_LIBCUDACXX_BEGIN_NAMESPACE_STD +#include "atomic_cuda_generated.h" +#include "atomic_cuda_derived.h" template struct __skip_amt { enum {value = 1}; }; @@ -131,18 +130,24 @@ __host__ __device__ inline bool __cxx_atomic_is_lock_free(size_t __x) { return __x <= 8; } __host__ __device__ inline void __cxx_atomic_thread_fence(int __order) { -#ifdef __CUDA_ARCH__ - detail::__atomic_thread_fence_cuda(__order, detail::__thread_scope_system_tag()); -#else - __atomic_thread_fence(__order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + detail::__atomic_thread_fence_cuda(__order, detail::__thread_scope_system_tag()); + ), + NV_IS_HOST, ( + host::__atomic_thread_fence(__order); + ) + ) } __host__ __device__ inline void __cxx_atomic_signal_fence(int __order) { -#ifdef __CUDA_ARCH__ - detail::__atomic_signal_fence_cuda(__order); -#else - __atomic_signal_fence(__order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + detail::__atomic_signal_fence_cuda(__order); + ), + NV_IS_HOST, ( + host::__atomic_signal_fence(__order); + ) + ) } template @@ -211,113 +216,151 @@ __host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_impl_default } template __host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { -#ifdef __CUDA_ARCH__ - detail::__atomic_store_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>()); -#else - auto __t = __cxx_atomic_alignment_wrap(__val); - __atomic_store(&__a->__a_value, &__t, __order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + detail::__atomic_store_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + auto __t = __cxx_atomic_alignment_wrap(__val); + host::__atomic_store(&__a->__a_value, &__t, __order); + ) + ) } template __host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_impl_default<_Tp, _Sco> const volatile* __a, int __order) { -#ifdef __CUDA_ARCH__ - return __cxx_atomic_alignment_unwrap(detail::__atomic_load_n_cuda(&__a->__a_value, __order, detail::__scope_tag<_Sco>())); -#else - alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; - auto* __dest = reinterpret_cast<_Tp*>(__buf); - __atomic_load(&__a->__a_value, __dest, __order); - return __cxx_atomic_alignment_unwrap(*__dest); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __cxx_atomic_alignment_unwrap(detail::__atomic_load_n_cuda(&__a->__a_value, __order, detail::__scope_tag<_Sco>())); + ), + NV_IS_HOST, ( + alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; + auto* __dest = reinterpret_cast<_Tp*>(__buf); + host::__atomic_load(&__a->__a_value, __dest, __order); + return __cxx_atomic_alignment_unwrap(*__dest); + ) + ) } template __host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { -#ifdef __CUDA_ARCH__ - return __cxx_atomic_alignment_unwrap(detail::__atomic_exchange_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>())); -#else - alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; - auto* __dest = reinterpret_cast<_Tp*>(__buf); - auto __t = __cxx_atomic_alignment_wrap(__val); - __atomic_exchange(&__a->__a_value, &__t, __dest, __order); - return __cxx_atomic_alignment_unwrap(*__dest); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __cxx_atomic_alignment_unwrap(detail::__atomic_exchange_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>())); + ), + NV_IS_HOST, ( + alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; + auto* __dest = reinterpret_cast<_Tp*>(__buf); + auto __t = __cxx_atomic_alignment_wrap(__val); + host::__atomic_exchange(&__a->__a_value, &__t, __dest, __order); + return __cxx_atomic_alignment_unwrap(*__dest); + ) + ) } template __host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __val, int __success, int __failure) { auto __tmp = __cxx_atomic_alignment_wrap(*__expected); -#ifdef __CUDA_ARCH__ - bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure, detail::__scope_tag<_Sco>()); -#else - bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, false, __success, __failure); -#endif + bool __result = false; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + __result = host::__atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, false, __success, __failure); + ) + ) *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; } template __host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __val, int __success, int __failure) { auto __tmp = __cxx_atomic_alignment_wrap(*__expected); -#ifdef __CUDA_ARCH__ - bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure, detail::__scope_tag<_Sco>()); -#else - bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, true, __success, __failure); -#endif + bool __result = false; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + __result = host::__atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, true, __success, __failure); + ) + ) *__expected = __cxx_atomic_alignment_unwrap(__tmp); return __result; } template __host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __delta, int __order) { -#ifdef __CUDA_ARCH__ - return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); -#else - return __atomic_fetch_add(&__a->__a_value, __delta, __order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + return host::__atomic_fetch_add(&__a->__a_value, __delta, __order); + ) + ) } template __host__ __device__ inline _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl_default<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, int __order) { -#ifdef __CUDA_ARCH__ - return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); -#else - return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + return host::__atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); + ) + ) } template __host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __delta, int __order) { -#ifdef __CUDA_ARCH__ - return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); -#else - return __atomic_fetch_sub(&__a->__a_value, __delta, __order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + return host::__atomic_fetch_sub(&__a->__a_value, __delta, __order); + ) + ) } template __host__ __device__ inline _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_default<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, int __order) { -#ifdef __CUDA_ARCH__ - return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); -#else - return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + return host::__atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); + ) + ) } template __host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { -#ifdef __CUDA_ARCH__ - return detail::__atomic_fetch_and_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); -#else - return __atomic_fetch_and(&__a->__a_value, __pattern, __order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return detail::__atomic_fetch_and_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + return host::__atomic_fetch_and(&__a->__a_value, __pattern, __order); + ) + ) } template __host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { -#ifdef __CUDA_ARCH__ - return detail::__atomic_fetch_or_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); -#else - return __atomic_fetch_or(&__a->__a_value, __pattern, __order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return detail::__atomic_fetch_or_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + return host::__atomic_fetch_or(&__a->__a_value, __pattern, __order); + ) + ) } template __host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { -#ifdef __CUDA_ARCH__ - return detail::__atomic_fetch_xor_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); -#else - return __atomic_fetch_xor(&__a->__a_value, __pattern, __order); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return detail::__atomic_fetch_xor_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); + ), + NV_IS_HOST, ( + return host::__atomic_fetch_xor(&__a->__a_value, __pattern, __order); + ) + ) } template @@ -432,5 +475,3 @@ template using __cxx_atomic_base_impl = typename conditional, __cxx_atomic_base_impl_default<_Tp, _Sco> >::type; - -_LIBCUDACXX_END_NAMESPACE_STD diff --git a/include/cuda/std/detail/__atomic_derived b/libcxx/include/support/atomic/atomic_cuda_derived.h similarity index 98% rename from include/cuda/std/detail/__atomic_derived rename to libcxx/include/support/atomic/atomic_cuda_derived.h index 204ebb9989..f0cbcdfd75 100644 --- a/include/cuda/std/detail/__atomic_derived +++ b/libcxx/include/support/atomic/atomic_cuda_derived.h @@ -7,9 +7,6 @@ // //===----------------------------------------------------------------------===// -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA -namespace detail { - template::type = 0> bool __device__ __atomic_compare_exchange_cuda(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, _Scope __s) { @@ -152,6 +149,3 @@ static inline __device__ void __atomic_signal_fence_cuda(int) { asm volatile("":::"memory"); } -} -_LIBCUDACXX_END_NAMESPACE_CUDA - diff --git a/include/cuda/std/detail/__atomic_generated b/libcxx/include/support/atomic/atomic_cuda_generated.h similarity index 99% rename from include/cuda/std/detail/__atomic_generated rename to libcxx/include/support/atomic/atomic_cuda_generated.h index 596467d991..d8b421c5ac 100644 --- a/include/cuda/std/detail/__atomic_generated +++ b/libcxx/include/support/atomic/atomic_cuda_generated.h @@ -7,10 +7,6 @@ // //===----------------------------------------------------------------------===// - -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA -namespace detail { - static inline __device__ void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); } static inline __device__ void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); } static inline __device__ void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); } @@ -2199,5 +2195,3 @@ __device__ _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __va return __ret; } -} -_LIBCUDACXX_END_NAMESPACE_CUDA diff --git a/libcxx/include/support/atomic/atomic_gcc.h b/libcxx/include/support/atomic/atomic_gcc.h new file mode 100644 index 0000000000..02568a6493 --- /dev/null +++ b/libcxx/include/support/atomic/atomic_gcc.h @@ -0,0 +1,251 @@ +// + +template +struct __cxx_atomic_base_impl { + + _LIBCUDACXX_INLINE_VISIBILITY +#ifndef _LIBCUDACXX_CXX03_LANG + __cxx_atomic_base_impl() _NOEXCEPT = default; +#else + __cxx_atomic_base_impl() _NOEXCEPT : __a_value() {} +#endif // _LIBCUDACXX_CXX03_LANG + _LIBCUDACXX_CONSTEXPR explicit __cxx_atomic_base_impl(_Tp value) _NOEXCEPT + : __a_value(value) {} + _ALIGNAS(sizeof(_Tp)) _Tp __a_value; +}; + +_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { + // Avoid switch statement to make this a constexpr. + return __order == memory_order_relaxed ? __ATOMIC_RELAXED: + (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: + (__order == memory_order_release ? __ATOMIC_RELEASE: + (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: + (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL: + __ATOMIC_CONSUME)))); +} + +_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_failure_order(memory_order __order) { + // Avoid switch statement to make this a constexpr. + return __order == memory_order_relaxed ? __ATOMIC_RELAXED: + (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: + (__order == memory_order_release ? __ATOMIC_RELAXED: + (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: + (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE: + __ATOMIC_CONSUME)))); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +void __cxx_atomic_init(volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp __val) { + __cxx_atomic_assign_volatile(__a->__a_value, __val); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val) { + __a->__a_value = __val; +} + +_LIBCUDACXX_INLINE_VISIBILITY inline +void __cxx_atomic_thread_fence(memory_order __order) { + __atomic_thread_fence(__to_gcc_order(__order)); +} + +_LIBCUDACXX_INLINE_VISIBILITY inline +void __cxx_atomic_signal_fence(memory_order __order) { + __atomic_signal_fence(__to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +void __cxx_atomic_store(volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp __val, + memory_order __order) { + __atomic_store(&__a->__a_value, &__val, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +void __cxx_atomic_store(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val, + memory_order __order) { + __atomic_store(&__a->__a_value, &__val, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_load(const volatile __cxx_atomic_base_impl<_Tp>* __a, + memory_order __order) { + _Tp __ret; + __atomic_load(&__a->__a_value, &__ret, + __to_gcc_order(__order)); + return __ret; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_load(const __cxx_atomic_base_impl<_Tp>* __a, memory_order __order) { + _Tp __ret; + __atomic_load(&__a->__a_value, &__ret, + __to_gcc_order(__order)); + return __ret; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_exchange(volatile __cxx_atomic_base_impl<_Tp>* __a, + _Tp __value, memory_order __order) { + _Tp __ret; + __atomic_exchange(&__a->__a_value, &__value, &__ret, + __to_gcc_order(__order)); + return __ret; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp>* __a, _Tp __value, + memory_order __order) { + _Tp __ret; + __atomic_exchange(&__a->__a_value, &__value, &__ret, + __to_gcc_order(__order)); + return __ret; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +bool __cxx_atomic_compare_exchange_strong( + volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, + memory_order __success, memory_order __failure) { + return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, + false, + __to_gcc_order(__success), + __to_gcc_failure_order(__failure)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +bool __cxx_atomic_compare_exchange_strong( + __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order __success, + memory_order __failure) { + return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, + false, + __to_gcc_order(__success), + __to_gcc_failure_order(__failure)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +bool __cxx_atomic_compare_exchange_weak( + volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, + memory_order __success, memory_order __failure) { + return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, + true, + __to_gcc_order(__success), + __to_gcc_failure_order(__failure)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +bool __cxx_atomic_compare_exchange_weak( + __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order __success, + memory_order __failure) { + return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, + true, + __to_gcc_order(__success), + __to_gcc_failure_order(__failure)); +} + +template +struct __skip_amt { enum {value = 1}; }; + +template +struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; + +// FIXME: Haven't figured out what the spec says about using arrays with +// atomic_fetch_add. Force a failure rather than creating bad behavior. +template +struct __skip_amt<_Tp[]> { }; +template +struct __skip_amt<_Tp[n]> { }; + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_base_impl<_Tp>* __a, + _Td __delta, memory_order __order) { + return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp>::value, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta, + memory_order __order) { + return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp>::value, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_base_impl<_Tp>* __a, + _Td __delta, memory_order __order) { + return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp>::value, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta, + memory_order __order) { + return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp>::value, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_base_impl<_Tp>* __a, + _Tp __pattern, memory_order __order) { + return __atomic_fetch_and(&__a->__a_value, __pattern, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a, + _Tp __pattern, memory_order __order) { + return __atomic_fetch_and(&__a->__a_value, __pattern, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_base_impl<_Tp>* __a, + _Tp __pattern, memory_order __order) { + return __atomic_fetch_or(&__a->__a_value, __pattern, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, + memory_order __order) { + return __atomic_fetch_or(&__a->__a_value, __pattern, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_xor(volatile __cxx_atomic_base_impl<_Tp>* __a, + _Tp __pattern, memory_order __order) { + return __atomic_fetch_xor(&__a->__a_value, __pattern, + __to_gcc_order(__order)); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY +_Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, + memory_order __order) { + return __atomic_fetch_xor(&__a->__a_value, __pattern, + __to_gcc_order(__order)); +} + +#define __cxx_atomic_is_lock_free(__s) __atomic_is_lock_free(__s, 0) diff --git a/libcxx/include/support/win32/atomic_msvc.h b/libcxx/include/support/atomic/atomic_msvc.h similarity index 100% rename from libcxx/include/support/win32/atomic_msvc.h rename to libcxx/include/support/atomic/atomic_msvc.h From ec308508abbd961b6ecd37f68900f4a36cc9cbe6 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 14 Apr 2021 12:13:19 -0700 Subject: [PATCH 04/34] WIP: single interface wrap/unwrap --- libcxx/include/support/atomic/atomic_cuda.h | 94 +++++++++--------- libcxx/include/support/atomic/atomic_gcc.h | 101 ++++++++++---------- 2 files changed, 97 insertions(+), 98 deletions(-) diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index a8b8c0553c..8eb0c23a38 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -96,7 +96,7 @@ __host__ __device__ auto constexpr __scope_tag() -> } // END TODO -// Wrap atomic implementations into a sub-namespace +// Wrap host atomic implementations into a sub-namespace namespace host { #if defined(_LIBCUDACXX_COMPILER_MSVC) # include "atomic_msvc.h" @@ -150,40 +150,61 @@ __host__ __device__ inline void __cxx_atomic_signal_fence(int __order) { ) } -template -struct __cxx_atomic_alignment_wrapper_impl; +// Atomic storage layouts: -template -struct __cxx_atomic_alignment_wrapper_impl { - struct type { - using __wrapped_type = _Tp; - __host__ __device__ constexpr type() noexcept : __a_held() { +// Implement _Sco with https://godbolt.org/z/foWdeYjEs + +template +struct type { + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + type() noexcept : __a_value() { } - __host__ __device__ constexpr type(_Tp __held) noexcept : __a_held(__held) { + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + type(_Tp __held) noexcept : __a_value(__held) { } - _ALIGNAS(sizeof(_Tp)) _Tp __a_held; - }; + + _ALIGNAS(sizeof(_Tp)) _Tp __a_value; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + _Tp* get() _NOEXCEPT { + return &__a_value; + } +} + +template +struct __cxx_atomic_base_storage_aligned<_Tp> { + }; -template -struct __cxx_atomic_alignment_wrapper_impl<_Tp, typename enable_if<_LIBCUDACXX_ALIGNOF(_Tp) == sizeof(_Tp)>::type> { - using type = _Tp; +template +struct __cxx_atomic_base_storage_small { + using __wrapped_type = _Tp; + + __cxx_atomic_base_storage_small() noexcept = default; + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit + __cxx_atomic_base_storage_small(_Tp __value) : __a_held(__value) { + } + + __cxx_atomic_base_storage_aligned __a_held; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + __cxx_atomic_base_storage_aligned* get() _NOEXCEPT { + return &__a_held; + } }; -template -using __cxx_atomic_alignment_wrapper_t = typename __cxx_atomic_alignment_wrapper_impl<_Tp>::type; +template +using __cxx_atomic_base_storage = typename conditional, + __cxx_atomic_base_storage_aligned<_Tp, _Sco> >::type; template -__host__ __device__ __cxx_atomic_alignment_wrapper_t<_Tp> __cxx_atomic_alignment_wrap(_Tp __value, true_type) { - return __value; -} -template -__host__ __device__ __cxx_atomic_alignment_wrapper_t<_Tp> __cxx_atomic_alignment_wrap(_Tp __value, false_type) { - return __cxx_atomic_alignment_wrapper_t<_Tp>(__value); -} +using __cxx_atomic_alignment_wrapper_t = __cxx_atomic_base_storage<_Tp>; + template __host__ __device__ __cxx_atomic_alignment_wrapper_t<_Tp> __cxx_atomic_alignment_wrap(_Tp __value) { - return __cxx_atomic_alignment_wrap(__value, integral_constant{}); + return __cxx_atomic_alignment_wrapper_t(__value); } template @@ -192,7 +213,7 @@ __host__ __device__ _Tp __cxx_atomic_alignment_unwrap(_Tp __value, true_type) { } template __host__ __device__ typename _Tp::__wrapped_type __cxx_atomic_alignment_unwrap(_Tp __value, false_type) { - return __value.__a_held; + return *__value.get(); } template __host__ __device__ auto __cxx_atomic_alignment_unwrap(_Tp __value) @@ -201,14 +222,6 @@ __host__ __device__ auto __cxx_atomic_alignment_unwrap(_Tp __value) return __cxx_atomic_alignment_unwrap(__value, integral_constant{}); } -template -struct __cxx_atomic_base_impl_default { - constexpr __cxx_atomic_base_impl_default() noexcept = default; - __host__ __device__ constexpr explicit __cxx_atomic_base_impl_default(_Tp __value) noexcept : __a_value(__value) { - } - __cxx_atomic_alignment_wrapper_t<_Tp> __a_value; -}; - template __host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val) { auto __tmp = __cxx_atomic_alignment_wrap(__val); @@ -363,16 +376,6 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_def ) } -template -struct __cxx_atomic_base_impl_small { - - __cxx_atomic_base_impl_small() noexcept = default; - __host__ __device__ constexpr explicit __cxx_atomic_base_impl_small(_Tp __value) : __a_value(__value) { - } - - __cxx_atomic_base_impl_default __a_value; -}; - template using __cxx_small_proxy = typename conditional __host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_xor(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); } - -template -using __cxx_atomic_base_impl = typename conditional, - __cxx_atomic_base_impl_default<_Tp, _Sco> >::type; diff --git a/libcxx/include/support/atomic/atomic_gcc.h b/libcxx/include/support/atomic/atomic_gcc.h index 02568a6493..bf8cc8991f 100644 --- a/libcxx/include/support/atomic/atomic_gcc.h +++ b/libcxx/include/support/atomic/atomic_gcc.h @@ -1,17 +1,18 @@ // template -struct __cxx_atomic_base_impl { - +struct __cxx_atomic_base_storage { _LIBCUDACXX_INLINE_VISIBILITY -#ifndef _LIBCUDACXX_CXX03_LANG - __cxx_atomic_base_impl() _NOEXCEPT = default; -#else __cxx_atomic_base_impl() _NOEXCEPT : __a_value() {} -#endif // _LIBCUDACXX_CXX03_LANG + _LIBCUDACXX_CONSTEXPR explicit __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} + _ALIGNAS(sizeof(_Tp)) _Tp __a_value; + + _LIBCUDACXX_CONSTEXPR _Tp* get() _NOEXCEPT { + return __a_value; + } }; _LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { @@ -36,14 +37,14 @@ _LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_failure_ template _LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp __val) { - __cxx_atomic_assign_volatile(__a->__a_value, __val); +void __cxx_atomic_init(volatile _Tp* __a, _Tp __val) { + __cxx_atomic_assign_volatile(*__a, __val); } template _LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val) { - __a->__a_value = __val; +void __cxx_atomic_init(_Tp* __a, _Tp __val) { +__a = __val; } _LIBCUDACXX_INLINE_VISIBILITY inline @@ -58,55 +59,55 @@ void __cxx_atomic_signal_fence(memory_order __order) { template _LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp __val, +void __cxx_atomic_store(volatile _Tp* __a, _Tp __val, memory_order __order) { - __atomic_store(&__a->__a_value, &__val, + __atomic_store(__a, &__val, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val, +void __cxx_atomic_store(_Tp* __a, _Tp __val, memory_order __order) { - __atomic_store(&__a->__a_value, &__val, + __atomic_store(__a, &__val, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(const volatile __cxx_atomic_base_impl<_Tp>* __a, +_Tp __cxx_atomic_load(const volatile _Tp* __a, memory_order __order) { _Tp __ret; - __atomic_load(&__a->__a_value, &__ret, + __atomic_load(__a, &__ret, __to_gcc_order(__order)); return __ret; } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(const __cxx_atomic_base_impl<_Tp>* __a, memory_order __order) { +_Tp __cxx_atomic_load(const _Tp* __a, memory_order __order) { _Tp __ret; - __atomic_load(&__a->__a_value, &__ret, + __atomic_load(__a, &__ret, __to_gcc_order(__order)); return __ret; } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(volatile __cxx_atomic_base_impl<_Tp>* __a, +_Tp __cxx_atomic_exchange(volatile _Tp* __a, _Tp __value, memory_order __order) { _Tp __ret; - __atomic_exchange(&__a->__a_value, &__value, &__ret, + __atomic_exchange(__a, &__value, &__ret, __to_gcc_order(__order)); return __ret; } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp>* __a, _Tp __value, +_Tp __cxx_atomic_exchange(_Tp* __a, _Tp __value, memory_order __order) { _Tp __ret; - __atomic_exchange(&__a->__a_value, &__value, &__ret, + __atomic_exchange(__a, &__value, &__ret, __to_gcc_order(__order)); return __ret; } @@ -114,9 +115,9 @@ _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp>* __a, _Tp __value, template _LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong( - volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, + volatile _Tp* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, + return __atomic_compare_exchange(__a, __expected, &__value, false, __to_gcc_order(__success), __to_gcc_failure_order(__failure)); @@ -125,9 +126,9 @@ bool __cxx_atomic_compare_exchange_strong( template _LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong( - __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order __success, + _Tp* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, + return __atomic_compare_exchange(__a, __expected, &__value, false, __to_gcc_order(__success), __to_gcc_failure_order(__failure)); @@ -136,9 +137,9 @@ bool __cxx_atomic_compare_exchange_strong( template _LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak( - volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, + volatile _Tp* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, + return __atomic_compare_exchange(__a, __expected, &__value, true, __to_gcc_order(__success), __to_gcc_failure_order(__failure)); @@ -147,9 +148,9 @@ bool __cxx_atomic_compare_exchange_weak( template _LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak( - __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order __success, + _Tp* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(&__a->__a_value, __expected, &__value, + return __atomic_compare_exchange(__a, __expected, &__value, true, __to_gcc_order(__success), __to_gcc_failure_order(__failure)); @@ -170,81 +171,81 @@ struct __skip_amt<_Tp[n]> { }; template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_base_impl<_Tp>* __a, +_Tp __cxx_atomic_fetch_add(volatile _Tp* __a, _Td __delta, memory_order __order) { - return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp>::value, + return __atomic_fetch_add(__a, __delta * __skip_amt<_Tp>::value, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta, +_Tp __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, memory_order __order) { - return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp>::value, + return __atomic_fetch_add(__a, __delta * __skip_amt<_Tp>::value, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_base_impl<_Tp>* __a, +_Tp __cxx_atomic_fetch_sub(volatile _Tp* __a, _Td __delta, memory_order __order) { - return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp>::value, + return __atomic_fetch_sub(__a, __delta * __skip_amt<_Tp>::value, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta, +_Tp __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, memory_order __order) { - return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp>::value, + return __atomic_fetch_sub(__a, __delta * __skip_amt<_Tp>::value, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_base_impl<_Tp>* __a, +_Tp __cxx_atomic_fetch_and(volatile _Tp* __a, _Tp __pattern, memory_order __order) { - return __atomic_fetch_and(&__a->__a_value, __pattern, + return __atomic_fetch_and(__a, __pattern, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a, +_Tp __cxx_atomic_fetch_and(_Tp* __a, _Tp __pattern, memory_order __order) { - return __atomic_fetch_and(&__a->__a_value, __pattern, + return __atomic_fetch_and(__a, __pattern, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_base_impl<_Tp>* __a, +_Tp __cxx_atomic_fetch_or(volatile _Tp* __a, _Tp __pattern, memory_order __order) { - return __atomic_fetch_or(&__a->__a_value, __pattern, + return __atomic_fetch_or(__a, __pattern, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, +_Tp __cxx_atomic_fetch_or(_Tp* __a, _Tp __pattern, memory_order __order) { - return __atomic_fetch_or(&__a->__a_value, __pattern, + return __atomic_fetch_or(__a, __pattern, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(volatile __cxx_atomic_base_impl<_Tp>* __a, +_Tp __cxx_atomic_fetch_xor(volatile _Tp* __a, _Tp __pattern, memory_order __order) { - return __atomic_fetch_xor(&__a->__a_value, __pattern, + return __atomic_fetch_xor(__a, __pattern, __to_gcc_order(__order)); } template _LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, +_Tp __cxx_atomic_fetch_xor(_Tp* __a, _Tp __pattern, memory_order __order) { - return __atomic_fetch_xor(&__a->__a_value, __pattern, + return __atomic_fetch_xor(__a, __pattern, __to_gcc_order(__order)); } From 2a5fbc60a72e1f8308823f38486fba0e7776ba3f Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 30 Apr 2021 18:40:05 -0700 Subject: [PATCH 05/34] Finish atomic refactor, bones of atomic_ref are in place --- include/cuda/std/atomic | 13 +- libcxx/include/__config | 2 + libcxx/include/atomic | 13 +- libcxx/include/support/atomic/atomic_cuda.h | 355 +++++++++++--------- libcxx/include/support/atomic/atomic_gcc.h | 264 ++++++--------- 5 files changed, 318 insertions(+), 329 deletions(-) diff --git a/include/cuda/std/atomic b/include/cuda/std/atomic index f09c189c5f..1274e548f5 100644 --- a/include/cuda/std/atomic +++ b/include/cuda/std/atomic @@ -37,6 +37,7 @@ #undef ATOMIC_VAR_INIT #endif //__CUDACC_RTC__ +#include "cassert" #include "cstddef" #include "cstdint" #include "type_traits" @@ -53,6 +54,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA using std::detail::thread_scope; +using std::detail::thread_scope_system; +using std::detail::thread_scope_device; +using std::detail::thread_scope_block; +using std::detail::thread_scope_thread; namespace detail { using std::detail::__thread_scope_block_tag; @@ -93,15 +98,15 @@ struct atomic __host__ __device__ _Tp fetch_max(const _Tp & __op, memory_order __m = memory_order_seq_cst) volatile noexcept { - return detail::__atomic_fetch_max_cuda(&this->__a_.__a_value, __op, - __m, detail::__scope_tag<_Sco>()); + return std::detail::__atomic_fetch_max_cuda(&this->__a_.__a_value, __op, + __m, std::detail::__scope_tag<_Sco>()); } __host__ __device__ _Tp fetch_min(const _Tp & __op, memory_order __m = memory_order_seq_cst) volatile noexcept { - return detail::__atomic_fetch_min_cuda(&this->__a_.__a_value, __op, - __m, detail::__scope_tag<_Sco>()); + return std::detail::__atomic_fetch_min_cuda(&this->__a_.__a_value, __op, + __m, std::detail::__scope_tag<_Sco>()); } }; diff --git a/libcxx/include/__config b/libcxx/include/__config index 34e7f9d54d..8e5811d20e 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -10,6 +10,8 @@ #ifndef _LIBCUDACXX_CONFIG #define _LIBCUDACXX_CONFIG +#include + #if defined(_MSC_VER) && !defined(__clang__) #define _LIBCUDACXX_HAS_PRAGMA_MSVC_WARNING #if !defined(_LIBCUDACXX_DISABLE_PRAGMA_MSVC_WARNING) diff --git a/libcxx/include/atomic b/libcxx/include/atomic index a58404ebd6..9b76c97de9 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -687,6 +687,17 @@ namespace detail { using detail::__cxx_atomic_base_impl; using detail::__cxx_atomic_thread_fence; using detail::__cxx_atomic_signal_fence; +using detail::__cxx_atomic_load; +using detail::__cxx_atomic_store; +using detail::__cxx_atomic_exchange; +using detail::__cxx_atomic_compare_exchange_weak; +using detail::__cxx_atomic_compare_exchange_strong; +using detail::__cxx_atomic_fetch_add; +using detail::__cxx_atomic_fetch_sub; +using detail::__cxx_atomic_fetch_or; +using detail::__cxx_atomic_fetch_and; +using detail::__cxx_atomic_fetch_xor; +using detail::__cxx_atomic_is_lock_free; template _LIBCUDACXX_INLINE_VISIBILITY @@ -1028,7 +1039,7 @@ template >::type> #else template > + typename _Base = __cxx_atomic_base_impl<_Tp, _Sco> > #endif //_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS struct __cxx_atomic_impl : public _Base { diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index 8eb0c23a38..84316359ba 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -13,7 +13,6 @@ #ifndef __CUDACC_RTC__ #include -#include #endif // __CUDACC_RTC__ #if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE) @@ -89,8 +88,8 @@ template<> struct __scope_enum_to_tag<(int)thread_scope_device> { template<> struct __scope_enum_to_tag<(int)thread_scope_system> { using type = __thread_scope_system_tag; }; -template -__host__ __device__ auto constexpr __scope_tag() -> +template +_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() -> typename __scope_enum_to_tag<_Scope>::type { return typename __scope_enum_to_tag<_Scope>::type(); } @@ -117,35 +116,31 @@ struct __skip_amt { enum {value = 1}; }; template struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; -// Forward-declare the function templates that are defined libcxx later. -template _LIBCUDACXX_INLINE_VISIBILITY -typename enable_if::value>::type -__cxx_atomic_assign_volatile(_Tp& __a_value, _Tv const& __val); - -template _LIBCUDACXX_INLINE_VISIBILITY -typename enable_if::value>::type -__cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val); - -__host__ __device__ inline bool __cxx_atomic_is_lock_free(size_t __x) { +_LIBCUDACXX_INLINE_VISIBILITY + bool __cxx_atomic_is_lock_free(size_t __x) { return __x <= 8; } -__host__ __device__ inline void __cxx_atomic_thread_fence(int __order) { + +_LIBCUDACXX_INLINE_VISIBILITY + void __cxx_atomic_thread_fence(memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( detail::__atomic_thread_fence_cuda(__order, detail::__thread_scope_system_tag()); ), NV_IS_HOST, ( - host::__atomic_thread_fence(__order); + host::__cxx_atomic_thread_fence(__order); ) ) } -__host__ __device__ inline void __cxx_atomic_signal_fence(int __order) { + +_LIBCUDACXX_INLINE_VISIBILITY + void __cxx_atomic_signal_fence(memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( detail::__atomic_signal_fence_cuda(__order); ), NV_IS_HOST, ( - host::__atomic_signal_fence(__order); + host::__cxx_atomic_signal_fence(__order); ) ) } @@ -153,239 +148,255 @@ __host__ __device__ inline void __cxx_atomic_signal_fence(int __order) { // Atomic storage layouts: // Implement _Sco with https://godbolt.org/z/foWdeYjEs +template +struct __cxx_atomic_base_heterogeneous_impl { + __cxx_atomic_base_heterogeneous_impl() noexcept = default; + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit + __cxx_atomic_base_heterogeneous_impl(_Tp __value) : __a_value(__value) { + } + + host::__cxx_atomic_base_impl<_Tp, _Sco> __a_value; -template -struct type { _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - type() noexcept : __a_value() { + auto __get_device() const volatile _NOEXCEPT -> decltype(__a_value.__get_atom()) { + return __a_value.__get_atom(); } - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - type(_Tp __held) noexcept : __a_value(__held) { + auto __get_device() volatile _NOEXCEPT -> decltype(__a_value.__get_atom()) { + return __a_value.__get_atom(); + } + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + auto __get_device() const _NOEXCEPT -> decltype(__a_value.__get_atom()) { + return __a_value.__get_atom(); } - - _ALIGNAS(sizeof(_Tp)) _Tp __a_value; _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - _Tp* get() _NOEXCEPT { + auto __get_host() const volatile _NOEXCEPT -> decltype(&__a_value) { + return &__a_value; + } + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + auto __get_host() volatile _NOEXCEPT -> decltype(&__a_value) { + return &__a_value; + } + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + auto __get_host() const _NOEXCEPT -> decltype(&__a_value) { return &__a_value; } -} - -template -struct __cxx_atomic_base_storage_aligned<_Tp> { - }; template -struct __cxx_atomic_base_storage_small { - using __wrapped_type = _Tp; - - __cxx_atomic_base_storage_small() noexcept = default; +struct __cxx_atomic_base_small_impl { + __cxx_atomic_base_small_impl() noexcept = default; _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit - __cxx_atomic_base_storage_small(_Tp __value) : __a_held(__value) { + __cxx_atomic_base_small_impl(_Tp __value) : __a_value(__value) { } - __cxx_atomic_base_storage_aligned __a_held; + __cxx_atomic_base_heterogeneous_impl __a_value; _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - __cxx_atomic_base_storage_aligned* get() _NOEXCEPT { - return &__a_held; + auto __get_atom() const volatile _NOEXCEPT -> decltype(&__a_value) { + return &__a_value; + } + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + auto __get_atom() volatile _NOEXCEPT -> decltype(&__a_value) { + return &__a_value; + } + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + auto __get_atom() const _NOEXCEPT -> decltype(&__a_value) { + return &__a_value; } }; -template -using __cxx_atomic_base_storage = typename conditional, - __cxx_atomic_base_storage_aligned<_Tp, _Sco> >::type; - template -using __cxx_atomic_alignment_wrapper_t = __cxx_atomic_base_storage<_Tp>; +using __cxx_small_proxy = typename conditional::type >::type; -template -__host__ __device__ __cxx_atomic_alignment_wrapper_t<_Tp> __cxx_atomic_alignment_wrap(_Tp __value) { - return __cxx_atomic_alignment_wrapper_t(__value); -} +template +using __cxx_atomic_base_impl = typename conditional, + __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> >::type; -template -__host__ __device__ _Tp __cxx_atomic_alignment_unwrap(_Tp __value, true_type) { - return __value; -} -template -__host__ __device__ typename _Tp::__wrapped_type __cxx_atomic_alignment_unwrap(_Tp __value, false_type) { - return *__value.get(); -} -template -__host__ __device__ auto __cxx_atomic_alignment_unwrap(_Tp __value) - -> decltype(__cxx_atomic_alignment_unwrap(__value, integral_constant{})) -{ - return __cxx_atomic_alignment_unwrap(__value, integral_constant{}); +template +__host__ __device__ + void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __val) { + alignas(_Tp) auto __tmp = __val; + __cxx_atomic_assign_volatile(*__a->__get_device(), __tmp); } -template -__host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val) { - auto __tmp = __cxx_atomic_alignment_wrap(__val); - __cxx_atomic_assign_volatile(__a->__a_value, __tmp); -} -template -__host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { +template +__host__ __device__ + void __cxx_atomic_store(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __val, memory_order __order) { + alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - detail::__atomic_store_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>()); + detail::__atomic_store_n_cuda(__a->__get_device(), __tmp, __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - auto __t = __cxx_atomic_alignment_wrap(__val); - host::__atomic_store(&__a->__a_value, &__t, __order); + host::__cxx_atomic_store(__a->__get_host(), __tmp, __order); ) ) } -template -__host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_impl_default<_Tp, _Sco> const volatile* __a, int __order) { + +template +__host__ __device__ + _Tp __cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> const volatile* __a, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __cxx_atomic_alignment_unwrap(detail::__atomic_load_n_cuda(&__a->__a_value, __order, detail::__scope_tag<_Sco>())); + return detail::__atomic_load_n_cuda(__a->__get_device(), __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; - auto* __dest = reinterpret_cast<_Tp*>(__buf); - host::__atomic_load(&__a->__a_value, __dest, __order); - return __cxx_atomic_alignment_unwrap(*__dest); + return host::__cxx_atomic_load(__a->__get_host(), __order); ) ) } -template -__host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { + +template +__host__ __device__ + _Tp __cxx_atomic_exchange(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __val, memory_order __order) { + alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __cxx_atomic_alignment_unwrap(detail::__atomic_exchange_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>())); + return detail::__atomic_exchange_n_cuda(__a->__get_device(), __tmp, __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; - auto* __dest = reinterpret_cast<_Tp*>(__buf); - auto __t = __cxx_atomic_alignment_wrap(__val); - host::__atomic_exchange(&__a->__a_value, &__t, __dest, __order); - return __cxx_atomic_alignment_unwrap(*__dest); + return host::__cxx_atomic_exchange(__a->__get_host(), __tmp, __order); ) ) } -template -__host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __val, int __success, int __failure) { - auto __tmp = __cxx_atomic_alignment_wrap(*__expected); + +template +__host__ __device__ + bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) { + alignas(_Tp) auto __tmp = *__expected; bool __result = false; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure, detail::__scope_tag<_Sco>()); + alignas(_Tp) auto __tmp_v = __val; + __result = detail::__atomic_compare_exchange_cuda(__a->__get_device(), &__tmp, &__tmp_v, false, __success, __failure, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - __result = host::__atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, false, __success, __failure); + __result = host::__cxx_atomic_compare_exchange_strong(__a->__get_host(), &__tmp, __val, __success, __failure); ) ) - *__expected = __cxx_atomic_alignment_unwrap(__tmp); + *__expected = __tmp; return __result; } -template -__host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __val, int __success, int __failure) { - auto __tmp = __cxx_atomic_alignment_wrap(*__expected); + +template +__host__ __device__ + bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) { + alignas(_Tp) auto __tmp = *__expected; bool __result = false; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure, detail::__scope_tag<_Sco>()); + alignas(_Tp) auto __tmp_v = __val; + __result = detail::__atomic_compare_exchange_cuda(__a->__get_device(), &__tmp, &__tmp_v, true, __success, __failure, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - __result = host::__atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, true, __success, __failure); + __result = host::__cxx_atomic_compare_exchange_weak(__a->__get_host(), &__tmp, __val, __success, __failure); ) ) - *__expected = __cxx_atomic_alignment_unwrap(__tmp); + *__expected = __tmp; return __result; } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __delta, int __order) { + +template +__host__ __device__ + _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); + return detail::__atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__atomic_fetch_add(&__a->__a_value, __delta, __order); + return host::__cxx_atomic_fetch_add(__a->__get_host(), __delta, __order); ) ) } -template -__host__ __device__ inline _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl_default<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, int __order) { + +template +__host__ __device__ + _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); + return detail::__atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); + return host::__cxx_atomic_fetch_add(__a->__get_host(), __delta, __order); ) ) } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __delta, int __order) { + +template +__host__ __device__ + _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); + return detail::__atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__atomic_fetch_sub(&__a->__a_value, __delta, __order); + return host::__cxx_atomic_fetch_sub(__a->__get_host(), __delta, __order); ) ) } -template -__host__ __device__ inline _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_default<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, int __order) { + +template +__host__ __device__ + _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); + return detail::__atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); + return host::__cxx_atomic_fetch_sub(__a->__get_host(), __delta, __order); ) ) } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { + +template +__host__ __device__ + _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_and_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); + return detail::__atomic_fetch_and_cuda(__a->__get_device(), __pattern, __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__atomic_fetch_and(&__a->__a_value, __pattern, __order); + return host::__cxx_atomic_fetch_and(__a->__get_host(), __pattern, __order); ) ) } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { + +template +__host__ __device__ + _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_or_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); + return detail::__atomic_fetch_or_cuda(__a->__get_device(), __pattern, __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__atomic_fetch_or(&__a->__a_value, __pattern, __order); + return host::__cxx_atomic_fetch_or(__a->__get_host(), __pattern, __order); ) ) } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { + +template +__host__ __device__ + _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_xor_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); + return detail::__atomic_fetch_xor_cuda(__a->__get_device(), __pattern, __order, detail::__scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__atomic_fetch_xor(&__a->__a_value, __pattern, __order); + return host::__cxx_atomic_fetch_xor(__a->__get_host(), __pattern, __order); ) ) } -template -using __cxx_small_proxy = typename conditional::type >::type; - template __host__ __device__ inline uint32_t __cxx_small_to_32(_Tp __val) { - __cxx_small_proxy<_Tp> __temp; + __cxx_small_proxy<_Tp> __temp = 0; memcpy(&__temp, &__val, sizeof(_Tp)); return __temp; } @@ -398,21 +409,24 @@ __host__ __device__ inline _Tp __cxx_small_from_32(uint32_t __val) { return __result; } -template -__host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp __val) { - __cxx_atomic_init(&__a->__a_value, __cxx_small_to_32(__val)); +template +__host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val) { + __cxx_atomic_init(__a->__get_atom(), __cxx_small_to_32(__val)); } -template -__host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { - __cxx_atomic_store(&__a->__a_value, __cxx_small_to_32(__val), __order); + +template +__host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val, memory_order __order) { + __cxx_atomic_store(__a->__get_atom(), __cxx_small_to_32(__val), __order); } -template -__host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_impl_small<_Tp, _Sco> const volatile* __a, int __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_load(&__a->__a_value, __order)); + +template +__host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_small_impl<_Tp, _Sco> const volatile* __a, memory_order __order) { + return __cxx_small_from_32<_Tp>(__cxx_atomic_load(__a->__get_atom(), __order)); } -template -__host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp __value, int __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_exchange(&__a->__a_value, __cxx_small_to_32(__value), __order)); + +template +__host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __value, memory_order __order) { + return __cxx_small_from_32<_Tp>(__cxx_atomic_exchange(__a->__get_atom(), __cxx_small_to_32(__value), __order)); } __host__ __device__ inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) { @@ -430,21 +444,23 @@ inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) return memcmp(__lhs, __rhs, __count); #endif } -template -__host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __value, int __success, int __failure) { + +template +__host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) { auto __temp = __cxx_small_to_32(*__expected); - auto const __ret = __cxx_atomic_compare_exchange_weak(&__a->__a_value, &__temp, __cxx_small_to_32(__value), __success, __failure); + auto const __ret = __cxx_atomic_compare_exchange_weak(__a->__get_atom(), &__temp, __cxx_small_to_32(__value), __success, __failure); auto const __actual = __cxx_small_from_32<_Tp>(__temp); if(!__ret) { if(0 == __cuda_memcmp(&__actual, __expected, sizeof(_Tp))) - __cxx_atomic_fetch_and(&__a->__a_value, (1u << (8*sizeof(_Tp))) - 1, __ATOMIC_RELAXED); + __cxx_atomic_fetch_and(__a->__get_atom(), (1u << (8*sizeof(_Tp))) - 1, memory_order::memory_order_relaxed); else *__expected = __actual; } return __ret; } -template -__host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __value, int __success, int __failure) { + +template +__host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) { auto const __old = *__expected; while(1) { if(__cxx_atomic_compare_exchange_weak(__a, __expected, __value, __success, __failure)) @@ -453,23 +469,28 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomi return false; } } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp __delta, int __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_add(&__a->__a_value, __cxx_small_to_32(__delta), __order)); + +template +__host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) { + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_add(__a->__get_atom(), __cxx_small_to_32(__delta), __order)); } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp __delta, int __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_sub(&__a->__a_value, __cxx_small_to_32(__delta), __order)); + +template +__host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) { + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_sub(__a->__get_atom(), __cxx_small_to_32(__delta), __order)); } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_and(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); + +template +__host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_and(__a->__get_atom(), __cxx_small_to_32(__pattern), __order)); } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_or(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); + +template +__host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_or(__a->__get_atom(), __cxx_small_to_32(__pattern), __order)); } -template -__host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_small<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_xor(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); + +template +__host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_xor(__a->__get_atom(), __cxx_small_to_32(__pattern), __order)); } diff --git a/libcxx/include/support/atomic/atomic_gcc.h b/libcxx/include/support/atomic/atomic_gcc.h index bf8cc8991f..1e1004820c 100644 --- a/libcxx/include/support/atomic/atomic_gcc.h +++ b/libcxx/include/support/atomic/atomic_gcc.h @@ -1,20 +1,57 @@ // -template -struct __cxx_atomic_base_storage { - _LIBCUDACXX_INLINE_VISIBILITY - __cxx_atomic_base_impl() _NOEXCEPT : __a_value() {} +template +struct __cxx_atomic_base_impl { + using __cxx_underlying_type = _Tp; + + _LIBCUDACXX_CONSTEXPR + __cxx_atomic_base_impl() _NOEXCEPT = default; - _LIBCUDACXX_CONSTEXPR explicit __cxx_atomic_base_impl(_Tp value) _NOEXCEPT - : __a_value(value) {} + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit + __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} _ALIGNAS(sizeof(_Tp)) _Tp __a_value; - _LIBCUDACXX_CONSTEXPR _Tp* get() _NOEXCEPT { - return __a_value; - } + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + const volatile _Tp* __get_atom() const volatile _NOEXCEPT {return &__a_value;} + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + const _Tp* __get_atom() const _NOEXCEPT {return &__a_value;} + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + volatile _Tp* __get_atom() volatile _NOEXCEPT {return &__a_value;} }; +template +struct __cxx_atomic_ref_base_impl { + using __cxx_underlying_type = _Tp; + + _LIBCUDACXX_CONSTEXPR + __cxx_atomic_ref_base_impl() _NOEXCEPT = default; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit + __cxx_atomic_ref_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} + + _Tp* __a_value; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + const volatile _Tp* __get_atom() const volatile _NOEXCEPT {return __a_value;} + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + const _Tp* __get_atom() const _NOEXCEPT {return __a_value;} + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + volatile _Tp* __get_atom() volatile _NOEXCEPT {return __a_value;} +}; + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT -> decltype(__a->__get_atom()) { + return __a->__get_atom(); +} + +template +using __cxx_atomic_underlying_t = typename _Tp::__cxx_underlying_type; + _LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { // Avoid switch statement to make this a constexpr. return __order == memory_order_relaxed ? __ATOMIC_RELAXED: @@ -35,122 +72,70 @@ _LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_failure_ __ATOMIC_CONSUME)))); } -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(volatile _Tp* __a, _Tp __val) { - __cxx_atomic_assign_volatile(*__a, __val); +template +inline void __cxx_atomic_init(volatile _Tp* __a, _Up __val) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __cxx_atomic_assign_volatile(*__a_tmp, __val); } -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(_Tp* __a, _Tp __val) { -__a = __val; +template +inline void __cxx_atomic_init(_Tp* __a, _Up __val) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __a = __val; } -_LIBCUDACXX_INLINE_VISIBILITY inline +inline void __cxx_atomic_thread_fence(memory_order __order) { __atomic_thread_fence(__to_gcc_order(__order)); } -_LIBCUDACXX_INLINE_VISIBILITY inline +inline void __cxx_atomic_signal_fence(memory_order __order) { __atomic_signal_fence(__to_gcc_order(__order)); } -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(volatile _Tp* __a, _Tp __val, +template +inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order) { - __atomic_store(__a, &__val, - __to_gcc_order(__order)); + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __atomic_store(__a_tmp, &__val, __to_gcc_order(__order)); } template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(_Tp* __a, _Tp __val, - memory_order __order) { - __atomic_store(__a, &__val, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(const volatile _Tp* __a, - memory_order __order) { - _Tp __ret; - __atomic_load(__a, &__ret, - __to_gcc_order(__order)); +inline auto __cxx_atomic_load(const _Tp* __a, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __cxx_atomic_underlying_t<_Tp> __ret; + __atomic_load(__a_tmp, &__ret, __to_gcc_order(__order)); return __ret; } -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(const _Tp* __a, memory_order __order) { - _Tp __ret; - __atomic_load(__a, &__ret, - __to_gcc_order(__order)); +template +inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __cxx_atomic_underlying_t<_Tp> __ret; + __atomic_exchange(__a_tmp, &__value, &__ret, __to_gcc_order(__order)); return __ret; } -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(volatile _Tp* __a, - _Tp __value, memory_order __order) { - _Tp __ret; - __atomic_exchange(__a, &__value, &__ret, - __to_gcc_order(__order)); - return __ret; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(_Tp* __a, _Tp __value, - memory_order __order) { - _Tp __ret; - __atomic_exchange(__a, &__value, &__ret, - __to_gcc_order(__order)); - return __ret; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_strong( - volatile _Tp* __a, _Tp* __expected, _Tp __value, - memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(__a, __expected, &__value, - false, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_strong( - _Tp* __a, _Tp* __expected, _Tp __value, memory_order __success, +template +inline bool __cxx_atomic_compare_exchange_strong( + _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(__a, __expected, &__value, + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_compare_exchange(__a_tmp, __expected, &__value, false, __to_gcc_order(__success), __to_gcc_failure_order(__failure)); } -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_weak( - volatile _Tp* __a, _Tp* __expected, _Tp __value, - memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(__a, __expected, &__value, - true, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_weak( - _Tp* __a, _Tp* __expected, _Tp __value, memory_order __success, +template +inline bool __cxx_atomic_compare_exchange_weak( + _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure) { - return __atomic_compare_exchange(__a, __expected, &__value, + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_compare_exchange(__a_tmp, __expected, &__value, true, __to_gcc_order(__success), __to_gcc_failure_order(__failure)); @@ -170,83 +155,48 @@ template struct __skip_amt<_Tp[n]> { }; template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(volatile _Tp* __a, - _Td __delta, memory_order __order) { - return __atomic_fetch_add(__a, __delta * __skip_amt<_Tp>::value, +inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_add(__a_tmp, __delta * __skip_v, __to_gcc_order(__order)); } template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, - memory_order __order) { - return __atomic_fetch_add(__a, __delta * __skip_amt<_Tp>::value, +inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, __to_gcc_order(__order)); } template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(volatile _Tp* __a, - _Td __delta, memory_order __order) { - return __atomic_fetch_sub(__a, __delta * __skip_amt<_Tp>::value, +inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_and(__a_tmp, __pattern, __to_gcc_order(__order)); } template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, - memory_order __order) { - return __atomic_fetch_sub(__a, __delta * __skip_amt<_Tp>::value, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(volatile _Tp* __a, - _Tp __pattern, memory_order __order) { - return __atomic_fetch_and(__a, __pattern, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(_Tp* __a, - _Tp __pattern, memory_order __order) { - return __atomic_fetch_and(__a, __pattern, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(volatile _Tp* __a, - _Tp __pattern, memory_order __order) { - return __atomic_fetch_or(__a, __pattern, +inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_or(__a_tmp, __pattern, __to_gcc_order(__order)); } -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(_Tp* __a, _Tp __pattern, - memory_order __order) { - return __atomic_fetch_or(__a, __pattern, - __to_gcc_order(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(volatile _Tp* __a, - _Tp __pattern, memory_order __order) { - return __atomic_fetch_xor(__a, __pattern, +template +inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_xor(__a_tmp, __pattern, __to_gcc_order(__order)); } -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(_Tp* __a, _Tp __pattern, - memory_order __order) { - return __atomic_fetch_xor(__a, __pattern, - __to_gcc_order(__order)); +inline constexpr + bool __cxx_atomic_is_lock_free(size_t __x) { + return __atomic_is_lock_free(__x, 0); } - -#define __cxx_atomic_is_lock_free(__s) __atomic_is_lock_free(__s, 0) From 5016ca12e1f278afb3bc6ccf37bc0aa36c9470c0 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 30 Apr 2021 18:40:38 -0700 Subject: [PATCH 06/34] Refactor a few #ifdef __CUDA__ things and fix statics/shared memory usage in tests --- .../test/cuda/bad_atomic_alignment.pass.cpp | 50 ++++++++ .../std/thread/thread.barrier/arrive.pass.cpp | 2 +- .../thread.barrier/arrive_and_drop.pass.cpp | 2 +- .../thread.barrier/arrive_and_wait.pass.cpp | 2 +- .../thread/thread.barrier/completion.pass.cpp | 4 +- .../thread.latch/arrive_and_wait.pass.cpp | 2 +- .../thread/thread.latch/count_down.pass.cpp | 2 +- .../std/thread/thread.latch/try_wait.pass.cpp | 2 +- .../thread/thread.semaphore/release.pass.cpp | 2 +- .../func.not_fn/not_fn.pass.cpp | 46 +++---- .../tuple.tuple/tuple.apply/apply.pass.cpp | 34 +++-- .../tuple.apply/apply_extended_types.pass.cpp | 118 +++++++++--------- .../tuple.tuple/tuple.assign/move.pass.cpp | 22 ++-- ...4_contains_ref_to_incomplete_type.pass.cpp | 73 +++++++---- .../tuple.tuple/tuple.cnstr/PR31384.pass.cpp | 34 +++-- .../pairs/pairs.pair/assign_pair.pass.cpp | 46 +++++-- .../test/support/concurrent_agents.h | 9 +- .../test/support/cuda_space_selector.h | 7 +- .upstream-tests/test/support/test_macros.h | 43 +++++++ 19 files changed, 309 insertions(+), 191 deletions(-) create mode 100644 .upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp diff --git a/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp b/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp new file mode 100644 index 0000000000..828d4401fd --- /dev/null +++ b/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads, pre-sm-60 +// UNSUPPORTED: windows && pre-sm-70 + +// + +// cuda::atomic + +// Original test issue: +// https://github.com/NVIDIA/libcudacxx/issues/160 + +#include + +template +__host__ __device__ +constexpr bool unused(T &&) {return true;} + +int main(int argc, char ** argv) +{ + // Test default aligned user type + { + struct key { + int32_t a; + int32_t b; + }; + static_assert(alignof(key) == 4); + cuda::atomic k; + auto r = k.load(); + unused(r); + } + // Test forcibly aligned user type + { + struct alignas(8) key { + int32_t a; + int32_t b; + }; + static_assert(alignof(key) == 8); + cuda::atomic k; + auto r = k.load(); + unused(r); + } + return 0; +} \ No newline at end of file diff --git a/.upstream-tests/test/std/thread/thread.barrier/arrive.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/arrive.pass.cpp index 98cac810b9..f8a2849854 100644 --- a/.upstream-tests/test/std/thread/thread.barrier/arrive.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.barrier/arrive.pass.cpp @@ -25,7 +25,7 @@ __host__ __device__ void test() { Selector sel; - SHARED Barrier * b; + Barrier*& b = maybe_shared_mem(); b = sel.construct(2); #ifdef __CUDA_ARCH__ diff --git a/.upstream-tests/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp index 25acfe46f6..f46d31e004 100644 --- a/.upstream-tests/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - SHARED Barrier * b; + Barrier*& b = maybe_shared_mem(); b = sel.construct(2); auto dropper = LAMBDA (){ diff --git a/.upstream-tests/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp index 2989b9b134..d99f99006e 100644 --- a/.upstream-tests/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - SHARED Barrier * b; + Barrier*& b = maybe_shared_mem(); b = sel.construct(2); auto worker = LAMBDA (){ diff --git a/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp index e9afbe17f3..dc1aefc4fc 100644 --- a/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp @@ -24,13 +24,13 @@ __host__ __device__ void test() { global_memory_selector int_sel; - SHARED int * x; + int*& x = maybe_shared_mem(); x = int_sel.construct(0); auto comp = LAMBDA () { *x += 1; }; Selector, Initializer> sel; - SHARED Barrier * b; + Barrier*& b = maybe_shared_mem*>(); b = sel.construct(2, comp); auto worker = LAMBDA () { diff --git a/.upstream-tests/test/std/thread/thread.latch/arrive_and_wait.pass.cpp b/.upstream-tests/test/std/thread/thread.latch/arrive_and_wait.pass.cpp index 4fa55834a1..815bb5a055 100644 --- a/.upstream-tests/test/std/thread/thread.latch/arrive_and_wait.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.latch/arrive_and_wait.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - SHARED Latch * l; + Latch*& l = maybe_shared_mem(); l = sel.construct(2); auto worker = LAMBDA (){ diff --git a/.upstream-tests/test/std/thread/thread.latch/count_down.pass.cpp b/.upstream-tests/test/std/thread/thread.latch/count_down.pass.cpp index b8720606bb..239eae9a88 100644 --- a/.upstream-tests/test/std/thread/thread.latch/count_down.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.latch/count_down.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - SHARED Latch * l; + Latch*& l = maybe_shared_mem(); l = sel.construct(2); #ifdef __CUDA_ARCH__ diff --git a/.upstream-tests/test/std/thread/thread.latch/try_wait.pass.cpp b/.upstream-tests/test/std/thread/thread.latch/try_wait.pass.cpp index 7d650ce692..f6b6d9e027 100644 --- a/.upstream-tests/test/std/thread/thread.latch/try_wait.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.latch/try_wait.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - SHARED Latch * l; + Latch*& l = maybe_shared_mem(); l = sel.construct(1); l->count_down(); diff --git a/.upstream-tests/test/std/thread/thread.semaphore/release.pass.cpp b/.upstream-tests/test/std/thread/thread.semaphore/release.pass.cpp index 28b8986ac2..8efb47490a 100644 --- a/.upstream-tests/test/std/thread/thread.semaphore/release.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.semaphore/release.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - SHARED Semaphore * s; + Semaphore*& s = maybe_shared_mem(); s = sel.construct(2); #ifdef __CUDA_ARCH__ diff --git a/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp b/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp index 63b0e6d09b..4638cc657b 100644 --- a/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp +++ b/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp @@ -166,16 +166,9 @@ inline constexpr CallType operator|(CallType LHS, CallType RHS) { #if 0 -#ifdef __CUDA_ARCH__ -__device__ -#endif -CallType ForwardingCallObject_last_call_type = CT_None; -#ifdef __CUDA_ARCH__ -__device__ -#endif -TypeID const* ForwardingCallObject_last_call_args = nullptr; - struct ForwardingCallObject { + STATIC_MEMBER_VAR(ForwardingCallObject_last_call_type, CallType) + STATIC_MEMBER_VAR(ForwardingCallObject_last_call_args, TypeID const*) template __host__ __device__ @@ -209,21 +202,21 @@ struct ForwardingCallObject { template __host__ __device__ static void set_call(CallType type) { - assert(ForwardingCallObject_last_call_type == CT_None); - assert(ForwardingCallObject_last_call_args == nullptr); - ForwardingCallObject_last_call_type = type; - ForwardingCallObject_last_call_args = &makeArgumentID(); + assert(ForwardingCallObject_last_call_type() == CT_None); + assert(ForwardingCallObject_last_call_args() == nullptr); + ForwardingCallObject_last_call_type() = type; + ForwardingCallObject_last_call_args() = &makeArgumentID(); } template __host__ __device__ static bool check_call(CallType type) { bool result = - ForwardingCallObject_last_call_type == type - && ForwardingCallObject_last_call_args - && *ForwardingCallObject_last_call_args == makeArgumentID(); - ForwardingCallObject_last_call_type = CT_None; - ForwardingCallObject_last_call_args = nullptr; + ForwardingCallObject_last_call_type() == type + && ForwardingCallObject_last_call_args() + && *ForwardingCallObject_last_call_args() == makeArgumentID(); + ForwardingCallObject_last_call_type() = CT_None; + ForwardingCallObject_last_call_args() = nullptr; return result; } }; @@ -235,18 +228,15 @@ struct ForwardingCallObject { // BOOL TEST TYPES /////////////////////////////////////////////////////////////////////////////// -#ifdef __CUDA_ARCH__ -__device__ -#endif -int EvilBool_bang_called = 0; - struct EvilBool { + STATIC_MEMBER_VAR(EvilBool_bang_called, int) + EvilBool(EvilBool const&) = default; EvilBool(EvilBool&&) = default; __host__ __device__ friend EvilBool operator!(EvilBool const& other) { - ++EvilBool_bang_called; + ++EvilBool_bang_called(); return EvilBool{!other.value}; } @@ -409,12 +399,12 @@ void return_type_tests() using T = CopyCallable; auto ret = cuda::std::not_fn(T{false}); static_assert(is_same::value, ""); - EvilBool_bang_called = 0; + EvilBool::EvilBool_bang_called() = 0; auto value_ret = ret(); - assert(EvilBool_bang_called == 1); + assert(EvilBool::EvilBool_bang_called() == 1); assert(value_ret.value == true); ret(); - assert(EvilBool_bang_called == 2); + assert(EvilBool::EvilBool_bang_called() == 2); } } @@ -527,6 +517,8 @@ void call_operator_sfinae_test() { __host__ __device__ void call_operator_forwarding_test() { + ForwardingCallObject::ForwardingCallObject_last_call_type() = CT_None; + ForwardingCallObject::ForwardingCallObject_last_call_args() = nullptr; using Fn = ForwardingCallObject; auto obj = cuda::std::not_fn(Fn{}); const auto& c_obj = obj; diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp index f7178048a8..64dc0608c7 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++98, c++03, c++11, c++14 +// UNSUPPORTED: c++98, c++03, c++11, c++14 // UNSUPPORTED: nvrtc // @@ -239,11 +239,9 @@ void test_noexcept() } namespace ReturnTypeTest { - #ifdef __CUDA_ARCH__ - __constant__ int my_int = 42; - #else - static int my_int = 42; - #endif + struct global { + STATIC_MEMBER_VAR(my_int, int) + }; template struct index {}; @@ -254,31 +252,31 @@ namespace ReturnTypeTest { int f(index<1>) { return 0; } __host__ __device__ - int & f(index<2>) { return static_cast(my_int); } + int & f(index<2>) { return static_cast(global::my_int()); } __host__ __device__ - int const & f(index<3>) { return static_cast(my_int); } + int const & f(index<3>) { return static_cast(global::my_int()); } __host__ __device__ - int volatile & f(index<4>) { return static_cast(my_int); } + int volatile & f(index<4>) { return static_cast(global::my_int()); } __host__ __device__ - int const volatile & f(index<5>) { return static_cast(my_int); } + int const volatile & f(index<5>) { return static_cast(global::my_int()); } __host__ __device__ - int && f(index<6>) { return static_cast(my_int); } + int && f(index<6>) { return static_cast(global::my_int()); } __host__ __device__ - int const && f(index<7>) { return static_cast(my_int); } + int const && f(index<7>) { return static_cast(global::my_int()); } __host__ __device__ - int volatile && f(index<8>) { return static_cast(my_int); } + int volatile && f(index<8>) { return static_cast(global::my_int()); } __host__ __device__ - int const volatile && f(index<9>) { return static_cast(my_int); } + int const volatile && f(index<9>) { return static_cast(global::my_int()); } __host__ __device__ - int * f(index<10>) { return static_cast(&my_int); } + int * f(index<10>) { return static_cast(&global::my_int()); } __host__ __device__ - int const * f(index<11>) { return static_cast(&my_int); } + int const * f(index<11>) { return static_cast(&global::my_int()); } __host__ __device__ - int volatile * f(index<12>) { return static_cast(&my_int); } + int volatile * f(index<12>) { return static_cast(&global::my_int()); } __host__ __device__ - int const volatile * f(index<13>) { return static_cast(&my_int); } + int const volatile * f(index<13>) { return static_cast(&global::my_int()); } template __host__ __device__ diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp index 4908965027..786d3208a3 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp @@ -8,7 +8,7 @@ -// UNSUPPORTED: c++98, c++03, c++11, c++14 +// UNSUPPORTED: c++98, c++03, c++11, c++14 // @@ -31,18 +31,16 @@ #include "test_macros.h" #include "disable_missing_braces_warning.h" -#ifdef __CUDA_ARCH__ -__device__ int count = 0; -#else -int count = 0; -#endif +struct global_state { + STATIC_MEMBER_VAR(count, int) +}; struct A_int_0 { __host__ __device__ A_int_0() : obj1(0){} __host__ __device__ A_int_0(int x) : obj1(x) {} - __host__ __device__ int mem1() { return ++count; } - __host__ __device__ int mem2() const { return ++count; } + __host__ __device__ int mem1() { return ++global_state::count(); } + __host__ __device__ int mem2() const { return ++global_state::count(); } int const obj1; }; @@ -50,16 +48,16 @@ struct A_int_1 { __host__ __device__ A_int_1() {} __host__ __device__ A_int_1(int) {} - __host__ __device__ int mem1(int x) { return count += x; } - __host__ __device__ int mem2(int x) const { return count += x; } + __host__ __device__ int mem1(int x) { return global_state::count() += x; } + __host__ __device__ int mem2(int x) const { return global_state::count() += x; } }; struct A_int_2 { __host__ __device__ A_int_2() {} __host__ __device__ A_int_2(int) {} - __host__ __device__ int mem1(int x, int y) { return count += (x + y); } - __host__ __device__ int mem2(int x, int y) const { return count += (x + y); } + __host__ __device__ int mem1(int x, int y) { return global_state::count() += (x + y); } + __host__ __device__ int mem2(int x, int y) const { return global_state::count() += (x + y); } }; template @@ -98,7 +96,7 @@ template < __host__ __device__ void test_ext_int_0() { - count = 0; + global_state::count() = 0; typedef A_int_0 T; typedef A_wrap_0 Wrap; typedef A_base_0 Base; @@ -117,63 +115,63 @@ void test_ext_int_0() T a; Tuple t{a}; assert(1 == cuda::std::apply(mem1, t)); - assert(count == 1); + assert(global_state::count() == 1); } - count = 0; + global_state::count() = 0; // member function w/pointer { T a; TuplePtr t{&a}; assert(1 == cuda::std::apply(mem1, t)); - assert(count == 1); + assert(global_state::count() == 1); } - count = 0; + global_state::count() = 0; // member function w/base { Base a; TupleBase t{a}; assert(1 == cuda::std::apply(mem1, t)); - assert(count == 1); + assert(global_state::count() == 1); } - count = 0; + global_state::count() = 0; // member function w/wrap { Wrap a; TupleWrap t{a}; assert(1 == cuda::std::apply(mem1, t)); - assert(count == 1); + assert(global_state::count() == 1); } - count = 0; + global_state::count() = 0; // const member function w/ref { T const a; ConstTuple t{a}; assert(1 == cuda::std::apply(mem2, t)); - assert(count == 1); + assert(global_state::count() == 1); } - count = 0; + global_state::count() = 0; // const member function w/pointer { T const a; ConstTuplePtr t{&a}; assert(1 == cuda::std::apply(mem2, t)); - assert(count == 1); + assert(global_state::count() == 1); } - count = 0; + global_state::count() = 0; // const member function w/base { Base const a; ConstTupleBase t{a}; assert(1 == cuda::std::apply(mem2, t)); - assert(count == 1); + assert(global_state::count() == 1); } - count = 0; + global_state::count() = 0; // const member function w/wrapper { Wrap const a; ConstTupleWrap t{a}; assert(1 == cuda::std::apply(mem2, t)); - assert(1 == count); + assert(1 == global_state::count()); } // member object w/ref { @@ -211,7 +209,7 @@ template < __host__ __device__ void test_ext_int_1() { - count = 0; + global_state::count() = 0; typedef A_int_1 T; typedef A_wrap_1 Wrap; typedef A_base_1 Base; @@ -227,63 +225,63 @@ void test_ext_int_1() T a; Tuple t{a, 2}; assert(2 == cuda::std::apply(mem1, t)); - assert(count == 2); + assert(global_state::count() == 2); } - count = 0; + global_state::count() = 0; // member function w/pointer { T a; TuplePtr t{&a, 3}; assert(3 == cuda::std::apply(mem1, t)); - assert(count == 3); + assert(global_state::count() == 3); } - count = 0; + global_state::count() = 0; // member function w/base { Base a; TupleBase t{a, 4}; assert(4 == cuda::std::apply(mem1, t)); - assert(count == 4); + assert(global_state::count() == 4); } - count = 0; + global_state::count() = 0; // member function w/wrap { Wrap a; TupleWrap t{a, 5}; assert(5 == cuda::std::apply(mem1, t)); - assert(count == 5); + assert(global_state::count() == 5); } - count = 0; + global_state::count() = 0; // const member function w/ref { T const a; ConstTuple t{a, 6}; assert(6 == cuda::std::apply(mem2, t)); - assert(count == 6); + assert(global_state::count() == 6); } - count = 0; + global_state::count() = 0; // const member function w/pointer { T const a; ConstTuplePtr t{&a, 7}; assert(7 == cuda::std::apply(mem2, t)); - assert(count == 7); + assert(global_state::count() == 7); } - count = 0; + global_state::count() = 0; // const member function w/base { Base const a; ConstTupleBase t{a, 8}; assert(8 == cuda::std::apply(mem2, t)); - assert(count == 8); + assert(global_state::count() == 8); } - count = 0; + global_state::count() = 0; // const member function w/wrapper { Wrap const a; ConstTupleWrap t{a, 9}; assert(9 == cuda::std::apply(mem2, t)); - assert(9 == count); + assert(9 == global_state::count()); } } @@ -297,7 +295,7 @@ template < __host__ __device__ void test_ext_int_2() { - count = 0; + global_state::count() = 0; typedef A_int_2 T; typedef A_wrap_2 Wrap; typedef A_base_2 Base; @@ -313,63 +311,63 @@ void test_ext_int_2() T a; Tuple t{a, 1, 1}; assert(2 == cuda::std::apply(mem1, t)); - assert(count == 2); + assert(global_state::count() == 2); } - count = 0; + global_state::count() = 0; // member function w/pointer { T a; TuplePtr t{&a, 1, 2}; assert(3 == cuda::std::apply(mem1, t)); - assert(count == 3); + assert(global_state::count() == 3); } - count = 0; + global_state::count() = 0; // member function w/base { Base a; TupleBase t{a, 2, 2}; assert(4 == cuda::std::apply(mem1, t)); - assert(count == 4); + assert(global_state::count() == 4); } - count = 0; + global_state::count() = 0; // member function w/wrap { Wrap a; TupleWrap t{a, 2, 3}; assert(5 == cuda::std::apply(mem1, t)); - assert(count == 5); + assert(global_state::count() == 5); } - count = 0; + global_state::count() = 0; // const member function w/ref { T const a; ConstTuple t{a, 3, 3}; assert(6 == cuda::std::apply(mem2, t)); - assert(count == 6); + assert(global_state::count() == 6); } - count = 0; + global_state::count() = 0; // const member function w/pointer { T const a; ConstTuplePtr t{&a, 3, 4}; assert(7 == cuda::std::apply(mem2, t)); - assert(count == 7); + assert(global_state::count() == 7); } - count = 0; + global_state::count() = 0; // const member function w/base { Base const a; ConstTupleBase t{a, 4, 4}; assert(8 == cuda::std::apply(mem2, t)); - assert(count == 8); + assert(global_state::count() == 8); } - count = 0; + global_state::count() = 0; // const member function w/wrapper { Wrap const a; ConstTupleWrap t{a, 4, 5}; assert(9 == cuda::std::apply(mem2, t)); - assert(9 == count); + assert(9 == global_state::count()); } } diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.assign/move.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.assign/move.pass.cpp index 6254d9d497..7ba86b4d41 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.assign/move.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.assign/move.pass.cpp @@ -14,7 +14,7 @@ // tuple& operator=(tuple&& u); -// UNSUPPORTED: c++98, c++03 +// UNSUPPORTED: c++98, c++03 #include #include @@ -41,19 +41,15 @@ struct MoveAssignable { MoveAssignable& operator=(MoveAssignable&&) = default; }; -#ifdef __CUDA_ARCH__ -__device__ static int copied = 0; -__device__ static int moved = 0; -#else -static int copied = 0; -static int moved = 0; -#endif struct CountAssign { - __host__ __device__ static void reset() { copied = moved = 0; } + STATIC_MEMBER_VAR(copied, int) + STATIC_MEMBER_VAR(moved, int) + + __host__ __device__ static void reset() { copied() = moved() = 0; } CountAssign() = default; - __host__ __device__ CountAssign& operator=(CountAssign const&) { ++copied; return *this; } - __host__ __device__ CountAssign& operator=(CountAssign&&) { ++moved; return *this; } + __host__ __device__ CountAssign& operator=(CountAssign const&) { ++copied(); return *this; } + __host__ __device__ CountAssign& operator=(CountAssign&&) { ++moved(); return *this; } }; int main(int, char**) @@ -130,8 +126,8 @@ int main(int, char**) T t1; T t2; t1 = cuda::std::move(t2); - assert(copied == 1); - assert(moved == 0); + assert(CountAssign::copied() == 1); + assert(CountAssign::moved() == 0); } return 0; diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR27684_contains_ref_to_incomplete_type.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR27684_contains_ref_to_incomplete_type.pass.cpp index 9451c66b06..c094ea54bd 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR27684_contains_ref_to_incomplete_type.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR27684_contains_ref_to_incomplete_type.pass.cpp @@ -29,41 +29,66 @@ #include "test_macros.h" struct IncompleteType; -#ifdef __CUDA_ARCH__ -__device__ extern IncompleteType inc1; -__device__ extern IncompleteType inc2; -__device__ IncompleteType const& cinc1 = inc1; -__device__ IncompleteType const& cinc2 = inc2; -#else -extern IncompleteType inc1; -extern IncompleteType inc2; -IncompleteType const& cinc1 = inc1; -IncompleteType const& cinc2 = inc2; -#endif + +#define STATIC_EXTERN_DECL(name, type) \ + __device__ static type& name##_device(); \ + __host__ static type& name##_host(); \ + __host__ __device__ static type& name(); + +struct global { + STATIC_EXTERN_DECL(inc1, IncompleteType) + STATIC_EXTERN_DECL(inc2, IncompleteType) + __host__ __device__ static const IncompleteType& cinc1(); + __host__ __device__ static const IncompleteType& cinc2(); +}; int main(int, char**) { using IT = IncompleteType; { // try calling tuple(Tp const&...) using Tup = cuda::std::tuple; - Tup t(cinc1, cinc2); - assert(&cuda::std::get<0>(t) == &inc1); - assert(&cuda::std::get<1>(t) == &inc2); + Tup t(global::cinc1(), global::cinc2()); + assert(&cuda::std::get<0>(t) == &global::inc1()); + assert(&cuda::std::get<1>(t) == &global::inc2()); } { // try calling tuple(Up&&...) using Tup = cuda::std::tuple; - Tup t(inc1, inc2); - assert(&cuda::std::get<0>(t) == &inc1); - assert(&cuda::std::get<1>(t) == &inc2); + Tup t(global::inc1(), global::inc2()); + assert(&cuda::std::get<0>(t) == &global::inc1()); + assert(&cuda::std::get<1>(t) == &global::inc2()); } return 0; } struct IncompleteType {}; -#ifdef __CUDA_ARCH__ -__device__ IncompleteType inc1; -__device__ IncompleteType inc2; -#else -IncompleteType inc1; -IncompleteType inc2; -#endif + +#define STATIC_EXTERN_IMPL(name, type) \ + __device__ type& name##_device() { \ + __shared__ type v; \ + return v; \ + } \ + __host__ type& name##_host() { \ + static type v; \ + return v; \ + } \ + type& name() { \ + NV_DISPATCH_TARGET( \ + NV_IS_DEVICE, ( \ + return name##_device(); \ + ), \ + NV_IS_HOST, ( \ + return name##_host(); \ + ) \ + ) \ + } + +STATIC_EXTERN_IMPL(global::inc1, IncompleteType) +STATIC_EXTERN_IMPL(global::inc2, IncompleteType) + +__host__ __device__ const IncompleteType& global::cinc1() { + return inc1(); +} + +__host__ __device__ const IncompleteType& global::cinc2() { + return inc2(); +} diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp index 3817209271..b593d93122 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp @@ -23,11 +23,9 @@ #include "test_macros.h" -#ifdef __CUDA_ARCH__ -__device__ int count = 0; -#else -int count = 0; -#endif +struct global { + STATIC_MEMBER_VAR(count, int) +}; struct Explicit { Explicit() = default; @@ -43,7 +41,7 @@ template struct Derived : cuda::std::tuple { using cuda::std::tuple::tuple; template - __host__ __device__ operator cuda::std::tuple() && { ++count; return {}; } + __host__ __device__ operator cuda::std::tuple() && { ++global::count(); return {}; } }; @@ -51,31 +49,31 @@ template struct ExplicitDerived : cuda::std::tuple { using cuda::std::tuple::tuple; template - __host__ __device__ explicit operator cuda::std::tuple() && { ++count; return {}; } + __host__ __device__ explicit operator cuda::std::tuple() && { ++global::count(); return {}; } }; int main(int, char**) { { cuda::std::tuple foo = Derived{42}; ((void)foo); - assert(count == 1); + assert(global::count() == 1); cuda::std::tuple bar(Derived{42}); ((void)bar); - assert(count == 2); + assert(global::count() == 2); } - count = 0; + global::count() = 0; { cuda::std::tuple foo = Derived{42}; ((void)foo); - assert(count == 1); + assert(global::count() == 1); cuda::std::tuple bar(Derived{42}); ((void)bar); - assert(count == 2); + assert(global::count() == 2); } - count = 0; + global::count() = 0; { static_assert(!cuda::std::is_convertible< ExplicitDerived, cuda::std::tuple>::value, ""); cuda::std::tuple bar(ExplicitDerived{42}); ((void)bar); - assert(count == 1); + assert(global::count() == 1); } - count = 0; + global::count() = 0; { // FIXME: Libc++ incorrectly rejects this code. #ifndef _LIBCUDACXX_VERSION @@ -88,11 +86,11 @@ int main(int, char**) { ExplicitDerived, cuda::std::tuple>::value, "libc++ incorrectly rejects this"); #endif - assert(count == 0); + assert(global::count() == 0); cuda::std::tuple bar(ExplicitDerived{42}); ((void)bar); - assert(count == 1); + assert(global::count() == 1); } - count = 0; + global::count() = 0; return 0; diff --git a/.upstream-tests/test/std/utilities/utility/pairs/pairs.pair/assign_pair.pass.cpp b/.upstream-tests/test/std/utilities/utility/pairs/pairs.pair/assign_pair.pass.cpp index e77004b15f..217c8a04c7 100644 --- a/.upstream-tests/test/std/utilities/utility/pairs/pairs.pair/assign_pair.pass.cpp +++ b/.upstream-tests/test/std/utilities/utility/pairs/pairs.pair/assign_pair.pass.cpp @@ -45,6 +45,7 @@ struct MoveAssignable { struct CountAssign { STATIC_MEMBER_VAR(copied, int); STATIC_MEMBER_VAR(moved, int); + __host__ __device__ static void reset() { copied() = moved() = 0; } CountAssign() = default; __host__ __device__ CountAssign& operator=(CountAssign const&) { ++copied(); return *this; } @@ -52,11 +53,15 @@ struct CountAssign { }; struct Incomplete; -#ifdef __CUDA_ARCH__ -__device__ extern Incomplete inc_obj; -#else -extern Incomplete inc_obj; -#endif + +#define STATIC_EXTERN_DECL(name, type) \ + __device__ static type& name##_device(); \ + __host__ static type& name##_host(); \ + __host__ __device__ static type& name(); + +struct global { + STATIC_EXTERN_DECL(inc_obj, Incomplete) +}; int main(int, char**) { @@ -100,17 +105,34 @@ int main(int, char**) { using P = cuda::std::pair; static_assert(!cuda::std::is_copy_assignable

::value, ""); - P p(42, inc_obj); + P p(42, global::inc_obj()); unused(p); - assert(&p.second == &inc_obj); + assert(&p.second == &global::inc_obj()); } return 0; } struct Incomplete {}; -#ifdef __CUDA_ARCH__ -__device__ Incomplete inc_obj; -#else -Incomplete inc_obj; -#endif + +#define STATIC_EXTERN_IMPL(name, type) \ + __device__ type& name##_device() { \ + __shared__ type v; \ + return v; \ + } \ + __host__ type& name##_host() { \ + static type v; \ + return v; \ + } \ + type& name() { \ + NV_DISPATCH_TARGET( \ + NV_IS_DEVICE, ( \ + return name##_device(); \ + ), \ + NV_IS_HOST, ( \ + return name##_host(); \ + ) \ + ) \ + } + +STATIC_EXTERN_IMPL(global::inc_obj, Incomplete) diff --git a/.upstream-tests/test/support/concurrent_agents.h b/.upstream-tests/test/support/concurrent_agents.h index 6836c17884..ab2a9fef76 100644 --- a/.upstream-tests/test/support/concurrent_agents.h +++ b/.upstream-tests/test/support/concurrent_agents.h @@ -11,16 +11,15 @@ #include #endif +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 350 + #error "This test requires CUDA dynamic parallelism to work." +#endif + template __host__ __device__ void concurrent_agents_launch(Fs ...fs) { #ifdef __CUDA_ARCH__ - - #if __CUDA_ARCH__ < 350 - #error "This test requires CUDA dynamic parallelism to work." - #endif - assert(blockDim.x == sizeof...(Fs)); using fptr = void (*)(void *); diff --git a/.upstream-tests/test/support/cuda_space_selector.h b/.upstream-tests/test/support/cuda_space_selector.h index 026a46c268..c15714c327 100644 --- a/.upstream-tests/test/support/cuda_space_selector.h +++ b/.upstream-tests/test/support/cuda_space_selector.h @@ -21,11 +21,8 @@ #define LAMBDA [=] __host__ __device__ #endif -#ifdef __CUDA_ARCH__ -#define SHARED __shared__ -#else -#define SHARED -#endif +#pragma diag_suppress 941 +#pragma diag_suppress 1057 template struct malloc_memory_provider { diff --git a/.upstream-tests/test/support/test_macros.h b/.upstream-tests/test/support/test_macros.h index ed7306f2c5..1ecca66677 100644 --- a/.upstream-tests/test/support/test_macros.h +++ b/.upstream-tests/test/support/test_macros.h @@ -346,6 +346,49 @@ inline void DoNotOptimize(Tp const& value) { #define TEST_NOINLINE #endif +template +__device__ _Tp& maybe_shared_mem_device() { + __shared__ _Tp v; + return v; +} + +template +__host__ _Tp& maybe_shared_mem_host() { + static _Tp v; + return v; +} + +template +__host__ __device__ _Tp& maybe_shared_mem() { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return maybe_shared_mem_device<_Tp>(); + ), + NV_IS_HOST, ( + return maybe_shared_mem_host<_Tp>(); + ) + ) +} +#define STATIC_MEMBER_VAR(name, type) \ + __device__ static type& name##_device() { \ + __shared__ type v; \ + return v; \ + } \ + __host__ static type& name##_host() { \ + static type v; \ + return v; \ + } \ + __host__ __device__ static type& name() { \ + NV_DISPATCH_TARGET( \ + NV_IS_DEVICE, ( \ + return name##_device(); \ + ), \ + NV_IS_HOST, ( \ + return name##_host(); \ + ) \ + ) \ + } + // NVCC can't handle static member variables, so with a little care // a function returning a reference will result in the same thing #ifdef __CUDA_ARCH__ From a95105959073515d5e9e9a0df7cd643fe785f4cf Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 13 May 2021 16:46:37 -0700 Subject: [PATCH 07/34] Implement CUDA atomic ref, implement MSVC atomics layer, fix several test utilities --- ...peline_memcpy_async_thread_scope_generic.h | 43 ++-- .../atomics.flag/atomic_flag_wait.pass.cpp | 2 + .../atomic_wait.pass.cpp | 2 + .../test/support/cuda_space_selector.h | 27 ++- libcxx/include/support/atomic/atomic_cuda.h | 68 ++++--- libcxx/include/support/atomic/atomic_gcc.h | 8 + libcxx/include/support/atomic/atomic_msvc.h | 188 +++++++++++++++++- 7 files changed, 277 insertions(+), 61 deletions(-) diff --git a/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h b/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h index fbba4cef8d..a6f79f8925 100644 --- a/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h +++ b/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h @@ -35,13 +35,7 @@ void test_fully_specialized() T * dest = dest_sel.construct(static_cast(0)); cuda::pipeline_shared_state * pipe_state = pipe_state_sel.construct(); -#ifdef __CUDA_ARCH__ - auto group = cooperative_groups::this_thread_block(); -#else - auto group = cuda::__single_thread_group{}; -#endif - - auto pipe = make_pipeline(group, pipe_state); + auto pipe = cuda::make_pipeline(group, pipe_state); assert(*source == 12); assert(*dest == 0); @@ -91,11 +85,16 @@ __host__ __device__ __noinline__ void test_select_pipeline() { constexpr uint8_t stages_count = 2; - test_fully_specialized(); -#ifdef __CUDA_ARCH__ - test_fully_specialized(); - test_fully_specialized(); -#endif + + auto singleGroup = cuda::__single_thread_group{}; + test_fully_specialized(singleGroup); + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + auto group = cooperative_groups::this_thread_block(); + test_fully_specialized(group); + test_fully_specialized(group); + ) + ) } template < @@ -107,10 +106,12 @@ __host__ __device__ __noinline__ void test_select_destination() { test_select_pipeline(); -#ifdef __CUDA_ARCH__ - test_select_pipeline(); - test_select_pipeline(); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + test_select_pipeline(); + test_select_pipeline(); + ) + ) } template @@ -118,8 +119,10 @@ __host__ __device__ __noinline__ void test_select_source() { test_select_destination(); -#ifdef __CUDA_ARCH__ - test_select_destination(); - test_select_destination(); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + // test_select_destination(); + // test_select_destination(); + ) + ) } diff --git a/.upstream-tests/test/std/atomics/atomics.flag/atomic_flag_wait.pass.cpp b/.upstream-tests/test/std/atomics/atomics.flag/atomic_flag_wait.pass.cpp index 52b92276bb..8a13854927 100644 --- a/.upstream-tests/test/std/atomics/atomics.flag/atomic_flag_wait.pass.cpp +++ b/.upstream-tests/test/std/atomics/atomics.flag/atomic_flag_wait.pass.cpp @@ -9,6 +9,8 @@ // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: c++98, c++03 // UNSUPPORTED: pre-sm-70 +// NVC++ does not support GPU function pointers +// UNSUPPORTED: pgi // diff --git a/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp b/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp index 2dc72caeec..63b2caa2bb 100644 --- a/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp +++ b/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp @@ -9,6 +9,8 @@ // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: c++98, c++03 // UNSUPPORTED: pre-sm-70 +// NVC++ does not support GPU function pointers +// UNSUPPORTED: pgi // diff --git a/.upstream-tests/test/support/cuda_space_selector.h b/.upstream-tests/test/support/cuda_space_selector.h index c15714c327..9584d02bd4 100644 --- a/.upstream-tests/test/support/cuda_space_selector.h +++ b/.upstream-tests/test/support/cuda_space_selector.h @@ -176,19 +176,30 @@ class memory_selector return ptr; } + __device__ void destruct_device() { + if (threadIdx.x == 0) { + ptr->~T(); + } + __syncthreads(); + } + + __host__ void destruct_host() { + ptr->~T(); + } + #ifndef __CUDACC_RTC__ __exec_check_disable__ #endif __host__ __device__ ~memory_selector() { -#ifdef __CUDA_ARCH__ - if (threadIdx.x == 0) { -#endif - ptr->~T(); -#ifdef __CUDA_ARCH__ - } - __syncthreads(); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + destruct_device(); + ), + NV_IS_HOST, ( + destruct_host(); + ) + ) } }; diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index 84316359ba..cef72cb1b2 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -145,17 +145,21 @@ _LIBCUDACXX_INLINE_VISIBILITY ) } -// Atomic storage layouts: +template +using __cxx_atomic_base_heterogeneous_storage + = typename conditional<_Ref, + host::__cxx_atomic_ref_base_impl<_Tp, _Sco>, + host::__cxx_atomic_base_impl<_Tp, _Sco> >::type; -// Implement _Sco with https://godbolt.org/z/foWdeYjEs -template + +template struct __cxx_atomic_base_heterogeneous_impl { __cxx_atomic_base_heterogeneous_impl() noexcept = default; _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit __cxx_atomic_base_heterogeneous_impl(_Tp __value) : __a_value(__value) { } - host::__cxx_atomic_base_impl<_Tp, _Sco> __a_value; + __cxx_atomic_base_heterogeneous_storage<_Tp, _Sco, _Ref> __a_value; _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR auto __get_device() const volatile _NOEXCEPT -> decltype(__a_value.__get_atom()) { @@ -191,7 +195,7 @@ struct __cxx_atomic_base_small_impl { __cxx_atomic_base_small_impl(_Tp __value) : __a_value(__value) { } - __cxx_atomic_base_heterogeneous_impl __a_value; + __cxx_atomic_base_heterogeneous_impl __a_value; _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR auto __get_atom() const volatile _NOEXCEPT -> decltype(&__a_value) { @@ -219,16 +223,20 @@ using __cxx_atomic_base_impl = typename conditional, __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> >::type; + template +using __cxx_atomic_base_ref_impl = __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, true>; + +template __host__ __device__ - void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __val) { + void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val) { alignas(_Tp) auto __tmp = __val; __cxx_atomic_assign_volatile(*__a->__get_device(), __tmp); } -template +template __host__ __device__ - void __cxx_atomic_store(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __val, memory_order __order) { + void __cxx_atomic_store(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val, memory_order __order) { alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -240,9 +248,9 @@ __host__ __device__ ) } -template +template __host__ __device__ - _Tp __cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> const volatile* __a, memory_order __order) { + _Tp __cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( return detail::__atomic_load_n_cuda(__a->__get_device(), __order, detail::__scope_tag<_Sco>()); @@ -253,9 +261,9 @@ __host__ __device__ ) } -template +template __host__ __device__ - _Tp __cxx_atomic_exchange(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __val, memory_order __order) { + _Tp __cxx_atomic_exchange(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val, memory_order __order) { alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -267,9 +275,9 @@ __host__ __device__ ) } -template +template __host__ __device__ - bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) { + bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) { alignas(_Tp) auto __tmp = *__expected; bool __result = false; NV_DISPATCH_TARGET( @@ -285,9 +293,9 @@ __host__ __device__ return __result; } -template +template __host__ __device__ - bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) { + bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure) { alignas(_Tp) auto __tmp = *__expected; bool __result = false; NV_DISPATCH_TARGET( @@ -303,9 +311,9 @@ __host__ __device__ return __result; } -template +template __host__ __device__ - _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) { + _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( return detail::__atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); @@ -316,9 +324,9 @@ __host__ __device__ ) } -template +template __host__ __device__ - _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, memory_order __order) { + _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( return detail::__atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); @@ -329,9 +337,9 @@ __host__ __device__ ) } -template +template __host__ __device__ - _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) { + _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( return detail::__atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); @@ -342,9 +350,9 @@ __host__ __device__ ) } -template +template __host__ __device__ - _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, memory_order __order) { + _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( return detail::__atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); @@ -355,9 +363,9 @@ __host__ __device__ ) } -template +template __host__ __device__ - _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { + _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( return detail::__atomic_fetch_and_cuda(__a->__get_device(), __pattern, __order, detail::__scope_tag<_Sco>()); @@ -368,9 +376,9 @@ __host__ __device__ ) } -template +template __host__ __device__ - _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { + _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( return detail::__atomic_fetch_or_cuda(__a->__get_device(), __pattern, __order, detail::__scope_tag<_Sco>()); @@ -381,9 +389,9 @@ __host__ __device__ ) } -template +template __host__ __device__ - _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { + _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( return detail::__atomic_fetch_xor_cuda(__a->__get_device(), __pattern, __order, detail::__scope_tag<_Sco>()); diff --git a/libcxx/include/support/atomic/atomic_gcc.h b/libcxx/include/support/atomic/atomic_gcc.h index 1e1004820c..50a667c40e 100644 --- a/libcxx/include/support/atomic/atomic_gcc.h +++ b/libcxx/include/support/atomic/atomic_gcc.h @@ -1,4 +1,12 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// // +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// template struct __cxx_atomic_base_impl { diff --git a/libcxx/include/support/atomic/atomic_msvc.h b/libcxx/include/support/atomic/atomic_msvc.h index dc7d691446..cb0c627268 100644 --- a/libcxx/include/support/atomic/atomic_msvc.h +++ b/libcxx/include/support/atomic/atomic_msvc.h @@ -40,9 +40,8 @@ static inline void __atomic_thread_fence(int __memorder) { } namespace detail { - template - using _enable_if_sized_as = typename _CUDA_VSTD::enable_if::type; -} +template +using _enable_if_sized_as = typename _CUDA_VSTD::enable_if::type; template = 0> void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { @@ -435,4 +434,187 @@ _Type __host__ __atomic_fetch_min(_Type volatile *__ptr, _Delta __val, int __mem return __expected; } +} // namespace detail + +template +struct __cxx_atomic_base_impl { + using __cxx_underlying_type = _Tp; + + _LIBCUDACXX_CONSTEXPR + __cxx_atomic_base_impl() _NOEXCEPT = default; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit + __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} + + _ALIGNAS(sizeof(_Tp)) _Tp __a_value; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + const volatile _Tp* __get_atom() const volatile _NOEXCEPT {return &__a_value;} + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + const _Tp* __get_atom() const _NOEXCEPT {return &__a_value;} + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + volatile _Tp* __get_atom() volatile _NOEXCEPT {return &__a_value;} +}; + +template +struct __cxx_atomic_ref_base_impl { + using __cxx_underlying_type = _Tp; + + _LIBCUDACXX_CONSTEXPR + __cxx_atomic_ref_base_impl() _NOEXCEPT = default; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit + __cxx_atomic_ref_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} + + _Tp* __a_value; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + const volatile _Tp* __get_atom() const volatile _NOEXCEPT {return __a_value;} + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + const _Tp* __get_atom() const _NOEXCEPT {return __a_value;} + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + volatile _Tp* __get_atom() volatile _NOEXCEPT {return __a_value;} +}; + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT -> decltype(__a->__get_atom()) { + return __a->__get_atom(); +} + +template +using __cxx_atomic_underlying_t = typename _Tp::__cxx_underlying_type; + +template +inline void __cxx_atomic_init(volatile _Tp* __a, _Up __val) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __cxx_atomic_assign_volatile(*__a_tmp, __val); +} + +template +inline void __cxx_atomic_init(_Tp* __a, _Up __val) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __a = __val; +} + +inline +void __cxx_atomic_thread_fence(memory_order __order) { + detail::__atomic_thread_fence(__to_gcc_order(__order)); +} + +inline +void __cxx_atomic_signal_fence(memory_order __order) { + detail::__atomic_signal_fence(__to_gcc_order(__order)); +} + +template +inline void __cxx_atomic_store(_Tp* __a, _Up __val, + memory_order __order) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + detail::__atomic_store(__a_tmp, &__val, __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_load(const _Tp* __a, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __cxx_atomic_underlying_t<_Tp> __ret; + detail::__atomic_load(__a_tmp, &__ret, __to_gcc_order(__order)); + return __ret; +} + +template +inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __cxx_atomic_underlying_t<_Tp> __ret; + detail::__atomic_exchange(__a_tmp, &__value, &__ret, __to_gcc_order(__order)); + return __ret; +} + +template +inline bool __cxx_atomic_compare_exchange_strong( + _Tp* __a, _Up* __expected, _Up __value, memory_order __success, + memory_order __failure) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return detail::__atomic_compare_exchange(__a_tmp, __expected, &__value, + false, + __to_gcc_order(__success), + __to_gcc_failure_order(__failure)); +} + +template +inline bool __cxx_atomic_compare_exchange_weak( + _Tp* __a, _Up* __expected, _Up __value, memory_order __success, + memory_order __failure) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return detail::__atomic_compare_exchange(__a_tmp, __expected, &__value, + true, + __to_gcc_order(__success), + __to_gcc_failure_order(__failure)); +} + +template +struct __skip_amt { enum {value = 1}; }; + +template +struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; + +// FIXME: Haven't figured out what the spec says about using arrays with +// atomic_fetch_add. Force a failure rather than creating bad behavior. +template +struct __skip_amt<_Tp[]> { }; +template +struct __skip_amt<_Tp[n]> { }; + +template +inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return detail::__atomic_fetch_add(__a_tmp, __delta * __skip_v, + __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return detail::__atomic_fetch_sub(__a_tmp, __delta * __skip_v, + __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return detail::__atomic_fetch_and(__a_tmp, __pattern, + __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return detail::__atomic_fetch_or(__a_tmp, __pattern, + __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return detail::__atomic_fetch_xor(__a_tmp, __pattern, + __to_gcc_order(__order)); +} + +inline constexpr + bool __cxx_atomic_is_lock_free(size_t __x) { + return detail::__atomic_is_lock_free(__x, 0); +} + _LIBCUDACXX_END_NAMESPACE_CUDA From 27a34b52ffb6c6eea37e358af9599d89b8e58727 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 13 May 2021 18:01:32 -0700 Subject: [PATCH 08/34] Fix atomic_mscv header, fix macro processing in __threading_support on MSVC, fix a test that used preprocessor directives within NV_TARGET --- .../std/atomics/atomics.flag/default.pass.cpp | 36 +++-- libcxx/include/__threading_support | 13 +- libcxx/include/support/atomic/atomic_cuda.h | 5 +- libcxx/include/support/atomic/atomic_msvc.h | 135 +++++++++++------- 4 files changed, 113 insertions(+), 76 deletions(-) diff --git a/.upstream-tests/test/std/atomics/atomics.flag/default.pass.cpp b/.upstream-tests/test/std/atomics/atomics.flag/default.pass.cpp index 343dbe46c2..f2aa50ebc7 100644 --- a/.upstream-tests/test/std/atomics/atomics.flag/default.pass.cpp +++ b/.upstream-tests/test/std/atomics/atomics.flag/default.pass.cpp @@ -25,26 +25,40 @@ #endif #include "cuda_space_selector.h" + template class Selector> __host__ __device__ void test() { +// cudafe crashes on trying to interpret the line below when compiling with Clang +// TODO: file a compiler bug +#if !(defined(__clang__) && defined(__CUDACC__)) +# define _TEST_NO_DESTRUCT_ZERO +#else +# define _TEST_NO_DESTRUCT_ZERO zero.~A() +#endif + Selector sel; cuda::std::atomic_flag & f = *sel.construct(); f.clear(); assert(f.test_and_set() == 0); { -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - typedef cuda::std::atomic_flag A; - TEST_ALIGNAS_TYPE(A) char storage[sizeof(A)] = {1}; - A& zero = *new (storage) A(); - assert(!zero.test_and_set()); - // cudafe crashes on trying to interpret the line below when compiling with Clang - // TODO: file a compiler bug -#if !(defined(__clang__) && defined(__CUDACC__)) - zero.~A(); -#endif -#endif + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + typedef cuda::std::atomic_flag A; + TEST_ALIGNAS_TYPE(A) char storage[sizeof(A)] = {1}; + A& zero = *new (storage) A(); + assert(!zero.test_and_set()); + _TEST_NO_DESTRUCT_ZERO; + ), + NV_IS_HOST, ( + typedef cuda::std::atomic_flag A; + TEST_ALIGNAS_TYPE(A) char storage[sizeof(A)] = {1}; + A& zero = *new (storage) A(); + assert(!zero.test_and_set()); + _TEST_NO_DESTRUCT_ZERO; + ) + ) } } diff --git a/libcxx/include/__threading_support b/libcxx/include/__threading_support index a63e1596cf..ae03791f40 100644 --- a/libcxx/include/__threading_support +++ b/libcxx/include/__threading_support @@ -75,17 +75,18 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD _LIBCUDACXX_INLINE_VISIBILITY inline void __libcpp_thread_yield_processor() { - NV_DISPATCH_TARGET( - NV_IS_HOST, #if defined(__aarch64__) - (asm volatile ("yield" :::);) +# define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile ("yield" :::);) #elif defined(__x86_64__) - (asm volatile ("pause" :::);) +# define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile ("pause" :::);) #elif defined (__powerpc__) - (asm volatile ("or 27,27,27":::);) +# define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile ("or 27,27,27":::);) #else - (;) +# define __LIBCUDACXX_ASM_THREAD_YIELD (;) #endif + NV_DISPATCH_TARGET( + NV_IS_HOST, + __LIBCUDACXX_ASM_THREAD_YIELD ) } diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index cef72cb1b2..4fdca7db2c 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -44,9 +44,6 @@ #define __ATOMIC_THREAD 10 #endif //__ATOMIC_BLOCK -// TODO: -// How to get this into cuda::??? - inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) { int const __max = __a > __b ? __a : __b; if(__max != __ATOMIC_RELEASE) @@ -225,7 +222,7 @@ using __cxx_atomic_base_impl = typename conditional -using __cxx_atomic_base_ref_impl = __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, true>; +using __cxx_atomic_ref_base_impl = __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, true>; template __host__ __device__ diff --git a/libcxx/include/support/atomic/atomic_msvc.h b/libcxx/include/support/atomic/atomic_msvc.h index cb0c627268..4f849726fe 100644 --- a/libcxx/include/support/atomic/atomic_msvc.h +++ b/libcxx/include/support/atomic/atomic_msvc.h @@ -27,7 +27,37 @@ #error Unsupported hardware #endif // hardware -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA +_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { + // Avoid switch statement to make this a constexpr. + return __order == memory_order_relaxed ? __ATOMIC_RELAXED: + (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: + (__order == memory_order_release ? __ATOMIC_RELEASE: + (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: + (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL: + __ATOMIC_CONSUME)))); +} + +_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_failure_order(memory_order __order) { + // Avoid switch statement to make this a constexpr. + return __order == memory_order_relaxed ? __ATOMIC_RELAXED: + (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: + (__order == memory_order_release ? __ATOMIC_RELAXED: + (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: + (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE: + __ATOMIC_CONSUME)))); +} + +inline int __stronger_order_msvc(int __a, int __b) { + int const __max = __a > __b ? __a : __b; + if(__max != __ATOMIC_RELEASE) + return __max; + static int const __xform[] = { + __ATOMIC_RELEASE, + __ATOMIC_ACQ_REL, + __ATOMIC_ACQ_REL, + __ATOMIC_RELEASE }; + return __xform[__a < __b ? __a : __b]; +} static inline void __atomic_signal_fence(int __memorder) { if (__memorder != __ATOMIC_RELAXED) @@ -39,11 +69,10 @@ static inline void __atomic_thread_fence(int __memorder) { _Memory_barrier(); } -namespace detail { template -using _enable_if_sized_as = typename _CUDA_VSTD::enable_if::type; +using _enable_if_sized_as = typename enable_if::type; -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { #ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN __int8 __tmp = *(const volatile __int8 *)__ptr; @@ -52,7 +81,7 @@ void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { #endif *__ret = reinterpret_cast<_Type&>(__tmp); } -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { #ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN __int16 __tmp = *(const volatile __int16 *)__ptr; @@ -61,7 +90,7 @@ void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { #endif *__ret = reinterpret_cast<_Type&>(__tmp); } -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { #ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN __int32 __tmp = *(const volatile __int32 *)__ptr; @@ -70,7 +99,7 @@ void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { #endif *__ret = reinterpret_cast<_Type&>(__tmp); } -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type *__ptr, _Type *__ret) { #ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN __int64 __tmp = *(const volatile __int64 *)__ptr; @@ -91,7 +120,7 @@ void __atomic_load(const volatile _Type *__ptr, _Type *__ret, int __memorder) { } } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type *__ptr, _Type *__val) { auto __t = reinterpret_cast<__int8 *>(__val); auto __d = reinterpret_cast(__ptr); @@ -101,7 +130,7 @@ void __atomic_store_relaxed(volatile _Type *__ptr, _Type *__val) { __iso_volatile_store8(__d, *__t); #endif } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type *__ptr, _Type *__val) { auto __t = reinterpret_cast<__int16 *>(__val); auto __d = reinterpret_cast(__ptr); @@ -111,7 +140,7 @@ void __atomic_store_relaxed(volatile _Type *__ptr, _Type *__val) { __iso_volatile_store16(__d, *__t); #endif } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type *__ptr, _Type *__val) { auto __t = reinterpret_cast<__int32 *>(__val); auto __d = reinterpret_cast(__ptr); @@ -122,7 +151,7 @@ void __atomic_store_relaxed(volatile _Type *__ptr, _Type *__val) { __iso_volatile_store32(__d, *__t); #endif } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type *__ptr, _Type *__val) { auto __t = reinterpret_cast<__int64 *>(__val); auto __d = reinterpret_cast(__ptr); @@ -143,7 +172,7 @@ void __atomic_store(volatile _Type *__ptr, _Type *__val, int __memorder) { } } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast(*__expected); @@ -153,7 +182,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__exp *__expected = reinterpret_cast(__old); return false; } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast(*__expected); @@ -163,7 +192,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__exp *__expected = reinterpret_cast(__old); return false; } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast(*__expected); @@ -173,7 +202,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__exp *__expected = reinterpret_cast(__old); return false; } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__expected, const _Type *__desired) { auto __tmp_desired = reinterpret_cast(*__desired); auto __tmp_expected = reinterpret_cast<__int64&>(*__expected); @@ -186,7 +215,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type *__ptr, _Type *__exp template bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder) { bool success = false; - switch (detail::__stronger_order_cuda(__success_memorder, __failure_memorder)) { + switch (__stronger_order_msvc(__success_memorder, __failure_memorder)) { case __ATOMIC_RELEASE: _Compiler_or_memory_barrier(); success = __atomic_compare_exchange_relaxed(__ptr, __expected, __desired); break; case __ATOMIC_ACQ_REL: _Compiler_or_memory_barrier(); _LIBCUDACXX_FALLTHROUGH(); case __ATOMIC_CONSUME: @@ -198,22 +227,22 @@ bool __atomic_compare_exchange(_Type volatile *__ptr, _Type *__expected, const _ return success; } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { auto const __old = _InterlockedExchange8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { auto const __old = _InterlockedExchange16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { auto const __old = _InterlockedExchange((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type *__ptr, const _Type *__val, _Type *__ret) { auto const __old = _InterlockedExchange64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -231,22 +260,22 @@ void __atomic_exchange(_Type volatile *__ptr, const _Type *__val, _Type *__ret, } } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedExchangeAdd8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedExchangeAdd16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedExchangeAdd((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedExchangeAdd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -273,22 +302,22 @@ _Type __atomic_fetch_sub(_Type volatile *__ptr, _Delta __val, int __memorder) { } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedAnd8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedAnd16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedAnd((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedAnd64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -310,22 +339,22 @@ _Type __atomic_fetch_and(_Type volatile *__ptr, _Delta __val, int __memorder) { return *__dest; } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedXor8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedXor16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedXor((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedXor64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -347,22 +376,22 @@ _Type __atomic_fetch_xor(_Type volatile *__ptr, _Delta __val, int __memorder) { return *__dest; } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedOr8((volatile char *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedOr16((volatile short *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedOr((volatile long *)__ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type *__ptr, const _Delta *__val, _Type *__ret) { auto const __old = _InterlockedOr64((volatile __int64 *)__ptr, reinterpret_cast<__int64 const&>(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); @@ -413,7 +442,7 @@ _Type __atomic_exchange_n(_Type volatile *__ptr, _Type __val, int __memorder) { } template -_Type __host__ __atomic_fetch_max(_Type volatile *__ptr, _Delta __val, int __memorder) { +_Type __atomic_fetch_max(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __expected = __atomic_load_n(__ptr, __ATOMIC_RELAXED); _Type __desired = __expected < __val ? __expected : __val; while(__desired == __val && @@ -424,7 +453,7 @@ _Type __host__ __atomic_fetch_max(_Type volatile *__ptr, _Delta __val, int __mem } template -_Type __host__ __atomic_fetch_min(_Type volatile *__ptr, _Delta __val, int __memorder) { +_Type __atomic_fetch_min(_Type volatile *__ptr, _Delta __val, int __memorder) { _Type __expected = __atomic_load_n(__ptr, __ATOMIC_RELAXED); _Type __desired = __expected < __val ? __expected : __val; while(__desired != __val && @@ -434,8 +463,6 @@ _Type __host__ __atomic_fetch_min(_Type volatile *__ptr, _Delta __val, int __mem return __expected; } -} // namespace detail - template struct __cxx_atomic_base_impl { using __cxx_underlying_type = _Tp; @@ -502,19 +529,19 @@ inline void __cxx_atomic_init(_Tp* __a, _Up __val) { inline void __cxx_atomic_thread_fence(memory_order __order) { - detail::__atomic_thread_fence(__to_gcc_order(__order)); + __atomic_thread_fence(__to_gcc_order(__order)); } inline void __cxx_atomic_signal_fence(memory_order __order) { - detail::__atomic_signal_fence(__to_gcc_order(__order)); + __atomic_signal_fence(__to_gcc_order(__order)); } template inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order) { auto __a_tmp = __cxx_atomic_base_unwrap(__a); - detail::__atomic_store(__a_tmp, &__val, __to_gcc_order(__order)); + __atomic_store(__a_tmp, &__val, __to_gcc_order(__order)); } template @@ -522,7 +549,7 @@ inline auto __cxx_atomic_load(const _Tp* __a, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); __cxx_atomic_underlying_t<_Tp> __ret; - detail::__atomic_load(__a_tmp, &__ret, __to_gcc_order(__order)); + __atomic_load(__a_tmp, &__ret, __to_gcc_order(__order)); return __ret; } @@ -531,7 +558,7 @@ inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); __cxx_atomic_underlying_t<_Tp> __ret; - detail::__atomic_exchange(__a_tmp, &__value, &__ret, __to_gcc_order(__order)); + __atomic_exchange(__a_tmp, &__value, &__ret, __to_gcc_order(__order)); return __ret; } @@ -540,7 +567,7 @@ inline bool __cxx_atomic_compare_exchange_strong( _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure) { auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return detail::__atomic_compare_exchange(__a_tmp, __expected, &__value, + return __atomic_compare_exchange(__a_tmp, __expected, &__value, false, __to_gcc_order(__success), __to_gcc_failure_order(__failure)); @@ -551,7 +578,7 @@ inline bool __cxx_atomic_compare_exchange_weak( _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure) { auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return detail::__atomic_compare_exchange(__a_tmp, __expected, &__value, + return __atomic_compare_exchange(__a_tmp, __expected, &__value, true, __to_gcc_order(__success), __to_gcc_failure_order(__failure)); @@ -575,7 +602,7 @@ inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return detail::__atomic_fetch_add(__a_tmp, __delta * __skip_v, + return __atomic_fetch_add(__a_tmp, __delta * __skip_v, __to_gcc_order(__order)); } @@ -584,7 +611,7 @@ inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return detail::__atomic_fetch_sub(__a_tmp, __delta * __skip_v, + return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, __to_gcc_order(__order)); } @@ -592,7 +619,7 @@ template inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return detail::__atomic_fetch_and(__a_tmp, __pattern, + return __atomic_fetch_and(__a_tmp, __pattern, __to_gcc_order(__order)); } @@ -600,7 +627,7 @@ template inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return detail::__atomic_fetch_or(__a_tmp, __pattern, + return __atomic_fetch_or(__a_tmp, __pattern, __to_gcc_order(__order)); } @@ -608,13 +635,11 @@ template inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return detail::__atomic_fetch_xor(__a_tmp, __pattern, + return __atomic_fetch_xor(__a_tmp, __pattern, __to_gcc_order(__order)); } inline constexpr bool __cxx_atomic_is_lock_free(size_t __x) { - return detail::__atomic_is_lock_free(__x, 0); + return __x <= sizeof(uint64_t); } - -_LIBCUDACXX_END_NAMESPACE_CUDA From 205a2dd656d2c0838089cef6ee95eb930d207c94 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 7 Jul 2021 14:56:42 -0700 Subject: [PATCH 09/34] Fix some local errors and warnings, put back the SHARED macro, fix pipeline tests --- ...peline_memcpy_async_thread_scope_generic.h | 3 +- .../std/utilities/time/time.cal/euclidian.h | 39 +++++++++++++++---- .../test/support/cuda_space_selector.h | 6 +++ .upstream-tests/test/support/test_macros.h | 14 ------- 4 files changed, 39 insertions(+), 23 deletions(-) diff --git a/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h b/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h index a6f79f8925..32cb82300c 100644 --- a/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h +++ b/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h @@ -16,6 +16,7 @@ #include "large_type.h" template < + class Group, cuda::thread_scope Scope, class T, template class SourceSelector, @@ -24,7 +25,7 @@ template < uint8_t PipelineStages > __host__ __device__ __noinline__ -void test_fully_specialized() +void test_fully_specialized(Group &group) { SourceSelector source_sel; typename DestSelector diff --git a/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h b/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h index eb8019fdf6..3592ff80b4 100644 --- a/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h +++ b/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include +#include // Assumption: minValue < maxValue @@ -24,6 +24,33 @@ T euclidian_addition(T rhs, T lhs) return ret; } +template ::value, T zero = 0> +struct signed_euclidean_subtraction { + static constexpr T modulus = maxValue - minValue + 1; + __host__ __device__ T operator()(T lhs, T rhs) { + T ret = lhs - rhs; + if (ret < minValue) { + ret += modulus; + } + if (ret > maxValue) { + ret += modulus; + } + return ret; + } +}; + +template +struct signed_euclidean_subtraction { + static constexpr T modulus = maxValue + 1; + __host__ __device__ T operator()(T lhs, T rhs) { + T ret = lhs - rhs; + if (ret > maxValue) { + ret += modulus; + } + return ret; + } +}; + // Assumption: minValue < maxValue // Assumption: minValue <= rhs <= maxValue // Assumption: minValue <= lhs <= maxValue @@ -32,11 +59,7 @@ template __host__ __device__ T euclidian_subtraction(T lhs, T rhs) { - const T modulus = maxValue - minValue + 1; - T ret = lhs - rhs; - if (std::is_signed::value and (ret < minValue)) // avoids warning about comparison with zero if T is unsigned - ret += modulus; - if (ret > maxValue) // this can happen if T is unsigned - ret += modulus; - return ret; + signed_euclidean_subtraction op; + + return op(lhs, rhs); } diff --git a/.upstream-tests/test/support/cuda_space_selector.h b/.upstream-tests/test/support/cuda_space_selector.h index 9584d02bd4..83ddaf1b3a 100644 --- a/.upstream-tests/test/support/cuda_space_selector.h +++ b/.upstream-tests/test/support/cuda_space_selector.h @@ -21,6 +21,12 @@ #define LAMBDA [=] __host__ __device__ #endif +#ifdef __CUDA_ARCH__ +#define SHARED __shared__ +#else +#define SHARED +#endif + #pragma diag_suppress 941 #pragma diag_suppress 1057 diff --git a/.upstream-tests/test/support/test_macros.h b/.upstream-tests/test/support/test_macros.h index 1ecca66677..7783c96f90 100644 --- a/.upstream-tests/test/support/test_macros.h +++ b/.upstream-tests/test/support/test_macros.h @@ -389,20 +389,6 @@ __host__ __device__ _Tp& maybe_shared_mem() { ) \ } -// NVCC can't handle static member variables, so with a little care -// a function returning a reference will result in the same thing -#ifdef __CUDA_ARCH__ -# define _STATIC_MEMBER_IMPL(type) __shared__ type v; -#else -# define _STATIC_MEMBER_IMPL(type) static type v; -#endif - -#define STATIC_MEMBER_VAR(name, type) \ - __host__ __device__ static type& name() { \ - _STATIC_MEMBER_IMPL(type); \ - return v; \ - } - #if defined(__GNUC__) #pragma GCC diagnostic pop #endif From e0896e8517673f2ca823e136c3059c0da28a9b10 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 8 Jul 2021 15:51:56 -0700 Subject: [PATCH 10/34] Add symlink to nv/target within libcxx --- libcxx/include/nv | 1 + 1 file changed, 1 insertion(+) create mode 120000 libcxx/include/nv diff --git a/libcxx/include/nv b/libcxx/include/nv new file mode 120000 index 0000000000..f10dec61cb --- /dev/null +++ b/libcxx/include/nv @@ -0,0 +1 @@ +../../include/nv \ No newline at end of file From 815269dc326d4501c894fb4dd26228380812e052 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 8 Jul 2021 18:03:56 -0700 Subject: [PATCH 11/34] Fix CV handling of atomics on the libcxx/std layer --- libcxx/include/support/atomic/atomic_gcc.h | 62 ++++++++++++++++------ 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/libcxx/include/support/atomic/atomic_gcc.h b/libcxx/include/support/atomic/atomic_gcc.h index 50a667c40e..e1f4d64e0f 100644 --- a/libcxx/include/support/atomic/atomic_gcc.h +++ b/libcxx/include/support/atomic/atomic_gcc.h @@ -19,16 +19,31 @@ struct __cxx_atomic_base_impl { __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} _ALIGNAS(sizeof(_Tp)) _Tp __a_value; +}; - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - const volatile _Tp* __get_atom() const volatile _NOEXCEPT {return &__a_value;} +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +_Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> * __a) _NOEXCEPT { + return &__a->__a_value; +} - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - const _Tp* __get_atom() const _NOEXCEPT {return &__a_value;} +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { + return &__a->__a_value; +} - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - volatile _Tp* __get_atom() volatile _NOEXCEPT {return &__a_value;} -}; +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { + return &__a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { + return &__a->__a_value; +} template struct __cxx_atomic_ref_base_impl { @@ -41,20 +56,35 @@ struct __cxx_atomic_ref_base_impl { __cxx_atomic_ref_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} _Tp* __a_value; +}; - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - const volatile _Tp* __get_atom() const volatile _NOEXCEPT {return __a_value;} +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +_Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> * __a) _NOEXCEPT { + return __a->__a_value; +} - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - const _Tp* __get_atom() const _NOEXCEPT {return __a_value;} +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { + return __a->__a_value; +} - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - volatile _Tp* __get_atom() volatile _NOEXCEPT {return __a_value;} -}; +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { + return __a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { + return __a->__a_value; +} template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT -> decltype(__a->__get_atom()) { - return __a->__get_atom(); +_LIBCUDACXX_INLINE_VISIBILITY auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT -> decltype(__cxx_get_underlying_atomic(__a)) { + return __cxx_get_underlying_atomic(__a); } template From eaf550ab08cc7aa0f5b047ff9f8cd3b89c9ba63e Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 9 Jul 2021 13:16:41 -0700 Subject: [PATCH 12/34] Disable C11 atomics in __config --- libcxx/include/__config | 7 ++++--- libcxx/include/atomic | 2 +- libcxx/include/support/atomic/atomic_gcc.h | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index 8e5811d20e..5f6a0f76f4 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1598,9 +1598,10 @@ _LIBCUDACXX_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( #define _LIBCUDACXX_HAS_NO_THREAD_UNSAFE_C_FUNCTIONS #endif -#if __has_feature(cxx_atomic) || __has_extension(c_atomic) || __has_keyword(_Atomic) -# define _LIBCUDACXX_HAS_C_ATOMIC_IMP -#elif defined(_LIBCUDACXX_COMPILER_GCC) +// TODO: Support C11 Atomics? +// #if __has_feature(cxx_atomic) || __has_extension(c_atomic) || __has_keyword(_Atomic) +// # define _LIBCUDACXX_HAS_C_ATOMIC_IMP +#if defined(_LIBCUDACXX_COMPILER_GCC) || defined(_LIBCUDACXX_COMPILER_CLANG) # define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP #elif defined(_LIBCUDACXX_COMPILER_MSVC) # define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL diff --git a/libcxx/include/atomic b/libcxx/include/atomic index 9b76c97de9..406f0ec22b 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -679,7 +679,7 @@ namespace detail { #elif defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) # include "support/atomic/atomic_gcc.h" #elif defined(_LIBCUDACXX_HAS_C_ATOMIC_IMP) -// TODO: +// TODO: Maybe support C11 atomics? // #include "support/atomic/atomic_c11.h" #endif // _LIBCUDACXX_HAS_GCC_ATOMIC_IMP, _LIBCUDACXX_HAS_C_ATOMIC_IMP } diff --git a/libcxx/include/support/atomic/atomic_gcc.h b/libcxx/include/support/atomic/atomic_gcc.h index e1f4d64e0f..0503f60e96 100644 --- a/libcxx/include/support/atomic/atomic_gcc.h +++ b/libcxx/include/support/atomic/atomic_gcc.h @@ -60,7 +60,7 @@ struct __cxx_atomic_ref_base_impl { template _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -_Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> * __a) _NOEXCEPT { +_Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) _NOEXCEPT { return __a->__a_value; } @@ -119,7 +119,7 @@ inline void __cxx_atomic_init(volatile _Tp* __a, _Up __val) { template inline void __cxx_atomic_init(_Tp* __a, _Up __val) { auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __a = __val; + *__a_tmp = __val; } inline From 88d92404e674cd62de56cc61aba9f9602150ccca Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 9 Jul 2021 17:06:37 -0700 Subject: [PATCH 13/34] fixups for Clang specific issues in atomic, make sure is within a C++ context --- libcxx/include/__config | 12 +++++++++--- libcxx/include/support/atomic/atomic_gcc.h | 4 ++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index 5f6a0f76f4..464523eaef 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -10,8 +10,6 @@ #ifndef _LIBCUDACXX_CONFIG #define _LIBCUDACXX_CONFIG -#include - #if defined(_MSC_VER) && !defined(__clang__) #define _LIBCUDACXX_HAS_PRAGMA_MSVC_WARNING #if !defined(_LIBCUDACXX_DISABLE_PRAGMA_MSVC_WARNING) @@ -30,6 +28,11 @@ #ifdef __cplusplus +// __config may be included in `extern "C"` contexts, switch back to include +extern "C++" { +#include +} + #ifdef __GNUC__ # define _GNUC_VER (__GNUC__ * 100 + __GNUC_MINOR__) // The _GNUC_VER_NEW macro better represents the new GCC versioning scheme @@ -1601,7 +1604,10 @@ _LIBCUDACXX_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( // TODO: Support C11 Atomics? // #if __has_feature(cxx_atomic) || __has_extension(c_atomic) || __has_keyword(_Atomic) // # define _LIBCUDACXX_HAS_C_ATOMIC_IMP -#if defined(_LIBCUDACXX_COMPILER_GCC) || defined(_LIBCUDACXX_COMPILER_CLANG) +#if defined(_LIBCUDACXX_COMPILER_CLANG) +# define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP +# define _LIBCUDACXX_NO_RUNTIME_LOCK_FREE +#elif defined(_LIBCUDACXX_COMPILER_GCC) # define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP #elif defined(_LIBCUDACXX_COMPILER_MSVC) # define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL diff --git a/libcxx/include/support/atomic/atomic_gcc.h b/libcxx/include/support/atomic/atomic_gcc.h index 0503f60e96..32f62e518b 100644 --- a/libcxx/include/support/atomic/atomic_gcc.h +++ b/libcxx/include/support/atomic/atomic_gcc.h @@ -236,5 +236,9 @@ inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, inline constexpr bool __cxx_atomic_is_lock_free(size_t __x) { + #if defined(_LIBCUDACXX_NO_RUNTIME_LOCK_FREE) + return __x <= 8; + #else return __atomic_is_lock_free(__x, 0); + #endif } From c9eeeee7c95859b2cd367fd5b2201b201f6ca298 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 13 Jul 2021 19:58:11 -0700 Subject: [PATCH 14/34] Fix CUDA and MSVC atomic layers --- libcxx/include/support/atomic/atomic_cuda.h | 47 ++++++---------- libcxx/include/support/atomic/atomic_msvc.h | 62 +++++++++++++++------ 2 files changed, 63 insertions(+), 46 deletions(-) diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index 4fdca7db2c..c61471a9af 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -159,16 +159,16 @@ struct __cxx_atomic_base_heterogeneous_impl { __cxx_atomic_base_heterogeneous_storage<_Tp, _Sco, _Ref> __a_value; _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_device() const volatile _NOEXCEPT -> decltype(__a_value.__get_atom()) { - return __a_value.__get_atom(); + auto __get_device() const volatile _NOEXCEPT -> decltype(__cxx_atomic_base_unwrap(&__a_value)) { + return __cxx_atomic_base_unwrap(&__a_value); } _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_device() volatile _NOEXCEPT -> decltype(__a_value.__get_atom()) { - return __a_value.__get_atom(); + auto __get_device() volatile _NOEXCEPT -> decltype(__cxx_atomic_base_unwrap(&__a_value)) { + return __cxx_atomic_base_unwrap(&__a_value); } _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_device() const _NOEXCEPT -> decltype(__a_value.__get_atom()) { - return __a_value.__get_atom(); + auto __get_device() const _NOEXCEPT -> decltype(__cxx_atomic_base_unwrap(&__a_value)) { + return __cxx_atomic_base_unwrap(&__a_value); } _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR @@ -193,19 +193,6 @@ struct __cxx_atomic_base_small_impl { } __cxx_atomic_base_heterogeneous_impl __a_value; - - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_atom() const volatile _NOEXCEPT -> decltype(&__a_value) { - return &__a_value; - } - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_atom() volatile _NOEXCEPT -> decltype(&__a_value) { - return &__a_value; - } - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_atom() const _NOEXCEPT -> decltype(&__a_value) { - return &__a_value; - } }; template @@ -416,22 +403,22 @@ __host__ __device__ inline _Tp __cxx_small_from_32(uint32_t __val) { template __host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val) { - __cxx_atomic_init(__a->__get_atom(), __cxx_small_to_32(__val)); + __cxx_atomic_init(&__a->__a_value, __cxx_small_to_32(__val)); } template __host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val, memory_order __order) { - __cxx_atomic_store(__a->__get_atom(), __cxx_small_to_32(__val), __order); + __cxx_atomic_store(&__a->__a_value, __cxx_small_to_32(__val), __order); } template __host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_small_impl<_Tp, _Sco> const volatile* __a, memory_order __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_load(__a->__get_atom(), __order)); + return __cxx_small_from_32<_Tp>(__cxx_atomic_load(&__a->__a_value, __order)); } template __host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __value, memory_order __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_exchange(__a->__get_atom(), __cxx_small_to_32(__value), __order)); + return __cxx_small_from_32<_Tp>(__cxx_atomic_exchange(&__a->__a_value, __cxx_small_to_32(__value), __order)); } __host__ __device__ inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) { @@ -453,11 +440,11 @@ inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) template __host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) { auto __temp = __cxx_small_to_32(*__expected); - auto const __ret = __cxx_atomic_compare_exchange_weak(__a->__get_atom(), &__temp, __cxx_small_to_32(__value), __success, __failure); + auto const __ret = __cxx_atomic_compare_exchange_weak(&__a->__a_value, &__temp, __cxx_small_to_32(__value), __success, __failure); auto const __actual = __cxx_small_from_32<_Tp>(__temp); if(!__ret) { if(0 == __cuda_memcmp(&__actual, __expected, sizeof(_Tp))) - __cxx_atomic_fetch_and(__a->__get_atom(), (1u << (8*sizeof(_Tp))) - 1, memory_order::memory_order_relaxed); + __cxx_atomic_fetch_and(&__a->__a_value, (1u << (8*sizeof(_Tp))) - 1, memory_order::memory_order_relaxed); else *__expected = __actual; } @@ -477,25 +464,25 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomi template __host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_add(__a->__get_atom(), __cxx_small_to_32(__delta), __order)); + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_add(&__a->__a_value, __cxx_small_to_32(__delta), __order)); } template __host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_sub(__a->__get_atom(), __cxx_small_to_32(__delta), __order)); + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_sub(&__a->__a_value, __cxx_small_to_32(__delta), __order)); } template __host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_and(__a->__get_atom(), __cxx_small_to_32(__pattern), __order)); + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_and(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); } template __host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_or(__a->__get_atom(), __cxx_small_to_32(__pattern), __order)); + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_or(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); } template __host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) { - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_xor(__a->__get_atom(), __cxx_small_to_32(__pattern), __order)); + return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_xor(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); } diff --git a/libcxx/include/support/atomic/atomic_msvc.h b/libcxx/include/support/atomic/atomic_msvc.h index 4f849726fe..73e2ce6def 100644 --- a/libcxx/include/support/atomic/atomic_msvc.h +++ b/libcxx/include/support/atomic/atomic_msvc.h @@ -474,16 +474,31 @@ struct __cxx_atomic_base_impl { __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} _ALIGNAS(sizeof(_Tp)) _Tp __a_value; +}; - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - const volatile _Tp* __get_atom() const volatile _NOEXCEPT {return &__a_value;} +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +_Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> * __a) _NOEXCEPT { + return &__a->__a_value; +} - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - const _Tp* __get_atom() const _NOEXCEPT {return &__a_value;} +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { + return &__a->__a_value; +} - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - volatile _Tp* __get_atom() volatile _NOEXCEPT {return &__a_value;} -}; +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { + return &__a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { + return &__a->__a_value; +} template struct __cxx_atomic_ref_base_impl { @@ -496,20 +511,35 @@ struct __cxx_atomic_ref_base_impl { __cxx_atomic_ref_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} _Tp* __a_value; +}; - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - const volatile _Tp* __get_atom() const volatile _NOEXCEPT {return __a_value;} +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +_Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) _NOEXCEPT { + return __a->__a_value; +} - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - const _Tp* __get_atom() const _NOEXCEPT {return __a_value;} +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { + return __a->__a_value; +} - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - volatile _Tp* __get_atom() volatile _NOEXCEPT {return __a_value;} -}; +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { + return __a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { + return __a->__a_value; +} template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT -> decltype(__a->__get_atom()) { - return __a->__get_atom(); +_LIBCUDACXX_INLINE_VISIBILITY auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT -> decltype(__cxx_get_underlying_atomic(__a)) { + return __cxx_get_underlying_atomic(__a); } template From e9dcf2b750f52c3f16ae099a04ddea07b0dfc49c Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 13 Jul 2021 19:58:31 -0700 Subject: [PATCH 15/34] uncomment a couple tests from pipeline --- .../test/cuda/pipeline_memcpy_async_thread_scope_generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h b/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h index 32cb82300c..a9ee75c189 100644 --- a/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h +++ b/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h @@ -122,8 +122,8 @@ void test_select_source() test_select_destination(); NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - // test_select_destination(); - // test_select_destination(); + test_select_destination(); + test_select_destination(); ) ) } From 28e83defa36e4ec3da6062fdb10badae04921b90 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 14 Jul 2021 14:59:41 -0700 Subject: [PATCH 16/34] Revert tests, will -ify later --- ...peline_memcpy_async_thread_scope_generic.h | 46 ++++--- .../atomics.flag/atomic_flag_wait.pass.cpp | 2 - .../std/atomics/atomics.flag/default.pass.cpp | 36 ++---- .../atomic_wait.pass.cpp | 2 - .../std/thread/thread.barrier/arrive.pass.cpp | 2 +- .../thread.barrier/arrive_and_drop.pass.cpp | 2 +- .../thread.barrier/arrive_and_wait.pass.cpp | 2 +- .../thread/thread.barrier/completion.pass.cpp | 4 +- .../thread.latch/arrive_and_wait.pass.cpp | 2 +- .../thread/thread.latch/count_down.pass.cpp | 2 +- .../std/thread/thread.latch/try_wait.pass.cpp | 2 +- .../thread/thread.semaphore/release.pass.cpp | 2 +- .../func.not_fn/not_fn.pass.cpp | 46 ++++--- .../std/utilities/time/time.cal/euclidian.h | 39 ++---- .../tuple.tuple/tuple.apply/apply.pass.cpp | 34 ++--- .../tuple.apply/apply_extended_types.pass.cpp | 118 +++++++++--------- .../tuple.tuple/tuple.assign/move.pass.cpp | 22 ++-- ...4_contains_ref_to_incomplete_type.pass.cpp | 73 ++++------- .../tuple.tuple/tuple.cnstr/PR31384.pass.cpp | 34 ++--- .../pairs/pairs.pair/assign_pair.pass.cpp | 46 ++----- .../test/support/concurrent_agents.h | 9 +- .../test/support/cuda_space_selector.h | 44 ++----- .upstream-tests/test/support/test_macros.h | 47 ++----- 23 files changed, 243 insertions(+), 373 deletions(-) diff --git a/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h b/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h index a9ee75c189..fbba4cef8d 100644 --- a/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h +++ b/.upstream-tests/test/cuda/pipeline_memcpy_async_thread_scope_generic.h @@ -16,7 +16,6 @@ #include "large_type.h" template < - class Group, cuda::thread_scope Scope, class T, template class SourceSelector, @@ -25,7 +24,7 @@ template < uint8_t PipelineStages > __host__ __device__ __noinline__ -void test_fully_specialized(Group &group) +void test_fully_specialized() { SourceSelector source_sel; typename DestSelector @@ -36,7 +35,13 @@ void test_fully_specialized(Group &group) T * dest = dest_sel.construct(static_cast(0)); cuda::pipeline_shared_state * pipe_state = pipe_state_sel.construct(); - auto pipe = cuda::make_pipeline(group, pipe_state); +#ifdef __CUDA_ARCH__ + auto group = cooperative_groups::this_thread_block(); +#else + auto group = cuda::__single_thread_group{}; +#endif + + auto pipe = make_pipeline(group, pipe_state); assert(*source == 12); assert(*dest == 0); @@ -86,16 +91,11 @@ __host__ __device__ __noinline__ void test_select_pipeline() { constexpr uint8_t stages_count = 2; - - auto singleGroup = cuda::__single_thread_group{}; - test_fully_specialized(singleGroup); - NV_DISPATCH_TARGET( - NV_IS_DEVICE, ( - auto group = cooperative_groups::this_thread_block(); - test_fully_specialized(group); - test_fully_specialized(group); - ) - ) + test_fully_specialized(); +#ifdef __CUDA_ARCH__ + test_fully_specialized(); + test_fully_specialized(); +#endif } template < @@ -107,12 +107,10 @@ __host__ __device__ __noinline__ void test_select_destination() { test_select_pipeline(); - NV_DISPATCH_TARGET( - NV_IS_DEVICE, ( - test_select_pipeline(); - test_select_pipeline(); - ) - ) +#ifdef __CUDA_ARCH__ + test_select_pipeline(); + test_select_pipeline(); +#endif } template @@ -120,10 +118,8 @@ __host__ __device__ __noinline__ void test_select_source() { test_select_destination(); - NV_DISPATCH_TARGET( - NV_IS_DEVICE, ( - test_select_destination(); - test_select_destination(); - ) - ) +#ifdef __CUDA_ARCH__ + test_select_destination(); + test_select_destination(); +#endif } diff --git a/.upstream-tests/test/std/atomics/atomics.flag/atomic_flag_wait.pass.cpp b/.upstream-tests/test/std/atomics/atomics.flag/atomic_flag_wait.pass.cpp index 8a13854927..52b92276bb 100644 --- a/.upstream-tests/test/std/atomics/atomics.flag/atomic_flag_wait.pass.cpp +++ b/.upstream-tests/test/std/atomics/atomics.flag/atomic_flag_wait.pass.cpp @@ -9,8 +9,6 @@ // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: c++98, c++03 // UNSUPPORTED: pre-sm-70 -// NVC++ does not support GPU function pointers -// UNSUPPORTED: pgi // diff --git a/.upstream-tests/test/std/atomics/atomics.flag/default.pass.cpp b/.upstream-tests/test/std/atomics/atomics.flag/default.pass.cpp index f2aa50ebc7..343dbe46c2 100644 --- a/.upstream-tests/test/std/atomics/atomics.flag/default.pass.cpp +++ b/.upstream-tests/test/std/atomics/atomics.flag/default.pass.cpp @@ -25,40 +25,26 @@ #endif #include "cuda_space_selector.h" - template class Selector> __host__ __device__ void test() { -// cudafe crashes on trying to interpret the line below when compiling with Clang -// TODO: file a compiler bug -#if !(defined(__clang__) && defined(__CUDACC__)) -# define _TEST_NO_DESTRUCT_ZERO -#else -# define _TEST_NO_DESTRUCT_ZERO zero.~A() -#endif - Selector sel; cuda::std::atomic_flag & f = *sel.construct(); f.clear(); assert(f.test_and_set() == 0); { - NV_DISPATCH_TARGET( - NV_PROVIDES_SM_70, ( - typedef cuda::std::atomic_flag A; - TEST_ALIGNAS_TYPE(A) char storage[sizeof(A)] = {1}; - A& zero = *new (storage) A(); - assert(!zero.test_and_set()); - _TEST_NO_DESTRUCT_ZERO; - ), - NV_IS_HOST, ( - typedef cuda::std::atomic_flag A; - TEST_ALIGNAS_TYPE(A) char storage[sizeof(A)] = {1}; - A& zero = *new (storage) A(); - assert(!zero.test_and_set()); - _TEST_NO_DESTRUCT_ZERO; - ) - ) +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + typedef cuda::std::atomic_flag A; + TEST_ALIGNAS_TYPE(A) char storage[sizeof(A)] = {1}; + A& zero = *new (storage) A(); + assert(!zero.test_and_set()); + // cudafe crashes on trying to interpret the line below when compiling with Clang + // TODO: file a compiler bug +#if !(defined(__clang__) && defined(__CUDACC__)) + zero.~A(); +#endif +#endif } } diff --git a/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp b/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp index 63b2caa2bb..2dc72caeec 100644 --- a/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp +++ b/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp @@ -9,8 +9,6 @@ // UNSUPPORTED: libcpp-has-no-threads // UNSUPPORTED: c++98, c++03 // UNSUPPORTED: pre-sm-70 -// NVC++ does not support GPU function pointers -// UNSUPPORTED: pgi // diff --git a/.upstream-tests/test/std/thread/thread.barrier/arrive.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/arrive.pass.cpp index f8a2849854..98cac810b9 100644 --- a/.upstream-tests/test/std/thread/thread.barrier/arrive.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.barrier/arrive.pass.cpp @@ -25,7 +25,7 @@ __host__ __device__ void test() { Selector sel; - Barrier*& b = maybe_shared_mem(); + SHARED Barrier * b; b = sel.construct(2); #ifdef __CUDA_ARCH__ diff --git a/.upstream-tests/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp index f46d31e004..25acfe46f6 100644 --- a/.upstream-tests/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - Barrier*& b = maybe_shared_mem(); + SHARED Barrier * b; b = sel.construct(2); auto dropper = LAMBDA (){ diff --git a/.upstream-tests/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp index d99f99006e..2989b9b134 100644 --- a/.upstream-tests/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - Barrier*& b = maybe_shared_mem(); + SHARED Barrier * b; b = sel.construct(2); auto worker = LAMBDA (){ diff --git a/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp index dc1aefc4fc..e9afbe17f3 100644 --- a/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp @@ -24,13 +24,13 @@ __host__ __device__ void test() { global_memory_selector int_sel; - int*& x = maybe_shared_mem(); + SHARED int * x; x = int_sel.construct(0); auto comp = LAMBDA () { *x += 1; }; Selector, Initializer> sel; - Barrier*& b = maybe_shared_mem*>(); + SHARED Barrier * b; b = sel.construct(2, comp); auto worker = LAMBDA () { diff --git a/.upstream-tests/test/std/thread/thread.latch/arrive_and_wait.pass.cpp b/.upstream-tests/test/std/thread/thread.latch/arrive_and_wait.pass.cpp index 815bb5a055..4fa55834a1 100644 --- a/.upstream-tests/test/std/thread/thread.latch/arrive_and_wait.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.latch/arrive_and_wait.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - Latch*& l = maybe_shared_mem(); + SHARED Latch * l; l = sel.construct(2); auto worker = LAMBDA (){ diff --git a/.upstream-tests/test/std/thread/thread.latch/count_down.pass.cpp b/.upstream-tests/test/std/thread/thread.latch/count_down.pass.cpp index 239eae9a88..b8720606bb 100644 --- a/.upstream-tests/test/std/thread/thread.latch/count_down.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.latch/count_down.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - Latch*& l = maybe_shared_mem(); + SHARED Latch * l; l = sel.construct(2); #ifdef __CUDA_ARCH__ diff --git a/.upstream-tests/test/std/thread/thread.latch/try_wait.pass.cpp b/.upstream-tests/test/std/thread/thread.latch/try_wait.pass.cpp index f6b6d9e027..7d650ce692 100644 --- a/.upstream-tests/test/std/thread/thread.latch/try_wait.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.latch/try_wait.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - Latch*& l = maybe_shared_mem(); + SHARED Latch * l; l = sel.construct(1); l->count_down(); diff --git a/.upstream-tests/test/std/thread/thread.semaphore/release.pass.cpp b/.upstream-tests/test/std/thread/thread.semaphore/release.pass.cpp index 8efb47490a..28b8986ac2 100644 --- a/.upstream-tests/test/std/thread/thread.semaphore/release.pass.cpp +++ b/.upstream-tests/test/std/thread/thread.semaphore/release.pass.cpp @@ -24,7 +24,7 @@ __host__ __device__ void test() { Selector sel; - Semaphore*& s = maybe_shared_mem(); + SHARED Semaphore * s; s = sel.construct(2); #ifdef __CUDA_ARCH__ diff --git a/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp b/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp index 4638cc657b..63b0e6d09b 100644 --- a/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp +++ b/.upstream-tests/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp @@ -166,9 +166,16 @@ inline constexpr CallType operator|(CallType LHS, CallType RHS) { #if 0 +#ifdef __CUDA_ARCH__ +__device__ +#endif +CallType ForwardingCallObject_last_call_type = CT_None; +#ifdef __CUDA_ARCH__ +__device__ +#endif +TypeID const* ForwardingCallObject_last_call_args = nullptr; + struct ForwardingCallObject { - STATIC_MEMBER_VAR(ForwardingCallObject_last_call_type, CallType) - STATIC_MEMBER_VAR(ForwardingCallObject_last_call_args, TypeID const*) template __host__ __device__ @@ -202,21 +209,21 @@ struct ForwardingCallObject { template __host__ __device__ static void set_call(CallType type) { - assert(ForwardingCallObject_last_call_type() == CT_None); - assert(ForwardingCallObject_last_call_args() == nullptr); - ForwardingCallObject_last_call_type() = type; - ForwardingCallObject_last_call_args() = &makeArgumentID(); + assert(ForwardingCallObject_last_call_type == CT_None); + assert(ForwardingCallObject_last_call_args == nullptr); + ForwardingCallObject_last_call_type = type; + ForwardingCallObject_last_call_args = &makeArgumentID(); } template __host__ __device__ static bool check_call(CallType type) { bool result = - ForwardingCallObject_last_call_type() == type - && ForwardingCallObject_last_call_args() - && *ForwardingCallObject_last_call_args() == makeArgumentID(); - ForwardingCallObject_last_call_type() = CT_None; - ForwardingCallObject_last_call_args() = nullptr; + ForwardingCallObject_last_call_type == type + && ForwardingCallObject_last_call_args + && *ForwardingCallObject_last_call_args == makeArgumentID(); + ForwardingCallObject_last_call_type = CT_None; + ForwardingCallObject_last_call_args = nullptr; return result; } }; @@ -228,15 +235,18 @@ struct ForwardingCallObject { // BOOL TEST TYPES /////////////////////////////////////////////////////////////////////////////// -struct EvilBool { - STATIC_MEMBER_VAR(EvilBool_bang_called, int) +#ifdef __CUDA_ARCH__ +__device__ +#endif +int EvilBool_bang_called = 0; +struct EvilBool { EvilBool(EvilBool const&) = default; EvilBool(EvilBool&&) = default; __host__ __device__ friend EvilBool operator!(EvilBool const& other) { - ++EvilBool_bang_called(); + ++EvilBool_bang_called; return EvilBool{!other.value}; } @@ -399,12 +409,12 @@ void return_type_tests() using T = CopyCallable; auto ret = cuda::std::not_fn(T{false}); static_assert(is_same::value, ""); - EvilBool::EvilBool_bang_called() = 0; + EvilBool_bang_called = 0; auto value_ret = ret(); - assert(EvilBool::EvilBool_bang_called() == 1); + assert(EvilBool_bang_called == 1); assert(value_ret.value == true); ret(); - assert(EvilBool::EvilBool_bang_called() == 2); + assert(EvilBool_bang_called == 2); } } @@ -517,8 +527,6 @@ void call_operator_sfinae_test() { __host__ __device__ void call_operator_forwarding_test() { - ForwardingCallObject::ForwardingCallObject_last_call_type() = CT_None; - ForwardingCallObject::ForwardingCallObject_last_call_args() = nullptr; using Fn = ForwardingCallObject; auto obj = cuda::std::not_fn(Fn{}); const auto& c_obj = obj; diff --git a/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h b/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h index 3592ff80b4..eb8019fdf6 100644 --- a/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h +++ b/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include +#include // Assumption: minValue < maxValue @@ -24,33 +24,6 @@ T euclidian_addition(T rhs, T lhs) return ret; } -template ::value, T zero = 0> -struct signed_euclidean_subtraction { - static constexpr T modulus = maxValue - minValue + 1; - __host__ __device__ T operator()(T lhs, T rhs) { - T ret = lhs - rhs; - if (ret < minValue) { - ret += modulus; - } - if (ret > maxValue) { - ret += modulus; - } - return ret; - } -}; - -template -struct signed_euclidean_subtraction { - static constexpr T modulus = maxValue + 1; - __host__ __device__ T operator()(T lhs, T rhs) { - T ret = lhs - rhs; - if (ret > maxValue) { - ret += modulus; - } - return ret; - } -}; - // Assumption: minValue < maxValue // Assumption: minValue <= rhs <= maxValue // Assumption: minValue <= lhs <= maxValue @@ -59,7 +32,11 @@ template __host__ __device__ T euclidian_subtraction(T lhs, T rhs) { - signed_euclidean_subtraction op; - - return op(lhs, rhs); + const T modulus = maxValue - minValue + 1; + T ret = lhs - rhs; + if (std::is_signed::value and (ret < minValue)) // avoids warning about comparison with zero if T is unsigned + ret += modulus; + if (ret > maxValue) // this can happen if T is unsigned + ret += modulus; + return ret; } diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp index 64dc0608c7..f7178048a8 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++98, c++03, c++11, c++14 +// UNSUPPORTED: c++98, c++03, c++11, c++14 // UNSUPPORTED: nvrtc // @@ -239,9 +239,11 @@ void test_noexcept() } namespace ReturnTypeTest { - struct global { - STATIC_MEMBER_VAR(my_int, int) - }; + #ifdef __CUDA_ARCH__ + __constant__ int my_int = 42; + #else + static int my_int = 42; + #endif template struct index {}; @@ -252,31 +254,31 @@ namespace ReturnTypeTest { int f(index<1>) { return 0; } __host__ __device__ - int & f(index<2>) { return static_cast(global::my_int()); } + int & f(index<2>) { return static_cast(my_int); } __host__ __device__ - int const & f(index<3>) { return static_cast(global::my_int()); } + int const & f(index<3>) { return static_cast(my_int); } __host__ __device__ - int volatile & f(index<4>) { return static_cast(global::my_int()); } + int volatile & f(index<4>) { return static_cast(my_int); } __host__ __device__ - int const volatile & f(index<5>) { return static_cast(global::my_int()); } + int const volatile & f(index<5>) { return static_cast(my_int); } __host__ __device__ - int && f(index<6>) { return static_cast(global::my_int()); } + int && f(index<6>) { return static_cast(my_int); } __host__ __device__ - int const && f(index<7>) { return static_cast(global::my_int()); } + int const && f(index<7>) { return static_cast(my_int); } __host__ __device__ - int volatile && f(index<8>) { return static_cast(global::my_int()); } + int volatile && f(index<8>) { return static_cast(my_int); } __host__ __device__ - int const volatile && f(index<9>) { return static_cast(global::my_int()); } + int const volatile && f(index<9>) { return static_cast(my_int); } __host__ __device__ - int * f(index<10>) { return static_cast(&global::my_int()); } + int * f(index<10>) { return static_cast(&my_int); } __host__ __device__ - int const * f(index<11>) { return static_cast(&global::my_int()); } + int const * f(index<11>) { return static_cast(&my_int); } __host__ __device__ - int volatile * f(index<12>) { return static_cast(&global::my_int()); } + int volatile * f(index<12>) { return static_cast(&my_int); } __host__ __device__ - int const volatile * f(index<13>) { return static_cast(&global::my_int()); } + int const volatile * f(index<13>) { return static_cast(&my_int); } template __host__ __device__ diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp index 786d3208a3..4908965027 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp @@ -8,7 +8,7 @@ -// UNSUPPORTED: c++98, c++03, c++11, c++14 +// UNSUPPORTED: c++98, c++03, c++11, c++14 // @@ -31,16 +31,18 @@ #include "test_macros.h" #include "disable_missing_braces_warning.h" -struct global_state { - STATIC_MEMBER_VAR(count, int) -}; +#ifdef __CUDA_ARCH__ +__device__ int count = 0; +#else +int count = 0; +#endif struct A_int_0 { __host__ __device__ A_int_0() : obj1(0){} __host__ __device__ A_int_0(int x) : obj1(x) {} - __host__ __device__ int mem1() { return ++global_state::count(); } - __host__ __device__ int mem2() const { return ++global_state::count(); } + __host__ __device__ int mem1() { return ++count; } + __host__ __device__ int mem2() const { return ++count; } int const obj1; }; @@ -48,16 +50,16 @@ struct A_int_1 { __host__ __device__ A_int_1() {} __host__ __device__ A_int_1(int) {} - __host__ __device__ int mem1(int x) { return global_state::count() += x; } - __host__ __device__ int mem2(int x) const { return global_state::count() += x; } + __host__ __device__ int mem1(int x) { return count += x; } + __host__ __device__ int mem2(int x) const { return count += x; } }; struct A_int_2 { __host__ __device__ A_int_2() {} __host__ __device__ A_int_2(int) {} - __host__ __device__ int mem1(int x, int y) { return global_state::count() += (x + y); } - __host__ __device__ int mem2(int x, int y) const { return global_state::count() += (x + y); } + __host__ __device__ int mem1(int x, int y) { return count += (x + y); } + __host__ __device__ int mem2(int x, int y) const { return count += (x + y); } }; template @@ -96,7 +98,7 @@ template < __host__ __device__ void test_ext_int_0() { - global_state::count() = 0; + count = 0; typedef A_int_0 T; typedef A_wrap_0 Wrap; typedef A_base_0 Base; @@ -115,63 +117,63 @@ void test_ext_int_0() T a; Tuple t{a}; assert(1 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 1); + assert(count == 1); } - global_state::count() = 0; + count = 0; // member function w/pointer { T a; TuplePtr t{&a}; assert(1 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 1); + assert(count == 1); } - global_state::count() = 0; + count = 0; // member function w/base { Base a; TupleBase t{a}; assert(1 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 1); + assert(count == 1); } - global_state::count() = 0; + count = 0; // member function w/wrap { Wrap a; TupleWrap t{a}; assert(1 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 1); + assert(count == 1); } - global_state::count() = 0; + count = 0; // const member function w/ref { T const a; ConstTuple t{a}; assert(1 == cuda::std::apply(mem2, t)); - assert(global_state::count() == 1); + assert(count == 1); } - global_state::count() = 0; + count = 0; // const member function w/pointer { T const a; ConstTuplePtr t{&a}; assert(1 == cuda::std::apply(mem2, t)); - assert(global_state::count() == 1); + assert(count == 1); } - global_state::count() = 0; + count = 0; // const member function w/base { Base const a; ConstTupleBase t{a}; assert(1 == cuda::std::apply(mem2, t)); - assert(global_state::count() == 1); + assert(count == 1); } - global_state::count() = 0; + count = 0; // const member function w/wrapper { Wrap const a; ConstTupleWrap t{a}; assert(1 == cuda::std::apply(mem2, t)); - assert(1 == global_state::count()); + assert(1 == count); } // member object w/ref { @@ -209,7 +211,7 @@ template < __host__ __device__ void test_ext_int_1() { - global_state::count() = 0; + count = 0; typedef A_int_1 T; typedef A_wrap_1 Wrap; typedef A_base_1 Base; @@ -225,63 +227,63 @@ void test_ext_int_1() T a; Tuple t{a, 2}; assert(2 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 2); + assert(count == 2); } - global_state::count() = 0; + count = 0; // member function w/pointer { T a; TuplePtr t{&a, 3}; assert(3 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 3); + assert(count == 3); } - global_state::count() = 0; + count = 0; // member function w/base { Base a; TupleBase t{a, 4}; assert(4 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 4); + assert(count == 4); } - global_state::count() = 0; + count = 0; // member function w/wrap { Wrap a; TupleWrap t{a, 5}; assert(5 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 5); + assert(count == 5); } - global_state::count() = 0; + count = 0; // const member function w/ref { T const a; ConstTuple t{a, 6}; assert(6 == cuda::std::apply(mem2, t)); - assert(global_state::count() == 6); + assert(count == 6); } - global_state::count() = 0; + count = 0; // const member function w/pointer { T const a; ConstTuplePtr t{&a, 7}; assert(7 == cuda::std::apply(mem2, t)); - assert(global_state::count() == 7); + assert(count == 7); } - global_state::count() = 0; + count = 0; // const member function w/base { Base const a; ConstTupleBase t{a, 8}; assert(8 == cuda::std::apply(mem2, t)); - assert(global_state::count() == 8); + assert(count == 8); } - global_state::count() = 0; + count = 0; // const member function w/wrapper { Wrap const a; ConstTupleWrap t{a, 9}; assert(9 == cuda::std::apply(mem2, t)); - assert(9 == global_state::count()); + assert(9 == count); } } @@ -295,7 +297,7 @@ template < __host__ __device__ void test_ext_int_2() { - global_state::count() = 0; + count = 0; typedef A_int_2 T; typedef A_wrap_2 Wrap; typedef A_base_2 Base; @@ -311,63 +313,63 @@ void test_ext_int_2() T a; Tuple t{a, 1, 1}; assert(2 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 2); + assert(count == 2); } - global_state::count() = 0; + count = 0; // member function w/pointer { T a; TuplePtr t{&a, 1, 2}; assert(3 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 3); + assert(count == 3); } - global_state::count() = 0; + count = 0; // member function w/base { Base a; TupleBase t{a, 2, 2}; assert(4 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 4); + assert(count == 4); } - global_state::count() = 0; + count = 0; // member function w/wrap { Wrap a; TupleWrap t{a, 2, 3}; assert(5 == cuda::std::apply(mem1, t)); - assert(global_state::count() == 5); + assert(count == 5); } - global_state::count() = 0; + count = 0; // const member function w/ref { T const a; ConstTuple t{a, 3, 3}; assert(6 == cuda::std::apply(mem2, t)); - assert(global_state::count() == 6); + assert(count == 6); } - global_state::count() = 0; + count = 0; // const member function w/pointer { T const a; ConstTuplePtr t{&a, 3, 4}; assert(7 == cuda::std::apply(mem2, t)); - assert(global_state::count() == 7); + assert(count == 7); } - global_state::count() = 0; + count = 0; // const member function w/base { Base const a; ConstTupleBase t{a, 4, 4}; assert(8 == cuda::std::apply(mem2, t)); - assert(global_state::count() == 8); + assert(count == 8); } - global_state::count() = 0; + count = 0; // const member function w/wrapper { Wrap const a; ConstTupleWrap t{a, 4, 5}; assert(9 == cuda::std::apply(mem2, t)); - assert(9 == global_state::count()); + assert(9 == count); } } diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.assign/move.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.assign/move.pass.cpp index 7ba86b4d41..6254d9d497 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.assign/move.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.assign/move.pass.cpp @@ -14,7 +14,7 @@ // tuple& operator=(tuple&& u); -// UNSUPPORTED: c++98, c++03 +// UNSUPPORTED: c++98, c++03 #include #include @@ -41,15 +41,19 @@ struct MoveAssignable { MoveAssignable& operator=(MoveAssignable&&) = default; }; +#ifdef __CUDA_ARCH__ +__device__ static int copied = 0; +__device__ static int moved = 0; +#else +static int copied = 0; +static int moved = 0; +#endif struct CountAssign { - STATIC_MEMBER_VAR(copied, int) - STATIC_MEMBER_VAR(moved, int) - - __host__ __device__ static void reset() { copied() = moved() = 0; } + __host__ __device__ static void reset() { copied = moved = 0; } CountAssign() = default; - __host__ __device__ CountAssign& operator=(CountAssign const&) { ++copied(); return *this; } - __host__ __device__ CountAssign& operator=(CountAssign&&) { ++moved(); return *this; } + __host__ __device__ CountAssign& operator=(CountAssign const&) { ++copied; return *this; } + __host__ __device__ CountAssign& operator=(CountAssign&&) { ++moved; return *this; } }; int main(int, char**) @@ -126,8 +130,8 @@ int main(int, char**) T t1; T t2; t1 = cuda::std::move(t2); - assert(CountAssign::copied() == 1); - assert(CountAssign::moved() == 0); + assert(copied == 1); + assert(moved == 0); } return 0; diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR27684_contains_ref_to_incomplete_type.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR27684_contains_ref_to_incomplete_type.pass.cpp index c094ea54bd..9451c66b06 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR27684_contains_ref_to_incomplete_type.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR27684_contains_ref_to_incomplete_type.pass.cpp @@ -29,66 +29,41 @@ #include "test_macros.h" struct IncompleteType; - -#define STATIC_EXTERN_DECL(name, type) \ - __device__ static type& name##_device(); \ - __host__ static type& name##_host(); \ - __host__ __device__ static type& name(); - -struct global { - STATIC_EXTERN_DECL(inc1, IncompleteType) - STATIC_EXTERN_DECL(inc2, IncompleteType) - __host__ __device__ static const IncompleteType& cinc1(); - __host__ __device__ static const IncompleteType& cinc2(); -}; +#ifdef __CUDA_ARCH__ +__device__ extern IncompleteType inc1; +__device__ extern IncompleteType inc2; +__device__ IncompleteType const& cinc1 = inc1; +__device__ IncompleteType const& cinc2 = inc2; +#else +extern IncompleteType inc1; +extern IncompleteType inc2; +IncompleteType const& cinc1 = inc1; +IncompleteType const& cinc2 = inc2; +#endif int main(int, char**) { using IT = IncompleteType; { // try calling tuple(Tp const&...) using Tup = cuda::std::tuple; - Tup t(global::cinc1(), global::cinc2()); - assert(&cuda::std::get<0>(t) == &global::inc1()); - assert(&cuda::std::get<1>(t) == &global::inc2()); + Tup t(cinc1, cinc2); + assert(&cuda::std::get<0>(t) == &inc1); + assert(&cuda::std::get<1>(t) == &inc2); } { // try calling tuple(Up&&...) using Tup = cuda::std::tuple; - Tup t(global::inc1(), global::inc2()); - assert(&cuda::std::get<0>(t) == &global::inc1()); - assert(&cuda::std::get<1>(t) == &global::inc2()); + Tup t(inc1, inc2); + assert(&cuda::std::get<0>(t) == &inc1); + assert(&cuda::std::get<1>(t) == &inc2); } return 0; } struct IncompleteType {}; - -#define STATIC_EXTERN_IMPL(name, type) \ - __device__ type& name##_device() { \ - __shared__ type v; \ - return v; \ - } \ - __host__ type& name##_host() { \ - static type v; \ - return v; \ - } \ - type& name() { \ - NV_DISPATCH_TARGET( \ - NV_IS_DEVICE, ( \ - return name##_device(); \ - ), \ - NV_IS_HOST, ( \ - return name##_host(); \ - ) \ - ) \ - } - -STATIC_EXTERN_IMPL(global::inc1, IncompleteType) -STATIC_EXTERN_IMPL(global::inc2, IncompleteType) - -__host__ __device__ const IncompleteType& global::cinc1() { - return inc1(); -} - -__host__ __device__ const IncompleteType& global::cinc2() { - return inc2(); -} +#ifdef __CUDA_ARCH__ +__device__ IncompleteType inc1; +__device__ IncompleteType inc2; +#else +IncompleteType inc1; +IncompleteType inc2; +#endif diff --git a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp index b593d93122..3817209271 100644 --- a/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp +++ b/.upstream-tests/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp @@ -23,9 +23,11 @@ #include "test_macros.h" -struct global { - STATIC_MEMBER_VAR(count, int) -}; +#ifdef __CUDA_ARCH__ +__device__ int count = 0; +#else +int count = 0; +#endif struct Explicit { Explicit() = default; @@ -41,7 +43,7 @@ template struct Derived : cuda::std::tuple { using cuda::std::tuple::tuple; template - __host__ __device__ operator cuda::std::tuple() && { ++global::count(); return {}; } + __host__ __device__ operator cuda::std::tuple() && { ++count; return {}; } }; @@ -49,31 +51,31 @@ template struct ExplicitDerived : cuda::std::tuple { using cuda::std::tuple::tuple; template - __host__ __device__ explicit operator cuda::std::tuple() && { ++global::count(); return {}; } + __host__ __device__ explicit operator cuda::std::tuple() && { ++count; return {}; } }; int main(int, char**) { { cuda::std::tuple foo = Derived{42}; ((void)foo); - assert(global::count() == 1); + assert(count == 1); cuda::std::tuple bar(Derived{42}); ((void)bar); - assert(global::count() == 2); + assert(count == 2); } - global::count() = 0; + count = 0; { cuda::std::tuple foo = Derived{42}; ((void)foo); - assert(global::count() == 1); + assert(count == 1); cuda::std::tuple bar(Derived{42}); ((void)bar); - assert(global::count() == 2); + assert(count == 2); } - global::count() = 0; + count = 0; { static_assert(!cuda::std::is_convertible< ExplicitDerived, cuda::std::tuple>::value, ""); cuda::std::tuple bar(ExplicitDerived{42}); ((void)bar); - assert(global::count() == 1); + assert(count == 1); } - global::count() = 0; + count = 0; { // FIXME: Libc++ incorrectly rejects this code. #ifndef _LIBCUDACXX_VERSION @@ -86,11 +88,11 @@ int main(int, char**) { ExplicitDerived, cuda::std::tuple>::value, "libc++ incorrectly rejects this"); #endif - assert(global::count() == 0); + assert(count == 0); cuda::std::tuple bar(ExplicitDerived{42}); ((void)bar); - assert(global::count() == 1); + assert(count == 1); } - global::count() = 0; + count = 0; return 0; diff --git a/.upstream-tests/test/std/utilities/utility/pairs/pairs.pair/assign_pair.pass.cpp b/.upstream-tests/test/std/utilities/utility/pairs/pairs.pair/assign_pair.pass.cpp index 217c8a04c7..e77004b15f 100644 --- a/.upstream-tests/test/std/utilities/utility/pairs/pairs.pair/assign_pair.pass.cpp +++ b/.upstream-tests/test/std/utilities/utility/pairs/pairs.pair/assign_pair.pass.cpp @@ -45,7 +45,6 @@ struct MoveAssignable { struct CountAssign { STATIC_MEMBER_VAR(copied, int); STATIC_MEMBER_VAR(moved, int); - __host__ __device__ static void reset() { copied() = moved() = 0; } CountAssign() = default; __host__ __device__ CountAssign& operator=(CountAssign const&) { ++copied(); return *this; } @@ -53,15 +52,11 @@ struct CountAssign { }; struct Incomplete; - -#define STATIC_EXTERN_DECL(name, type) \ - __device__ static type& name##_device(); \ - __host__ static type& name##_host(); \ - __host__ __device__ static type& name(); - -struct global { - STATIC_EXTERN_DECL(inc_obj, Incomplete) -}; +#ifdef __CUDA_ARCH__ +__device__ extern Incomplete inc_obj; +#else +extern Incomplete inc_obj; +#endif int main(int, char**) { @@ -105,34 +100,17 @@ int main(int, char**) { using P = cuda::std::pair; static_assert(!cuda::std::is_copy_assignable

::value, ""); - P p(42, global::inc_obj()); + P p(42, inc_obj); unused(p); - assert(&p.second == &global::inc_obj()); + assert(&p.second == &inc_obj); } return 0; } struct Incomplete {}; - -#define STATIC_EXTERN_IMPL(name, type) \ - __device__ type& name##_device() { \ - __shared__ type v; \ - return v; \ - } \ - __host__ type& name##_host() { \ - static type v; \ - return v; \ - } \ - type& name() { \ - NV_DISPATCH_TARGET( \ - NV_IS_DEVICE, ( \ - return name##_device(); \ - ), \ - NV_IS_HOST, ( \ - return name##_host(); \ - ) \ - ) \ - } - -STATIC_EXTERN_IMPL(global::inc_obj, Incomplete) +#ifdef __CUDA_ARCH__ +__device__ Incomplete inc_obj; +#else +Incomplete inc_obj; +#endif diff --git a/.upstream-tests/test/support/concurrent_agents.h b/.upstream-tests/test/support/concurrent_agents.h index ab2a9fef76..6836c17884 100644 --- a/.upstream-tests/test/support/concurrent_agents.h +++ b/.upstream-tests/test/support/concurrent_agents.h @@ -11,15 +11,16 @@ #include #endif -#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 350 - #error "This test requires CUDA dynamic parallelism to work." -#endif - template __host__ __device__ void concurrent_agents_launch(Fs ...fs) { #ifdef __CUDA_ARCH__ + + #if __CUDA_ARCH__ < 350 + #error "This test requires CUDA dynamic parallelism to work." + #endif + assert(blockDim.x == sizeof...(Fs)); using fptr = void (*)(void *); diff --git a/.upstream-tests/test/support/cuda_space_selector.h b/.upstream-tests/test/support/cuda_space_selector.h index 83ddaf1b3a..e3b68c7ea2 100644 --- a/.upstream-tests/test/support/cuda_space_selector.h +++ b/.upstream-tests/test/support/cuda_space_selector.h @@ -27,9 +27,6 @@ #define SHARED #endif -#pragma diag_suppress 941 -#pragma diag_suppress 1057 - template struct malloc_memory_provider { static const constexpr cuda::std::size_t prefix_size @@ -39,20 +36,6 @@ struct malloc_memory_provider { static const constexpr cuda::std::size_t shared_offset = prefix_size + sizeof(T *); private: - - __device__ char* device_static_storage() { - __shared__ alignas(T*) char storage[shared_offset]; - return storage; - } - - -#if !defined(__CUDACC_RTC__) - __host__ char* host_static_storage() { - alignas(T*) static char storage[shared_offset]; - return storage; - } -#endif - __host__ __device__ T *& get_pointer() { alignas(T*) @@ -182,30 +165,19 @@ class memory_selector return ptr; } - __device__ void destruct_device() { - if (threadIdx.x == 0) { - ptr->~T(); - } - __syncthreads(); - } - - __host__ void destruct_host() { - ptr->~T(); - } - #ifndef __CUDACC_RTC__ __exec_check_disable__ #endif __host__ __device__ ~memory_selector() { - NV_DISPATCH_TARGET( - NV_IS_DEVICE, ( - destruct_device(); - ), - NV_IS_HOST, ( - destruct_host(); - ) - ) +#ifdef __CUDA_ARCH__ + if (threadIdx.x == 0) { +#endif + ptr->~T(); +#ifdef __CUDA_ARCH__ + } + __syncthreads(); +#endif } }; diff --git a/.upstream-tests/test/support/test_macros.h b/.upstream-tests/test/support/test_macros.h index 7783c96f90..ed7306f2c5 100644 --- a/.upstream-tests/test/support/test_macros.h +++ b/.upstream-tests/test/support/test_macros.h @@ -346,47 +346,18 @@ inline void DoNotOptimize(Tp const& value) { #define TEST_NOINLINE #endif -template -__device__ _Tp& maybe_shared_mem_device() { - __shared__ _Tp v; - return v; -} - -template -__host__ _Tp& maybe_shared_mem_host() { - static _Tp v; - return v; -} +// NVCC can't handle static member variables, so with a little care +// a function returning a reference will result in the same thing +#ifdef __CUDA_ARCH__ +# define _STATIC_MEMBER_IMPL(type) __shared__ type v; +#else +# define _STATIC_MEMBER_IMPL(type) static type v; +#endif -template -__host__ __device__ _Tp& maybe_shared_mem() { - NV_DISPATCH_TARGET( - NV_IS_DEVICE, ( - return maybe_shared_mem_device<_Tp>(); - ), - NV_IS_HOST, ( - return maybe_shared_mem_host<_Tp>(); - ) - ) -} #define STATIC_MEMBER_VAR(name, type) \ - __device__ static type& name##_device() { \ - __shared__ type v; \ - return v; \ - } \ - __host__ static type& name##_host() { \ - static type v; \ - return v; \ - } \ __host__ __device__ static type& name() { \ - NV_DISPATCH_TARGET( \ - NV_IS_DEVICE, ( \ - return name##_device(); \ - ), \ - NV_IS_HOST, ( \ - return name##_host(); \ - ) \ - ) \ + _STATIC_MEMBER_IMPL(type); \ + return v; \ } #if defined(__GNUC__) From dfbd5decad3eae39c43adbf357b22c27f23d56bb Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 14 Jul 2021 15:00:05 -0700 Subject: [PATCH 17/34] Rebuild atomic_cuda_generated --- codegen/codegen.cpp | 6 -- .../support/atomic/atomic_cuda_generated.h | 61 ++++++++++--------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/codegen/codegen.cpp b/codegen/codegen.cpp index d724a2dd12..d01ba4a3eb 100644 --- a/codegen/codegen.cpp +++ b/codegen/codegen.cpp @@ -88,10 +88,6 @@ int main() { return "__cuda_fence_" + sem + "_" + scope; }; - out << "_LIBCUDACXX_BEGIN_NAMESPACE_CUDA\n"; - out << "namespace detail {\n"; - out << "\n"; - for(auto& s : scopes) { out << "static inline __device__ void __cuda_membar_" << s.first << "() { asm volatile(\"membar" << membar_scopes[s.first] << ";\":::\"memory\"); }\n"; for(auto& sem : fence_semantics) @@ -316,8 +312,6 @@ int main() { } out << "\n"; - out << "}\n"; - out << "_LIBCUDACXX_END_NAMESPACE_CUDA\n"; return 0; } diff --git a/libcxx/include/support/atomic/atomic_cuda_generated.h b/libcxx/include/support/atomic/atomic_cuda_generated.h index d8b421c5ac..d21994d2bc 100644 --- a/libcxx/include/support/atomic/atomic_cuda_generated.h +++ b/libcxx/include/support/atomic/atomic_cuda_generated.h @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// + static inline __device__ void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); } static inline __device__ void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); } static inline __device__ void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); } @@ -113,11 +114,11 @@ __device__ void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __m default: assert(0); } } -template static inline __device__ void __cuda_compare_exchange_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template::type = 0> __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; @@ -397,11 +398,11 @@ __device__ _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int memcpy(&__ret, &__tmp, 4); return __ret; } -template static inline __device__ void __cuda_compare_exchange_acq_rel_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acq_rel_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_relaxed_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } template::type = 0> __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; @@ -842,11 +843,11 @@ __device__ void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __m default: assert(0); } } -template static inline __device__ void __cuda_compare_exchange_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template::type = 0> __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; @@ -1126,11 +1127,11 @@ __device__ _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int memcpy(&__ret, &__tmp, 4); return __ret; } -template static inline __device__ void __cuda_compare_exchange_acq_rel_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acq_rel_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_relaxed_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } template::type = 0> __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; @@ -1571,11 +1572,11 @@ __device__ void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __m default: assert(0); } } -template static inline __device__ void __cuda_compare_exchange_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(_cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template::type = 0> __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; @@ -1855,11 +1856,11 @@ __device__ _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int memcpy(&__ret, &__tmp, 4); return __ret; } -template static inline __device__ void __cuda_compare_exchange_acq_rel_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } -template static inline __device__ void __cuda_compare_exchange_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C _cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(_cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acq_rel_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_relaxed_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.relaxed.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } +template static inline __device__ void __cuda_compare_exchange_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } template::type = 0> __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; From 4f0b2433029cb96a38f3fe2ddb1769cb1f4a0041 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 22 Jul 2021 15:57:25 -0700 Subject: [PATCH 18/34] Dedup MSVC by splitting the atomic base class into a seperate header and creating back-end intrinsics in the MSVC header --- .../atomic_is_lock_free.pass.cpp | 4 +- libcxx/include/__config | 1 + libcxx/include/support/atomic/atomic_base.h | 249 ++++++++++++++++++ libcxx/include/support/atomic/atomic_gcc.h | 236 +---------------- libcxx/include/support/atomic/atomic_msvc.h | 231 +--------------- 5 files changed, 257 insertions(+), 464 deletions(-) create mode 100644 libcxx/include/support/atomic/atomic_base.h diff --git a/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp b/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp index 6eb6c342e0..c81421a758 100644 --- a/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp +++ b/.upstream-tests/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp @@ -31,9 +31,9 @@ struct TestFn { __host__ __device__ void operator()() const { typedef cuda::std::atomic A; - A t; + A t{}; bool b1 = cuda::std::atomic_is_lock_free(static_cast(&t)); - volatile A vt; + volatile A vt{}; bool b2 = cuda::std::atomic_is_lock_free(static_cast(&vt)); assert(b1 == b2); } diff --git a/libcxx/include/__config b/libcxx/include/__config index 464523eaef..64c93f4bb2 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1611,6 +1611,7 @@ _LIBCUDACXX_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( # define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP #elif defined(_LIBCUDACXX_COMPILER_MSVC) # define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL +# define _LIBCUDACXX_NO_RUNTIME_LOCK_FREE #endif // CUDA Atomics supersede host atomics in order to insert the host/device dispatch layer diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h new file mode 100644 index 0000000000..29535cfcbc --- /dev/null +++ b/libcxx/include/support/atomic/atomic_base.h @@ -0,0 +1,249 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX_ATOMIC_BASE_H +#define _LIBCUDACXX_ATOMIC_BASE_H + +template +struct __cxx_atomic_base_impl { + using __cxx_underlying_type = _Tp; + + _LIBCUDACXX_CONSTEXPR + __cxx_atomic_base_impl() _NOEXCEPT = default; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit + __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} + + _ALIGNAS(sizeof(_Tp)) _Tp __a_value; +}; + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +_Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> * __a) _NOEXCEPT { + return &__a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { + return &__a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { + return &__a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { + return &__a->__a_value; +} + +template +struct __cxx_atomic_ref_base_impl { + using __cxx_underlying_type = _Tp; + + _LIBCUDACXX_CONSTEXPR + __cxx_atomic_ref_base_impl() _NOEXCEPT = default; + + _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit + __cxx_atomic_ref_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} + + _Tp* __a_value; +}; + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +_Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) _NOEXCEPT { + return __a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { + return __a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { + return __a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { + return __a->__a_value; +} + +template +_LIBCUDACXX_INLINE_VISIBILITY auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT -> decltype(__cxx_get_underlying_atomic(__a)) { + return __cxx_get_underlying_atomic(__a); +} + +template +using __cxx_atomic_underlying_t = typename _Tp::__cxx_underlying_type; + +_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { + // Avoid switch statement to make this a constexpr. + return __order == memory_order_relaxed ? __ATOMIC_RELAXED: + (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: + (__order == memory_order_release ? __ATOMIC_RELEASE: + (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: + (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL: + __ATOMIC_CONSUME)))); +} + +_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_failure_order(memory_order __order) { + // Avoid switch statement to make this a constexpr. + return __order == memory_order_relaxed ? __ATOMIC_RELAXED: + (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: + (__order == memory_order_release ? __ATOMIC_RELAXED: + (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: + (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE: + __ATOMIC_CONSUME)))); +} + +template +inline void __cxx_atomic_init(volatile _Tp* __a, _Up __val) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __cxx_atomic_assign_volatile(*__a_tmp, __val); +} + +template +inline void __cxx_atomic_init(_Tp* __a, _Up __val) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + *__a_tmp = __val; +} + +inline +void __cxx_atomic_thread_fence(memory_order __order) { + __atomic_thread_fence(__to_gcc_order(__order)); +} + +inline +void __cxx_atomic_signal_fence(memory_order __order) { + __atomic_signal_fence(__to_gcc_order(__order)); +} + +template +inline void __cxx_atomic_store(_Tp* __a, _Up __val, + memory_order __order) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __atomic_store(__a_tmp, &__val, __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_load(const _Tp* __a, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __cxx_atomic_underlying_t<_Tp> __ret; + __atomic_load(__a_tmp, &__ret, __to_gcc_order(__order)); + return __ret; +} + +template +inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + __cxx_atomic_underlying_t<_Tp> __ret; + __atomic_exchange(__a_tmp, &__value, &__ret, __to_gcc_order(__order)); + return __ret; +} + +template +inline bool __cxx_atomic_compare_exchange_strong( + _Tp* __a, _Up* __expected, _Up __value, memory_order __success, + memory_order __failure) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_compare_exchange(__a_tmp, __expected, &__value, + false, + __to_gcc_order(__success), + __to_gcc_failure_order(__failure)); +} + +template +inline bool __cxx_atomic_compare_exchange_weak( + _Tp* __a, _Up* __expected, _Up __value, memory_order __success, + memory_order __failure) { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_compare_exchange(__a_tmp, __expected, &__value, + true, + __to_gcc_order(__success), + __to_gcc_failure_order(__failure)); +} + +template +struct __skip_amt { enum {value = 1}; }; + +template +struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; + +// FIXME: Haven't figured out what the spec says about using arrays with +// atomic_fetch_add. Force a failure rather than creating bad behavior. +template +struct __skip_amt<_Tp[]> { }; +template +struct __skip_amt<_Tp[n]> { }; + +template +inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_add(__a_tmp, __delta * __skip_v, + __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, + __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_and(__a_tmp, __pattern, + __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_or(__a_tmp, __pattern, + __to_gcc_order(__order)); +} + +template +inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, + memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { + auto __a_tmp = __cxx_atomic_base_unwrap(__a); + return __atomic_fetch_xor(__a_tmp, __pattern, + __to_gcc_order(__order)); +} + +inline constexpr + bool __cxx_atomic_is_lock_free(size_t __x) { + #if defined(_LIBCUDACXX_NO_RUNTIME_LOCK_FREE) + return __x <= 8; + #else + return __atomic_is_lock_free(__x, 0); + #endif +} + +#endif // _LIBCUDACXX_ATOMIC_BASE_H diff --git a/libcxx/include/support/atomic/atomic_gcc.h b/libcxx/include/support/atomic/atomic_gcc.h index 32f62e518b..7f76e7e826 100644 --- a/libcxx/include/support/atomic/atomic_gcc.h +++ b/libcxx/include/support/atomic/atomic_gcc.h @@ -8,237 +8,9 @@ // //===----------------------------------------------------------------------===// -template -struct __cxx_atomic_base_impl { - using __cxx_underlying_type = _Tp; +#ifndef _LIBCUDACXX_ATOMIC_GCC_H +#define _LIBCUDACXX_ATOMIC_GCC_H - _LIBCUDACXX_CONSTEXPR - __cxx_atomic_base_impl() _NOEXCEPT = default; +#include "atomic_base.h" - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit - __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} - - _ALIGNAS(sizeof(_Tp)) _Tp __a_value; -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -_Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> * __a) _NOEXCEPT { - return &__a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { - return &__a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { - return &__a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { - return &__a->__a_value; -} - -template -struct __cxx_atomic_ref_base_impl { - using __cxx_underlying_type = _Tp; - - _LIBCUDACXX_CONSTEXPR - __cxx_atomic_ref_base_impl() _NOEXCEPT = default; - - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit - __cxx_atomic_ref_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} - - _Tp* __a_value; -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -_Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) _NOEXCEPT { - return __a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { - return __a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { - return __a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { - return __a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT -> decltype(__cxx_get_underlying_atomic(__a)) { - return __cxx_get_underlying_atomic(__a); -} - -template -using __cxx_atomic_underlying_t = typename _Tp::__cxx_underlying_type; - -_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed ? __ATOMIC_RELAXED: - (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: - (__order == memory_order_release ? __ATOMIC_RELEASE: - (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: - (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL: - __ATOMIC_CONSUME)))); -} - -_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_failure_order(memory_order __order) { - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed ? __ATOMIC_RELAXED: - (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: - (__order == memory_order_release ? __ATOMIC_RELAXED: - (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: - (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE: - __ATOMIC_CONSUME)))); -} - -template -inline void __cxx_atomic_init(volatile _Tp* __a, _Up __val) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __cxx_atomic_assign_volatile(*__a_tmp, __val); -} - -template -inline void __cxx_atomic_init(_Tp* __a, _Up __val) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - *__a_tmp = __val; -} - -inline -void __cxx_atomic_thread_fence(memory_order __order) { - __atomic_thread_fence(__to_gcc_order(__order)); -} - -inline -void __cxx_atomic_signal_fence(memory_order __order) { - __atomic_signal_fence(__to_gcc_order(__order)); -} - -template -inline void __cxx_atomic_store(_Tp* __a, _Up __val, - memory_order __order) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __atomic_store(__a_tmp, &__val, __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_load(const _Tp* __a, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __cxx_atomic_underlying_t<_Tp> __ret; - __atomic_load(__a_tmp, &__ret, __to_gcc_order(__order)); - return __ret; -} - -template -inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __cxx_atomic_underlying_t<_Tp> __ret; - __atomic_exchange(__a_tmp, &__value, &__ret, __to_gcc_order(__order)); - return __ret; -} - -template -inline bool __cxx_atomic_compare_exchange_strong( - _Tp* __a, _Up* __expected, _Up __value, memory_order __success, - memory_order __failure) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_compare_exchange(__a_tmp, __expected, &__value, - false, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -inline bool __cxx_atomic_compare_exchange_weak( - _Tp* __a, _Up* __expected, _Up __value, memory_order __success, - memory_order __failure) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_compare_exchange(__a_tmp, __expected, &__value, - true, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -struct __skip_amt { enum {value = 1}; }; - -template -struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; - -// FIXME: Haven't figured out what the spec says about using arrays with -// atomic_fetch_add. Force a failure rather than creating bad behavior. -template -struct __skip_amt<_Tp[]> { }; -template -struct __skip_amt<_Tp[n]> { }; - -template -inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_add(__a_tmp, __delta * __skip_v, - __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, - __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_and(__a_tmp, __pattern, - __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_or(__a_tmp, __pattern, - __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_xor(__a_tmp, __pattern, - __to_gcc_order(__order)); -} - -inline constexpr - bool __cxx_atomic_is_lock_free(size_t __x) { - #if defined(_LIBCUDACXX_NO_RUNTIME_LOCK_FREE) - return __x <= 8; - #else - return __atomic_is_lock_free(__x, 0); - #endif -} +#endif // _LIBCUDACXX_ATOMIC_GCC_H \ No newline at end of file diff --git a/libcxx/include/support/atomic/atomic_msvc.h b/libcxx/include/support/atomic/atomic_msvc.h index 73e2ce6def..8a8084449a 100644 --- a/libcxx/include/support/atomic/atomic_msvc.h +++ b/libcxx/include/support/atomic/atomic_msvc.h @@ -27,26 +27,6 @@ #error Unsupported hardware #endif // hardware -_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed ? __ATOMIC_RELAXED: - (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: - (__order == memory_order_release ? __ATOMIC_RELEASE: - (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: - (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL: - __ATOMIC_CONSUME)))); -} - -_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_failure_order(memory_order __order) { - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed ? __ATOMIC_RELAXED: - (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: - (__order == memory_order_release ? __ATOMIC_RELAXED: - (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: - (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE: - __ATOMIC_CONSUME)))); -} - inline int __stronger_order_msvc(int __a, int __b) { int const __max = __a > __b ? __a : __b; if(__max != __ATOMIC_RELEASE) @@ -463,213 +443,4 @@ _Type __atomic_fetch_min(_Type volatile *__ptr, _Delta __val, int __memorder) { return __expected; } -template -struct __cxx_atomic_base_impl { - using __cxx_underlying_type = _Tp; - - _LIBCUDACXX_CONSTEXPR - __cxx_atomic_base_impl() _NOEXCEPT = default; - - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit - __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} - - _ALIGNAS(sizeof(_Tp)) _Tp __a_value; -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -_Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> * __a) _NOEXCEPT { - return &__a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { - return &__a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { - return &__a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { - return &__a->__a_value; -} - -template -struct __cxx_atomic_ref_base_impl { - using __cxx_underlying_type = _Tp; - - _LIBCUDACXX_CONSTEXPR - __cxx_atomic_ref_base_impl() _NOEXCEPT = default; - - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR explicit - __cxx_atomic_ref_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} - - _Tp* __a_value; -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -_Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) _NOEXCEPT { - return __a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) _NOEXCEPT { - return __a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -const _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) _NOEXCEPT { - return __a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR -const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) _NOEXCEPT { - return __a->__a_value; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT -> decltype(__cxx_get_underlying_atomic(__a)) { - return __cxx_get_underlying_atomic(__a); -} - -template -using __cxx_atomic_underlying_t = typename _Tp::__cxx_underlying_type; - -template -inline void __cxx_atomic_init(volatile _Tp* __a, _Up __val) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __cxx_atomic_assign_volatile(*__a_tmp, __val); -} - -template -inline void __cxx_atomic_init(_Tp* __a, _Up __val) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __a = __val; -} - -inline -void __cxx_atomic_thread_fence(memory_order __order) { - __atomic_thread_fence(__to_gcc_order(__order)); -} - -inline -void __cxx_atomic_signal_fence(memory_order __order) { - __atomic_signal_fence(__to_gcc_order(__order)); -} - -template -inline void __cxx_atomic_store(_Tp* __a, _Up __val, - memory_order __order) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __atomic_store(__a_tmp, &__val, __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_load(const _Tp* __a, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __cxx_atomic_underlying_t<_Tp> __ret; - __atomic_load(__a_tmp, &__ret, __to_gcc_order(__order)); - return __ret; -} - -template -inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - __cxx_atomic_underlying_t<_Tp> __ret; - __atomic_exchange(__a_tmp, &__value, &__ret, __to_gcc_order(__order)); - return __ret; -} - -template -inline bool __cxx_atomic_compare_exchange_strong( - _Tp* __a, _Up* __expected, _Up __value, memory_order __success, - memory_order __failure) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_compare_exchange(__a_tmp, __expected, &__value, - false, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -inline bool __cxx_atomic_compare_exchange_weak( - _Tp* __a, _Up* __expected, _Up __value, memory_order __success, - memory_order __failure) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_compare_exchange(__a_tmp, __expected, &__value, - true, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); -} - -template -struct __skip_amt { enum {value = 1}; }; - -template -struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; - -// FIXME: Haven't figured out what the spec says about using arrays with -// atomic_fetch_add. Force a failure rather than creating bad behavior. -template -struct __skip_amt<_Tp[]> { }; -template -struct __skip_amt<_Tp[n]> { }; - -template -inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_add(__a_tmp, __delta * __skip_v, - __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, - __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_and(__a_tmp, __pattern, - __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_or(__a_tmp, __pattern, - __to_gcc_order(__order)); -} - -template -inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, - memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - return __atomic_fetch_xor(__a_tmp, __pattern, - __to_gcc_order(__order)); -} - -inline constexpr - bool __cxx_atomic_is_lock_free(size_t __x) { - return __x <= sizeof(uint64_t); -} +#include "atomic_base.h" From f38ee6334ad91ea71847d8d091d1990f652d84a5 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 23 Jul 2021 18:56:16 -0700 Subject: [PATCH 19/34] Missed grabbing important parts of the nvcxx-compatibility branch when cherry-picking --- codegen/codegen.cpp | 232 +- .../support/atomic/atomic_cuda_generated.h | 3022 ++++++++++------- 2 files changed, 1867 insertions(+), 1387 deletions(-) diff --git a/codegen/codegen.cpp b/codegen/codegen.cpp index d01ba4a3eb..bcddbb334b 100644 --- a/codegen/codegen.cpp +++ b/codegen/codegen.cpp @@ -93,23 +93,30 @@ int main() { for(auto& sem : fence_semantics) out << "static inline __device__ void " << fencename(sem.first, s.first) << "() { asm volatile(\"fence" << sem.second << s.second << ";\":::\"memory\"); }\n"; out << "static inline __device__ void __atomic_thread_fence_cuda(int __memorder, " << scopenametag(s.first) << ") {\n"; - out << " switch (__memorder) {\n"; - out << "#if __CUDA_ARCH__ >= 700\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); break;\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE:\n"; - out << " case __ATOMIC_ACQ_REL:\n"; - out << " case __ATOMIC_RELEASE: " << fencename("acq_rel"s, s.first) << "(); break;\n"; - out << "#else\n"; - out << " case __ATOMIC_SEQ_CST:\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE:\n"; - out << " case __ATOMIC_ACQ_REL:\n"; - out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); break;\n"; - out << "#endif // __CUDA_ARCH__ >= 700\n"; - out << " case __ATOMIC_RELAXED: break;\n"; - out << " default: assert(0);\n"; - out << " }\n"; + out << " NV_DISPATCH_TARGET(\n"; + out << " NV_PROVIDES_SM_70, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); break;\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE:\n"; + out << " case __ATOMIC_ACQ_REL:\n"; + out << " case __ATOMIC_RELEASE: " << fencename("acq_rel"s, s.first) << "(); break;\n"; + out << " case __ATOMIC_RELAXED: break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " ),\n"; + out << " NV_IS_DEVICE, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_SEQ_CST:\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE:\n"; + out << " case __ATOMIC_ACQ_REL:\n"; + out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); break;\n"; + out << " case __ATOMIC_RELAXED: break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " )\n"; + out << " )\n"; out << "}\n"; for(auto& sz : ld_sizes) { for(auto& sem : ld_semantics) { @@ -126,20 +133,26 @@ int main() { out << "template::type = 0>\n"; out << "__device__ void __atomic_load_cuda(const " << cv << "_Type *__ptr, _Type *__ret, int __memorder, " << scopenametag(s.first) << ") {\n"; out << " uint" << (registers[sz] == "r" ? 32 : sz) << "_t __tmp = 0;\n"; - out << " switch (__memorder) {\n"; - out << "#if __CUDA_ARCH__ >= 700\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_load_acquire_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_load_relaxed_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; - out << "#else\n"; - out << " case __ATOMIC_SEQ_CST: __cuda_membar_" << s.first << "();\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_load_volatile_" << sz << "_" << s.first << "(__ptr, __tmp); __cuda_membar_" << s.first << "(); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_load_volatile_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; - out << "#endif // __CUDA_ARCH__ >= 700\n"; - out << " default: assert(0);\n"; - out << " }\n"; + out << " NV_DISPATCH_TARGET(\n"; + out << " NV_PROVIDES_SM_70, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE: __cuda_load_acquire_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; + out << " case __ATOMIC_RELAXED: __cuda_load_relaxed_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " ),\n"; + out << " NV_IS_DEVICE, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_SEQ_CST: __cuda_membar_" << s.first << "();\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE: __cuda_load_volatile_" << sz << "_" << s.first << "(__ptr, __tmp); __cuda_membar_" << s.first << "(); break;\n"; + out << " case __ATOMIC_RELAXED: __cuda_load_volatile_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " )\n"; + out << " )\n"; out << " memcpy(__ret, &__tmp, " << sz/8 << ");\n"; out << "}\n"; } @@ -157,18 +170,24 @@ int main() { out << "__device__ void __atomic_store_cuda(" << cv << "_Type *__ptr, _Type *__val, int __memorder, " << scopenametag(s.first) << ") {\n"; out << " uint" << (registers[sz] == "r" ? 32 : sz) << "_t __tmp = 0;\n"; out << " memcpy(&__tmp, __val, " << sz/8 << ");\n"; - out << " switch (__memorder) {\n"; - out << "#if __CUDA_ARCH__ >= 700\n"; - out << " case __ATOMIC_RELEASE: __cuda_store_release_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; - out << " case __ATOMIC_RELAXED: __cuda_store_relaxed_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; - out << "#else\n"; - out << " case __ATOMIC_RELEASE:\n"; - out << " case __ATOMIC_SEQ_CST: __cuda_membar_" << s.first << "();\n"; - out << " case __ATOMIC_RELAXED: __cuda_store_volatile_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; - out << "#endif // __CUDA_ARCH__ >= 700\n"; - out << " default: assert(0);\n"; - out << " }\n"; + out << " NV_DISPATCH_TARGET(\n"; + out << " NV_PROVIDES_SM_70, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_RELEASE: __cuda_store_release_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; + out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; + out << " case __ATOMIC_RELAXED: __cuda_store_relaxed_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " ),\n"; + out << " NV_IS_DEVICE, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_RELEASE:\n"; + out << " case __ATOMIC_SEQ_CST: __cuda_membar_" << s.first << "();\n"; + out << " case __ATOMIC_RELAXED: __cuda_store_volatile_" << sz << "_" << s.first << "(__ptr, __tmp); break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " )\n"; + out << " )\n"; out << "}\n"; } } @@ -210,24 +229,30 @@ int main() { out << " memcpy(&__tmp, __desired, " << sz/8 << ");\n"; out << " memcpy(&__old, __expected, " << sz/8 << ");\n"; out << " __old_tmp = __old;\n"; - out << " switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {\n"; - out << "#if __CUDA_ARCH__ >= 700\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_compare_exchange_release_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; - out << "#else\n"; - out << " case __ATOMIC_SEQ_CST:\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "();\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_compare_exchange_volatile_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; - out << "#endif // __CUDA_ARCH__ >= 700\n"; - out << " default: assert(0);\n"; - out << " }\n"; + out << " NV_DISPATCH_TARGET(\n"; + out << " NV_PROVIDES_SM_70, (\n"; + out << " switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {\n"; + out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; + out << " case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; + out << " case __ATOMIC_RELEASE: __cuda_compare_exchange_release_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; + out << " case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " ),\n"; + out << " NV_IS_DEVICE, (\n"; + out << " switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {\n"; + out << " case __ATOMIC_SEQ_CST:\n"; + out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "();\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n"; + out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_compare_exchange_volatile_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; + out << " case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_" << sz << "_" << s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " )\n"; + out << " )\n"; out << " bool const __ret = __old == __old_tmp;\n"; out << " memcpy(__expected, &__old, " << sz/8 << ");\n"; out << " return __ret;\n"; @@ -246,24 +271,30 @@ int main() { out << " uint" << sz << "_t __tmp = 0;\n"; out << " memcpy(&__tmp, &__val, " << sz/8 << ");\n"; } - out << " switch (__memorder) {\n"; - out << "#if __CUDA_ARCH__ >= 700\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_" << rmw.first << "_acquire_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_" << rmw.first << "_acq_rel_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_" << rmw.first << "_release_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_" << rmw.first << "_relaxed_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << "#else\n"; - out << " case __ATOMIC_SEQ_CST:\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "();\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_" << rmw.first << "_volatile_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_" << rmw.first << "_volatile_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_" << rmw.first << "_volatile_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << "#endif // __CUDA_ARCH__ >= 700\n"; - out << " default: assert(0);\n"; - out << " }\n"; + out << " NV_DISPATCH_TARGET(\n"; + out << " NV_PROVIDES_SM_70, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE: __cuda_" << rmw.first << "_acquire_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " case __ATOMIC_ACQ_REL: __cuda_" << rmw.first << "_acq_rel_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " case __ATOMIC_RELEASE: __cuda_" << rmw.first << "_release_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " case __ATOMIC_RELAXED: __cuda_" << rmw.first << "_relaxed_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " ),\n"; + out << " NV_IS_DEVICE, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_SEQ_CST:\n"; + out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "();\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE: __cuda_" << rmw.first << "_volatile_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n"; + out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_" << rmw.first << "_volatile_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " case __ATOMIC_RELAXED: __cuda_" << rmw.first << "_volatile_" << sz << "_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " )\n"; + out << " )\n"; if(rmw.first == "exchange") out << " memcpy(__ret, &__tmp, " << sz/8 << ");\n"; else { @@ -286,24 +317,29 @@ int main() { if(op == "sub") out << " __tmp = -__tmp;\n"; out << " __tmp *= sizeof(_Type);\n"; - out << " switch (__memorder) {\n"; - out << "#if __CUDA_ARCH__ >= 700\n"; - out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << "#else\n"; - out << " case __ATOMIC_SEQ_CST:\n"; - out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "();\n"; - out << " case __ATOMIC_CONSUME:\n"; - out << " case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_" << s.first << "(__ptr, __tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n"; - out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_fetch_add_volatile_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << " case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; - out << "#endif // __CUDA_ARCH__ >= 700\n"; - out << " default: assert(0);\n"; - out << " }\n"; + out << " NV_DISPATCH_TARGET(\n"; + out << " NV_PROVIDES_SM_70, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "();\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " }\n"; + out << " ),\n"; + out << " NV_IS_DEVICE, (\n"; + out << " switch (__memorder) {\n"; + out << " case __ATOMIC_SEQ_CST:\n"; + out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "();\n"; + out << " case __ATOMIC_CONSUME:\n"; + out << " case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_" << s.first << "(__ptr, __tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n"; + out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first << "(); __cuda_fetch_add_volatile_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_" << s.first << "(__ptr, __tmp, __tmp); break;\n"; + out << " default: assert(0);\n"; + out << " }\n"; + out << " )\n"; + out << " )\n"; out << " memcpy(&__ret, &__tmp, 8);\n"; out << " return __ret;\n"; out << "}\n"; @@ -311,7 +347,5 @@ int main() { } } - out << "\n"; - return 0; } diff --git a/libcxx/include/support/atomic/atomic_cuda_generated.h b/libcxx/include/support/atomic/atomic_cuda_generated.h index d21994d2bc..f4d8cd52c6 100644 --- a/libcxx/include/support/atomic/atomic_cuda_generated.h +++ b/libcxx/include/support/atomic/atomic_cuda_generated.h @@ -12,23 +12,30 @@ static inline __device__ void __cuda_membar_block() { asm volatile("membar.cta;" static inline __device__ void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); } static inline __device__ void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); } static inline __device__ void __atomic_thread_fence_cuda(int __memorder, __thread_scope_block_tag) { - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); break; - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: - case __ATOMIC_ACQ_REL: - case __ATOMIC_RELEASE: __cuda_fence_acq_rel_block(); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: - case __ATOMIC_ACQ_REL: - case __ATOMIC_RELEASE: __cuda_membar_block(); break; -#endif // __CUDA_ARCH__ >= 700 - case __ATOMIC_RELAXED: break; - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); break; + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: + case __ATOMIC_ACQ_REL: + case __ATOMIC_RELEASE: __cuda_fence_acq_rel_block(); break; + case __ATOMIC_RELAXED: break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: + case __ATOMIC_ACQ_REL: + case __ATOMIC_RELEASE: __cuda_membar_block(); break; + case __ATOMIC_RELAXED: break; + default: assert(0); + } + ) + ) } template static inline __device__ void __cuda_load_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } template static inline __device__ void __cuda_load_relaxed_32_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.cta.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } @@ -36,20 +43,26 @@ template static inline __device__ void __cuda_load template::type = 0> __device__ void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) { uint32_t __tmp = 0; - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_block(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_32_block(__ptr, __tmp); break; -#else - case __ATOMIC_SEQ_CST: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_block(__ptr, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_32_block(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_block(__ptr, __tmp); break; + case __ATOMIC_RELAXED: __cuda_load_relaxed_32_block(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_block(__ptr, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELAXED: __cuda_load_volatile_32_block(__ptr, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 4); } template static inline __device__ void __cuda_load_acquire_64_block(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.cta.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } @@ -58,20 +71,26 @@ template static inline __device__ void __cuda_load template::type = 0> __device__ void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_block_tag) { uint64_t __tmp = 0; - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_block(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_64_block(__ptr, __tmp); break; -#else - case __ATOMIC_SEQ_CST: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_block(__ptr, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_64_block(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_block(__ptr, __tmp); break; + case __ATOMIC_RELAXED: __cuda_load_relaxed_64_block(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_block(__ptr, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELAXED: __cuda_load_volatile_64_block(__ptr, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 8); } template static inline __device__ void __cuda_store_relaxed_32_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.cta.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } @@ -81,18 +100,24 @@ template::type __device__ void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) { uint32_t __tmp = 0; memcpy(&__tmp, __val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_RELEASE: __cuda_store_release_32_block(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_32_block(__ptr, __tmp); break; -#else - case __ATOMIC_RELEASE: - case __ATOMIC_SEQ_CST: __cuda_membar_block(); - case __ATOMIC_RELAXED: __cuda_store_volatile_32_block(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_RELEASE: __cuda_store_release_32_block(__ptr, __tmp); break; + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_RELAXED: __cuda_store_relaxed_32_block(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_RELEASE: + case __ATOMIC_SEQ_CST: __cuda_membar_block(); + case __ATOMIC_RELAXED: __cuda_store_volatile_32_block(__ptr, __tmp); break; + default: assert(0); + } + ) + ) } template static inline __device__ void __cuda_store_relaxed_64_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.cta.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } template static inline __device__ void __cuda_store_release_64_block(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.cta.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } @@ -101,18 +126,24 @@ template::type __device__ void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_block_tag) { uint64_t __tmp = 0; memcpy(&__tmp, __val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_RELEASE: __cuda_store_release_64_block(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_64_block(__ptr, __tmp); break; -#else - case __ATOMIC_RELEASE: - case __ATOMIC_SEQ_CST: __cuda_membar_block(); - case __ATOMIC_RELAXED: __cuda_store_volatile_64_block(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_RELEASE: __cuda_store_release_64_block(__ptr, __tmp); break; + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_RELAXED: __cuda_store_relaxed_64_block(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_RELEASE: + case __ATOMIC_SEQ_CST: __cuda_membar_block(); + case __ATOMIC_RELAXED: __cuda_store_volatile_64_block(__ptr, __tmp); break; + default: assert(0); + } + ) + ) } template static inline __device__ void __cuda_compare_exchange_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template static inline __device__ void __cuda_compare_exchange_acquire_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } @@ -125,24 +156,30 @@ __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__e memcpy(&__tmp, __desired, 4); memcpy(&__old, __expected, 4); __old_tmp = __old; - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_32_block(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_32_block(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_32_block(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_32_block(__ptr, __old, __old_tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_32_block(__ptr, __old, __old_tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_32_block(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_32_block(__ptr, __old, __old_tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_32_block(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_32_block(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_compare_exchange_release_32_block(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_32_block(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_32_block(__ptr, __old, __old_tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_32_block(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_32_block(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ) + ) bool const __ret = __old == __old_tmp; memcpy(__expected, &__old, 4); return __ret; @@ -156,24 +193,30 @@ template::type __device__ void __atomic_exchange_cuda(volatile _Type *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { uint32_t __tmp = 0; memcpy(&__tmp, __val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_32_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_32_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_exchange_release_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_relaxed_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_volatile_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 4); } template static inline __device__ void __cuda_fetch_add_acq_rel_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } @@ -186,24 +229,30 @@ __device__ _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_32_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_32_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -217,24 +266,30 @@ __device__ _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -248,24 +303,30 @@ __device__ _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_32_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_32_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_max_release_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -279,24 +340,30 @@ __device__ _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_32_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_32_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_min_release_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -310,24 +377,30 @@ __device__ _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -346,24 +419,30 @@ __device__ _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_32_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_32_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_sub_release_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -377,24 +456,30 @@ __device__ _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -409,24 +494,30 @@ __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__e memcpy(&__tmp, __desired, 8); memcpy(&__old, __expected, 8); __old_tmp = __old; - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_64_block(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_64_block(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_64_block(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_64_block(__ptr, __old, __old_tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_64_block(__ptr, __old, __old_tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_64_block(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_64_block(__ptr, __old, __old_tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_64_block(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_64_block(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_compare_exchange_release_64_block(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_64_block(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_64_block(__ptr, __old, __old_tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_compare_exchange_volatile_64_block(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_64_block(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ) + ) bool const __ret = __old == __old_tmp; memcpy(__expected, &__old, 8); return __ret; @@ -440,24 +531,30 @@ template::type __device__ void __atomic_exchange_cuda(volatile _Type *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_block_tag) { uint64_t __tmp = 0; memcpy(&__tmp, __val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_exchange_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_relaxed_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_exchange_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 8); } template static inline __device__ void __cuda_fetch_add_acq_rel_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.cta.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } @@ -470,24 +567,30 @@ __device__ _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -501,24 +604,30 @@ __device__ _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -532,24 +641,30 @@ __device__ _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_max_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_max_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -563,24 +678,30 @@ __device__ _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_min_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_min_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -594,24 +715,30 @@ __device__ _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -630,24 +757,30 @@ __device__ _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_sub_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_sub_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -661,24 +794,30 @@ __device__ _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -688,24 +827,29 @@ __device__ _Type* __atomic_fetch_add_cuda(_Type *volatile *__ptr, ptrdiff_t __va uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); __tmp *= sizeof(_Type); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_block(__ptr, __tmp, __tmp); break; + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -716,24 +860,29 @@ __device__ _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __va memcpy(&__tmp, &__val, 8); __tmp = -__tmp; __tmp *= sizeof(_Type); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_block(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_block(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; - case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_block(__ptr, __tmp, __tmp); break; + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_block(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); __cuda_membar_block(); break; + case __ATOMIC_RELEASE: __cuda_membar_block(); __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_block(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -741,23 +890,30 @@ static inline __device__ void __cuda_membar_device() { asm volatile("membar.gl;" static inline __device__ void __cuda_fence_acq_rel_device() { asm volatile("fence.acq_rel.gpu;":::"memory"); } static inline __device__ void __cuda_fence_sc_device() { asm volatile("fence.sc.gpu;":::"memory"); } static inline __device__ void __atomic_thread_fence_cuda(int __memorder, __thread_scope_device_tag) { - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); break; - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: - case __ATOMIC_ACQ_REL: - case __ATOMIC_RELEASE: __cuda_fence_acq_rel_device(); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: - case __ATOMIC_ACQ_REL: - case __ATOMIC_RELEASE: __cuda_membar_device(); break; -#endif // __CUDA_ARCH__ >= 700 - case __ATOMIC_RELAXED: break; - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); break; + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: + case __ATOMIC_ACQ_REL: + case __ATOMIC_RELEASE: __cuda_fence_acq_rel_device(); break; + case __ATOMIC_RELAXED: break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: + case __ATOMIC_ACQ_REL: + case __ATOMIC_RELEASE: __cuda_membar_device(); break; + case __ATOMIC_RELAXED: break; + default: assert(0); + } + ) + ) } template static inline __device__ void __cuda_load_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } template static inline __device__ void __cuda_load_relaxed_32_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.gpu.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } @@ -765,20 +921,26 @@ template static inline __device__ void __cuda_load template::type = 0> __device__ void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) { uint32_t __tmp = 0; - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_device(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_32_device(__ptr, __tmp); break; -#else - case __ATOMIC_SEQ_CST: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_device(__ptr, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_32_device(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_device(__ptr, __tmp); break; + case __ATOMIC_RELAXED: __cuda_load_relaxed_32_device(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_device(__ptr, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELAXED: __cuda_load_volatile_32_device(__ptr, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 4); } template static inline __device__ void __cuda_load_acquire_64_device(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.gpu.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } @@ -787,20 +949,26 @@ template static inline __device__ void __cuda_load template::type = 0> __device__ void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_device_tag) { uint64_t __tmp = 0; - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_device(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_64_device(__ptr, __tmp); break; -#else - case __ATOMIC_SEQ_CST: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_device(__ptr, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_64_device(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_device(__ptr, __tmp); break; + case __ATOMIC_RELAXED: __cuda_load_relaxed_64_device(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_device(__ptr, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELAXED: __cuda_load_volatile_64_device(__ptr, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 8); } template static inline __device__ void __cuda_store_relaxed_32_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.gpu.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } @@ -810,18 +978,24 @@ template::type __device__ void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) { uint32_t __tmp = 0; memcpy(&__tmp, __val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_RELEASE: __cuda_store_release_32_device(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_32_device(__ptr, __tmp); break; -#else - case __ATOMIC_RELEASE: - case __ATOMIC_SEQ_CST: __cuda_membar_device(); - case __ATOMIC_RELAXED: __cuda_store_volatile_32_device(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_RELEASE: __cuda_store_release_32_device(__ptr, __tmp); break; + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_RELAXED: __cuda_store_relaxed_32_device(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_RELEASE: + case __ATOMIC_SEQ_CST: __cuda_membar_device(); + case __ATOMIC_RELAXED: __cuda_store_volatile_32_device(__ptr, __tmp); break; + default: assert(0); + } + ) + ) } template static inline __device__ void __cuda_store_relaxed_64_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.gpu.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } template static inline __device__ void __cuda_store_release_64_device(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.gpu.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } @@ -830,18 +1004,24 @@ template::type __device__ void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_device_tag) { uint64_t __tmp = 0; memcpy(&__tmp, __val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_RELEASE: __cuda_store_release_64_device(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_64_device(__ptr, __tmp); break; -#else - case __ATOMIC_RELEASE: - case __ATOMIC_SEQ_CST: __cuda_membar_device(); - case __ATOMIC_RELAXED: __cuda_store_volatile_64_device(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_RELEASE: __cuda_store_release_64_device(__ptr, __tmp); break; + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_RELAXED: __cuda_store_relaxed_64_device(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_RELEASE: + case __ATOMIC_SEQ_CST: __cuda_membar_device(); + case __ATOMIC_RELAXED: __cuda_store_volatile_64_device(__ptr, __tmp); break; + default: assert(0); + } + ) + ) } template static inline __device__ void __cuda_compare_exchange_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template static inline __device__ void __cuda_compare_exchange_acquire_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } @@ -854,24 +1034,30 @@ __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__e memcpy(&__tmp, __desired, 4); memcpy(&__old, __expected, 4); __old_tmp = __old; - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_32_device(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_32_device(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_32_device(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_32_device(__ptr, __old, __old_tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_32_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_32_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_compare_exchange_release_32_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_32_device(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_32_device(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ) + ) bool const __ret = __old == __old_tmp; memcpy(__expected, &__old, 4); return __ret; @@ -885,24 +1071,30 @@ template::type __device__ void __atomic_exchange_cuda(volatile _Type *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { uint32_t __tmp = 0; memcpy(&__tmp, __val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_32_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_32_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_exchange_release_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_relaxed_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_volatile_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 4); } template static inline __device__ void __cuda_fetch_add_acq_rel_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } @@ -915,24 +1107,30 @@ __device__ _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_32_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_32_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -946,24 +1144,30 @@ __device__ _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -977,24 +1181,30 @@ __device__ _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_32_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_32_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_max_release_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1008,24 +1218,30 @@ __device__ _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_32_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_32_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_min_release_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1039,24 +1255,30 @@ __device__ _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1075,24 +1297,30 @@ __device__ _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_32_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_32_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_sub_release_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1106,24 +1334,30 @@ __device__ _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1138,24 +1372,30 @@ __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__e memcpy(&__tmp, __desired, 8); memcpy(&__old, __expected, 8); __old_tmp = __old; - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_64_device(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_64_device(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_64_device(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_64_device(__ptr, __old, __old_tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_64_device(__ptr, __old, __old_tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_64_device(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_64_device(__ptr, __old, __old_tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_64_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_64_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_compare_exchange_release_64_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_64_device(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_64_device(__ptr, __old, __old_tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_compare_exchange_volatile_64_device(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_64_device(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ) + ) bool const __ret = __old == __old_tmp; memcpy(__expected, &__old, 8); return __ret; @@ -1169,24 +1409,30 @@ template::type __device__ void __atomic_exchange_cuda(volatile _Type *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_device_tag) { uint64_t __tmp = 0; memcpy(&__tmp, __val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_exchange_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_relaxed_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_exchange_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 8); } template static inline __device__ void __cuda_fetch_add_acq_rel_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.gpu.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } @@ -1199,24 +1445,30 @@ __device__ _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1230,24 +1482,30 @@ __device__ _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1261,24 +1519,30 @@ __device__ _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_max_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_max_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1292,24 +1556,30 @@ __device__ _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_min_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_min_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1323,24 +1593,30 @@ __device__ _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1359,24 +1635,30 @@ __device__ _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_sub_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_sub_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1390,24 +1672,30 @@ __device__ _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1417,24 +1705,29 @@ __device__ _Type* __atomic_fetch_add_cuda(_Type *volatile *__ptr, ptrdiff_t __va uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); __tmp *= sizeof(_Type); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_device(__ptr, __tmp, __tmp); break; + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1445,24 +1738,29 @@ __device__ _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __va memcpy(&__tmp, &__val, 8); __tmp = -__tmp; __tmp *= sizeof(_Type); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_device(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_device(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; - case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_device(__ptr, __tmp, __tmp); break; + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_device(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); __cuda_membar_device(); break; + case __ATOMIC_RELEASE: __cuda_membar_device(); __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_device(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1470,23 +1768,30 @@ static inline __device__ void __cuda_membar_system() { asm volatile("membar.sys; static inline __device__ void __cuda_fence_acq_rel_system() { asm volatile("fence.acq_rel.sys;":::"memory"); } static inline __device__ void __cuda_fence_sc_system() { asm volatile("fence.sc.sys;":::"memory"); } static inline __device__ void __atomic_thread_fence_cuda(int __memorder, __thread_scope_system_tag) { - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); break; - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: - case __ATOMIC_ACQ_REL: - case __ATOMIC_RELEASE: __cuda_fence_acq_rel_system(); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: - case __ATOMIC_ACQ_REL: - case __ATOMIC_RELEASE: __cuda_membar_system(); break; -#endif // __CUDA_ARCH__ >= 700 - case __ATOMIC_RELAXED: break; - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); break; + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: + case __ATOMIC_ACQ_REL: + case __ATOMIC_RELEASE: __cuda_fence_acq_rel_system(); break; + case __ATOMIC_RELAXED: break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: + case __ATOMIC_ACQ_REL: + case __ATOMIC_RELEASE: __cuda_membar_system(); break; + case __ATOMIC_RELAXED: break; + default: assert(0); + } + ) + ) } template static inline __device__ void __cuda_load_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } template static inline __device__ void __cuda_load_relaxed_32_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.relaxed.sys.b32 %0,[%1];" : "=r"(__dst) : "l"(__ptr) : "memory"); } @@ -1494,20 +1799,26 @@ template static inline __device__ void __cuda_load template::type = 0> __device__ void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) { uint32_t __tmp = 0; - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_system(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_32_system(__ptr, __tmp); break; -#else - case __ATOMIC_SEQ_CST: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_system(__ptr, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_32_system(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_acquire_32_system(__ptr, __tmp); break; + case __ATOMIC_RELAXED: __cuda_load_relaxed_32_system(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_volatile_32_system(__ptr, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELAXED: __cuda_load_volatile_32_system(__ptr, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 4); } template static inline __device__ void __cuda_load_acquire_64_system(_CUDA_A __ptr, _CUDA_B& __dst) {asm volatile("ld.acquire.sys.b64 %0,[%1];" : "=l"(__dst) : "l"(__ptr) : "memory"); } @@ -1516,20 +1827,26 @@ template static inline __device__ void __cuda_load template::type = 0> __device__ void __atomic_load_cuda(const volatile _Type *__ptr, _Type *__ret, int __memorder, __thread_scope_system_tag) { uint64_t __tmp = 0; - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_system(__ptr, __tmp); break; - case __ATOMIC_RELAXED: __cuda_load_relaxed_64_system(__ptr, __tmp); break; -#else - case __ATOMIC_SEQ_CST: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_system(__ptr, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELAXED: __cuda_load_volatile_64_system(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_acquire_64_system(__ptr, __tmp); break; + case __ATOMIC_RELAXED: __cuda_load_relaxed_64_system(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_load_volatile_64_system(__ptr, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELAXED: __cuda_load_volatile_64_system(__ptr, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 8); } template static inline __device__ void __cuda_store_relaxed_32_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.sys.b32 [%0], %1;" :: "l"(__ptr),"r"(__src) : "memory"); } @@ -1539,18 +1856,24 @@ template::type __device__ void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) { uint32_t __tmp = 0; memcpy(&__tmp, __val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_RELEASE: __cuda_store_release_32_system(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_32_system(__ptr, __tmp); break; -#else - case __ATOMIC_RELEASE: - case __ATOMIC_SEQ_CST: __cuda_membar_system(); - case __ATOMIC_RELAXED: __cuda_store_volatile_32_system(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_RELEASE: __cuda_store_release_32_system(__ptr, __tmp); break; + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_RELAXED: __cuda_store_relaxed_32_system(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_RELEASE: + case __ATOMIC_SEQ_CST: __cuda_membar_system(); + case __ATOMIC_RELAXED: __cuda_store_volatile_32_system(__ptr, __tmp); break; + default: assert(0); + } + ) + ) } template static inline __device__ void __cuda_store_relaxed_64_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.relaxed.sys.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } template static inline __device__ void __cuda_store_release_64_system(_CUDA_A __ptr, _CUDA_B __src) { asm volatile("st.release.sys.b64 [%0], %1;" :: "l"(__ptr),"l"(__src) : "memory"); } @@ -1559,18 +1882,24 @@ template::type __device__ void __atomic_store_cuda(volatile _Type *__ptr, _Type *__val, int __memorder, __thread_scope_system_tag) { uint64_t __tmp = 0; memcpy(&__tmp, __val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_RELEASE: __cuda_store_release_64_system(__ptr, __tmp); break; - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_RELAXED: __cuda_store_relaxed_64_system(__ptr, __tmp); break; -#else - case __ATOMIC_RELEASE: - case __ATOMIC_SEQ_CST: __cuda_membar_system(); - case __ATOMIC_RELAXED: __cuda_store_volatile_64_system(__ptr, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_RELEASE: __cuda_store_release_64_system(__ptr, __tmp); break; + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_RELAXED: __cuda_store_relaxed_64_system(__ptr, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_RELEASE: + case __ATOMIC_SEQ_CST: __cuda_membar_system(); + case __ATOMIC_RELAXED: __cuda_store_volatile_64_system(__ptr, __tmp); break; + default: assert(0); + } + ) + ) } template static inline __device__ void __cuda_compare_exchange_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acq_rel.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template static inline __device__ void __cuda_compare_exchange_acquire_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.acquire.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } @@ -1583,24 +1912,30 @@ __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__e memcpy(&__tmp, __desired, 4); memcpy(&__old, __expected, 4); __old_tmp = __old; - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_32_system(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_32_system(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_32_system(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_32_system(__ptr, __old, __old_tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_32_system(__ptr, __old, __old_tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_32_system(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_32_system(__ptr, __old, __old_tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_32_system(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_32_system(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_compare_exchange_release_32_system(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_32_system(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_32_system(__ptr, __old, __old_tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_32_system(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_32_system(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ) + ) bool const __ret = __old == __old_tmp; memcpy(__expected, &__old, 4); return __ret; @@ -1614,24 +1949,30 @@ template::type __device__ void __atomic_exchange_cuda(volatile _Type *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { uint32_t __tmp = 0; memcpy(&__tmp, __val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_32_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_32_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_exchange_release_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_relaxed_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_volatile_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 4); } template static inline __device__ void __cuda_fetch_add_acq_rel_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.u32 %0,[%1],%2;" : "=r"(__dst) : "l"(__ptr),"r"(__op) : "memory"); } @@ -1644,24 +1985,30 @@ __device__ _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_32_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_32_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1675,24 +2022,30 @@ __device__ _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_and_release_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1706,24 +2059,30 @@ __device__ _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_32_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_32_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_max_release_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1737,24 +2096,30 @@ __device__ _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_32_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_32_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_min_release_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1768,24 +2133,30 @@ __device__ _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_or_release_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1804,24 +2175,30 @@ __device__ _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_32_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_32_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_sub_release_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1835,24 +2212,30 @@ __device__ _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint32_t __tmp = 0; memcpy(&__tmp, &__val, 4); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_xor_release_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_32_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 4); return __ret; } @@ -1867,24 +2250,30 @@ __device__ bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__e memcpy(&__tmp, __desired, 8); memcpy(&__old, __expected, 8); __old_tmp = __old; - switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_64_system(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_64_system(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_compare_exchange_release_64_system(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_64_system(__ptr, __old, __old_tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_64_system(__ptr, __old, __old_tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_64_system(__ptr, __old, __old_tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_64_system(__ptr, __old, __old_tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_64_system(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_64_system(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_compare_exchange_release_64_system(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_64_system(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_64_system(__ptr, __old, __old_tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_compare_exchange_volatile_64_system(__ptr, __old, __old_tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_64_system(__ptr, __old, __old_tmp, __tmp); break; + default: assert(0); + } + ) + ) bool const __ret = __old == __old_tmp; memcpy(__expected, &__old, 8); return __ret; @@ -1898,24 +2287,30 @@ template::type __device__ void __atomic_exchange_cuda(volatile _Type *__ptr, _Type *__val, _Type *__ret, int __memorder, __thread_scope_system_tag) { uint64_t __tmp = 0; memcpy(&__tmp, __val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_exchange_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_exchange_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_exchange_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_exchange_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_relaxed_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_exchange_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_exchange_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_exchange_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(__ret, &__tmp, 8); } template static inline __device__ void __cuda_fetch_add_acq_rel_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __op) { asm volatile("atom.add.acq_rel.sys.u64 %0,[%1],%2;" : "=l"(__dst) : "l"(__ptr),"l"(__op) : "memory"); } @@ -1928,24 +2323,30 @@ __device__ _Type __atomic_fetch_add_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1959,24 +2360,30 @@ __device__ _Type __atomic_fetch_and_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_and_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_and_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_relaxed_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_and_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -1990,24 +2397,30 @@ __device__ _Type __atomic_fetch_max_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_max_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_max_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_max_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_relaxed_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_max_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_max_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_max_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -2021,24 +2434,30 @@ __device__ _Type __atomic_fetch_min_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_min_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_min_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_min_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_relaxed_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_min_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_min_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_min_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -2052,24 +2471,30 @@ __device__ _Type __atomic_fetch_or_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_or_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_or_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_relaxed_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_or_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -2088,24 +2513,30 @@ __device__ _Type __atomic_fetch_sub_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_sub_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_sub_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_sub_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_relaxed_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_sub_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_sub_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_sub_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -2119,24 +2550,30 @@ __device__ _Type __atomic_fetch_xor_cuda(volatile _Type *__ptr, _Type __val, int _Type __ret; uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_xor_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_xor_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_relaxed_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_xor_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -2146,24 +2583,29 @@ __device__ _Type* __atomic_fetch_add_cuda(_Type *volatile *__ptr, ptrdiff_t __va uint64_t __tmp = 0; memcpy(&__tmp, &__val, 8); __tmp *= sizeof(_Type); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_system(__ptr, __tmp, __tmp); break; + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } @@ -2174,25 +2616,29 @@ __device__ _Type* __atomic_fetch_sub_cuda(_Type *volatile *__ptr, ptrdiff_t __va memcpy(&__tmp, &__val, 8); __tmp = -__tmp; __tmp *= sizeof(_Type); - switch (__memorder) { -#if __CUDA_ARCH__ >= 700 - case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_system(__ptr, __tmp, __tmp); break; -#else - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: __cuda_membar_system(); - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; - case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; - case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; -#endif // __CUDA_ARCH__ >= 700 - default: assert(0); - } + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: __cuda_fence_sc_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_acquire_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_ACQ_REL: __cuda_fetch_add_acq_rel_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELEASE: __cuda_fetch_add_release_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_relaxed_64_system(__ptr, __tmp, __tmp); break; + } + ), + NV_IS_DEVICE, ( + switch (__memorder) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: __cuda_membar_system(); + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); __cuda_membar_system(); break; + case __ATOMIC_RELEASE: __cuda_membar_system(); __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; + case __ATOMIC_RELAXED: __cuda_fetch_add_volatile_64_system(__ptr, __tmp, __tmp); break; + default: assert(0); + } + ) + ) memcpy(&__ret, &__tmp, 8); return __ret; } - From 09d6ac64b54f90c665a273192382aa68af21499e Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 23 Jul 2021 19:24:52 -0700 Subject: [PATCH 20/34] Pickup more nv/target specializations from nvcxx_compatibility branch --- include/cuda/std/atomic | 13 +++-- libcxx/include/__threading_support | 16 +++--- libcxx/include/support/atomic/atomic_cuda.h | 29 ++++++----- .../support/atomic/atomic_cuda_derived.h | 52 ++++++++++++------- 4 files changed, 64 insertions(+), 46 deletions(-) diff --git a/include/cuda/std/atomic b/include/cuda/std/atomic index 1274e548f5..cb918b7c24 100644 --- a/include/cuda/std/atomic +++ b/include/cuda/std/atomic @@ -193,11 +193,14 @@ inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_sco } inline __host__ __device__ void atomic_signal_fence(memory_order __m) { -#ifdef __CUDA_ARCH__ - detail::__atomic_signal_fence_cuda((int)__m); -#else - ::std::atomic_signal_fence((::std::memory_order)__m); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + detail::__atomic_signal_fence_cuda((int)__m); + ), + NV_IS_HOST, ( + ::std::atomic_signal_fence((::std::memory_order)__m); + ) + ) } _LIBCUDACXX_END_NAMESPACE_CUDA diff --git a/libcxx/include/__threading_support b/libcxx/include/__threading_support index ae03791f40..9489f6a6bd 100644 --- a/libcxx/include/__threading_support +++ b/libcxx/include/__threading_support @@ -84,7 +84,7 @@ inline void __libcpp_thread_yield_processor() #else # define __LIBCUDACXX_ASM_THREAD_YIELD (;) #endif - NV_DISPATCH_TARGET( + NV_IF_TARGET( NV_IS_HOST, __LIBCUDACXX_ASM_THREAD_YIELD ) @@ -295,13 +295,13 @@ void __libcpp_thread_yield() {} _LIBCUDACXX_THREAD_ABI_VISIBILITY void __libcpp_thread_sleep_for(chrono::nanoseconds __ns) { -#if __CUDA_ARCH__ >= 700 - auto const __step = __ns.count(); - assert(__step < numeric_limits::max()); - asm volatile("nanosleep.u32 %0;"::"r"((unsigned)__step):); -#else - ; -#endif + NV_IF_TARGET( + NV_IS_DEVICE, ( + auto const __step = __ns.count(); + assert(__step < numeric_limits::max()); + asm volatile("nanosleep.u32 %0;"::"r"((unsigned)__step):); + ) + ) } #elif defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD) diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index c61471a9af..59ed27ebef 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -422,19 +422,22 @@ __host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_small_imp } __host__ __device__ inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) { -#ifdef __CUDA_ARCH__ - auto __lhs_c = reinterpret_cast(__lhs); - auto __rhs_c = reinterpret_cast(__rhs); - while (__count--) { - auto const __lhs_v = *__lhs_c++; - auto const __rhs_v = *__rhs_c++; - if (__lhs_v < __rhs_v) { return -1; } - if (__lhs_v > __rhs_v) { return 1; } - } - return 0; -#else - return memcmp(__lhs, __rhs, __count); -#endif + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + auto __lhs_c = reinterpret_cast(__lhs); + auto __rhs_c = reinterpret_cast(__rhs); + while (__count--) { + auto const __lhs_v = *__lhs_c++; + auto const __rhs_v = *__rhs_c++; + if (__lhs_v < __rhs_v) { return -1; } + if (__lhs_v > __rhs_v) { return 1; } + } + return 0; + ), + NV_IS_HOST, ( + return memcmp(__lhs, __rhs, __count); + ) + ) } template diff --git a/libcxx/include/support/atomic/atomic_cuda_derived.h b/libcxx/include/support/atomic/atomic_cuda_derived.h index f0cbcdfd75..7c005e423e 100644 --- a/libcxx/include/support/atomic/atomic_cuda_derived.h +++ b/libcxx/include/support/atomic/atomic_cuda_derived.h @@ -52,16 +52,22 @@ _Type __host__ __device__ __atomic_fetch_max_cuda(_Type volatile *__ptr, _Delta _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); _Type __desired = __expected > __val ? __expected : __val; -#ifdef __CUDA_ARCH__ - while(__desired == __val && - !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s)) { -#else - while(__desired == __val && - !__atomic_compare_exchange(__ptr, &__expected, &__desired, true, __memorder, __memorder)) { -#endif - __desired = __expected > __val ? __expected : __val; - } - return __expected; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + while(__desired == __val && + !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s)) { + __desired = __expected > __val ? __expected : __val; + } + return __expected; + ), + NV_IS_HOST, ( + while(__desired == __val && + !__atomic_compare_exchange(__ptr, &__expected, &__desired, true, __memorder, __memorder)) { + __desired = __expected > __val ? __expected : __val; + } + return __expected; + ) + ) } template::type = 0> @@ -69,16 +75,22 @@ _Type __host__ __device__ __atomic_fetch_min_cuda(_Type volatile *__ptr, _Delta _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); _Type __desired = __expected < __val ? __expected : __val; -#ifdef __CUDA_ARCH__ - while(__desired != __val && - !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s)) { -#else - while(__desired != __val && - !__atomic_compare_exchange(__ptr, &__expected, &__desired, true, __memorder, __memorder)) { -#endif - __desired = __expected < __val ? __expected : __val; - } - return __expected; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + while(__desired != __val && + !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s)) { + __desired = __expected < __val ? __expected : __val; + } + return __expected; + ), + NV_IS_HOST, ( + while(__desired != __val && + !__atomic_compare_exchange(__ptr, &__expected, &__desired, true, __memorder, __memorder)) { + __desired = __expected < __val ? __expected : __val; + } + return __expected; + ) + ) } template::type = 0> From 2ddf98f263be636ec6864e919329f81987f6882d Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 26 Jul 2021 17:39:38 -0700 Subject: [PATCH 21/34] Uglify the atomic detail:: and host:: namespaces --- include/cuda/std/atomic | 40 ++++++------ libcxx/include/atomic | 30 ++++----- libcxx/include/support/atomic/atomic_cuda.h | 68 ++++++++++----------- 3 files changed, 66 insertions(+), 72 deletions(-) diff --git a/include/cuda/std/atomic b/include/cuda/std/atomic index cb918b7c24..c738d48fef 100644 --- a/include/cuda/std/atomic +++ b/include/cuda/std/atomic @@ -53,18 +53,18 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA -using std::detail::thread_scope; -using std::detail::thread_scope_system; -using std::detail::thread_scope_device; -using std::detail::thread_scope_block; -using std::detail::thread_scope_thread; - -namespace detail { -using std::detail::__thread_scope_block_tag; -using std::detail::__thread_scope_device_tag; -using std::detail::__thread_scope_system_tag; -using std::detail::__atomic_signal_fence_cuda; -using std::detail::__atomic_thread_fence_cuda; +using std::__detail::thread_scope; +using std::__detail::thread_scope_system; +using std::__detail::thread_scope_device; +using std::__detail::thread_scope_block; +using std::__detail::thread_scope_thread; + +namespace __detail { +using std::__detail::__thread_scope_block_tag; +using std::__detail::__thread_scope_device_tag; +using std::__detail::__thread_scope_system_tag; +using std::__detail::__atomic_signal_fence_cuda; +using std::__detail::__atomic_thread_fence_cuda; } using memory_order = std::memory_order; @@ -98,15 +98,15 @@ struct atomic __host__ __device__ _Tp fetch_max(const _Tp & __op, memory_order __m = memory_order_seq_cst) volatile noexcept { - return std::detail::__atomic_fetch_max_cuda(&this->__a_.__a_value, __op, - __m, std::detail::__scope_tag<_Sco>()); + return std::__detail::__atomic_fetch_max_cuda(&this->__a_.__a_value, __op, + __m, std::__detail::__scope_tag<_Sco>()); } __host__ __device__ _Tp fetch_min(const _Tp & __op, memory_order __m = memory_order_seq_cst) volatile noexcept { - return std::detail::__atomic_fetch_min_cuda(&this->__a_.__a_value, __op, - __m, std::detail::__scope_tag<_Sco>()); + return std::__detail::__atomic_fetch_min_cuda(&this->__a_.__a_value, __op, + __m, std::__detail::__scope_tag<_Sco>()); } }; @@ -175,13 +175,13 @@ inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_sco NV_IS_DEVICE, ( switch(_Scope) { case thread_scope::thread_scope_system: - detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_system_tag()); + __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_system_tag()); break; case thread_scope::thread_scope_device: - detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_device_tag()); + __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_device_tag()); break; case thread_scope::thread_scope_block: - detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_block_tag()); + __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_block_tag()); break; } ), @@ -195,7 +195,7 @@ inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_sco inline __host__ __device__ void atomic_signal_fence(memory_order __m) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - detail::__atomic_signal_fence_cuda((int)__m); + __detail::__atomic_signal_fence_cuda((int)__m); ), NV_IS_HOST, ( ::std::atomic_signal_fence((::std::memory_order)__m); diff --git a/libcxx/include/atomic b/libcxx/include/atomic index 406f0ec22b..d20ebf4945 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -671,7 +671,7 @@ __cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) #endif // Headers are wrapped like so: (cuda::std::|std::)detail -namespace detail { +namespace __detail { #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL) # include "support/atomic/atomic_cuda.h" #elif defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL) @@ -684,20 +684,20 @@ namespace detail { #endif // _LIBCUDACXX_HAS_GCC_ATOMIC_IMP, _LIBCUDACXX_HAS_C_ATOMIC_IMP } -using detail::__cxx_atomic_base_impl; -using detail::__cxx_atomic_thread_fence; -using detail::__cxx_atomic_signal_fence; -using detail::__cxx_atomic_load; -using detail::__cxx_atomic_store; -using detail::__cxx_atomic_exchange; -using detail::__cxx_atomic_compare_exchange_weak; -using detail::__cxx_atomic_compare_exchange_strong; -using detail::__cxx_atomic_fetch_add; -using detail::__cxx_atomic_fetch_sub; -using detail::__cxx_atomic_fetch_or; -using detail::__cxx_atomic_fetch_and; -using detail::__cxx_atomic_fetch_xor; -using detail::__cxx_atomic_is_lock_free; +using __detail::__cxx_atomic_base_impl; +using __detail::__cxx_atomic_thread_fence; +using __detail::__cxx_atomic_signal_fence; +using __detail::__cxx_atomic_load; +using __detail::__cxx_atomic_store; +using __detail::__cxx_atomic_exchange; +using __detail::__cxx_atomic_compare_exchange_weak; +using __detail::__cxx_atomic_compare_exchange_strong; +using __detail::__cxx_atomic_fetch_add; +using __detail::__cxx_atomic_fetch_sub; +using __detail::__cxx_atomic_fetch_or; +using __detail::__cxx_atomic_fetch_and; +using __detail::__cxx_atomic_fetch_xor; +using __detail::__cxx_atomic_is_lock_free; template _LIBCUDACXX_INLINE_VISIBILITY diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index 59ed27ebef..ea1c667272 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -93,7 +93,7 @@ _LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() -> // END TODO // Wrap host atomic implementations into a sub-namespace -namespace host { +namespace __host { #if defined(_LIBCUDACXX_COMPILER_MSVC) # include "atomic_msvc.h" #elif defined (_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) @@ -107,12 +107,6 @@ namespace host { #include "atomic_cuda_generated.h" #include "atomic_cuda_derived.h" -template -struct __skip_amt { enum {value = 1}; }; - -template -struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; - _LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_is_lock_free(size_t __x) { return __x <= 8; @@ -122,10 +116,10 @@ _LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_thread_fence(memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - detail::__atomic_thread_fence_cuda(__order, detail::__thread_scope_system_tag()); + __atomic_thread_fence_cuda(__order, __thread_scope_system_tag()); ), NV_IS_HOST, ( - host::__cxx_atomic_thread_fence(__order); + __host::__cxx_atomic_thread_fence(__order); ) ) } @@ -134,10 +128,10 @@ _LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_signal_fence(memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - detail::__atomic_signal_fence_cuda(__order); + __atomic_signal_fence_cuda(__order); ), NV_IS_HOST, ( - host::__cxx_atomic_signal_fence(__order); + __host::__cxx_atomic_signal_fence(__order); ) ) } @@ -145,8 +139,8 @@ _LIBCUDACXX_INLINE_VISIBILITY template using __cxx_atomic_base_heterogeneous_storage = typename conditional<_Ref, - host::__cxx_atomic_ref_base_impl<_Tp, _Sco>, - host::__cxx_atomic_base_impl<_Tp, _Sco> >::type; + __host::__cxx_atomic_ref_base_impl<_Tp, _Sco>, + __host::__cxx_atomic_base_impl<_Tp, _Sco> >::type; template @@ -224,10 +218,10 @@ __host__ __device__ alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - detail::__atomic_store_n_cuda(__a->__get_device(), __tmp, __order, detail::__scope_tag<_Sco>()); + __atomic_store_n_cuda(__a->__get_device(), __tmp, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - host::__cxx_atomic_store(__a->__get_host(), __tmp, __order); + __host::__cxx_atomic_store(__a->__get_host(), __tmp, __order); ) ) } @@ -237,10 +231,10 @@ __host__ __device__ _Tp __cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_load_n_cuda(__a->__get_device(), __order, detail::__scope_tag<_Sco>()); + return __atomic_load_n_cuda(__a->__get_device(), __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__cxx_atomic_load(__a->__get_host(), __order); + return __host::__cxx_atomic_load(__a->__get_host(), __order); ) ) } @@ -251,10 +245,10 @@ __host__ __device__ alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_exchange_n_cuda(__a->__get_device(), __tmp, __order, detail::__scope_tag<_Sco>()); + return __atomic_exchange_n_cuda(__a->__get_device(), __tmp, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__cxx_atomic_exchange(__a->__get_host(), __tmp, __order); + return __host::__cxx_atomic_exchange(__a->__get_host(), __tmp, __order); ) ) } @@ -267,10 +261,10 @@ __host__ __device__ NV_DISPATCH_TARGET( NV_IS_DEVICE, ( alignas(_Tp) auto __tmp_v = __val; - __result = detail::__atomic_compare_exchange_cuda(__a->__get_device(), &__tmp, &__tmp_v, false, __success, __failure, detail::__scope_tag<_Sco>()); + __result = __atomic_compare_exchange_cuda(__a->__get_device(), &__tmp, &__tmp_v, false, __success, __failure, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - __result = host::__cxx_atomic_compare_exchange_strong(__a->__get_host(), &__tmp, __val, __success, __failure); + __result = __host::__cxx_atomic_compare_exchange_strong(__a->__get_host(), &__tmp, __val, __success, __failure); ) ) *__expected = __tmp; @@ -285,10 +279,10 @@ __host__ __device__ NV_DISPATCH_TARGET( NV_IS_DEVICE, ( alignas(_Tp) auto __tmp_v = __val; - __result = detail::__atomic_compare_exchange_cuda(__a->__get_device(), &__tmp, &__tmp_v, true, __success, __failure, detail::__scope_tag<_Sco>()); + __result = __atomic_compare_exchange_cuda(__a->__get_device(), &__tmp, &__tmp_v, true, __success, __failure, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - __result = host::__cxx_atomic_compare_exchange_weak(__a->__get_host(), &__tmp, __val, __success, __failure); + __result = __host::__cxx_atomic_compare_exchange_weak(__a->__get_host(), &__tmp, __val, __success, __failure); ) ) *__expected = __tmp; @@ -300,10 +294,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); + return __atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__cxx_atomic_fetch_add(__a->__get_host(), __delta, __order); + return __host::__cxx_atomic_fetch_add(__a->__get_host(), __delta, __order); ) ) } @@ -313,10 +307,10 @@ __host__ __device__ _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); + return __atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__cxx_atomic_fetch_add(__a->__get_host(), __delta, __order); + return __host::__cxx_atomic_fetch_add(__a->__get_host(), __delta, __order); ) ) } @@ -326,10 +320,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); + return __atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__cxx_atomic_fetch_sub(__a->__get_host(), __delta, __order); + return __host::__cxx_atomic_fetch_sub(__a->__get_host(), __delta, __order); ) ) } @@ -339,10 +333,10 @@ __host__ __device__ _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, detail::__scope_tag<_Sco>()); + return __atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__cxx_atomic_fetch_sub(__a->__get_host(), __delta, __order); + return __host::__cxx_atomic_fetch_sub(__a->__get_host(), __delta, __order); ) ) } @@ -352,10 +346,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_and_cuda(__a->__get_device(), __pattern, __order, detail::__scope_tag<_Sco>()); + return __atomic_fetch_and_cuda(__a->__get_device(), __pattern, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__cxx_atomic_fetch_and(__a->__get_host(), __pattern, __order); + return __host::__cxx_atomic_fetch_and(__a->__get_host(), __pattern, __order); ) ) } @@ -365,10 +359,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_or_cuda(__a->__get_device(), __pattern, __order, detail::__scope_tag<_Sco>()); + return __atomic_fetch_or_cuda(__a->__get_device(), __pattern, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__cxx_atomic_fetch_or(__a->__get_host(), __pattern, __order); + return __host::__cxx_atomic_fetch_or(__a->__get_host(), __pattern, __order); ) ) } @@ -378,10 +372,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return detail::__atomic_fetch_xor_cuda(__a->__get_device(), __pattern, __order, detail::__scope_tag<_Sco>()); + return __atomic_fetch_xor_cuda(__a->__get_device(), __pattern, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return host::__cxx_atomic_fetch_xor(__a->__get_host(), __pattern, __order); + return __host::__cxx_atomic_fetch_xor(__a->__get_host(), __pattern, __order); ) ) } From 6f43f61528f66a425c09d8cbed8ee557b13037a3 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 26 Jul 2021 17:42:15 -0700 Subject: [PATCH 22/34] Rename __skip_amt to __atomic_ptr_inc --- libcxx/include/support/atomic/atomic_base.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h index 29535cfcbc..763f32c850 100644 --- a/libcxx/include/support/atomic/atomic_base.h +++ b/libcxx/include/support/atomic/atomic_base.h @@ -183,22 +183,22 @@ inline bool __cxx_atomic_compare_exchange_weak( } template -struct __skip_amt { enum {value = 1}; }; +struct __atomic_ptr_inc { enum {value = 1}; }; template -struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; +struct __atomic_ptr_inc<_Tp*> { enum {value = sizeof(_Tp)}; }; // FIXME: Haven't figured out what the spec says about using arrays with // atomic_fetch_add. Force a failure rather than creating bad behavior. template -struct __skip_amt<_Tp[]> { }; +struct __atomic_ptr_inc<_Tp[]> { }; template -struct __skip_amt<_Tp[n]> { }; +struct __atomic_ptr_inc<_Tp[n]> { }; template inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; + constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value; auto __a_tmp = __cxx_atomic_base_unwrap(__a); return __atomic_fetch_add(__a_tmp, __delta * __skip_v, __to_gcc_order(__order)); @@ -207,7 +207,7 @@ inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, template inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - constexpr auto __skip_v = __skip_amt<__cxx_atomic_underlying_t<_Tp>>::value; + constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value; auto __a_tmp = __cxx_atomic_base_unwrap(__a); return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, __to_gcc_order(__order)); From 3996fb939805c09027844797720f1b5cfe11091f Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 27 Jul 2021 16:37:02 -0700 Subject: [PATCH 23/34] Refactor and dedup some code in the __cxx_atomic cuda layer, fix runtime lock free check --- libcxx/include/__config | 4 +- libcxx/include/support/atomic/atomic_cuda.h | 98 ++++++++++----------- 2 files changed, 50 insertions(+), 52 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index 64c93f4bb2..70e1774c1a 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1606,14 +1606,14 @@ _LIBCUDACXX_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( // # define _LIBCUDACXX_HAS_C_ATOMIC_IMP #if defined(_LIBCUDACXX_COMPILER_CLANG) # define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP -# define _LIBCUDACXX_NO_RUNTIME_LOCK_FREE #elif defined(_LIBCUDACXX_COMPILER_GCC) # define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP #elif defined(_LIBCUDACXX_COMPILER_MSVC) # define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL -# define _LIBCUDACXX_NO_RUNTIME_LOCK_FREE #endif +#define _LIBCUDACXX_NO_RUNTIME_LOCK_FREE + // CUDA Atomics supersede host atomics in order to insert the host/device dispatch layer #if defined(_LIBCUDACXX_COMPILER_NVCC) || defined(_LIBCUDACXX_COMPILER_PGI) # define _LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index ea1c667272..b022147bde 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -151,33 +151,31 @@ struct __cxx_atomic_base_heterogeneous_impl { } __cxx_atomic_base_heterogeneous_storage<_Tp, _Sco, _Ref> __a_value; +}; - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_device() const volatile _NOEXCEPT -> decltype(__cxx_atomic_base_unwrap(&__a_value)) { - return __cxx_atomic_base_unwrap(&__a_value); - } - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_device() volatile _NOEXCEPT -> decltype(__cxx_atomic_base_unwrap(&__a_value)) { - return __cxx_atomic_base_unwrap(&__a_value); - } - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_device() const _NOEXCEPT -> decltype(__cxx_atomic_base_unwrap(&__a_value)) { - return __cxx_atomic_base_unwrap(&__a_value); - } +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +_Tp* __cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> * __a) _NOEXCEPT { + return __cxx_atomic_base_unwrap(&__a->__a_value); +} - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_host() const volatile _NOEXCEPT -> decltype(&__a_value) { - return &__a_value; - } - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_host() volatile _NOEXCEPT -> decltype(&__a_value) { - return &__a_value; - } - _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR - auto __get_host() const _NOEXCEPT -> decltype(&__a_value) { - return &__a_value; - } -}; +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +volatile _Tp* __cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a) _NOEXCEPT { + return __cxx_atomic_base_unwrap(&__a->__a_value); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const _Tp* __cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const* __a) _NOEXCEPT { + return __cxx_atomic_base_unwrap(&__a->__a_value); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR +const volatile _Tp* __cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a) _NOEXCEPT { + return __cxx_atomic_base_unwrap(&__a->__a_value); +} template struct __cxx_atomic_base_small_impl { @@ -209,7 +207,7 @@ template __host__ __device__ void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val) { alignas(_Tp) auto __tmp = __val; - __cxx_atomic_assign_volatile(*__a->__get_device(), __tmp); + __cxx_atomic_assign_volatile(*__cxx_get_underlying_device_atomic(__a), __tmp); } template @@ -218,10 +216,10 @@ __host__ __device__ alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - __atomic_store_n_cuda(__a->__get_device(), __tmp, __order, __scope_tag<_Sco>()); + __atomic_store_n_cuda(__cxx_get_underlying_device_atomic(__a), __tmp, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - __host::__cxx_atomic_store(__a->__get_host(), __tmp, __order); + __host::__cxx_atomic_store(&__a->__a_value, __tmp, __order); ) ) } @@ -231,10 +229,10 @@ __host__ __device__ _Tp __cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_load_n_cuda(__a->__get_device(), __order, __scope_tag<_Sco>()); + return __atomic_load_n_cuda(__cxx_get_underlying_device_atomic(__a), __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return __host::__cxx_atomic_load(__a->__get_host(), __order); + return __host::__cxx_atomic_load(&__a->__a_value, __order); ) ) } @@ -245,10 +243,10 @@ __host__ __device__ alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_exchange_n_cuda(__a->__get_device(), __tmp, __order, __scope_tag<_Sco>()); + return __atomic_exchange_n_cuda(__cxx_get_underlying_device_atomic(__a), __tmp, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return __host::__cxx_atomic_exchange(__a->__get_host(), __tmp, __order); + return __host::__cxx_atomic_exchange(&__a->__a_value, __tmp, __order); ) ) } @@ -261,10 +259,10 @@ __host__ __device__ NV_DISPATCH_TARGET( NV_IS_DEVICE, ( alignas(_Tp) auto __tmp_v = __val; - __result = __atomic_compare_exchange_cuda(__a->__get_device(), &__tmp, &__tmp_v, false, __success, __failure, __scope_tag<_Sco>()); + __result = __atomic_compare_exchange_cuda(__cxx_get_underlying_device_atomic(__a), &__tmp, &__tmp_v, false, __success, __failure, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - __result = __host::__cxx_atomic_compare_exchange_strong(__a->__get_host(), &__tmp, __val, __success, __failure); + __result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure); ) ) *__expected = __tmp; @@ -279,10 +277,10 @@ __host__ __device__ NV_DISPATCH_TARGET( NV_IS_DEVICE, ( alignas(_Tp) auto __tmp_v = __val; - __result = __atomic_compare_exchange_cuda(__a->__get_device(), &__tmp, &__tmp_v, true, __success, __failure, __scope_tag<_Sco>()); + __result = __atomic_compare_exchange_cuda(__cxx_get_underlying_device_atomic(__a), &__tmp, &__tmp_v, true, __success, __failure, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - __result = __host::__cxx_atomic_compare_exchange_weak(__a->__get_host(), &__tmp, __val, __success, __failure); + __result = __host::__cxx_atomic_compare_exchange_weak(&__a->__a_value, &__tmp, __val, __success, __failure); ) ) *__expected = __tmp; @@ -294,10 +292,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, __scope_tag<_Sco>()); + return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a), __delta, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return __host::__cxx_atomic_fetch_add(__a->__get_host(), __delta, __order); + return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order); ) ) } @@ -307,10 +305,10 @@ __host__ __device__ _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_add_cuda(__a->__get_device(), __delta, __order, __scope_tag<_Sco>()); + return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a), __delta, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return __host::__cxx_atomic_fetch_add(__a->__get_host(), __delta, __order); + return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order); ) ) } @@ -320,10 +318,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, __scope_tag<_Sco>()); + return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a), __delta, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return __host::__cxx_atomic_fetch_sub(__a->__get_host(), __delta, __order); + return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order); ) ) } @@ -333,10 +331,10 @@ __host__ __device__ _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_sub_cuda(__a->__get_device(), __delta, __order, __scope_tag<_Sco>()); + return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a), __delta, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return __host::__cxx_atomic_fetch_sub(__a->__get_host(), __delta, __order); + return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order); ) ) } @@ -346,10 +344,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_and_cuda(__a->__get_device(), __pattern, __order, __scope_tag<_Sco>()); + return __atomic_fetch_and_cuda(__cxx_get_underlying_device_atomic(__a), __pattern, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return __host::__cxx_atomic_fetch_and(__a->__get_host(), __pattern, __order); + return __host::__cxx_atomic_fetch_and(&__a->__a_value, __pattern, __order); ) ) } @@ -359,10 +357,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_or_cuda(__a->__get_device(), __pattern, __order, __scope_tag<_Sco>()); + return __atomic_fetch_or_cuda(__cxx_get_underlying_device_atomic(__a), __pattern, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return __host::__cxx_atomic_fetch_or(__a->__get_host(), __pattern, __order); + return __host::__cxx_atomic_fetch_or(&__a->__a_value, __pattern, __order); ) ) } @@ -372,10 +370,10 @@ __host__ __device__ _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_xor_cuda(__a->__get_device(), __pattern, __order, __scope_tag<_Sco>()); + return __atomic_fetch_xor_cuda(__cxx_get_underlying_device_atomic(__a), __pattern, __order, __scope_tag<_Sco>()); ), NV_IS_HOST, ( - return __host::__cxx_atomic_fetch_xor(__a->__get_host(), __pattern, __order); + return __host::__cxx_atomic_fetch_xor(&__a->__a_value, __pattern, __order); ) ) } From 8796c150eb2c724fc42816ffc52adbc7f8da567e Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 27 Jul 2021 17:47:14 -0700 Subject: [PATCH 24/34] Fix set-but-not-used warnings for atomic intrinsics in atomic_base.h --- libcxx/include/support/atomic/atomic_base.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h index 763f32c850..f8d31781d9 100644 --- a/libcxx/include/support/atomic/atomic_base.h +++ b/libcxx/include/support/atomic/atomic_base.h @@ -139,6 +139,7 @@ template inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order) { auto __a_tmp = __cxx_atomic_base_unwrap(__a); + (void)__a_tmp; __atomic_store(__a_tmp, &__val, __to_gcc_order(__order)); } @@ -146,6 +147,7 @@ template inline auto __cxx_atomic_load(const _Tp* __a, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); + (void)__a_tmp; __cxx_atomic_underlying_t<_Tp> __ret; __atomic_load(__a_tmp, &__ret, __to_gcc_order(__order)); return __ret; @@ -155,6 +157,7 @@ template inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); + (void)__a_tmp; __cxx_atomic_underlying_t<_Tp> __ret; __atomic_exchange(__a_tmp, &__value, &__ret, __to_gcc_order(__order)); return __ret; @@ -165,6 +168,8 @@ inline bool __cxx_atomic_compare_exchange_strong( _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure) { auto __a_tmp = __cxx_atomic_base_unwrap(__a); + (void)__a_tmp; + (void)__expected; return __atomic_compare_exchange(__a_tmp, __expected, &__value, false, __to_gcc_order(__success), @@ -176,6 +181,8 @@ inline bool __cxx_atomic_compare_exchange_weak( _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure) { auto __a_tmp = __cxx_atomic_base_unwrap(__a); + (void)__a_tmp; + (void)__expected; return __atomic_compare_exchange(__a_tmp, __expected, &__value, true, __to_gcc_order(__success), From 50013632d07a70e84b70ac74a52366a3ba265f3e Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Tue, 27 Jul 2021 19:25:52 -0700 Subject: [PATCH 25/34] Fix static_assert in bad_atomic_alignment test. --- .upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp b/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp index 828d4401fd..01568dc637 100644 --- a/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp +++ b/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp @@ -30,7 +30,7 @@ int main(int argc, char ** argv) int32_t a; int32_t b; }; - static_assert(alignof(key) == 4); + static_assert(alignof(key) == 4, ""); cuda::atomic k; auto r = k.load(); unused(r); @@ -41,10 +41,10 @@ int main(int argc, char ** argv) int32_t a; int32_t b; }; - static_assert(alignof(key) == 8); + static_assert(alignof(key) == 8, ""); cuda::atomic k; auto r = k.load(); unused(r); } return 0; -} \ No newline at end of file +} From ec7df6253aec5fa743574adc5b67fb9264dd408a Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 28 Jul 2021 14:27:34 -0700 Subject: [PATCH 26/34] Suppress pointless comparison warnings where tests are impacted by the introduction of --- .upstream-tests/test/cuda/pipeline_group_concept.h | 3 +++ .upstream-tests/test/std/utilities/time/time.cal/euclidian.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/.upstream-tests/test/cuda/pipeline_group_concept.h b/.upstream-tests/test/cuda/pipeline_group_concept.h index 9069bca1ac..db55fac7b2 100644 --- a/.upstream-tests/test/cuda/pipeline_group_concept.h +++ b/.upstream-tests/test/cuda/pipeline_group_concept.h @@ -9,6 +9,9 @@ // UNSUPPORTED: pre-sm-70 +// TODO: Remove pointless comparison suppression when compiler fixes short-circuiting +#pragma nv_diag_suppress 186 + #include template diff --git a/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h b/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h index eb8019fdf6..9233a04892 100644 --- a/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h +++ b/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +#pragma nv_diag_suppress 186 + #include From 09a2971f8c451bec5922302990cd8d0d103c92f9 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 28 Jul 2021 15:08:18 -0700 Subject: [PATCH 27/34] Add a missing license header to the atomic_c11.h file --- libcxx/include/support/atomic/atomic_c11.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libcxx/include/support/atomic/atomic_c11.h b/libcxx/include/support/atomic/atomic_c11.h index 7669a45a15..dd6abbc6d4 100644 --- a/libcxx/include/support/atomic/atomic_c11.h +++ b/libcxx/include/support/atomic/atomic_c11.h @@ -1,3 +1,13 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + // Atomics for C11 template From f1f436754e30e247d96a9e372325b650961220a1 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 28 Jul 2021 16:03:42 -0700 Subject: [PATCH 28/34] Fix pointless comparison warnings on two other pipeline tests --- .upstream-tests/test/cuda/pipeline_arrive_on.pass.cpp | 2 ++ .upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp | 3 +++ 2 files changed, 5 insertions(+) diff --git a/.upstream-tests/test/cuda/pipeline_arrive_on.pass.cpp b/.upstream-tests/test/cuda/pipeline_arrive_on.pass.cpp index 4fab0fdf2b..74bfe96040 100644 --- a/.upstream-tests/test/cuda/pipeline_arrive_on.pass.cpp +++ b/.upstream-tests/test/cuda/pipeline_arrive_on.pass.cpp @@ -11,6 +11,8 @@ // Remove after bump to version 4 #define _LIBCUDACXX_CUDA_ABI_VERSION 3 +// TODO: Remove pointless comparison suppression when compiler fixes short-circuiting +#pragma nv_diag_suppress 186 #pragma nv_diag_suppress static_var_with_dynamic_init #pragma nv_diag_suppress declared_but_not_referenced diff --git a/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp b/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp index 686ff43de1..61ad5d8e56 100644 --- a/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp +++ b/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp @@ -11,6 +11,9 @@ #define _LIBCUDACXX_CUDA_ABI_VERSION 2 +// TODO: Remove pointless comparison suppression when compiler fixes short-circuiting +#pragma nv_diag_suppress 186 + #pragma nv_diag_suppress static_var_with_dynamic_init #pragma nv_diag_suppress declared_but_not_referenced From 047e3c94658cb66a0889c6a3ef5e141e506c9814 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 28 Jul 2021 17:15:10 -0700 Subject: [PATCH 29/34] Wrap/Unwrap store, exchange, and load to make sure the 'written to' parts of atomics are aligned --- .../test/cuda/bad_atomic_alignment.pass.cpp | 4 +++ libcxx/include/support/atomic/atomic_base.h | 26 ++++++++++++------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp b/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp index 01568dc637..71ccb2d0b8 100644 --- a/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp +++ b/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp @@ -33,6 +33,8 @@ int main(int argc, char ** argv) static_assert(alignof(key) == 4, ""); cuda::atomic k; auto r = k.load(); + k.store(r); + (void)k.exchange(r); unused(r); } // Test forcibly aligned user type @@ -44,6 +46,8 @@ int main(int argc, char ** argv) static_assert(alignof(key) == 8, ""); cuda::atomic k; auto r = k.load(); + k.store(r); + (void)k.exchange(r); unused(r); } return 0; diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h index f8d31781d9..16e9965398 100644 --- a/libcxx/include/support/atomic/atomic_base.h +++ b/libcxx/include/support/atomic/atomic_base.h @@ -11,9 +11,12 @@ #ifndef _LIBCUDACXX_ATOMIC_BASE_H #define _LIBCUDACXX_ATOMIC_BASE_H +#include + template struct __cxx_atomic_base_impl { - using __cxx_underlying_type = _Tp; + using __underlying_t = _Tp; + static constexpr int __sco = _Sco; _LIBCUDACXX_CONSTEXPR __cxx_atomic_base_impl() _NOEXCEPT = default; @@ -50,7 +53,8 @@ const volatile _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco template struct __cxx_atomic_ref_base_impl { - using __cxx_underlying_type = _Tp; + using __underlying_t = _Tp; + static constexpr int __sco = _Sco; _LIBCUDACXX_CONSTEXPR __cxx_atomic_ref_base_impl() _NOEXCEPT = default; @@ -91,7 +95,7 @@ _LIBCUDACXX_INLINE_VISIBILITY auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT } template -using __cxx_atomic_underlying_t = typename _Tp::__cxx_underlying_type; +using __cxx_atomic_underlying_t = typename _Tp::__underlying_t; _LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { // Avoid switch statement to make this a constexpr. @@ -140,7 +144,8 @@ inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order) { auto __a_tmp = __cxx_atomic_base_unwrap(__a); (void)__a_tmp; - __atomic_store(__a_tmp, &__val, __to_gcc_order(__order)); + __cxx_atomic_base_impl<__cxx_atomic_underlying_t<_Tp>, _Tp::__sco> __v_temp(__val); + __atomic_store(__a, &__v_temp, __to_gcc_order(__order)); } template @@ -148,9 +153,9 @@ inline auto __cxx_atomic_load(const _Tp* __a, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); (void)__a_tmp; - __cxx_atomic_underlying_t<_Tp> __ret; - __atomic_load(__a_tmp, &__ret, __to_gcc_order(__order)); - return __ret; + __cxx_atomic_base_impl<__cxx_atomic_underlying_t<_Tp>, _Tp::__sco> __ret; + __atomic_load(__a, &__ret, __to_gcc_order(__order)); + return __ret.__a_value; } template @@ -158,9 +163,10 @@ inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); (void)__a_tmp; - __cxx_atomic_underlying_t<_Tp> __ret; - __atomic_exchange(__a_tmp, &__value, &__ret, __to_gcc_order(__order)); - return __ret; + __cxx_atomic_base_impl<__cxx_atomic_underlying_t<_Tp>, _Tp::__sco> __v_temp(__value); + __cxx_atomic_base_impl<__cxx_atomic_underlying_t<_Tp>, _Tp::__sco> __ret; + __atomic_exchange(__a, &__v_temp, &__ret, __to_gcc_order(__order)); + return __ret.__a_value; } template From 0d5fb0ee35c7f9abb709179ee9116f72054a5719 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 29 Jul 2021 12:46:14 -0700 Subject: [PATCH 30/34] Change method of ensuring that atomic types match --- libcxx/include/support/atomic/atomic_base.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h index 16e9965398..4a99353fab 100644 --- a/libcxx/include/support/atomic/atomic_base.h +++ b/libcxx/include/support/atomic/atomic_base.h @@ -142,18 +142,14 @@ void __cxx_atomic_signal_fence(memory_order __order) { template inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order) { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - (void)__a_tmp; - __cxx_atomic_base_impl<__cxx_atomic_underlying_t<_Tp>, _Tp::__sco> __v_temp(__val); + typename _CUDA_VSTD::remove_cv<_Tp>::type __v_temp(__val); __atomic_store(__a, &__v_temp, __to_gcc_order(__order)); } template inline auto __cxx_atomic_load(const _Tp* __a, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - (void)__a_tmp; - __cxx_atomic_base_impl<__cxx_atomic_underlying_t<_Tp>, _Tp::__sco> __ret; + typename _CUDA_VSTD::remove_cv<_Tp>::type __ret; __atomic_load(__a, &__ret, __to_gcc_order(__order)); return __ret.__a_value; } @@ -161,10 +157,8 @@ inline auto __cxx_atomic_load(const _Tp* __a, template inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { - auto __a_tmp = __cxx_atomic_base_unwrap(__a); - (void)__a_tmp; - __cxx_atomic_base_impl<__cxx_atomic_underlying_t<_Tp>, _Tp::__sco> __v_temp(__value); - __cxx_atomic_base_impl<__cxx_atomic_underlying_t<_Tp>, _Tp::__sco> __ret; + typename _CUDA_VSTD::remove_cv<_Tp>::type __v_temp(__value); + typename _CUDA_VSTD::remove_cv<_Tp>::type __ret; __atomic_exchange(__a, &__v_temp, &__ret, __to_gcc_order(__order)); return __ret.__a_value; } From ffcb0f9fa78e7b59be1927ead3dee50f1956f70e Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 2 Aug 2021 12:30:12 -0700 Subject: [PATCH 31/34] Fix spurious warnings in atomic_base.h --- libcxx/include/support/atomic/atomic_base.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h index 4a99353fab..def4180bc9 100644 --- a/libcxx/include/support/atomic/atomic_base.h +++ b/libcxx/include/support/atomic/atomic_base.h @@ -143,6 +143,7 @@ template inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order) { typename _CUDA_VSTD::remove_cv<_Tp>::type __v_temp(__val); + (void)__a; __atomic_store(__a, &__v_temp, __to_gcc_order(__order)); } @@ -150,6 +151,7 @@ template inline auto __cxx_atomic_load(const _Tp* __a, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { typename _CUDA_VSTD::remove_cv<_Tp>::type __ret; + (void)__a; __atomic_load(__a, &__ret, __to_gcc_order(__order)); return __ret.__a_value; } @@ -159,6 +161,7 @@ inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { typename _CUDA_VSTD::remove_cv<_Tp>::type __v_temp(__value); typename _CUDA_VSTD::remove_cv<_Tp>::type __ret; + (void)__a; __atomic_exchange(__a, &__v_temp, &__ret, __to_gcc_order(__order)); return __ret.__a_value; } From 6fb0a9a384248bd486e200c9fc3e1f59ab0f0fe9 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 2 Aug 2021 18:23:26 -0700 Subject: [PATCH 32/34] Rename __to_gcc_order to __cxx_atomic_order_to_int --- libcxx/include/support/atomic/atomic_base.h | 32 ++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h index def4180bc9..54450a7f41 100644 --- a/libcxx/include/support/atomic/atomic_base.h +++ b/libcxx/include/support/atomic/atomic_base.h @@ -97,7 +97,7 @@ _LIBCUDACXX_INLINE_VISIBILITY auto __cxx_atomic_base_unwrap(_Tp* __a) _NOEXCEPT template using __cxx_atomic_underlying_t = typename _Tp::__underlying_t; -_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(memory_order __order) { +_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __cxx_atomic_order_to_int(memory_order __order) { // Avoid switch statement to make this a constexpr. return __order == memory_order_relaxed ? __ATOMIC_RELAXED: (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: @@ -107,7 +107,7 @@ _LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_order(me __ATOMIC_CONSUME)))); } -_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __to_gcc_failure_order(memory_order __order) { +_LIBCUDACXX_INLINE_VISIBILITY inline _LIBCUDACXX_CONSTEXPR int __cxx_atomic_failure_order_to_int(memory_order __order) { // Avoid switch statement to make this a constexpr. return __order == memory_order_relaxed ? __ATOMIC_RELAXED: (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: @@ -131,12 +131,12 @@ inline void __cxx_atomic_init(_Tp* __a, _Up __val) { inline void __cxx_atomic_thread_fence(memory_order __order) { - __atomic_thread_fence(__to_gcc_order(__order)); + __atomic_thread_fence(__cxx_atomic_order_to_int(__order)); } inline void __cxx_atomic_signal_fence(memory_order __order) { - __atomic_signal_fence(__to_gcc_order(__order)); + __atomic_signal_fence(__cxx_atomic_order_to_int(__order)); } template @@ -144,7 +144,7 @@ inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order) { typename _CUDA_VSTD::remove_cv<_Tp>::type __v_temp(__val); (void)__a; - __atomic_store(__a, &__v_temp, __to_gcc_order(__order)); + __atomic_store(__a, &__v_temp, __cxx_atomic_order_to_int(__order)); } template @@ -152,7 +152,7 @@ inline auto __cxx_atomic_load(const _Tp* __a, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { typename _CUDA_VSTD::remove_cv<_Tp>::type __ret; (void)__a; - __atomic_load(__a, &__ret, __to_gcc_order(__order)); + __atomic_load(__a, &__ret, __cxx_atomic_order_to_int(__order)); return __ret.__a_value; } @@ -162,7 +162,7 @@ inline auto __cxx_atomic_exchange(_Tp* __a, _Up __value, typename _CUDA_VSTD::remove_cv<_Tp>::type __v_temp(__value); typename _CUDA_VSTD::remove_cv<_Tp>::type __ret; (void)__a; - __atomic_exchange(__a, &__v_temp, &__ret, __to_gcc_order(__order)); + __atomic_exchange(__a, &__v_temp, &__ret, __cxx_atomic_order_to_int(__order)); return __ret.__a_value; } @@ -175,8 +175,8 @@ inline bool __cxx_atomic_compare_exchange_strong( (void)__expected; return __atomic_compare_exchange(__a_tmp, __expected, &__value, false, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); + __cxx_atomic_order_to_int(__success), + __cxx_atomic_failure_order_to_int(__failure)); } template @@ -188,8 +188,8 @@ inline bool __cxx_atomic_compare_exchange_weak( (void)__expected; return __atomic_compare_exchange(__a_tmp, __expected, &__value, true, - __to_gcc_order(__success), - __to_gcc_failure_order(__failure)); + __cxx_atomic_order_to_int(__success), + __cxx_atomic_failure_order_to_int(__failure)); } template @@ -211,7 +211,7 @@ inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value; auto __a_tmp = __cxx_atomic_base_unwrap(__a); return __atomic_fetch_add(__a_tmp, __delta * __skip_v, - __to_gcc_order(__order)); + __cxx_atomic_order_to_int(__order)); } template @@ -220,7 +220,7 @@ inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value; auto __a_tmp = __cxx_atomic_base_unwrap(__a); return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, - __to_gcc_order(__order)); + __cxx_atomic_order_to_int(__order)); } template @@ -228,7 +228,7 @@ inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); return __atomic_fetch_and(__a_tmp, __pattern, - __to_gcc_order(__order)); + __cxx_atomic_order_to_int(__order)); } template @@ -236,7 +236,7 @@ inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); return __atomic_fetch_or(__a_tmp, __pattern, - __to_gcc_order(__order)); + __cxx_atomic_order_to_int(__order)); } template @@ -244,7 +244,7 @@ inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> { auto __a_tmp = __cxx_atomic_base_unwrap(__a); return __atomic_fetch_xor(__a_tmp, __pattern, - __to_gcc_order(__order)); + __cxx_atomic_order_to_int(__order)); } inline constexpr From 8711f32a7846594f3649da5189464813fea6196e Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 3 Aug 2021 17:27:20 -0700 Subject: [PATCH 33/34] Reset barrier/latch/semaphore to head, as those will be nv/target'd later --- include/cuda/std/barrier | 4 ++-- include/cuda/std/latch | 2 +- include/cuda/std/semaphore | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index 8d75b4b763..e7af6f138c 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 # error "CUDA synchronization primitives are only supported for sm_70 and up." #endif @@ -311,7 +311,7 @@ inline void __strided_memcpy(char * __destination, char const * __source, std::s } } -#if __CUDA_MINIMUM_ARCH__ >= 800 +#if __CUDA_ARCH__ >= 800 template 16)> struct __memcpy_async_impl { __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { diff --git a/include/cuda/std/latch b/include/cuda/std/latch index ba27b60b8d..0bb4c4f27a 100644 --- a/include/cuda/std/latch +++ b/include/cuda/std/latch @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 # error "CUDA synchronization primitives are only supported for sm_70 and up." #endif diff --git a/include/cuda/std/semaphore b/include/cuda/std/semaphore index 45a9b8beb7..7a02b4e332 100644 --- a/include/cuda/std/semaphore +++ b/include/cuda/std/semaphore @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 # error "CUDA synchronization primitives are only supported for sm_70 and up." #endif From 6efb2063927242768772bc554775f54ba4d499bd Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 3 Aug 2021 17:37:57 -0700 Subject: [PATCH 34/34] Fix a few includes occuring within internal namespaces --- libcxx/include/atomic | 8 ++++++++ libcxx/include/support/atomic/atomic_base.h | 2 -- libcxx/include/support/atomic/atomic_cuda.h | 4 ---- libcxx/include/support/atomic/atomic_msvc.h | 2 -- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libcxx/include/atomic b/libcxx/include/atomic index d20ebf4945..ceb679ae6f 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -588,6 +588,14 @@ void atomic_signal_fence(memory_order m) noexcept; __f == memory_order_acq_rel, \ "memory order argument to atomic operation is invalid") +#if defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL) +# include +#endif + +#if !defined(_LIBCUDACXX_COMPILER_NVRTC) +# include +#endif + _LIBCUDACXX_BEGIN_NAMESPACE_STD // Figure out what the underlying type for `memory_order` would be if it were diff --git a/libcxx/include/support/atomic/atomic_base.h b/libcxx/include/support/atomic/atomic_base.h index 54450a7f41..548f636a40 100644 --- a/libcxx/include/support/atomic/atomic_base.h +++ b/libcxx/include/support/atomic/atomic_base.h @@ -11,8 +11,6 @@ #ifndef _LIBCUDACXX_ATOMIC_BASE_H #define _LIBCUDACXX_ATOMIC_BASE_H -#include - template struct __cxx_atomic_base_impl { using __underlying_t = _Tp; diff --git a/libcxx/include/support/atomic/atomic_cuda.h b/libcxx/include/support/atomic/atomic_cuda.h index b022147bde..b0e17c5bd8 100644 --- a/libcxx/include/support/atomic/atomic_cuda.h +++ b/libcxx/include/support/atomic/atomic_cuda.h @@ -11,10 +11,6 @@ # error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows." #endif -#ifndef __CUDACC_RTC__ -#include -#endif // __CUDACC_RTC__ - #if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE) #define ATOMIC_BOOL_LOCK_FREE 2 #define ATOMIC_CHAR_LOCK_FREE 2 diff --git a/libcxx/include/support/atomic/atomic_msvc.h b/libcxx/include/support/atomic/atomic_msvc.h index 8a8084449a..9294a7fa3b 100644 --- a/libcxx/include/support/atomic/atomic_msvc.h +++ b/libcxx/include/support/atomic/atomic_msvc.h @@ -12,8 +12,6 @@ #error "This file is only for CL.EXE's benefit" #endif -#include - #define _Compiler_barrier() _ReadWriteBarrier() #if defined(_M_ARM) || defined(_M_ARM64)