NVIDIA · gonzalobg · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023
@@ -0,0 +1,126 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: windows && pre-sm-70
+
+#include <cuda/atomic>
+#include <cuda/std/cassert>
+
+template <typename T>
+__device__ T store(T in) {
+  cuda::atomic<T> x = in;
+  x.store(in + 1, cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T compare_exchange_weak(T in) {
+  cuda::atomic<T> x = in;
+  T old = T(7);
+  x.compare_exchange_weak(old, T(42), cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T compare_exchange_strong(T in) {
+  cuda::atomic<T> x = in;
+  T old = T(7);
+  x.compare_exchange_strong(old, T(42), cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T exchange(T in) {
+  cuda::atomic<T> x = in;
+  T out = x.exchange(T(1), cuda::memory_order_relaxed);
+  return out + x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T fetch_add(T in) {
+  cuda::atomic<T> x = in;
+  x.fetch_add(T(1), cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T fetch_sub(T in) {
+  cuda::atomic<T> x = in;
+  x.fetch_sub(T(1), cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T fetch_and(T in) {
+  cuda::atomic<T> x = in;
+  x.fetch_and(T(1), cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T fetch_or(T in) {
+  cuda::atomic<T> x = in;
+  x.fetch_or(T(1), cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T fetch_xor(T in) {
+  cuda::atomic<T> x = in;
+  x.fetch_xor(T(1), cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T fetch_min(T in) {
+  cuda::atomic<T> x = in;
+  x.fetch_min(T(7), cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+__device__ T fetch_max(T in) {
+  cuda::atomic<T> x = in;
+  x.fetch_max(T(7), cuda::memory_order_relaxed);
+  return x.load(cuda::memory_order_relaxed);
+}
+
+template <typename T>
+  __device__ inline void tests() {
+    const T tid = threadIdx.x;
+    assert(tid + T(1)               == store(tid));
+    assert(T(1) + tid               == exchange(tid));
+    assert(tid == T(7)? T(42) : tid == compare_exchange_weak(tid));
+    assert(tid == T(7)? T(42) : tid == compare_exchange_strong(tid));
+    assert((tid + T(1))               == fetch_add(tid));
+    assert((tid & T(1))               == fetch_and(tid));
+    assert((tid | T(1))               == fetch_or(tid));
+    assert((tid ^ T(1))               == fetch_xor(tid));
+    assert(min(tid, T(7))           == fetch_min(tid));
+    assert(max(tid, T(7))           == fetch_max(tid));
+    assert(T(tid - T(1))               == fetch_sub(tid));
+}
+
+int main(int arg, char ** argv)
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_HOST, (    
+      cuda_thread_count = 64;
+    ),(
+      tests<uint8_t>();
+      tests<uint16_t>();
+      tests<uint32_t>();
+      tests<uint64_t>();
+      tests<int8_t>();
+      tests<int16_t>();
+      tests<int32_t>();
+      tests<int64_t>();
+    )
+  )
+  return 0;
+}
@@ -83,6 +83,8 @@ int main() {
 // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
+
+#include "atomic_cuda_local.h"
 )XXX" << "\n\n";
 
     auto scopenametag = [&](auto scope) {
@@ -142,6 +144,7 @@ int main() {
             for(auto& cv: cv_qualifier) {
                 out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz/8 << ", int> = 0>\n";
                 out << "_LIBCUDACXX_DEVICE void __atomic_load_cuda(const " << cv << "_Type *__ptr, _Type *__ret, int __memorder, " << scopenametag(s.first) << ") {\n";
+                out << "    if (__cuda_load_weak_if_local(__ptr, __ret)) return;\n";
                 out << "    uint" << sz << "_t __tmp = 0;\n";
                 out << "    NV_DISPATCH_TARGET(\n";
                 out << "      NV_PROVIDES_SM_70, (\n";
@@ -178,6 +181,7 @@ int main() {
             for(auto& cv: cv_qualifier) {
                 out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz/8 << ", int> = 0>\n";
                 out << "_LIBCUDACXX_DEVICE void __atomic_store_cuda(" << cv << "_Type *__ptr, _Type *__val, int __memorder, " << scopenametag(s.first) << ") {\n";
+                out << "    if (__cuda_store_weak_if_local(__ptr, *__val)) return;\n";
                 out << "    uint" << sz << "_t __tmp = 0;\n";
                 out << "    memcpy(&__tmp, __val, " << sz/8 << ");\n";
                 out << "    NV_DISPATCH_TARGET(\n";
@@ -239,6 +243,8 @@ int main() {
                             if(rmw.first == "compare_exchange") {
                                 out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz/8 << ", int> = 0>\n";
                                 out << "_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(" << cv << "_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, " << scopenametag(s.first) << ") {\n";
+				out << "    bool __tmp_out;\n";
+				out << "    if (__cuda_compare_exchange_weak_if_local(__ptr, __expected, __desired, &__tmp_out)) return __tmp_out;\n";
                                 out << "    uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n";
                                 out << "    memcpy(&__tmp, __desired, " << sz/8 << ");\n";
                                 out << "    memcpy(&__old, __expected, " << sz/8 << ");\n";
@@ -277,7 +283,8 @@ int main() {
                                 if(rmw.first == "exchange") {
                                     out << ", int> = 0>\n";
                                     out << "_LIBCUDACXX_DEVICE void __atomic_exchange_cuda(" << cv << "_Type *__ptr, _Type *__val, _Type *__ret, int __memorder, " << scopenametag(s.first) << ") {\n";
-                                    out << "    uint" << sz << "_t __tmp = 0;\n";
+				    out << "    if (__cuda_exchange_weak_if_local(__ptr, __val, __ret)) return;\n";
+				    out << "    uint" << sz << "_t __tmp = 0;\n";
                                     out << "    memcpy(&__tmp, __val, " << sz/8 << ");\n";
                                 }
                                 else {
@@ -295,6 +302,7 @@ int main() {
                                         out << ", int> = 0>\n";
                                     out << "_LIBCUDACXX_DEVICE _Type __atomic_" << rmw.first << "_cuda(" << cv << "_Type *__ptr, _Type __val, int __memorder, " << scopenametag(s.first) << ") {\n";
                                     out << "    _Type __ret;\n";
+				    out << "    if (__cuda_" << rmw.first << "_weak_if_local(__ptr, __val, &__ret)) return __ret;\n";
                                     if(type.first == "f" && sz == 32)
                                         out << "    float";
                                     else if(type.first == "f" && sz == 64)
@@ -352,6 +360,7 @@ int main() {
                 if(op == "sub")
                     out << "    __tmp = -__tmp;\n";
                 out << "    __tmp *= sizeof(_Type);\n";
+		out << "    if (__cuda_fetch_add_weak_if_local((uint64_t*)__ptr, __tmp, (uint64_t*)&__ret)) return __ret;\n";
                 out << "    NV_DISPATCH_TARGET(\n";
                 out << "      NV_PROVIDES_SM_70, (\n";
                 out << "        switch (__memorder) {\n";

@@ -7,6 +7,8 @@
 // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
+#ifndef __LIBCUDACXX_ATOMIC_CUDA_H
+#define __LIBCUDACXX_ATOMIC_CUDA_H
 
 #if defined(__CUDA_MINIMUM_ARCH__) && ((!defined(_LIBCUDACXX_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 600) || (defined(_LIBCUDACXX_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 700))
 #  error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
@@ -398,25 +400,6 @@ template <typename _Tp, int _Sco>
 _LIBCUDACXX_HOST_DEVICE inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __value, memory_order __order) {
     return __cxx_small_from_32<_Tp>(__cxx_atomic_exchange(&__a->__a_value, __cxx_small_to_32(__value), __order));
 }
-_LIBCUDACXX_HOST_DEVICE
-inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            auto __lhs_c = reinterpret_cast<unsigned char const *>(__lhs);
-            auto __rhs_c = reinterpret_cast<unsigned char const *>(__rhs);
-            while (__count--) {
-                auto const __lhs_v = *__lhs_c++;
-                auto const __rhs_v = *__rhs_c++;
-                if (__lhs_v < __rhs_v) { return -1; }
-                if (__lhs_v > __rhs_v) { return 1; }
-            }
-            return 0;
-        ),
-        NV_IS_HOST, (
-            return memcmp(__lhs, __rhs, __count);
-        )
-    )
-}
 
 template <typename _Tp, int _Sco>
 _LIBCUDACXX_HOST_DEVICE inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) {
@@ -478,3 +461,5 @@ template <typename _Tp, typename _Delta, int _Sco>
 _LIBCUDACXX_HOST_DEVICE inline _Tp __cxx_atomic_fetch_min(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Delta __val, memory_order __order) {
     return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_min(&__a->__a_value, __cxx_small_to_32(__val), __order));
 }
+
+#endif // __LIBCUDACXX_ATOMIC_CUDA_H
@@ -8,8 +8,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "atomic_cuda_local.h"
+
 template<class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
 bool _LIBCUDACXX_DEVICE __atomic_compare_exchange_cuda(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, _Scope __s) {
+    bool __ret;
+    if (__cuda_compare_exchange_weak_if_local(__ptr, __expected, __desired, &__ret)) return __ret;
 
     auto const __aligned = (uint32_t*)((intptr_t)__ptr & ~(sizeof(uint32_t) - 1));
     auto const __offset = uint32_t((intptr_t)__ptr & (sizeof(uint32_t) - 1)) * 8;
@@ -31,7 +35,7 @@ bool _LIBCUDACXX_DEVICE __atomic_compare_exchange_cuda(_Type volatile *__ptr, _T
 
 template<class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
 void _LIBCUDACXX_DEVICE __atomic_exchange_cuda(_Type volatile *__ptr, _Type *__val, _Type *__ret, int __memorder, _Scope __s) {
-
+    if (__cuda_exchange_weak_if_local(__ptr, __val, __ret)) return;
     _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
     while(!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, __s))
         ;
@@ -40,6 +44,8 @@ void _LIBCUDACXX_DEVICE __atomic_exchange_cuda(_Type volatile *__ptr, _Type *__v
 
 template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
 _Type _LIBCUDACXX_DEVICE __atomic_fetch_add_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
+    _Type __ret;
+    if (__cuda_fetch_add_weak_if_local(__ptr, __val, &__ret)) return __ret;
 
     _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
     _Type __desired = __expected + __val;
@@ -50,6 +56,9 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_add_cuda(_Type volatile *__ptr, _Delta _
 
 template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0>
 _Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_max_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
+    _Type __ret;
+    if (__cuda_fetch_max_weak_if_local(__ptr, __val, &__ret)) return __ret;
+
     _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
     _Type __desired = __expected > __val ? __expected : __val;
 
@@ -63,6 +72,9 @@ _Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_max_cuda(_Type volatile *__ptr, _De
 
 template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0>
 _Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_min_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
+    _Type __ret;
+    if (__cuda_fetch_min_weak_if_local(__ptr, __val, &__ret)) return __ret;
+
     _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
     _Type __desired = __expected < __val ? __expected : __val;
 
@@ -76,6 +88,8 @@ _Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_min_cuda(_Type volatile *__ptr, _De
 
 template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
 _Type _LIBCUDACXX_DEVICE __atomic_fetch_sub_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
+    _Type __ret;
+    if (__cuda_fetch_sub_weak_if_local(__ptr, __val, &__ret)) return __ret;
 
     _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
     _Type __desired = __expected - __val;
@@ -86,6 +100,8 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_sub_cuda(_Type volatile *__ptr, _Delta _
 
 template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
 _Type _LIBCUDACXX_DEVICE __atomic_fetch_and_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
+    _Type __ret;
+    if (__cuda_fetch_and_weak_if_local(__ptr, __val, &__ret)) return __ret;
 
     _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
     _Type __desired = __expected & __val;
@@ -96,6 +112,8 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_and_cuda(_Type volatile *__ptr, _Delta _
 
 template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
 _Type _LIBCUDACXX_DEVICE __atomic_fetch_xor_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
+    _Type __ret;
+    if (__cuda_fetch_xor_weak_if_local(__ptr, __val, &__ret)) return __ret;
 
     _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
     _Type __desired = __expected ^ __val;
@@ -106,6 +124,8 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_xor_cuda(_Type volatile *__ptr, _Delta _
 
 template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
 _Type _LIBCUDACXX_DEVICE __atomic_fetch_or_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
+    _Type __ret;
+    if (__cuda_fetch_or_weak_if_local(__ptr, __val, &__ret)) return __ret;
 
     _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
     _Type __desired = __expected | __val;