NVIDIA · wmaxey · Jan 30, 2021 · Mar 9, 2021 · Jul 23, 2021 · Jul 24, 2021
diff --git a/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp b/.upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
+// UNSUPPORTED: windows && pre-sm-70
+
+// <cuda/atomic>
+
+// cuda::atomic<key>
+
+// Original test issue:
+// https://github.com/NVIDIA/libcudacxx/issues/160
+
+#include <cuda/atomic>
+
+template <typename T>
+__host__ __device__
+constexpr bool unused(T &&) {return true;}
+
+int main(int argc, char ** argv)
+{
+  // Test default aligned user type
+  {
+    struct key {
+      int32_t a;
+      int32_t b;
+    };
+    static_assert(alignof(key) == 4, "");
+    cuda::atomic<key> k;
+    auto r = k.load();
+    k.store(r);
+    (void)k.exchange(r);
+    unused(r);
+  }
+  // Test forcibly aligned user type
+  {
+    struct alignas(8) key {
+      int32_t a;
+      int32_t b;
+    };
+    static_assert(alignof(key) == 8, "");
+    cuda::atomic<key> k;
+    auto r = k.load();
+    k.store(r);
+    (void)k.exchange(r);
+    unused(r);
+  }
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/pipeline_arrive_on.pass.cpp b/.upstream-tests/test/cuda/pipeline_arrive_on.pass.cpp
@@ -11,6 +11,8 @@
 
 // Remove after bump to version 4
 #define _LIBCUDACXX_CUDA_ABI_VERSION 3
+// TODO: Remove pointless comparison suppression when compiler fixes short-circuiting
+#pragma nv_diag_suppress 186
 
 #pragma nv_diag_suppress static_var_with_dynamic_init
 #pragma nv_diag_suppress declared_but_not_referenced

diff --git a/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp b/.upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp
@@ -11,6 +11,9 @@
 
 #define _LIBCUDACXX_CUDA_ABI_VERSION 2
 
+// TODO: Remove pointless comparison suppression when compiler fixes short-circuiting
+#pragma nv_diag_suppress 186
+
 #pragma nv_diag_suppress static_var_with_dynamic_init
 #pragma nv_diag_suppress declared_but_not_referenced
 

diff --git a/.upstream-tests/test/cuda/pipeline_group_concept.h b/.upstream-tests/test/cuda/pipeline_group_concept.h
@@ -9,6 +9,9 @@
 
 // UNSUPPORTED: pre-sm-70
 
+// TODO: Remove pointless comparison suppression when compiler fixes short-circuiting
+#pragma nv_diag_suppress 186
+
 #include <cuda/pipeline>
 
 template <typename T_size, typename T_thread_rank>

diff --git a/...tomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp b/...tomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp
@@ -31,9 +31,9 @@ struct TestFn {
   __host__ __device__
   void operator()() const {
     typedef cuda::std::atomic<T> A;
-    A t;
+    A t{};
     bool b1 = cuda::std::atomic_is_lock_free(static_cast<const A*>(&t));
-    volatile A vt;
+    volatile A vt{};
     bool b2 = cuda::std::atomic_is_lock_free(static_cast<const volatile A*>(&vt));
     assert(b1 == b2);
   }

diff --git a/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h b/.upstream-tests/test/std/utilities/time/time.cal/euclidian.h
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#pragma nv_diag_suppress 186
+
 #include <type_traits>
 
 

diff --git a/codegen/codegen.cpp b/codegen/codegen.cpp
diff --git a/docs/releases/changelog.md b/docs/releases/changelog.md
@@ -11,9 +11,34 @@ It pulls in the latest version of upstream libc++ and marks the beginning of
 
 !-->
 
+## libcu++ 1.6.0 (CUDA Toolkit 11.5)
+
+libcu++ 1.6.0 is a major release. It adds two new functions to the `cuda::std::barrier` API and
+uses `<nv/target>` as the primary dispatch mechanism for `cuda::std::atomic`.
+
+This release introduces ABI version 4, which is now the default.
+
+Supported ABI Versions: 4 (default), 3, and 2.
+
+Included in: CUDA Toolkit 11.5.
+
+### Issues Fixed
+
+- #179: Refactors the atomic layer to allow for layering the host device/host abstractions.
+- #189: Changed pragmas for silencing chrono long double warnings.
+- #186: Allows `<nv/target>` to be used under NVRTC.
+- #177: Allows `<nv/target>` to build when compiled under C and C++98.
+  - Thanks to David Olsen for this contribution.
+- #172: Introduces ABI version 4.
+  - Forces `cuda::std::complex` alignment for enhanced performance.
+  - Sets the internal representation of `cuda::std::chrono` literals to `double`.
+- #165: For tests on some older distributions keep using Python 3, but downgrade lit.
+- #164: Fixes testing issues related to Python 2/3 switch for lit.
+  - Thanks to Royil Damer for this contribution.
+
 ## libcu++ 1.5.0 (CUDA Toolkit 11.4)
 
-libcu++ 1.5.0 is a major release.  It adds `<nv/target>`, 
+libcu++ 1.5.0 is a major release.  It adds `<nv/target>`,
 the library support header for the new `if target`
 target specialization mechanism.
 

diff --git a/include/cuda/std/atomic b/include/cuda/std/atomic
@@ -37,6 +37,7 @@
     #undef ATOMIC_VAR_INIT
 #endif //__CUDACC_RTC__
 
+#include "cassert"
 #include "cstddef"
 #include "cstdint"
 #include "type_traits"
@@ -46,16 +47,26 @@
 
 #include "detail/__pragma_push"
 
-#include "detail/__atomic"
 #include "detail/__threading_support"
 
-#undef _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
-#undef _LIBCUDACXX_HAS_C_ATOMIC_IMP
-
 #include "detail/libcxx/include/atomic"
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
+using std::__detail::thread_scope;
+using std::__detail::thread_scope_system;
+using std::__detail::thread_scope_device;
+using std::__detail::thread_scope_block;
+using std::__detail::thread_scope_thread;
+
+namespace __detail {
+using std::__detail::__thread_scope_block_tag;
+using std::__detail::__thread_scope_device_tag;
+using std::__detail::__thread_scope_system_tag;
+using std::__detail::__atomic_signal_fence_cuda;
+using std::__detail::__atomic_thread_fence_cuda;
+}
+
 using memory_order = std::memory_order;
 
 constexpr memory_order memory_order_relaxed = std::memory_order_relaxed;
@@ -67,7 +78,7 @@ constexpr memory_order memory_order_seq_cst = std::memory_order_seq_cst;
 
 // atomic<T>
 
-template <class _Tp, thread_scope _Sco = thread_scope_system>
+template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
 struct atomic
     : public std::__atomic_base<_Tp, _Sco>
 {
@@ -87,15 +98,15 @@ struct atomic
     __host__ __device__
     _Tp fetch_max(const _Tp & __op, memory_order __m = memory_order_seq_cst) volatile noexcept
     {
-        return detail::__atomic_fetch_max_cuda(&this->__a_.__a_value, __op,
-                                              __m, detail::__scope_tag<_Sco>());
+        return std::__detail::__atomic_fetch_max_cuda(&this->__a_.__a_value, __op,
+                                              __m, std::__detail::__scope_tag<_Sco>());
     }
 
     __host__ __device__
     _Tp fetch_min(const _Tp & __op, memory_order __m = memory_order_seq_cst) volatile noexcept
     {
-        return detail::__atomic_fetch_min_cuda(&this->__a_.__a_value, __op,
-                                              __m, detail::__scope_tag<_Sco>());
+        return std::__detail::__atomic_fetch_min_cuda(&this->__a_.__a_value, __op,
+                                              __m, std::__detail::__scope_tag<_Sco>());
     }
 };
 
@@ -159,31 +170,37 @@ struct atomic<_Tp*, _Sco>
     _Tp* operator-=(ptrdiff_t __op) noexcept          {return fetch_sub(__op) - __op;}
 };
 
-inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_scope _Scope = thread_scope_system) {
-#ifdef __CUDA_ARCH__
-    switch(_Scope) {
-    case thread_scope_system:
-        detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_system_tag());
-        break;
-    case thread_scope_device:
-        detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_device_tag());
-        break;
-    case thread_scope_block:
-        detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_block_tag());
-        break;
-    }
-#else
-    (void) _Scope;
-    ::std::atomic_thread_fence((::std::memory_order)__m);
-#endif
+inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_scope _Scope = thread_scope::thread_scope_system) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            switch(_Scope) {
+            case thread_scope::thread_scope_system:
+                __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_system_tag());
+                break;
+            case thread_scope::thread_scope_device:
+                __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_device_tag());
+                break;
+            case thread_scope::thread_scope_block:
+                __detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_block_tag());
+                break;
+            }
+        ),
+        NV_IS_HOST, (
+            (void) _Scope;
+            ::std::atomic_thread_fence((::std::memory_order)__m);
+        )
+    )
 }
 
 inline __host__ __device__ void atomic_signal_fence(memory_order __m) {
-#ifdef __CUDA_ARCH__
-    detail::__atomic_signal_fence_cuda((int)__m);
-#else
-    ::std::atomic_signal_fence((::std::memory_order)__m);
-#endif
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            __detail::__atomic_signal_fence_cuda((int)__m);
+        ),
+        NV_IS_HOST, (
+            ::std::atomic_signal_fence((::std::memory_order)__m);
+        )
+    )
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,8 @@ @@
     //
     //===----------------------------------------------------------------------===//
+    #pragma nv_diag_suppress 186
     #include <type_traits>
@@ Expand Down @@