From 01cdc1e0e6c74fa220dbe35df1348ba965e1a5a5 Mon Sep 17 00:00:00 2001
From: Olivier Giroux <ogiroux@gmail.com>
Date: Thu, 30 Mar 2023 16:38:53 +0200
Subject: [PATCH 1/4] Initial version of mutex support and some drive-by fixes

---
 .../test/support/nasty_containers.h           |  14 +-
 include/cuda/mutex                            |  30 +++
 include/cuda/std/detail/__config              |   6 +-
 .../std/detail/libcxx/include/CMakeLists.txt  |   1 +
 .../libcxx/include/__memory/atomic_load.h     |  69 ++++++
 .../std/detail/libcxx/include/__mutex_base    | 162 +++++++++----
 include/cuda/std/detail/libcxx/include/memory |  28 +--
 include/cuda/std/detail/libcxx/include/mutex  | 218 +++++++-----------
 include/cuda/std/mutex                        |  26 +++
 libcxx/src/mutex.cpp                          |   6 +-
 10 files changed, 351 insertions(+), 209 deletions(-)
 create mode 100644 include/cuda/mutex
 create mode 100644 include/cuda/std/detail/libcxx/include/__memory/atomic_load.h
 create mode 100644 include/cuda/std/mutex
diff --git a/.upstream-tests/test/support/nasty_containers.h b/.upstream-tests/test/support/nasty_containers.h
index c6584ed76a..bfc89170f1 100644
--- a/.upstream-tests/test/support/nasty_containers.h
+++ b/.upstream-tests/test/support/nasty_containers.h
@@ -9,12 +9,17 @@
 #ifndef NASTY_CONTAINERS_H
 #define NASTY_CONTAINERS_H
 
-#include <cassert>
-#include <vector>
-#include <list>
+#include <cuda/std/cassert>
+#if defined(_LIBCUDACXX_HAS_VECTOR)
+#include <cuda/std/vector>
+#endif
+#if defined(_LIBCUDACXX_HAS_LIST)
+#include <cuda/std/list>
+#endif
 
 #include "test_macros.h"
 
+#if defined(_LIBCUDACXX_HAS_VECTOR)
 template <class T>
 class nasty_vector
 {
@@ -135,7 +140,9 @@ class nasty_vector
 
 template <class T>
 bool operator==(const nasty_vector<T>& x, const nasty_vector<T>& y) { return x.v_ == y.v_; }
+#endif
 
+#if defined(_LIBCUDACXX_HAS_LIST)
 template <class T>
 class nasty_list
 {
@@ -282,6 +289,7 @@ class nasty_list
 
 template <class T>
 bool operator==(const nasty_list<T>& x, const nasty_list<T>& y) { return x.l_ == y.l_; }
+#endif
 
 // Not really a mutex, but can play one in tests
 class nasty_mutex
diff --git a/include/cuda/mutex b/include/cuda/mutex
new file mode 100644
index 0000000000..532cc533c8
--- /dev/null
+++ b/include/cuda/mutex
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_MUTEX
+#define _CUDA_MUTEX
+
+#include "std/mutex"
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template<thread_scope _Sco>
+using mutex = _CUDA_VSTD::__mutex_base<_Sco>;
+
+template<thread_scope _Sco>
+using timed_mutex = _CUDA_VSTD::__mutex_base<_Sco>;
+
+template<thread_scope _Sco>
+using once_flag = _CUDA_VSTD::__once_flag_base<_Sco>;
+
+using _CUDA_VSTD::call_once;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif //_CUDA_MUTEX
diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
index 238c0a3e35..e7a68974db 100644
--- a/include/cuda/std/detail/__config
+++ b/include/cuda/std/detail/__config
@@ -79,10 +79,12 @@
 #define _LIBCUDACXX_HAS_NO_PLATFORM_WAIT
 #define _LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK
 #define _LIBCUDACXX_HAS_NO_TREE_BARRIER
-#if defined(__CUDACC_RTC__)
+#define _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
+#define _LIBCUDACXX_INLINE_THREADING
+
+#ifdef __CUDACC_RTC__
     #define __ELF__
     #define _LIBCUDACXX_DISABLE_PRAGMA_GCC_SYSTEM_HEADER
-    #define _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
     #define __alignof(x) alignof(x)
     #define _LIBCUDACXX_LITTLE_ENDIAN
     #define _LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS
diff --git a/include/cuda/std/detail/libcxx/include/CMakeLists.txt b/include/cuda/std/detail/libcxx/include/CMakeLists.txt
index f7c19c021f..eba40e0057 100644
--- a/include/cuda/std/detail/libcxx/include/CMakeLists.txt
+++ b/include/cuda/std/detail/libcxx/include/CMakeLists.txt
@@ -117,6 +117,7 @@ set(files
   __mdspan/submdspan.hpp
   __mdspan/type_list.hpp
   __memory/addressof.h
+  __memory/atomic_load.h
   __memory/construct_at.h
   __memory/pointer_traits.h
   __memory/voidify.h
diff --git a/include/cuda/std/detail/libcxx/include/__memory/atomic_load.h b/include/cuda/std/detail/libcxx/include/__memory/atomic_load.h
new file mode 100644
index 0000000000..ae060162ad
--- /dev/null
+++ b/include/cuda/std/detail/libcxx/include/__memory/atomic_load.h
@@ -0,0 +1,69 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___MEMORY_ATOMIMC_LOAD_H
+#define _LIBCUDACXX___MEMORY_ATOMIMC_LOAD_H
+
+#ifndef __cuda_std__
+#include <__config>
+#endif //__cuda_std__
+
+#include "../atomic"
+
+#if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#ifndef __cuda_std__
+
+template <class _ValueType>
+inline _LIBCUDACXX_INLINE_VISIBILITY
+_ValueType __libcpp_relaxed_load(_ValueType const* __value) {
+#if !defined(_LIBCUDACXX_HAS_NO_THREADS) && \
+    defined(__ATOMIC_RELAXED) &&        \
+    (__has_builtin(__atomic_load_n) || defined(_LIBCUDACXX_COMPILER_GCC))
+    return __atomic_load_n(__value, __ATOMIC_RELAXED);
+#else
+    return *__value;
+#endif
+}
+
+template <class _ValueType>
+inline _LIBCUDACXX_INLINE_VISIBILITY
+_ValueType __libcpp_acquire_load(_ValueType const* __value) {
+#if !defined(_LIBCUDACXX_HAS_NO_THREADS) && \
+    defined(__ATOMIC_ACQUIRE) &&        \
+    (__has_builtin(__atomic_load_n) || defined(_LIBCUDACXX_COMPILER_GCC))
+    return __atomic_load_n(__value, __ATOMIC_ACQUIRE);
+#else
+    return *__value;
+#endif
+}
+
+#else
+
+template <class _ValueType>
+inline _LIBCUDACXX_INLINE_VISIBILITY
+_ValueType __libcpp_relaxed_load(atomic<_ValueType> const* __value) {
+    return __value->load(memory_order_relaxed);
+}
+
+template <class _ValueType>
+inline _LIBCUDACXX_INLINE_VISIBILITY
+_ValueType __libcpp_acquire_load(atomic<_ValueType> const* __value) {
+    return __value->load(memory_order_acquire);
+}
+#endif // __cuda_std__
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___MEMORY_ATOMIMC_LOAD_H
diff --git a/include/cuda/std/detail/libcxx/include/__mutex_base b/include/cuda/std/detail/libcxx/include/__mutex_base
index bae7c7c210..b8229c6672 100644
--- a/include/cuda/std/detail/libcxx/include/__mutex_base
+++ b/include/cuda/std/detail/libcxx/include/__mutex_base
@@ -10,23 +10,40 @@
 #ifndef _LIBCUDACXX___MUTEX_BASE
 #define _LIBCUDACXX___MUTEX_BASE
 
+#ifndef __cuda_std__
 #include <__config>
-#include <chrono>
 #include <system_error>
-#include <__threading_support>
-
-#include <time.h>
+#endif // __cuda_std__
+
+#include "__memory/addressof.h"
+#include "__memory/atomic_load.h"
+#include "__threading_support"
+#include "__type_traits/enable_if.h"
+#include "__type_traits/is_floating_point.h"
+#include "__type_traits/is_nothrow_default_constructible.h"
+#include "__utility/unreachable.h"
+#include "chrono"
+#include "ctime"
+#include "semaphore"
+
+#ifndef __cuda_std__
+#include <__pragma_push>
+#endif // __cuda_std__
 
 #if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
 
-_LIBCUDACXX_PUSH_MACROS
-#include <__undef_macros>
-
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
+#ifdef __cuda_std__
+_LIBCUDACXX_INLINE_VISIBILITY
+inline void __throw_system_error(int, const char*)
+{
+    __libcpp_unreachable();
+}
+#endif // __cuda_std__
+
 #ifndef _LIBCUDACXX_HAS_NO_THREADS
 
 #ifndef _LIBCUDACXX_THREAD_SAFETY_ANNOTATION
@@ -37,34 +54,73 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #  endif
 #endif  // _LIBCUDACXX_THREAD_SAFETY_ANNOTATION
 
+#ifndef __cuda_std__
+template<int>
+using __libcpp_mutex_base_t = __libcpp_mutex_t;
+#else
+template<int _Sco>
+using __libcpp_mutex_base_t = __atomic_semaphore_base<_Sco,1>;
+
+#undef _LIBCUDACXX_MUTEX_INITIALIZER
+#define _LIBCUDACXX_MUTEX_INITIALIZER {1ll}
+#endif // __cuda_std__
 
-class _LIBCUDACXX_TYPE_VIS _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(capability("mutex")) mutex
+template<int _Sco>
+class _LIBCUDACXX_TYPE_VIS _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(capability("mutex")) __mutex_base
 {
-    __libcpp_mutex_t __m_ = _LIBCUDACXX_MUTEX_INITIALIZER;
+    __libcpp_mutex_base_t<_Sco> __m_ = _LIBCUDACXX_MUTEX_INITIALIZER;
 
 public:
     _LIBCUDACXX_INLINE_VISIBILITY
-    _LIBCUDACXX_CONSTEXPR mutex() = default;
+    constexpr __mutex_base() noexcept {}
 
-    mutex(const mutex&) = delete;
-    mutex& operator=(const mutex&) = delete;
+    __mutex_base(const __mutex_base&) = delete;
+    __mutex_base& operator=(const __mutex_base&) = delete;
 
-#if defined(_LIBCUDACXX_HAS_TRIVIAL_MUTEX_DESTRUCTION)
-    ~mutex() = default;
+#if defined(_LIBCUDACXX_HAS_TRIVIAL_MUTEX_DESTRUCTION) || defined(__cuda_std__)
+    ~__mutex_base() = default;
 #else
-    ~mutex() _NOEXCEPT;
+    ~__mutex_base() _NOEXCEPT;
 #endif
 
+#ifndef _LIBCUDACXX_INLINE_THREADING
     void lock() _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(acquire_capability());
     bool try_lock() _NOEXCEPT _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(try_acquire_capability(true));
     void unlock() _NOEXCEPT _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(release_capability());
+#else
+    _LIBCUDACXX_INLINE_VISIBILITY
+    void lock() _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(acquire_capability()) {
+        __m_.acquire(); //while(!__m_.exchange(0));
+    }
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock() _NOEXCEPT _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(try_acquire_capability(true)) {
+        return __m_.try_acquire();
+    }
+    _LIBCUDACXX_INLINE_VISIBILITY
+    void unlock() _NOEXCEPT _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(release_capability()) {
+        __m_.release(); //__m_.store(1);
+    }
+
+    template <class _Rep, class _Period>
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock_for(const chrono::duration<_Rep, _Period>& __d) {
+        return __m_.try_acquire_for(__d);
+    }
 
-    typedef __libcpp_mutex_t* native_handle_type;
+    template <class _Clock, class _Duration>
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t) {
+        return __m_.try_acquire_until(__t);
+    }
+#endif
+
+    typedef __libcpp_mutex_base_t<_Sco>* native_handle_type;
     _LIBCUDACXX_INLINE_VISIBILITY native_handle_type native_handle() {return &__m_;}
 };
 
-static_assert(is_nothrow_default_constructible<mutex>::value,
-              "the default constructor for std::mutex must be nothrow");
+using mutex = __mutex_base<0>;
+
+static_assert(is_nothrow_default_constructible<mutex>::value, "the default constructor for std::mutex must be nothrow");
 
 struct _LIBCUDACXX_TYPE_VIS defer_lock_t { explicit defer_lock_t() = default; };
 struct _LIBCUDACXX_TYPE_VIS try_to_lock_t { explicit try_to_lock_t() = default; };
@@ -78,9 +134,9 @@ extern _LIBCUDACXX_EXPORTED_FROM_ABI const adopt_lock_t  adopt_lock;
 
 #else
 
-/* _LIBCUDACXX_INLINE_VAR */ constexpr defer_lock_t  defer_lock  = defer_lock_t();
-/* _LIBCUDACXX_INLINE_VAR */ constexpr try_to_lock_t try_to_lock = try_to_lock_t();
-/* _LIBCUDACXX_INLINE_VAR */ constexpr adopt_lock_t  adopt_lock  = adopt_lock_t();
+_LIBCUDACXX_CPO_ACCESSIBILITY defer_lock_t  defer_lock  = defer_lock_t();
+_LIBCUDACXX_CPO_ACCESSIBILITY try_to_lock_t try_to_lock = try_to_lock_t();
+_LIBCUDACXX_CPO_ACCESSIBILITY adopt_lock_t  adopt_lock  = adopt_lock_t();
 
 #endif
 
@@ -151,8 +207,8 @@ public:
     }
 
 private:
-    unique_lock(unique_lock const&); // = delete;
-    unique_lock& operator=(unique_lock const&); // = delete;
+    unique_lock(unique_lock const&) = delete;
+    unique_lock& operator=(unique_lock const&) = delete;
 
 public:
 #ifndef _LIBCUDACXX_CXX03_LANG
@@ -174,14 +230,18 @@ public:
 
 #endif  // _LIBCUDACXX_CXX03_LANG
 
-    void lock();
-    bool try_lock();
+    _LIBCUDACXX_INLINE_VISIBILITY void lock();
+    _LIBCUDACXX_INLINE_VISIBILITY bool try_lock();
 
     template <class _Rep, class _Period>
-        bool try_lock_for(const chrono::duration<_Rep, _Period>& __d);
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock_for(const chrono::duration<_Rep, _Period>& __d);
+
     template <class _Clock, class _Duration>
-        bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t);
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t);
 
+    _LIBCUDACXX_INLINE_VISIBILITY
     void unlock();
 
     _LIBCUDACXX_INLINE_VISIBILITY
@@ -201,6 +261,7 @@ public:
 
     _LIBCUDACXX_INLINE_VISIBILITY
     bool owns_lock() const _NOEXCEPT {return __owns_;}
+
     _LIBCUDACXX_INLINE_VISIBILITY
     _LIBCUDACXX_EXPLICIT
         operator bool () const _NOEXCEPT {return __owns_;}
@@ -209,61 +270,71 @@ public:
 };
 
 template <class _Mutex>
-void
-unique_lock<_Mutex>::lock()
+_LIBCUDACXX_INLINE_VISIBILITY
+void unique_lock<_Mutex>::lock()
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (__m_ == nullptr)
         __throw_system_error(EPERM, "unique_lock::lock: references null mutex");
     if (__owns_)
         __throw_system_error(EDEADLK, "unique_lock::lock: already locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __m_->lock();
     __owns_ = true;
 }
 
 template <class _Mutex>
-bool
-unique_lock<_Mutex>::try_lock()
+_LIBCUDACXX_INLINE_VISIBILITY
+bool unique_lock<_Mutex>::try_lock()
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (__m_ == nullptr)
         __throw_system_error(EPERM, "unique_lock::try_lock: references null mutex");
     if (__owns_)
         __throw_system_error(EDEADLK, "unique_lock::try_lock: already locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __owns_ = __m_->try_lock();
     return __owns_;
 }
 
 template <class _Mutex>
 template <class _Rep, class _Period>
-bool
-unique_lock<_Mutex>::try_lock_for(const chrono::duration<_Rep, _Period>& __d)
+_LIBCUDACXX_INLINE_VISIBILITY
+bool unique_lock<_Mutex>::try_lock_for(const chrono::duration<_Rep, _Period>& __d)
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (__m_ == nullptr)
         __throw_system_error(EPERM, "unique_lock::try_lock_for: references null mutex");
     if (__owns_)
         __throw_system_error(EDEADLK, "unique_lock::try_lock_for: already locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __owns_ = __m_->try_lock_for(__d);
     return __owns_;
 }
 
 template <class _Mutex>
 template <class _Clock, class _Duration>
-bool
+_LIBCUDACXX_INLINE_VISIBILITY bool
 unique_lock<_Mutex>::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t)
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (__m_ == nullptr)
         __throw_system_error(EPERM, "unique_lock::try_lock_until: references null mutex");
     if (__owns_)
         __throw_system_error(EDEADLK, "unique_lock::try_lock_until: already locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __owns_ = __m_->try_lock_until(__t);
     return __owns_;
 }
 
-template <class _Mutex>
+template <class _Mutex> _LIBCUDACXX_INLINE_VISIBILITY
 void
 unique_lock<_Mutex>::unlock()
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (!__owns_)
         __throw_system_error(EPERM, "unique_lock::unlock: not locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __m_->unlock();
     __owns_ = false;
 }
@@ -274,6 +345,8 @@ void
 swap(unique_lock<_Mutex>& __x, unique_lock<_Mutex>& __y) _NOEXCEPT
     {__x.swap(__y);}
 
+#ifndef _LIBCUDACXX_HAS_THREAD_API_CUDA
+
 //enum class cv_status
 _LIBCUDACXX_DECLARE_STRONG_ENUM(cv_status)
 {
@@ -346,15 +419,17 @@ private:
     void __do_timed_wait(unique_lock<mutex>& __lk,
        chrono::time_point<_Clock, chrono::nanoseconds>) _NOEXCEPT;
 };
+
+#endif // _LIBCUDACXX_HAS_THREAD_API_CUDA
 #endif // !_LIBCUDACXX_HAS_NO_THREADS
 
 template <class _Rep, class _Period>
 inline _LIBCUDACXX_INLINE_VISIBILITY
-typename enable_if
+__enable_if_t
 <
     is_floating_point<_Rep>::value,
     chrono::nanoseconds
->::type
+>
 __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
 {
     using namespace chrono;
@@ -377,11 +452,11 @@ __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
 
 template <class _Rep, class _Period>
 inline _LIBCUDACXX_INLINE_VISIBILITY
-typename enable_if
+__enable_if_t
 <
     !is_floating_point<_Rep>::value,
     chrono::nanoseconds
->::type
+>
 __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
 {
     using namespace chrono;
@@ -410,6 +485,8 @@ __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
 }
 
 #ifndef _LIBCUDACXX_HAS_NO_THREADS
+#ifndef _LIBCUDACXX_HAS_THREAD_API_CUDA
+
 template <class _Predicate>
 void
 condition_variable::wait(unique_lock<mutex>& __lk, _Predicate __pred)
@@ -532,10 +609,13 @@ condition_variable::__do_timed_wait(unique_lock<mutex>& __lk,
     wait_for(__lk, __tp - _Clock::now());
 }
 
+#endif //_LIBCUDACXX_HAS_THREAD_API_CUDA
 #endif // !_LIBCUDACXX_HAS_NO_THREADS
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-_LIBCUDACXX_POP_MACROS
+#ifndef __cuda_std__
+#include <__pragma_pop>
+#endif // __cuda_std__
 
 #endif  // _LIBCUDACXX___MUTEX_BASE
diff --git a/include/cuda/std/detail/libcxx/include/memory b/include/cuda/std/detail/libcxx/include/memory
index e898a8eeac..2832880640 100644
--- a/include/cuda/std/detail/libcxx/include/memory
+++ b/include/cuda/std/detail/libcxx/include/memory
@@ -663,6 +663,7 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space);
 #include "__iterator/iterator_traits.h"
 #include "__iterator/iterator.h"
 #include "__memory/addressof.h"
+#include "__memory/atomic_load.h"
 #include "__memory/construct_at.h"
 #include "__memory/pointer_traits.h"
 #include "__memory/voidify.h"
@@ -709,33 +710,6 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space);
 #endif
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <class _ValueType>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-_ValueType __libcpp_relaxed_load(_ValueType const* __value) {
-#if !defined(_LIBCUDACXX_HAS_NO_THREADS) && \
-    defined(__ATOMIC_RELAXED) &&        \
-    (__has_builtin(__atomic_load_n) || defined(_LIBCUDACXX_COMPILER_GCC))
-    return __atomic_load_n(__value, __ATOMIC_RELAXED);
-#else
-    return *__value;
-#endif
-}
-
-template <class _ValueType>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-_ValueType __libcpp_acquire_load(_ValueType const* __value) {
-#if !defined(_LIBCUDACXX_HAS_NO_THREADS) && \
-    defined(__ATOMIC_ACQUIRE) &&        \
-    (__has_builtin(__atomic_load_n) || defined(_LIBCUDACXX_COMPILER_GCC))
-    return __atomic_load_n(__value, __ATOMIC_ACQUIRE);
-#else
-    return *__value;
-#endif
-}
-
-// addressof moved to <type_traits>
-
 template <class _Tp> class allocator;
 
 template <>
diff --git a/include/cuda/std/detail/libcxx/include/mutex b/include/cuda/std/detail/libcxx/include/mutex
index 7a454e5609..eab4957455 100644
--- a/include/cuda/std/detail/libcxx/include/mutex
+++ b/include/cuda/std/detail/libcxx/include/mutex
@@ -186,28 +186,31 @@ template<class Callable, class ...Args>
 
 */
 
+#ifndef __cuda_std__
 #include <__config>
-#include <__mutex_base>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#ifndef _LIBCUDACXX_CXX03_LANG
-#include <tuple>
-#endif
-#include <version>
-#include <__threading_support>
+#include <memory> // for __libcpp_acquire_load
+#endif // __cuda_std__
+
+#include "__mutex_base"
+#include "__threading_support"
+#include "__utility/forward.h"
+#include "cstdint"
+#include "functional"
+#include "tuple"
+#include "version"
+
+#ifndef __cuda_std__
+#include <__pragma_push>
+#endif // __cuda_std__
 
 #if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
 
-_LIBCUDACXX_PUSH_MACROS
-#include <__undef_macros>
-
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #ifndef _LIBCUDACXX_HAS_NO_THREADS
+#ifndef _LIBCUDACXX_HAS_THREAD_API_CUDA
 
 class _LIBCUDACXX_TYPE_VIS recursive_mutex
 {
@@ -303,8 +306,7 @@ public:
 };
 
 template <class _Clock, class _Duration>
-bool
-recursive_timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t)
+bool recursive_timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t)
 {
     using namespace chrono;
     __thread_id __id = this_thread::get_id();
@@ -327,10 +329,15 @@ recursive_timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration
     }
     return false;
 }
+#else
+
+using timed_mutex = __mutex_base<0>;
+
+#endif // _LIBCUDACXX_HAS_THREAD_API_CUDA
 
 template <class _L0, class _L1>
-int
-try_lock(_L0& __l0, _L1& __l1)
+_LIBCUDACXX_INLINE_VISIBILITY
+int try_lock(_L0& __l0, _L1& __l1)
 {
     unique_lock<_L0> __u0(__l0, try_to_lock);
     if (__u0.owns_lock())
@@ -346,11 +353,9 @@ try_lock(_L0& __l0, _L1& __l1)
     return 0;
 }
 
-#ifndef _LIBCUDACXX_CXX03_LANG
-
 template <class _L0, class _L1, class _L2, class... _L3>
-int
-try_lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3)
+_LIBCUDACXX_INLINE_VISIBILITY
+int try_lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3)
 {
     int __r = 0;
     unique_lock<_L0> __u0(__l0, try_to_lock);
@@ -365,11 +370,9 @@ try_lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3)
     return __r;
 }
 
-#endif  // _LIBCUDACXX_CXX03_LANG
-
 template <class _L0, class _L1>
-void
-lock(_L0& __l0, _L1& __l1)
+_LIBCUDACXX_INLINE_VISIBILITY
+void lock(_L0& __l0, _L1& __l1)
 {
     while (true)
     {
@@ -394,11 +397,9 @@ lock(_L0& __l0, _L1& __l1)
     }
 }
 
-#ifndef _LIBCUDACXX_CXX03_LANG
-
 template <class _L0, class _L1, class _L2, class ..._L3>
-void
-__lock_first(int __i, _L0& __l0, _L1& __l1, _L2& __l2, _L3& ...__l3)
+_LIBCUDACXX_INLINE_VISIBILITY
+void __lock_first(int __i, _L0& __l0, _L1& __l1, _L2& __l2, _L3& ...__l3)
 {
     while (true)
     {
@@ -469,8 +470,6 @@ void __unlock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3) {
     _CUDA_VSTD::__unlock(__l2, __l3...);
 }
 
-#endif  // _LIBCUDACXX_CXX03_LANG
-
 #if _LIBCUDACXX_STD_VER > 14
 template <class ..._Mutexes>
 class _LIBCUDACXX_TEMPLATE_VIS scoped_lock;
@@ -478,6 +477,7 @@ class _LIBCUDACXX_TEMPLATE_VIS scoped_lock;
 template <>
 class _LIBCUDACXX_TEMPLATE_VIS scoped_lock<> {
 public:
+    _LIBCUDACXX_INLINE_VISIBILITY
     explicit scoped_lock() {}
     ~scoped_lock() = default;
 
@@ -495,13 +495,13 @@ public:
 private:
     mutex_type& __m_;
 public:
-    explicit scoped_lock(mutex_type & __m) _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
+    _LIBCUDACXX_INLINE_VISIBILITY explicit scoped_lock(mutex_type & __m) _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
         : __m_(__m) {__m_.lock();}
 
-    ~scoped_lock() _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(release_capability()) {__m_.unlock();}
+    _LIBCUDACXX_INLINE_VISIBILITY ~scoped_lock() _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(release_capability()) {__m_.unlock();}
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    explicit scoped_lock(adopt_lock_t, mutex_type& __m) _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
+    _LIBCUDACXX_INLINE_VISIBILITY explicit scoped_lock(adopt_lock_t, mutex_type& __m) _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
         : __m_(__m) {}
 
     scoped_lock(scoped_lock const&) = delete;
@@ -550,61 +550,39 @@ private:
 #endif // _LIBCUDACXX_STD_VER > 14
 #endif // !_LIBCUDACXX_HAS_NO_THREADS
 
-struct _LIBCUDACXX_TEMPLATE_VIS once_flag;
-
-#ifndef _LIBCUDACXX_CXX03_LANG
-
-template<class _Callable, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY
-void call_once(once_flag&, _Callable&&, _Args&&...);
-
-#else  // _LIBCUDACXX_CXX03_LANG
+template<int _Sco>
+struct _LIBCUDACXX_TEMPLATE_VIS __once_flag_base;
 
-template<class _Callable>
+template<int _Sco, class _Callable, class... _Args>
 _LIBCUDACXX_INLINE_VISIBILITY
-void call_once(once_flag&, _Callable&);
+void call_once(__once_flag_base<_Sco>&, _Callable&&, _Args&&...);
 
-template<class _Callable>
-_LIBCUDACXX_INLINE_VISIBILITY
-void call_once(once_flag&, const _Callable&);
-
-#endif  // _LIBCUDACXX_CXX03_LANG
-
-struct _LIBCUDACXX_TEMPLATE_VIS once_flag
+template<int _Sco>
+struct _LIBCUDACXX_TEMPLATE_VIS __once_flag_base
 {
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _LIBCUDACXX_CONSTEXPR
-        once_flag() _NOEXCEPT : __state_(0) {}
+    constexpr __once_flag_base() noexcept = default;
 
 #if defined(_LIBCUDACXX_ABI_MICROSOFT)
-   typedef uintptr_t _State_type;
+    typedef uintptr_t _State_data_type;
 #else
-   typedef unsigned long _State_type;
+    typedef unsigned long _State_data_type;
 #endif
 
+#ifndef _LIBCUDACXX_INLINE_THREADING
+    using _State_type = _State_data_type;
+#else
+    using _State_type = atomic<_State_data_type>;
+#endif // _LIBCUDACXX_INLINE_THREADING
+
+    _State_type __state_{0};
 
 private:
-    once_flag(const once_flag&); // = delete;
-    once_flag& operator=(const once_flag&); // = delete;
-
-    _State_type __state_;
-
-#ifndef _LIBCUDACXX_CXX03_LANG
-    template<class _Callable, class... _Args>
-    friend
-    void call_once(once_flag&, _Callable&&, _Args&&...);
-#else  // _LIBCUDACXX_CXX03_LANG
-    template<class _Callable>
-    friend
-    void call_once(once_flag&, _Callable&);
-
-    template<class _Callable>
-    friend
-    void call_once(once_flag&, const _Callable&);
-#endif  // _LIBCUDACXX_CXX03_LANG
+
+    __once_flag_base(const __once_flag_base&) = delete;
+    __once_flag_base& operator=(const __once_flag_base&) = delete;
 };
 
-#ifndef _LIBCUDACXX_CXX03_LANG
+using once_flag = __once_flag_base<0>;
 
 template <class _Fp>
 class __call_once_param
@@ -630,82 +608,52 @@ private:
     }
 };
 
-#else
-
 template <class _Fp>
-class __call_once_param
-{
-    _Fp& __f_;
-public:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit __call_once_param(_Fp& __f) : __f_(__f) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator()()
-    {
-        __f_();
-    }
-};
-
-#endif
-
-template <class _Fp>
-void
-__call_once_proxy(void* __vp)
+_LIBCUDACXX_INLINE_VISIBILITY
+void __call_once_proxy(void* __vp)
 {
     __call_once_param<_Fp>* __p = static_cast<__call_once_param<_Fp>*>(__vp);
     (*__p)();
 }
 
-_LIBCUDACXX_FUNC_VIS void __call_once(volatile once_flag::_State_type&, void*,
-                                  void (*)(void*));
-
-#ifndef _LIBCUDACXX_CXX03_LANG
-
-template<class _Callable, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-call_once(once_flag& __flag, _Callable&& __func, _Args&&... __args)
-{
-    if (__libcpp_acquire_load(&__flag.__state_) != ~once_flag::_State_type(0))
-    {
-        typedef tuple<_Callable&&, _Args&&...> _Gp;
-        _Gp __f(_CUDA_VSTD::forward<_Callable>(__func), _CUDA_VSTD::forward<_Args>(__args)...);
-        __call_once_param<_Gp> __p(__f);
-        __call_once(__flag.__state_, &__p, &__call_once_proxy<_Gp>);
-    }
-}
-
-#else  // _LIBCUDACXX_CXX03_LANG
-
-template<class _Callable>
+#ifndef _LIBCUDACXX_INLINE_THREADING
+template<int _Sco>
+_LIBCUDACXX_FUNC_VIS
+void __call_once(volatile typename __once_flag_base<_Sco>::_State_type&, void*, void (*)(void*));
+#else
+template<int _Sco>
 inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-call_once(once_flag& __flag, _Callable& __func)
-{
-    if (__libcpp_acquire_load(&__flag.__state_) != ~once_flag::_State_type(0))
+void __call_once(volatile typename __once_flag_base<_Sco>::_State_type& __s, void* __p, void (* __f)(void*))
     {
-        __call_once_param<_Callable> __p(__func);
-        __call_once(__flag.__state_, &__p, &__call_once_proxy<_Callable>);
+        typename __once_flag_base<_Sco>::_State_data_type __once_expect = 0;
+        if(__s.compare_exchange_strong(__once_expect, typename __once_flag_base<_Sco>::_State_data_type(1), memory_order_acquire))
+        {
+            __f(__p);
+            __s.store(~typename __once_flag_base<_Sco>::_State_data_type(0), memory_order_release);
+            __s.notify_all();
+        }
+        else if(__once_expect == 1)
+            __s.wait(__once_expect);
     }
-}
+#endif // _LIBCUDACXX_INLINE_THREADING
 
-template<class _Callable>
+template<int _Sco, class _Callable, class... _Args>
 inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-call_once(once_flag& __flag, const _Callable& __func)
+void call_once(__once_flag_base<_Sco>& __flag, _Callable&& __func, _Args&&... __args)
 {
-    if (__libcpp_acquire_load(&__flag.__state_) != ~once_flag::_State_type(0))
+    if (__libcpp_acquire_load(&__flag.__state_) != ~typename __once_flag_base<_Sco>::_State_data_type(0))
     {
-        __call_once_param<const _Callable> __p(__func);
-        __call_once(__flag.__state_, &__p, &__call_once_proxy<const _Callable>);
+        typedef tuple<_Callable&&, _Args&&...> _Gp;
+        _Gp __f(_CUDA_VSTD::forward<_Callable>(__func), _CUDA_VSTD::forward<_Args>(__args)...);
+        __call_once_param<_Gp> __p(__f);
+        __call_once<_Sco>(__flag.__state_, &__p, &__call_once_proxy<_Gp>);
     }
 }
 
-#endif  // _LIBCUDACXX_CXX03_LANG
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
-_LIBCUDACXX_POP_MACROS
+#ifndef __cuda_std__
+#include <__pragma_pop>
+#endif // __cuda_std__
 
 #endif  // _LIBCUDACXX_MUTEX
diff --git a/include/cuda/std/mutex b/include/cuda/std/mutex
new file mode 100644
index 0000000000..c60e649adc
--- /dev/null
+++ b/include/cuda/std/mutex
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_MUTEX
+#define _CUDA_STD_MUTEX
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
+#  error "CUDA synchronization primitives are only supported for sm_70 and up."
+#endif
+
+#include "detail/__config"
+
+#include "detail/__pragma_push"
+
+#include "detail/libcxx/include/mutex"
+
+#include "detail/__pragma_pop"
+
+#endif //_CUDA_STD_MUTEX
diff --git a/libcxx/src/mutex.cpp b/libcxx/src/mutex.cpp
index 49352a005a..1744172f4d 100644
--- a/libcxx/src/mutex.cpp
+++ b/libcxx/src/mutex.cpp
@@ -27,6 +27,7 @@ const adopt_lock_t  adopt_lock{};
 
 // ~mutex is defined elsewhere
 
+template<>
 void
 mutex::lock()
 {
@@ -35,12 +36,14 @@ mutex::lock()
         __throw_system_error(ec, "mutex lock failed");
 }
 
+template<>
 bool
 mutex::try_lock() _NOEXCEPT
 {
     return __libcpp_mutex_trylock(&__m_);
 }
 
+template<>
 void
 mutex::unlock() _NOEXCEPT
 {
@@ -200,7 +203,8 @@ _LIBCUDACXX_SAFE_STATIC static __libcpp_mutex_t mut = _LIBCUDACXX_MUTEX_INITIALI
 _LIBCUDACXX_SAFE_STATIC static __libcpp_condvar_t cv = _LIBCUDACXX_CONDVAR_INITIALIZER;
 #endif
 
-void __call_once(volatile once_flag::_State_type& flag, void* arg,
+template<>
+void __call_once<0>(volatile typename __once_flag_base<0>::_State_type& flag, void* arg,
                  void (*func)(void*))
 {
 #if defined(_LIBCUDACXX_HAS_NO_THREADS)

From 495bc23b7f2ad30e0d2fde1cb5bb39515a07ed01 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 30 Mar 2023 16:38:53 +0200
Subject: [PATCH 2/4] Add some tests

---
 .../assign.compile.fail.cpp                   |  36 ++
 .../thread.mutex.class/copy.compile.fail.cpp  |  35 ++
 .../thread.mutex.class/default.pass.cpp       |  32 ++
 .../thread.mutex.class/lock.pass.cpp          |  86 +++
 .../thread.mutex.class/try_lock.pass.cpp      |  90 +++
 .../assign.compile.fail.cpp                   |  36 ++
 .../copy.compile.fail.cpp                     |  35 ++
 .../thread.timedmutex.class/default.pass.cpp  |  31 +
 .../thread.timedmutex.class/lock.pass.cpp     |  86 +++
 .../thread.timedmutex.class/try_lock.pass.cpp |  90 +++
 .../try_lock_for.pass.cpp                     | 130 +++++
 .../try_lock_until.pass.cpp                   | 130 +++++
 .../thread.once.callonce/call_once.pass.cpp   | 313 ++++++++++
 .../thread.once.callonce/race.pass.cpp        |  91 +++
 .../assign.compile.fail.cpp                   |  37 ++
 .../copy.compile.fail.cpp                     |  36 ++
 .../thread.once.onceflag/default.pass.cpp     |  60 ++
 .../thread.lock.algorithm/lock.pass.cpp       | 543 ++++++++++++++++++
 .../thread.lock.algorithm/try_lock.pass.cpp   | 529 +++++++++++++++++
 .../thread.lock.guard/adopt_lock.pass.cpp     | 105 ++++
 .../thread.lock.guard/assign.compile.fail.cpp |  30 +
 .../thread.lock.guard/copy.compile.fail.cpp   |  28 +
 .../thread.lock.guard/implicit_ctad.pass.cpp  |  33 ++
 .../thread.lock.guard/mutex.fail.cpp          |  28 +
 .../thread.lock.guard/mutex.pass.cpp          | 117 ++++
 .../thread.lock.guard/types.pass.cpp          |  34 ++
 .../thread.lock.scoped/adopt_lock.pass.cpp    |  75 +++
 .../thread.lock.scoped/assign.fail.cpp        |  54 ++
 .../thread.lock.scoped/copy.fail.cpp          |  50 ++
 .../thread.lock.scoped/implicit_ctad.pass.cpp |  46 ++
 .../thread.lock.scoped/mutex.fail.cpp         |  57 ++
 .../thread.lock.scoped/mutex.pass.cpp         | 163 ++++++
 .../thread.lock.scoped/types.pass.cpp         |  85 +++
 .../thread.lock.unique/implicit_ctad.pass.cpp |  33 ++
 .../copy_assign.compile.fail.cpp              |  38 ++
 .../copy_ctor.compile.fail.cpp                |  36 ++
 .../thread.lock.unique.cons/default.pass.cpp  |  31 +
 .../move_assign.pass.cpp                      |  53 ++
 .../move_ctor.pass.cpp                        |  49 ++
 .../thread.lock.unique.cons/mutex.pass.cpp    |  94 +++
 .../mutex_adopt_lock.pass.cpp                 |  48 ++
 .../mutex_defer_lock.pass.cpp                 |  43 ++
 .../mutex_duration.pass.cpp                   | 102 ++++
 .../mutex_time_point.pass.cpp                 | 100 ++++
 .../mutex_try_to_lock.pass.cpp                |  92 +++
 .../thread.lock.unique.locking/lock.pass.cpp  | 101 ++++
 .../try_lock.pass.cpp                         |  75 +++
 .../try_lock_for.pass.cpp                     |  78 +++
 .../try_lock_until.pass.cpp                   |  83 +++
 .../unlock.pass.cpp                           |  65 +++
 .../member_swap.pass.cpp                      |  43 ++
 .../nonmember_swap.pass.cpp                   |  44 ++
 .../thread.lock.unique.mod/release.pass.cpp   |  48 ++
 .../thread.lock.unique.obs/mutex.pass.cpp     |  36 ++
 .../thread.lock.unique.obs/op_bool.pass.cpp   |  41 ++
 .../thread.lock.unique.obs/owns_lock.pass.cpp |  36 ++
 .../thread.lock.unique/types.pass.cpp         |  34 ++
 .../thread.mutex/thread.lock/types.fail.cpp   |  32 ++
 .../thread.mutex/thread.lock/types.pass.cpp   |  38 ++
 .../nothing_to_do.pass.cpp                    |  14 +
 .../assign.compile.fail.cpp                   |  28 +
 .../thread.mutex.class/copy.compile.fail.cpp  |  27 +
 .../thread.mutex.class/default.pass.cpp       |  30 +
 .../thread.mutex.class/lock.pass.cpp          |  75 +++
 .../thread.mutex.class/try_lock.pass.cpp      |  79 +++
 .../assign.compile.fail.cpp                   |  28 +
 .../copy.compile.fail.cpp                     |  27 +
 .../thread.timedmutex.class/default.pass.cpp  |  29 +
 .../thread.timedmutex.class/lock.pass.cpp     |  75 +++
 .../thread.timedmutex.class/try_lock.pass.cpp |  80 +++
 .../try_lock_for.pass.cpp                     |  99 ++++
 .../try_lock_until.pass.cpp                   |  99 ++++
 .../thread.once.callonce/call_once.pass.cpp   | 303 ++++++++++
 .../thread.once.callonce/race.pass.cpp        |  80 +++
 .../assign.compile.fail.cpp                   |  29 +
 .../copy.compile.fail.cpp                     |  28 +
 .../thread.once.onceflag/default.pass.cpp     |  36 ++
 .../support/heterogeneous_thread_handler.h    |  91 +++
 .../test/support/nasty_containers.h           |  24 +-
 79 files changed, 6073 insertions(+), 12 deletions(-)
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.callonce/call_once.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.callonce/race.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/assign.compile.fail.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/copy.compile.fail.cpp
 create mode 100644 .upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/default.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock.algorithm/lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock.algorithm/try_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/adopt_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/adopt_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/assign.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/copy.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/types.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_duration.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_time_point.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/types.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.lock/types.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.general/nothing_to_do.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.callonce/call_once.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.callonce/race.pass.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/assign.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/copy.compile.fail.cpp
 create mode 100644 .upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/default.pass.cpp
 create mode 100644 .upstream-tests/test/support/heterogeneous_thread_handler.h

diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp
new file mode 100644
index 0000000000..06290a4997
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class mutex;
+
+// mutex& operator=(const mutex&) = delete;
+
+#include<cuda/mutex>
+
+template<int scope>
+void test() {
+    cuda::mutex<scope> m0;
+    cuda::mutex<scope> m1;
+    m1 = m0;
+}
+
+int main(int, char**)
+{
+  test<cuda::thread_scope_system>();
+  test<cuda::thread_scope_device>();
+  test<cuda::thread_scope_block>();
+  test<cuda::thread_scope_thread>();
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp
new file mode 100644
index 0000000000..92a96ef984
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class mutex;
+
+// mutex(const mutex&) = delete;
+
+#include<cuda/mutex>
+
+template<int scope>
+void test() {
+    cuda::mutex<scope> m0;
+    cuda::mutex<scope> m1{m0};
+}
+
+int main(int, char**)
+{
+  test<cuda::thread_scope_system>();
+  test<cuda::thread_scope_device>();
+  test<cuda::thread_scope_block>();
+  test<cuda::thread_scope_thread>();
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp
new file mode 100644
index 0000000000..df6af0a289
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp
@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class mutex;
+
+// mutex();
+
+#include<cuda/mutex>
+#include<cuda/std/type_traits>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+  static_assert(cuda::std::is_nothrow_default_constructible<cuda::mutex<cuda::thread_scope_system>>::value, "");
+  static_assert(cuda::std::is_nothrow_default_constructible<cuda::mutex<cuda::thread_scope_device>>::value, "");
+  static_assert(cuda::std::is_nothrow_default_constructible<cuda::mutex<cuda::thread_scope_block>>::value, "");
+  static_assert(cuda::std::is_nothrow_default_constructible<cuda::mutex<cuda::thread_scope_thread>>::value, "");
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp
new file mode 100644
index 0000000000..33437a5e04
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class mutex;
+
+// void lock();
+
+#include<cuda/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    m->lock();
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::mutex<cuda::thread_scope_system>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, local_memory_selector>();
+#else
+    test<cuda::mutex<cuda::thread_scope_system>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, shared_memory_selector>();
+
+    test<cuda::mutex<cuda::thread_scope_system>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp
new file mode 100644
index 0000000000..4f6ab43853
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp
@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class mutex;
+
+// bool try_lock();
+
+#include<cuda/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(!m->try_lock());
+    assert(!m->try_lock());
+    assert(!m->try_lock());
+    while(!m->try_lock())
+        ;
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::mutex<cuda::thread_scope_system>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, local_memory_selector>();
+#else
+    test<cuda::mutex<cuda::thread_scope_system>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, shared_memory_selector>();
+
+    test<cuda::mutex<cuda::thread_scope_system>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp
new file mode 100644
index 0000000000..71146d8f21
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class timed_mutex;
+
+// timed_mutex& operator=(const timed_mutex&) = delete;
+
+#include<cuda/mutex>
+
+template<int scope>
+void test() {
+    cuda::timed_mutex<scope> m0;
+    cuda::timed_mutex<scope> m1;
+    m1 = m0;
+}
+
+int main(int, char**)
+{
+  test<cuda::thread_scope_system>();
+  test<cuda::thread_scope_device>();
+  test<cuda::thread_scope_block>();
+  test<cuda::thread_scope_thread>();
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp
new file mode 100644
index 0000000000..154bc939df
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class timed_mutex;
+
+// timed_mutex(const timed_mutex&) = delete;
+
+#include<cuda/mutex>
+
+template<int scope>
+void test() {
+    cuda::timed_mutex<scope> m0;
+    cuda::timed_mutex<scope> m1{m0};
+}
+
+int main(int, char**)
+{
+  test<cuda::thread_scope_system>();
+  test<cuda::thread_scope_device>();
+  test<cuda::thread_scope_block>();
+  test<cuda::thread_scope_thread>();
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp
new file mode 100644
index 0000000000..f45024f339
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class timed_mutex;
+
+// timed_mutex();
+
+#include<cuda/mutex>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+  static_assert(cuda::std::is_nothrow_default_constructible<cuda::timed_mutex<cuda::thread_scope_system>>::value, "");
+  static_assert(cuda::std::is_nothrow_default_constructible<cuda::timed_mutex<cuda::thread_scope_device>>::value, "");
+  static_assert(cuda::std::is_nothrow_default_constructible<cuda::timed_mutex<cuda::thread_scope_block>>::value, "");
+  static_assert(cuda::std::is_nothrow_default_constructible<cuda::timed_mutex<cuda::thread_scope_thread>>::value, "");
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp
new file mode 100644
index 0000000000..8bc95c8ff4
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// void lock();
+
+#include<cuda/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    m->lock();
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::mutex<cuda::thread_scope_system>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, local_memory_selector>();
+#else
+    test<cuda::mutex<cuda::thread_scope_system>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, shared_memory_selector>();
+
+    test<cuda::mutex<cuda::thread_scope_system>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, global_memory_selector>();
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp
new file mode 100644
index 0000000000..d88eb5fece
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp
@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// bool try_lock();
+
+#include<cuda/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(!m->try_lock());
+    assert(!m->try_lock());
+    assert(!m->try_lock());
+    while(!m->try_lock())
+        ;
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::mutex<cuda::thread_scope_system>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, local_memory_selector>();
+#else
+    test<cuda::mutex<cuda::thread_scope_system>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, shared_memory_selector>();
+
+    test<cuda::mutex<cuda::thread_scope_system>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp
new file mode 100644
index 0000000000..2c8b365dd5
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp
@@ -0,0 +1,130 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: true
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// template <class Rep, class Period>
+//     bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// bool try_lock();
+
+#include<cuda/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f1(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(m->try_lock_for(ms(300)) == true);
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex>
+__host__ __device__ void f2(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(m->try_lock_for(ms(250)) == false);
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    {
+        handler.run_on_first_thread(&Mutex::lock, m);
+        handler.syncthreads();
+
+        handler.run_on_second_thread(f1<Mutex>, m);
+        handler.sleep_first_thread(ms(250));
+        handler.run_on_first_thread(&Mutex::unlock, m);
+        handler.join_test_thread();
+    }
+
+    handler.syncthreads();
+
+    {
+        handler.run_on_first_thread(&Mutex::lock, m);
+        handler.syncthreads();
+
+        handler.run_on_second_thread(f2<Mutex>, m);
+        handler.sleep_first_thread(ms(250));
+        handler.run_on_first_thread(&Mutex::unlock, m);
+        handler.join_test_thread();
+    }
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::mutex<cuda::thread_scope_system>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, local_memory_selector>();
+#else
+    test<cuda::mutex<cuda::thread_scope_system>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, shared_memory_selector>();
+
+    test<cuda::mutex<cuda::thread_scope_system>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp
new file mode 100644
index 0000000000..5eadc2d835
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp
@@ -0,0 +1,130 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: true
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// template <class Clock, class Duration>
+//     bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// bool try_lock();
+
+#include<cuda/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f1(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(m->try_lock_until(Clock::now() + ms(300)) == true);
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex>
+__host__ __device__ void f2(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(m->try_lock_until(Clock::now() + ms(200)) == false);
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    {
+        handler.run_on_first_thread(&Mutex::lock, m);
+        handler.syncthreads();
+
+        handler.run_on_second_thread(f1<Mutex>, m);
+        handler.sleep_first_thread(ms(250));
+        handler.run_on_first_thread(&Mutex::unlock, m);
+        handler.join_test_thread();
+    }
+
+    handler.syncthreads();
+
+    {
+        handler.run_on_first_thread(&Mutex::lock, m);
+        handler.syncthreads();
+
+        handler.run_on_second_thread(f2<Mutex>, m);
+        handler.sleep_first_thread(ms(250));
+        handler.run_on_first_thread(&Mutex::unlock, m);
+        handler.join_test_thread();
+    }
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::mutex<cuda::thread_scope_system>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  local_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, local_memory_selector>();
+#else
+    test<cuda::mutex<cuda::thread_scope_system>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  shared_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, shared_memory_selector>();
+
+    test<cuda::mutex<cuda::thread_scope_system>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_device>, global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_block>,  global_memory_selector>();
+    test<cuda::mutex<cuda::thread_scope_thread>, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.callonce/call_once.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.callonce/call_once.pass.cpp
new file mode 100644
index 0000000000..be7ded4fde
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.callonce/call_once.pass.cpp
@@ -0,0 +1,313 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// template<class Callable, class ...Args>
+//   void call_once(once_flag& flag, Callable&& func, Args&&... args);
+
+#include<cuda/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::milliseconds ms;
+
+__host__ __device__
+void init0(int& init0_called)
+{
+    test_sleep_thread(ms(250));
+    ++init0_called;
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f0(OnceFlag* flg0, int* init0_called)
+{
+    cuda::std::call_once(*flg0, init0, *init0_called);
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+__host__ __device__
+void init3(int& init3_called, int& init3_completed)
+{
+    ++init3_called;
+    test_sleep_thread(ms(250));
+    if (init3_called == 1)
+#ifdef __CUDA_ARCH__
+        _LIBCUDACXX_UNREACHABLE();
+#else
+        TEST_THROW(1);
+#endif
+    ++init3_completed;
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f3(OnceFlag* flg3, int* init3_called, int* init3_completed)
+{
+    try
+    {
+        cuda::std::call_once(*flg3, init3, *init3_called, *init3_completed);
+    }
+    catch (...)
+    {
+    }
+}
+#endif // TEST_HAS_NO_EXCEPTIONS
+
+#if TEST_STD_VER >= 11
+struct init1
+{
+    int& init1_called;
+    __host__ __device__ void operator()(int i) { init1_called += i; }
+};
+
+template<class OnceFlag>
+__host__ __device__
+void f1(OnceFlag* flg1, int* init1_called)
+{
+    cuda::std::call_once(*flg1, init1{*init1_called}, 1);
+}
+
+struct init2
+{
+    int& init2_called;
+    __host__ __device__ void operator()(int i, int j) const {init2_called += i + j;}
+};
+
+template<class OnceFlag>
+__host__ __device__
+void f2(OnceFlag* flg2, int* init2_called)
+{
+    cuda::std::call_once(*flg2, init2{*init2_called}, 2, 3);
+    cuda::std::call_once(*flg2, init2{*init2_called}, 4, 5);
+}
+
+#endif // TEST_STD_VER >= 11
+__host__ __device__
+void init41(int& init41_called)
+{
+    test_sleep_thread(ms(250));
+    ++init41_called;
+}
+
+__host__ __device__
+void init42(int& init42_called)
+{
+    test_sleep_thread(ms(250));
+    ++init42_called;
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f41(OnceFlag* flg41, OnceFlag* flg42, int* init41_called, int* init42_called)
+{
+    cuda::std::call_once(*flg41, init41, *init41_called);
+    cuda::std::call_once(*flg42, init42, *init42_called);
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f42(OnceFlag* flg41, OnceFlag* flg42, int* init41_called, int* init42_called)
+{
+    cuda::std::call_once(*flg41, init42, *init42_called);
+    cuda::std::call_once(*flg42, init41, *init41_called);
+}
+
+#if TEST_STD_VER >= 11
+
+class MoveOnly
+{
+    __host__ __device__ MoveOnly(const MoveOnly&);
+public:
+    __host__ __device__ MoveOnly() {}
+    __host__ __device__ MoveOnly(MoveOnly&&) {}
+
+    __host__ __device__ void operator()(MoveOnly&&) {}
+};
+
+class NonCopyable
+{
+    __host__ __device__ NonCopyable(const NonCopyable&);
+public:
+    __host__ __device__ NonCopyable() {}
+
+    __host__ __device__ void operator()(int&) {}
+};
+
+// reference qualifiers on functions are a C++11 extension
+struct RefQual
+{
+    __host__ __device__ void operator()( int& lv_called) & { ++lv_called; }
+    __host__ __device__ void operator()( int& rv_called) && { ++rv_called; }
+};
+
+#endif // TEST_STD_VER >= 11
+
+template<class OnceFlag,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<OnceFlag, Initializer> sel;
+    Selector<int, Initializer> sel_count;
+    heterogeneous_thread_handler handler;
+
+    // check basic functionality
+    {
+        SHARED OnceFlag* flg0;
+        flg0 = sel.construct();
+
+        SHARED int* init0_called;
+        init0_called = sel_count.construct();
+
+        handler.run_on_first_thread(f0<OnceFlag>, flg0, init0_called);
+        handler.run_on_second_thread(f0<OnceFlag>, flg0, init0_called);
+        handler.join_test_thread();
+        handler.syncthreads();
+        assert(*init0_called == 1);
+    }
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    // check basic exception safety
+    {
+        SHARED OnceFlag* flg0;
+        flg0 = sel.construct();
+
+        SHARED int* init3_called;
+        init3_called = sel_count.construct();
+        SHARED int* init3_completed;
+        init3_completed = sel_count.construct();
+
+        handler.run_on_first_thread( f3<OnceFlag>, flg3, init3_called, init3_completed);
+        handler.run_on_second_thread(f3<OnceFlag>, flg3, init3_called, init3_completed);
+        handler.join_test_thread();
+        handler.syncthreads();
+
+        assert(*init3_called == 2);
+        assert(*init3_completed == 1);
+    }
+#endif
+    // check deadlock avoidance
+    {
+        SHARED OnceFlag* flg41;
+        flg41 = sel.construct();
+        SHARED OnceFlag* flg42;
+        flg42 = sel.construct();
+
+        SHARED int* init41_called;
+        init41_called = sel_count.construct();
+        SHARED int* init42_called;
+        init42_called = sel_count.construct();
+
+        handler.run_on_first_thread( f41<OnceFlag>, flg41, flg42, init41_called, init42_called);
+        handler.run_on_second_thread(f42<OnceFlag>, flg41, flg42, init41_called, init42_called);
+        handler.join_test_thread();
+        handler.syncthreads();
+
+        assert(*init41_called == 1);
+        assert(*init42_called == 1);
+    }
+#if TEST_STD_VER >= 11
+    // check functors with 1 arg
+    {
+        SHARED OnceFlag* flg1;
+        flg1 = sel.construct();
+
+        SHARED int* init1_called;
+        init1_called = sel_count.construct();
+
+        handler.run_on_first_thread( f1<OnceFlag>, flg1, init1_called);
+        handler.run_on_second_thread(f1<OnceFlag>, flg1, init1_called);
+        handler.join_test_thread();
+        handler.syncthreads();
+        assert(*init1_called == 1);
+    }
+    // check functors with 2 args
+    {
+        SHARED OnceFlag* flg2;
+        flg2 = sel.construct();
+
+        SHARED int* init2_called;
+        init2_called = sel_count.construct();
+
+        handler.run_on_first_thread( f2<OnceFlag>, flg2, init2_called);
+        handler.run_on_second_thread(f2<OnceFlag>, flg2, init2_called);
+        handler.join_test_thread();
+        handler.syncthreads();
+        assert(*init2_called == 5);
+    }
+
+    {
+        SHARED OnceFlag* f;
+        f = sel.construct();
+
+        cuda::call_once(*f, MoveOnly(), MoveOnly());
+    }
+    // check LWG2442: call_once() shouldn't DECAY_COPY()
+    {
+        SHARED OnceFlag* f;
+        f = sel.construct();
+        int i = 0;
+
+        cuda::call_once(*f, NonCopyable(), i);
+    }
+    // reference qualifiers on functions are a C++11 extension
+    {
+        SHARED OnceFlag* f1, *f2;
+        f1 = sel.construct();
+        f2 = sel.construct();
+
+        SHARED int* lv_called;
+        lv_called = sel_count.construct();
+
+        SHARED int* rv_called;
+        rv_called = sel_count.construct();
+
+        RefQual rq;
+        cuda::call_once(*f1, rq, *lv_called);
+        assert(*lv_called == 1);
+        cuda::call_once(*f2, cuda::std::move(rq), *rv_called);
+        assert(*rv_called == 1);
+    }
+#endif // TEST_STD_VER >= 11
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::once_flag<cuda::thread_scope_system>, local_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_device>, local_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_block>,  local_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_thread>, local_memory_selector>();
+#else
+    test<cuda::once_flag<cuda::thread_scope_system>, shared_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_device>, shared_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_block>,  shared_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_thread>, shared_memory_selector>();
+
+    test<cuda::once_flag<cuda::thread_scope_system>, global_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_device>, global_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_block>,  global_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_thread>, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.callonce/race.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.callonce/race.pass.cpp
new file mode 100644
index 0000000000..2a0843560a
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.callonce/race.pass.cpp
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// template<class Callable, class ...Args>
+//   void call_once(once_flag& flag, Callable&& func, Args&&... args);
+
+// This test is supposed to be run with ThreadSanitizer and verifies that
+// call_once properly synchronizes user state, a data race that was fixed
+// in r280621.
+
+#include<cuda/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::milliseconds ms;
+
+__host__ __device__
+void init0(int& global)
+{
+    ++global;
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f0(OnceFlag* flg, int* global)
+{
+    cuda::call_once(*flg, init0, *global);
+    assert(*global == 1);
+}
+
+template<class OnceFlag,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<OnceFlag, Initializer> sel;
+    SHARED OnceFlag* flg;
+    flg = sel.construct();
+
+    Selector<int, Initializer> sel_count;
+    SHARED int* global;
+    global = sel_count.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(f0<OnceFlag>, flg, global);
+    handler.run_on_second_thread(f0<OnceFlag>, flg, global);
+    handler.syncthreads();
+    handler.join_test_thread();
+    assert(*global == 1);
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::once_flag<cuda::thread_scope_system>, local_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_device>, local_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_block>,  local_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_thread>, local_memory_selector>();
+#else
+    test<cuda::once_flag<cuda::thread_scope_system>, shared_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_device>, shared_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_block>,  shared_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_thread>, shared_memory_selector>();
+
+    test<cuda::once_flag<cuda::thread_scope_system>, global_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_device>, global_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_block>,  global_memory_selector>();
+    test<cuda::once_flag<cuda::thread_scope_thread>, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/assign.compile.fail.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/assign.compile.fail.cpp
new file mode 100644
index 0000000000..ccc5c15224
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/assign.compile.fail.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// once_flag& operator=(const once_flag&) = delete;
+
+#include<cuda/mutex>
+
+template<int scope>
+void test() {
+    cuda::once_flag<scope> f0;
+    cuda::once_flag<scope> f1;
+    f1 = f0;
+}
+
+int main(int, char**)
+{
+  test<cuda::thread_scope_system>();
+  test<cuda::thread_scope_device>();
+  test<cuda::thread_scope_block>();
+  test<cuda::thread_scope_thread>();
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/copy.compile.fail.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/copy.compile.fail.cpp
new file mode 100644
index 0000000000..55694b5b5d
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/copy.compile.fail.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// once_flag(const once_flag&) = delete;
+
+#include<cuda/mutex>
+
+template<int scope>
+void test() {
+    cuda::once_flag<scope> f0;
+    cuda::once_flag<scope> f1{f0};
+}
+
+int main(int, char**)
+{
+  test<cuda::thread_scope_system>();
+  test<cuda::thread_scope_device>();
+  test<cuda::thread_scope_block>();
+  test<cuda::thread_scope_thread>();
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/default.pass.cpp b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/default.pass.cpp
new file mode 100644
index 0000000000..1febee3403
--- /dev/null
+++ b/.upstream-tests/test/cuda/thread/thread.mutex/thread.once/thread.once.onceflag/default.pass.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// constexpr once_flag() noexcept;
+
+#include<cuda/mutex>
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    {
+    cuda::once_flag<cuda::thread_scope_system> f;
+    unused(f);
+    }
+    {
+    cuda::once_flag<cuda::thread_scope_device> f;
+    unused(f);
+    }
+    {
+    cuda::once_flag<cuda::thread_scope_block> f;
+    unused(f);
+    }
+    {
+    cuda::once_flag<cuda::thread_scope_thread> f;
+    unused(f);
+    }
+#if TEST_STD_VER >= 11
+    {
+    constexpr cuda::once_flag<cuda::thread_scope_system> f;
+    unused(f);
+    }
+    {
+    constexpr cuda::once_flag<cuda::thread_scope_device> f;
+    unused(f);
+    }
+    {
+    constexpr cuda::once_flag<cuda::thread_scope_block> f;
+    unused(f);
+    }
+    {
+    constexpr cuda::once_flag<cuda::thread_scope_thread> f;
+    unused(f);
+    }
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock.algorithm/lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock.algorithm/lock.pass.cpp
new file mode 100644
index 0000000000..214d5d036e
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock.algorithm/lock.pass.cpp
@@ -0,0 +1,543 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// This test hangs forever when built against libstdc++ (Oct 2016).
+// UNSUPPORTED: stdlib=libstdc++
+
+// This test isn't quite standards-conforming: it's testing our specific
+// algorithm, where when lx.try_lock() fails we start the next attempt
+// with an unconditional lx.lock(). Thus our algorithm can handle a list
+// of mutexes where at-most-one of them is of the evil type `class L1`,
+// but will loop forever if two or more of them are `class L1`.
+
+// <mutex>
+
+// template <class L1, class L2, class... L3>
+//   void lock(L1&, L2&, L3&...);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+class L0
+{
+    bool locked_;
+
+public:
+    __host__ __device__ L0() : locked_(false) {}
+
+    __host__ __device__ void lock()
+    {
+        locked_ = true;
+    }
+
+    __host__ __device__ bool try_lock()
+    {
+        locked_ = true;
+        return locked_;
+    }
+
+    __host__ __device__ void unlock() {locked_ = false;}
+
+    __host__ __device__ bool locked() const {return locked_;}
+};
+
+class L1
+{
+    bool locked_;
+
+public:
+    __host__ __device__ L1() : locked_(false) {}
+
+    __host__ __device__ void lock()
+    {
+        locked_ = true;
+    }
+
+    __host__ __device__ bool try_lock()
+    {
+        locked_ = false;
+        return locked_;
+    }
+
+    __host__ __device__ void unlock() {locked_ = false;}
+
+    __host__ __device__ bool locked() const {return locked_;}
+};
+
+class L2
+{
+    bool locked_;
+
+public:
+    __host__ __device__ L2() : locked_(false) {}
+
+    __host__ __device__ void lock()
+    {
+        TEST_THROW(1);
+    }
+
+    __host__ __device__ bool try_lock()
+    {
+        TEST_THROW(1);
+        return locked_;
+    }
+
+    __host__ __device__ void unlock() {locked_ = false;}
+
+    __host__ __device__ bool locked() const {return locked_;}
+};
+
+__host__ __device__
+void with_one_or_two_locks() {
+    {
+        L0 l0;
+        L0 l1;
+        cuda::std::lock(l0, l1);
+        assert(l0.locked());
+        assert(l1.locked());
+    }
+    {
+        L0 l0;
+        L1 l1;
+        cuda::std::lock(l0, l1);
+        assert(l0.locked());
+        assert(l1.locked());
+    }
+    {
+        L1 l0;
+        L0 l1;
+        cuda::std::lock(l0, l1);
+        assert(l0.locked());
+        assert(l1.locked());
+    }
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    {
+        L0 l0;
+        L2 l1;
+        try
+        {
+            cuda::std::lock(l0, l1);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+        }
+    }
+    {
+        L2 l0;
+        L0 l1;
+        try
+        {
+            cuda::std::lock(l0, l1);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+        }
+    }
+    {
+        L1 l0;
+        L2 l1;
+        try
+        {
+            cuda::std::lock(l0, l1);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+        }
+    }
+    {
+        L2 l0;
+        L1 l1;
+        try
+        {
+            cuda::std::lock(l0, l1);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+        }
+    }
+    {
+        L2 l0;
+        L2 l1;
+        try
+        {
+            cuda::std::lock(l0, l1);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+        }
+    }
+#endif
+}
+
+__host__ __device__
+void with_three_locks() {
+    {
+        L0 l0;
+        L0 l1;
+        L0 l2;
+        cuda::std::lock(l0, l1, l2);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+    }
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    {
+        L2 l0;
+        L2 l1;
+        L2 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+#endif
+    {
+        L0 l0;
+        L0 l1;
+        L1 l2;
+        cuda::std::lock(l0, l1, l2);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+    }
+    {
+        L0 l0;
+        L1 l1;
+        L0 l2;
+        cuda::std::lock(l0, l1, l2);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+    }
+    {
+        L1 l0;
+        L0 l1;
+        L0 l2;
+        cuda::std::lock(l0, l1, l2);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+    }
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    {
+        L0 l0;
+        L0 l1;
+        L2 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L0 l0;
+        L2 l1;
+        L0 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L0 l1;
+        L0 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L2 l1;
+        L0 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L0 l1;
+        L2 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L0 l0;
+        L2 l1;
+        L2 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L2 l1;
+        L1 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L1 l1;
+        L2 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L1 l0;
+        L2 l1;
+        L2 l2;
+        try
+        {
+            cuda::std::lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+#endif // TEST_HAS_NO_EXCEPTIONS
+}
+
+__host__ __device__
+void with_four_locks() {
+{
+        L0 l0;
+        L0 l1;
+        L0 l2;
+        L0 l3;
+        cuda::std::lock(l0, l1, l2, l3);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+        assert(l3.locked());
+    }
+    {
+        L0 l0;
+        L0 l1;
+        L0 l2;
+        L1 l3;
+        cuda::std::lock(l0, l1, l2, l3);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+        assert(l3.locked());
+    }
+    {
+        L0 l0;
+        L0 l1;
+        L1 l2;
+        L0 l3;
+        cuda::std::lock(l0, l1, l2, l3);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+        assert(l3.locked());
+    }
+    {
+        L0 l0;
+        L1 l1;
+        L0 l2;
+        L0 l3;
+        cuda::std::lock(l0, l1, l2, l3);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+        assert(l3.locked());
+    }
+    {
+        L1 l0;
+        L0 l1;
+        L0 l2;
+        L0 l3;
+        cuda::std::lock(l0, l1, l2, l3);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+        assert(l3.locked());
+    }
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    {
+        L0 l0;
+        L0 l1;
+        L0 l2;
+        L2 l3;
+        try
+        {
+            cuda::std::lock(l0, l1, l2, l3);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+            assert(!l3.locked());
+        }
+    }
+    {
+        L0 l0;
+        L0 l1;
+        L2 l2;
+        L0 l3;
+        try
+        {
+            cuda::std::lock(l0, l1, l2, l3);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+            assert(!l3.locked());
+        }
+    }
+    {
+        L0 l0;
+        L2 l1;
+        L0 l2;
+        L0 l3;
+        try
+        {
+            cuda::std::lock(l0, l1, l2, l3);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+            assert(!l3.locked());
+        }
+    }
+    {
+        L2 l0;
+        L0 l1;
+        L0 l2;
+        L0 l3;
+        try
+        {
+            cuda::std::lock(l0, l1, l2, l3);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+            assert(!l3.locked());
+        }
+    }
+#endif // TEST_HAS_NO_EXCEPTIONS
+}
+
+int main(int, char**)
+{
+    with_one_or_two_locks();
+    with_three_locks();
+#ifndef __CUDA_ARCH__ // explodes stack space
+    with_four_locks();
+#endif
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock.algorithm/try_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock.algorithm/try_lock.pass.cpp
new file mode 100644
index 0000000000..1ba73406c0
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock.algorithm/try_lock.pass.cpp
@@ -0,0 +1,529 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class L1, class L2, class... L3>
+//   int try_lock(L1&, L2&, L3&...);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+class L0
+{
+    bool locked_;
+
+public:
+    __host__ __device__ L0() : locked_(false) {}
+
+    __host__ __device__ bool try_lock()
+    {
+        locked_ = true;
+        return locked_;
+    }
+
+    __host__ __device__ void unlock() {locked_ = false;}
+
+    __host__ __device__ bool locked() const {return locked_;}
+};
+
+class L1
+{
+    bool locked_;
+
+public:
+    __host__ __device__ L1() : locked_(false) {}
+
+    __host__ __device__ bool try_lock()
+    {
+        locked_ = false;
+        return locked_;
+    }
+
+    __host__ __device__ void unlock() {locked_ = false;}
+
+    __host__ __device__ bool locked() const {return locked_;}
+};
+
+class L2
+{
+    bool locked_;
+
+public:
+    __host__ __device__ L2() : locked_(false) {}
+
+    __host__ __device__ bool try_lock()
+    {
+        TEST_THROW(1);
+        return locked_;
+    }
+
+    __host__ __device__ void unlock() {locked_ = false;}
+
+    __host__ __device__ bool locked() const {return locked_;}
+};
+
+int main(int, char**)
+{
+    {
+        L0 l0;
+        L0 l1;
+        assert(cuda::std::try_lock(l0, l1) == -1);
+        assert(l0.locked());
+        assert(l1.locked());
+    }
+    {
+        L0 l0;
+        L1 l1;
+        assert(cuda::std::try_lock(l0, l1) == 1);
+        assert(!l0.locked());
+        assert(!l1.locked());
+    }
+    {
+        L1 l0;
+        L0 l1;
+        assert(cuda::std::try_lock(l0, l1) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+    }
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    {
+        L0 l0;
+        L2 l1;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+        }
+    }
+    {
+        L2 l0;
+        L0 l1;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+        }
+    }
+#endif
+#if TEST_STD_VER >= 11
+    {
+        L0 l0;
+        L0 l1;
+        L0 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == -1);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+    }
+    {
+        L1 l0;
+        L1 l1;
+        L1 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    {
+        L2 l0;
+        L2 l1;
+        L2 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L0 l0;
+        L1 l1;
+        L2 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 1);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+#endif
+    {
+        L0 l0;
+        L0 l1;
+        L1 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 2);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+    {
+        L0 l0;
+        L1 l1;
+        L0 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 1);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+    {
+        L1 l0;
+        L0 l1;
+        L0 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    {
+        L0 l0;
+        L0 l1;
+        L2 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L0 l0;
+        L2 l1;
+        L0 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L0 l1;
+        L0 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+#endif
+    {
+        L1 l0;
+        L1 l1;
+        L0 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+    {
+        L1 l0;
+        L0 l1;
+        L1 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+    {
+        L0 l0;
+        L1 l1;
+        L1 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 1);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    {
+        L1 l0;
+        L1 l1;
+        L2 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+    {
+        L1 l0;
+        L2 l1;
+        L1 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+    {
+        L2 l0;
+        L1 l1;
+        L1 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L2 l1;
+        L0 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L0 l1;
+        L2 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L0 l0;
+        L2 l1;
+        L2 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L2 l1;
+        L1 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L1 l1;
+        L2 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L1 l0;
+        L2 l1;
+        L2 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+    {
+        L0 l0;
+        L2 l1;
+        L1 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L1 l0;
+        L0 l1;
+        L2 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+    {
+        L1 l0;
+        L2 l1;
+        L0 l2;
+        assert(cuda::std::try_lock(l0, l1, l2) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+    }
+    {
+        L2 l0;
+        L0 l1;
+        L1 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+    {
+        L2 l0;
+        L1 l1;
+        L0 l2;
+        try
+        {
+            (void)cuda::std::try_lock(l0, l1, l2);
+            assert(false);
+        }
+        catch (int)
+        {
+            assert(!l0.locked());
+            assert(!l1.locked());
+            assert(!l2.locked());
+        }
+    }
+#endif // TEST_HAS_NO_EXCEPTIONS
+    {
+        L0 l0;
+        L0 l1;
+        L0 l2;
+        L0 l3;
+        assert(cuda::std::try_lock(l0, l1, l2, l3) == -1);
+        assert(l0.locked());
+        assert(l1.locked());
+        assert(l2.locked());
+        assert(l3.locked());
+    }
+    {
+        L1 l0;
+        L0 l1;
+        L0 l2;
+        L0 l3;
+        assert(cuda::std::try_lock(l0, l1, l2, l3) == 0);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+        assert(!l3.locked());
+    }
+    {
+        L0 l0;
+        L1 l1;
+        L0 l2;
+        L0 l3;
+        assert(cuda::std::try_lock(l0, l1, l2, l3) == 1);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+        assert(!l3.locked());
+    }
+    {
+        L0 l0;
+        L0 l1;
+        L1 l2;
+        L0 l3;
+        assert(cuda::std::try_lock(l0, l1, l2, l3) == 2);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+        assert(!l3.locked());
+    }
+    {
+        L0 l0;
+        L0 l1;
+        L0 l2;
+        L1 l3;
+        assert(cuda::std::try_lock(l0, l1, l2, l3) == 3);
+        assert(!l0.locked());
+        assert(!l1.locked());
+        assert(!l2.locked());
+        assert(!l3.locked());
+    }
+#endif // TEST_STD_VER >= 11
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/adopt_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/adopt_lock.pass.cpp
new file mode 100644
index 0000000000..0210dff844
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/adopt_lock.pass.cpp
@@ -0,0 +1,105 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class lock_guard;
+
+// lock_guard(mutex_type& m, adopt_lock_t);
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+#ifndef __CUDA_ARCH__
+template<class Mutex>
+__host__ void do_try_lock(Mutex* m) {
+  assert(m->try_lock() == false);
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ void test_host() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    {
+      m->lock();
+      cuda::std::lock_guard<Mutex> lg(*m, cuda::std::adopt_lock);
+      std::thread t(do_try_lock<Mutex>, m);
+      t.join();
+    }
+
+    m->lock();
+    m->unlock();
+}
+#endif
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__device__ void test_device() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    Selector<cuda::std::atomic_flag, Initializer> sel_flag;
+    SHARED cuda::std::atomic_flag* thread_ready;
+    thread_ready = sel_flag.construct(false);
+
+    if (threadIdx.x == 0) {
+      m->lock();
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      cuda::std::lock_guard<Mutex> lg(*m, cuda::std::adopt_lock);
+      thread_ready->test_and_set(cuda::std::memory_order_relaxed);
+      thread_ready->notify_one();
+      thread_ready->wait(true, cuda::std::memory_order_relaxed);  // Waits until clear
+    } else {
+      thread_ready->wait(false, cuda::std::memory_order_relaxed); // Waits until test_and_set
+      assert(m->try_lock() == false);
+      thread_ready->clear(cuda::std::memory_order_relaxed);
+      thread_ready->notify_one();
+    }
+    __syncthreads();
+
+    m->lock();
+    m->unlock();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test_host<cuda::std::mutex, local_memory_selector>();
+#else
+    test_device<cuda::std::mutex, shared_memory_selector>();
+    test_device<cuda::std::mutex, global_memory_selector>();
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.fail.cpp
new file mode 100644
index 0000000000..65d3451b0c
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/assign.compile.fail.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class lock_guard;
+
+// lock_guard& operator=(lock_guard const&) = delete;
+
+#include<cuda/std/mutex>
+
+int main(int, char**)
+{
+    cuda::std::mutex m0;
+    cuda::std::mutex m1;
+    cuda::std::lock_guard<cuda::std::mutex> lg0(m0);
+    cuda::std::lock_guard<cuda::std::mutex> lg(m1);
+    lg = lg0;
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.fail.cpp
new file mode 100644
index 0000000000..2d0ce6c8ec
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/copy.compile.fail.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class lock_guard;
+
+// lock_guard(lock_guard const&) = delete;
+
+#include<cuda/std/mutex>
+
+int main(int, char**)
+{
+    cuda::std::mutex m;
+    cuda::std::lock_guard<cuda::std::mutex> lg0(m);
+    cuda::std::lock_guard<cuda::std::mutex> lg(lg0);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp
new file mode 100644
index 0000000000..6dec0bcfef
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// lock_guard
+
+// Make sure that the implicitly-generated CTAD works.
+
+#include<cuda/std/mutex>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  cuda::std::mutex mutex;
+  {
+    cuda::std::lock_guard lock(mutex);
+    ASSERT_SAME_TYPE(decltype(lock), cuda::std::lock_guard<cuda::std::mutex>);
+  }
+
+  return 0;
+}
+
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.fail.cpp
new file mode 100644
index 0000000000..9c9c99fc75
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.fail.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class lock_guard;
+
+// explicit lock_guard(mutex_type& m);
+
+#include<cuda/std/mutex>
+
+int main(int, char**)
+{
+    cuda::std::mutex m;
+    cuda::std::lock_guard<cuda::std::mutex> lg = m; // expected-error{{no viable conversion}}
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp
new file mode 100644
index 0000000000..4097478437
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class lock_guard;
+
+// explicit lock_guard(mutex_type& m);
+
+// template<class _Mutex> lock_guard(lock_guard<_Mutex>)
+//     -> lock_guard<_Mutex>;  // C++17
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__
+void check_ctad(Mutex* m)
+{
+#if TEST_STD_VER >= 17
+    cuda::std::lock_guard lg(*m);
+    static_assert((cuda::std::is_same<decltype(lg), cuda::std::lock_guard<Mutex>>::value), "" );
+    unused(lg);
+#endif
+}
+
+#ifndef __CUDA_ARCH__
+template<class Mutex>
+__host__ void do_try_lock(Mutex* m) {
+  assert(m->try_lock() == false);
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ void test_host() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    {
+      cuda::std::lock_guard<Mutex> lg(*m);
+      std::thread t(do_try_lock<Mutex>, m);
+      t.join();
+    }
+
+    m->lock();
+    m->unlock();
+
+    check_ctad(m);
+}
+#endif
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__device__ void test_device() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    Selector<cuda::std::atomic_flag, Initializer> sel_flag;
+    SHARED cuda::std::atomic_flag* thread_ready;
+    thread_ready = sel_flag.construct(false);
+
+    if (threadIdx.x == 0) {
+      cuda::std::lock_guard<Mutex> lg(*m);
+      thread_ready->test_and_set(cuda::std::memory_order_relaxed);
+      thread_ready->notify_one();
+      thread_ready->wait(true, cuda::std::memory_order_relaxed);
+    } else {
+      thread_ready->wait(false, cuda::std::memory_order_relaxed);
+      assert(m->try_lock() == false);
+      thread_ready->clear(cuda::std::memory_order_relaxed);
+      thread_ready->notify_one();
+    }
+    __syncthreads();
+
+    m->lock();
+    m->unlock();
+
+    check_ctad(m);
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test_host<cuda::std::mutex, local_memory_selector>();
+#else
+    test_device<cuda::std::mutex, shared_memory_selector>();
+    test_device<cuda::std::mutex, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.pass.cpp
new file mode 100644
index 0000000000..cf8b7b92d5
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/types.pass.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex>
+// class lock_guard
+// {
+// public:
+//     typedef Mutex mutex_type;
+//     ...
+// };
+
+#include<cuda/std/mutex>
+#include<cuda/std/type_traits>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    static_assert((cuda::std::is_same<cuda::std::lock_guard<cuda::std::mutex>::mutex_type,
+                   cuda::std::mutex>::value), "");
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/adopt_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/adopt_lock.pass.cpp
new file mode 100644
index 0000000000..997bccfcc2
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/adopt_lock.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class ...Mutex> class scoped_lock;
+
+// scoped_lock(adopt_lock_t, Mutex&...);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include "test_macros.h"
+
+struct TestMutex {
+    bool locked = false;
+    TestMutex() = default;
+
+    __host__ __device__ void lock() { assert(!locked); locked = true; }
+    __host__ __device__ bool try_lock() { if (locked) return false; locked = true; return true; }
+    __host__ __device__ void unlock() { assert(locked); locked = false; }
+
+    TestMutex(TestMutex const&) = delete;
+    TestMutex& operator=(TestMutex const&) = delete;
+};
+
+int main(int, char**)
+{
+    {
+        using LG = cuda::std::scoped_lock<>;
+        LG lg(cuda::std::adopt_lock);
+    }
+    {
+        TestMutex m1;
+        using LG = cuda::std::scoped_lock<TestMutex>;
+        m1.lock();
+        {
+            LG lg(cuda::std::adopt_lock, m1);
+            assert(m1.locked);
+        }
+        assert(!m1.locked);
+    }
+    {
+        TestMutex m1, m2;
+        using LG = cuda::std::scoped_lock<TestMutex, TestMutex>;
+        m1.lock(); m2.lock();
+        {
+            LG lg(cuda::std::adopt_lock, m1, m2);
+            assert(m1.locked && m2.locked);
+        }
+        assert(!m1.locked && !m2.locked);
+    }
+    {
+        TestMutex m1, m2, m3;
+        using LG = cuda::std::scoped_lock<TestMutex, TestMutex, TestMutex>;
+        m1.lock(); m2.lock(); m3.lock();
+        {
+            LG lg(cuda::std::adopt_lock, m1, m2, m3);
+            assert(m1.locked && m2.locked && m3.locked);
+        }
+        assert(!m1.locked && !m2.locked && !m3.locked);
+    }
+
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/assign.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/assign.fail.cpp
new file mode 100644
index 0000000000..5cd0d79ad8
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/assign.fail.cpp
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class ...Mutex> class scoped_lock;
+
+// scoped_lock& operator=(scoped_lock const&) = delete;
+
+#include<cuda/std/mutex>
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    using M = cuda::std::mutex;
+    M m0, m1, m2;
+    M om0, om1, om2;
+    {
+        using LG = cuda::std::scoped_lock<>;
+        LG lg1, lg2;
+        lg1 = lg2; // expected-error{{overload resolution selected deleted operator '='}}
+    }
+    {
+        using LG = cuda::std::scoped_lock<M>;
+        LG lg1(m0);
+        LG lg2(om0);
+        lg1 = lg2; // expected-error{{overload resolution selected deleted operator '='}}
+    }
+    {
+        using LG = cuda::std::scoped_lock<M, M>;
+        LG lg1(m0, m1);
+        LG lg2(om0, om1);
+        lg1 = lg2; // expected-error{{overload resolution selected deleted operator '='}}
+    }
+    {
+        using LG = cuda::std::scoped_lock<M, M, M>;
+        LG lg1(m0, m1, m2);
+        LG lg2(om0, om1, om2);
+        lg1 = lg2; // expected-error{{overload resolution selected deleted operator '='}}
+    }
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/copy.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/copy.fail.cpp
new file mode 100644
index 0000000000..7d75ccb2e6
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/copy.fail.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class ...Mutex> class scoped_lock;
+
+// scoped_lock(scoped_lock const&) = delete;
+
+#include<cuda/std/mutex>
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    using M = cuda::std::mutex;
+    M m0, m1, m2;
+    {
+        using LG = cuda::std::scoped_lock<>;
+        const LG Orig;
+        LG Copy(Orig); // expected-error{{call to deleted constructor of 'LG'}}
+    }
+    {
+        using LG = cuda::std::scoped_lock<M>;
+        const LG Orig(m0);
+        LG Copy(Orig); // expected-error{{call to deleted constructor of 'LG'}}
+    }
+    {
+        using LG = cuda::std::scoped_lock<M, M>;
+        const LG Orig(m0, m1);
+        LG Copy(Orig); // expected-error{{call to deleted constructor of 'LG'}}
+    }
+    {
+        using LG = cuda::std::scoped_lock<M, M, M>;
+        const LG Orig(m0, m1, m2);
+        LG Copy(Orig); // expected-error{{call to deleted constructor of 'LG'}}
+    }
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp
new file mode 100644
index 0000000000..c7a6d80ab9
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: gcc-7
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// scoped_lock
+
+// Make sure that the implicitly-generated CTAD works.
+
+#include<cuda/std/mutex>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  cuda::std::mutex m1;
+  {
+    cuda::std::scoped_lock lock(m1);
+    ASSERT_SAME_TYPE(decltype(lock), cuda::std::scoped_lock<cuda::std::mutex>);
+  }
+#if 0 // No recursive mutex
+  cuda::std::recursive_mutex m2;
+  cuda::std::recursive_timed_mutex m3;
+  {
+    cuda::std::scoped_lock lock(m1, m2);
+    ASSERT_SAME_TYPE(decltype(lock), cuda::std::scoped_lock<cuda::std::mutex, cuda::std::recursive_mutex>);
+  }
+  {
+    cuda::std::scoped_lock lock(m1, m2, m3);
+    ASSERT_SAME_TYPE(decltype(lock), cuda::std::scoped_lock<cuda::std::mutex, cuda::std::recursive_mutex, cuda::std::recursive_timed_mutex>);
+  }
+#endif
+
+  return 0;
+}
+
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.fail.cpp
new file mode 100644
index 0000000000..0a9c3465d4
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.fail.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class ...Mutex> class scoped_lock;
+
+// explicit scoped_lock(Mutex&...);
+
+#include<cuda/std/mutex>
+#include "test_macros.h"
+
+template <class LG>
+__host__ __device__ void test_conversion(LG) {}
+
+int main(int, char**)
+{
+    using M = cuda::std::mutex;
+    M m0, m1, m2;
+    M n0, n1, n2;
+    {
+        using LG = cuda::std::scoped_lock<>;
+        LG lg = {}; // expected-error{{chosen constructor is explicit in copy-initialization}}
+        test_conversion<LG>({}); // expected-error{{no matching function for call}}
+        ((void)lg);
+    }
+    {
+        using LG = cuda::std::scoped_lock<M>;
+        LG lg = {m0}; // expected-error{{chosen constructor is explicit in copy-initialization}}
+        test_conversion<LG>({n0}); // expected-error{{no matching function for call}}
+        ((void)lg);
+    }
+    {
+        using LG = cuda::std::scoped_lock<M, M>;
+        LG lg = {m0, m1}; // expected-error{{chosen constructor is explicit in copy-initialization}}
+        test_conversion<LG>({n0, n1}); // expected-error{{no matching function for call}}
+        ((void)lg);
+    }
+    {
+        using LG = cuda::std::scoped_lock<M, M, M>;
+        LG lg = {m0, m1, m2}; // expected-error{{chosen constructor is explicit in copy-initialization}}
+        test_conversion<LG>({n0, n1, n2}); // expected-error{{no matching function for call}}
+    }
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.pass.cpp
new file mode 100644
index 0000000000..9f42f220e0
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.pass.cpp
@@ -0,0 +1,163 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class ...Mutex> class scoped_lock;
+
+// explicit scoped_lock(mutex_type& m);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include "test_macros.h"
+
+struct TestMutex {
+    bool locked = false;
+    TestMutex() = default;
+    __host__ __device__ ~TestMutex() { assert(!locked); }
+
+    __host__ __device__ void lock() { assert(!locked); locked = true; }
+    __host__ __device__ bool try_lock() { if (locked) return false; locked = true; return true; }
+    __host__ __device__ void unlock() { assert(locked); locked = false; }
+
+    TestMutex(TestMutex const&) = delete;
+    TestMutex& operator=(TestMutex const&) = delete;
+};
+
+#if !defined(TEST_HAS_NO_EXCEPTIONS)
+struct TestMutexThrows {
+    bool locked = false;
+    bool throws_on_lock = false;
+
+    __host__ __device__ TestMutexThrows() = default;
+    __host__ __device__ ~TestMutexThrows() { assert(!locked); }
+
+    __host__ __device__ void lock() {
+        assert(!locked);
+        if (throws_on_lock) {
+            throw 42;
+        }
+        locked = true;
+    }
+
+    __host__ __device__ bool try_lock() {
+        if (locked) return false;
+        lock();
+        return true;
+    }
+
+    __host__ __device__ void unlock() { assert(locked); locked = false; }
+
+    TestMutexThrows(TestMutexThrows const&) = delete;
+    TestMutexThrows& operator=(TestMutexThrows const&) = delete;
+};
+#endif // !defined(TEST_HAS_NO_EXCEPTIONS)
+
+int main(int, char**)
+{
+    {
+        using LG = cuda::std::scoped_lock<>;
+        LG lg;
+        unused(lg);
+    }
+    {
+        using LG = cuda::std::scoped_lock<TestMutex>;
+        TestMutex m1;
+        {
+            LG lg(m1);
+            assert(m1.locked);
+        }
+        assert(!m1.locked);
+    }
+    {
+        using LG = cuda::std::scoped_lock<TestMutex, TestMutex>;
+        TestMutex m1, m2;
+        {
+            LG lg(m1, m2);
+            assert(m1.locked && m2.locked);
+        }
+        assert(!m1.locked && !m2.locked);
+    }
+    {
+        using LG = cuda::std::scoped_lock<TestMutex, TestMutex, TestMutex>;
+        TestMutex m1, m2, m3;
+        {
+            LG lg(m1, m2, m3);
+            assert(m1.locked && m2.locked && m3.locked);
+        }
+        assert(!m1.locked && !m2.locked && !m3.locked);
+    }
+#if !defined(TEST_HAS_NO_EXCEPTIONS)
+    {
+        using MT = TestMutexThrows;
+        using LG = cuda::std::scoped_lock<MT>;
+        MT m1;
+        m1.throws_on_lock = true;
+        try {
+            LG lg(m1);
+            assert(false);
+        } catch (int) {}
+        assert(!m1.locked);
+    }
+    {
+        using MT = TestMutexThrows;
+        using LG = cuda::std::scoped_lock<MT, MT>;
+        MT m1, m2;
+        m1.throws_on_lock = true;
+        try {
+            LG lg(m1, m2);
+            assert(false);
+        } catch (int) {}
+        assert(!m1.locked && !m2.locked);
+    }
+    {
+        using MT = TestMutexThrows;
+        using LG = cuda::std::scoped_lock<MT, MT, MT>;
+        MT m1, m2, m3;
+        m2.throws_on_lock = true;
+        try {
+            LG lg(m1, m2, m3);
+            assert(false);
+        } catch (int) {}
+        assert(!m1.locked && !m2.locked && !m3.locked);
+    }
+#endif
+
+#if TEST_STD_VER >= 17
+    {
+    TestMutex m1, m2, m3;
+        {
+        cuda::std::scoped_lock sl{};
+        static_assert((cuda::std::is_same<decltype(sl), cuda::std::scoped_lock<>>::value), "" );
+        unused(sl);
+        }
+        {
+        cuda::std::scoped_lock sl{m1};
+        static_assert((cuda::std::is_same<decltype(sl), cuda::std::scoped_lock<decltype(m1)>>::value), "" );
+        unused(sl);
+        }
+        {
+        cuda::std::scoped_lock sl{m1, m2};
+        static_assert((cuda::std::is_same<decltype(sl), cuda::std::scoped_lock<decltype(m1), decltype(m2)>>::value), "" );
+        unused(sl);
+        }
+        {
+        cuda::std::scoped_lock sl{m1, m2, m3};
+        static_assert((cuda::std::is_same<decltype(sl), cuda::std::scoped_lock<decltype(m1), decltype(m2), decltype(m3)>>::value), "" );
+        unused(sl);
+        }
+    }
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/types.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/types.pass.cpp
new file mode 100644
index 0000000000..21c78444fa
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/types.pass.cpp
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class... MutexTypes>
+// class scoped_lock
+// {
+// public:
+//     typedef Mutex mutex_type;  // Only if sizeof...(MutexTypes) == 1
+//     ...
+// };
+
+#include<cuda/std/mutex>
+#include<cuda/std/type_traits>
+#include "test_macros.h"
+
+struct NAT {};
+
+template <class LG>
+__host__ __device__ auto test_typedef(int) -> typename LG::mutex_type;
+
+template <class LG>
+__host__ __device__ auto test_typedef(...) -> NAT;
+
+template <class LG>
+__host__ __device__ constexpr bool has_mutex_type() {
+    return !cuda::std::is_same<decltype(test_typedef<LG>(0)), NAT>::value;
+}
+
+int main(int, char**)
+{
+    {
+        using T = cuda::std::scoped_lock<>;
+        static_assert(!has_mutex_type<T>(), "");
+    }
+    {
+        using M1 = cuda::std::mutex;
+        using T = cuda::std::scoped_lock<M1>;
+        static_assert(cuda::std::is_same<T::mutex_type, M1>::value, "");
+    }
+#if 0 // No recursive mutex
+    {
+        using M1 = cuda::std::recursive_mutex;
+        using T = cuda::std::scoped_lock<M1>;
+        static_assert(cuda::std::is_same<T::mutex_type, M1>::value, "");
+    }
+    {
+        using M1 = cuda::std::mutex;
+        using M2 = cuda::std::recursive_mutex;
+        using T = cuda::std::scoped_lock<M1, M2>;
+        static_assert(!has_mutex_type<T>(), "");
+    }
+    {
+        using M1 = cuda::std::mutex;
+        using M2 = cuda::std::recursive_mutex;
+        using T = cuda::std::scoped_lock<M1, M1, M2>;
+        static_assert(!has_mutex_type<T>(), "");
+    }
+#endif
+    {
+        using M1 = cuda::std::mutex;
+        using T = cuda::std::scoped_lock<M1, M1>;
+        static_assert(!has_mutex_type<T>(), "");
+    }
+#if 0 // No recursive mutex
+    {
+        using M1 = cuda::std::recursive_mutex;
+        using T = cuda::std::scoped_lock<M1, M1, M1>;
+        static_assert(!has_mutex_type<T>(), "");
+    }
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
new file mode 100644
index 0000000000..f67a32fb76
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: gcc-7
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// unique_lock
+
+// Make sure that the implicitly-generated CTAD works.
+
+#include<cuda/std/mutex>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  cuda::std::mutex mutex;
+  {
+    cuda::std::unique_lock lock(mutex);
+    ASSERT_SAME_TYPE(decltype(lock), cuda::std::unique_lock<cuda::std::mutex>);
+  }
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.fail.cpp
new file mode 100644
index 0000000000..46366cc73c
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.fail.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// unique_lock& operator=(unique_lock const&) = delete;
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+int main(int, char**)
+{
+    {
+    typedef cuda::std::mutex M;
+    M m0;
+    M m1;
+    cuda::std::unique_lock<M> lk0(m0);
+    cuda::std::unique_lock<M> lk1(m1);
+    lk1 = lk0;
+    assert(lk1.mutex() == &m0);
+    assert(lk1.owns_lock() == true);
+    assert(lk0.mutex() == nullptr);
+    assert(lk0.owns_lock() == false);
+    }
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.fail.cpp
new file mode 100644
index 0000000000..8e11c825d7
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.fail.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// unique_lock(unique_lock const&) = delete;
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+int main(int, char**)
+{
+    {
+    typedef cuda::std::mutex M;
+    M m;
+    cuda::std::unique_lock<M> lk0(m);
+    cuda::std::unique_lock<M> lk = lk0;
+    assert(lk.mutex() == &m);
+    assert(lk.owns_lock() == true);
+    assert(lk0.mutex() == nullptr);
+    assert(lk0.owns_lock() == false);
+    }
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp
new file mode 100644
index 0000000000..1e0110050b
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// unique_lock();
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<cuda::std::mutex> ul;
+    assert(!ul.owns_lock());
+    assert(ul.mutex() == nullptr);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp
new file mode 100644
index 0000000000..0201bc2da6
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads, c++03
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// unique_lock& operator=(unique_lock&& u);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include "nasty_containers.h"
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    {
+    typedef cuda::std::mutex M;
+    M m0;
+    M m1;
+    cuda::std::unique_lock<M> lk0(m0);
+    cuda::std::unique_lock<M> lk1(m1);
+    lk1 = cuda::std::move(lk0);
+    assert(lk1.mutex() == cuda::std::addressof(m0));
+    assert(lk1.owns_lock() == true);
+    assert(lk0.mutex() == nullptr);
+    assert(lk0.owns_lock() == false);
+    }
+    {
+    typedef nasty_mutex M;
+    M m0;
+    M m1;
+    cuda::std::unique_lock<M> lk0(m0);
+    cuda::std::unique_lock<M> lk1(m1);
+    lk1 = cuda::std::move(lk0);
+    assert(lk1.mutex() == cuda::std::addressof(m0));
+    assert(lk1.owns_lock() == true);
+    assert(lk0.mutex() == nullptr);
+    assert(lk0.owns_lock() == false);
+    }
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp
new file mode 100644
index 0000000000..4d14155845
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads, c++03
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// unique_lock(unique_lock&& u);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include "nasty_containers.h"
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    {
+    typedef cuda::std::mutex M;
+    M m;
+    cuda::std::unique_lock<M> lk0(m);
+    cuda::std::unique_lock<M> lk = cuda::std::move(lk0);
+    assert(lk.mutex() == cuda::std::addressof(m));
+    assert(lk.owns_lock() == true);
+    assert(lk0.mutex() == nullptr);
+    assert(lk0.owns_lock() == false);
+    }
+    {
+    typedef nasty_mutex M;
+    M m;
+    cuda::std::unique_lock<M> lk0(m);
+    cuda::std::unique_lock<M> lk = cuda::std::move(lk0);
+    assert(lk.mutex() == cuda::std::addressof(m));
+    assert(lk.owns_lock() == true);
+    assert(lk0.mutex() == nullptr);
+    assert(lk0.owns_lock() == false);
+    }
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp
new file mode 100644
index 0000000000..46b5d4497d
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// explicit unique_lock(mutex_type& m);
+
+// template<class _Mutex> unique_lock(unique_lock<_Mutex>)
+//     -> unique_lock<_Mutex>;  // C++17
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    time_point t1;
+    {
+        cuda::std::unique_lock<Mutex> ul(*m);
+        t1 = Clock::now();
+    }
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex>
+__host__ __device__
+void check_ctad(Mutex* m)
+{
+#if TEST_STD_VER >= 17
+    cuda::std::unique_lock ul(*m);
+    static_assert((cuda::std::is_same<decltype(ul), cuda::std::unique_lock<Mutex>>::value), "" );
+    unused(ul);
+#endif
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+
+    handler.syncthreads();
+    handler.run_on_first_thread(check_ctad<Mutex>, m);
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::mutex, local_memory_selector>();
+#else
+    test<cuda::std::mutex, shared_memory_selector>();
+    test<cuda::std::mutex, global_memory_selector>();
+#endif
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp
new file mode 100644
index 0000000000..b83d4a2159
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// unique_lock(mutex_type& m, adopt_lock_t);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include "nasty_containers.h"
+
+#include "test_macros.h"
+
+__host__ __device__ void test() {
+    {
+    typedef cuda::std::mutex M;
+    M m;
+    m.lock();
+    cuda::std::unique_lock<M> lk(m, cuda::std::adopt_lock);
+    assert(lk.mutex() == cuda::std::addressof(m));
+    assert(lk.owns_lock() == true);
+    }
+    {
+    typedef nasty_mutex M;
+    M m;
+    m.lock();
+    cuda::std::unique_lock<M> lk(m, cuda::std::adopt_lock);
+    assert(lk.mutex() == cuda::std::addressof(m));
+    assert(lk.owns_lock() == true);
+    }
+}
+
+int main(int, char**)
+{
+  test();
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp
new file mode 100644
index 0000000000..37ac29e6e6
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// unique_lock(mutex_type& m, defer_lock_t);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include "nasty_containers.h"
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    {
+    typedef cuda::std::mutex M;
+    M m;
+    cuda::std::unique_lock<M> lk(m, cuda::std::defer_lock);
+    assert(lk.mutex() == cuda::std::addressof(m));
+    assert(lk.owns_lock() == false);
+    }
+    {
+    typedef nasty_mutex M;
+    M m;
+    cuda::std::unique_lock<M> lk(m, cuda::std::defer_lock);
+    assert(lk.mutex() == cuda::std::addressof(m));
+    assert(lk.owns_lock() == false);
+    }
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_duration.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_duration.pass.cpp
new file mode 100644
index 0000000000..d59ca64440
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_duration.pass.cpp
@@ -0,0 +1,102 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// template <class Rep, class Period>
+//   unique_lock(mutex_type& m, const chrono::duration<Rep, Period>& rel_time);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f1(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    cuda::std::unique_lock<Mutex> lk(*m, ms(500));
+    assert(lk.owns_lock() == true);
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex>
+__host__ __device__ void f2(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    cuda::std::unique_lock<Mutex> lk(*m, ms(200));
+    assert(lk.owns_lock() == false);
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(200));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    {
+      handler.run_on_first_thread(&Mutex::lock, m);
+      handler.syncthreads();
+
+      handler.run_on_second_thread(f1<Mutex>, m);
+      handler.sleep_first_thread(ms(250));
+      handler.run_on_first_thread(&Mutex::unlock, m);
+      handler.join_test_thread();
+    }
+
+    handler.syncthreads();
+
+    {
+      handler.run_on_first_thread(&Mutex::lock, m);
+      handler.syncthreads();
+
+      handler.run_on_second_thread(f2<Mutex>, m);
+      handler.sleep_first_thread(ms(250));
+      handler.run_on_first_thread(&Mutex::unlock, m);
+      handler.join_test_thread();
+    }
+
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::timed_mutex, local_memory_selector>();
+#else
+    test<cuda::std::timed_mutex, shared_memory_selector>();
+    test<cuda::std::timed_mutex, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_time_point.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_time_point.pass.cpp
new file mode 100644
index 0000000000..15ed054242
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_time_point.pass.cpp
@@ -0,0 +1,100 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// template <class Clock, class Duration>
+//   unique_lock(mutex_type& m, const chrono::time_point<Clock, Duration>& abs_time);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::high_resolution_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f1(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    cuda::std::unique_lock<Mutex> lk(*m, Clock::now() + ms(500));
+    assert(lk.owns_lock() == true);
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex>
+__host__ __device__ void f2(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    cuda::std::unique_lock<Mutex> lk(*m, Clock::now() + ms(200));
+    assert(lk.owns_lock() == false);
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(200));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    {
+      handler.run_on_first_thread(&Mutex::lock, m);
+      handler.syncthreads();
+
+      handler.run_on_second_thread(f1<Mutex>, m);
+      handler.sleep_first_thread(ms(250));
+      handler.run_on_first_thread(&Mutex::unlock, m);
+      handler.join_test_thread();
+    }
+
+    handler.syncthreads();
+
+    {
+      handler.run_on_first_thread(&Mutex::lock, m);
+      handler.syncthreads();
+
+      handler.run_on_second_thread(f2<Mutex>, m);
+      handler.sleep_first_thread(ms(250));
+      handler.run_on_first_thread(&Mutex::unlock, m);
+      handler.join_test_thread();
+    }
+}
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::timed_mutex, local_memory_selector>();
+#else
+    test<cuda::std::timed_mutex, shared_memory_selector>();
+    test<cuda::std::timed_mutex, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp
new file mode 100644
index 0000000000..fb7de67a32
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp
@@ -0,0 +1,92 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// unique_lock(mutex_type& m, try_to_lock_t);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    {
+        cuda::std::unique_lock<Mutex> lk(*m, cuda::std::try_to_lock);
+        assert(lk.owns_lock() == false);
+    }
+    {
+        cuda::std::unique_lock<Mutex> lk(*m, cuda::std::try_to_lock);
+        assert(lk.owns_lock() == false);
+    }
+    {
+        cuda::std::unique_lock<Mutex> lk(*m, cuda::std::try_to_lock);
+        assert(lk.owns_lock() == false);
+    }
+    while (true)
+    {
+        cuda::std::unique_lock<Mutex> lk(*m, cuda::std::try_to_lock);
+        if (lk.owns_lock())
+            break;
+    }
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+
+}
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::mutex, local_memory_selector>();
+#else
+    test<cuda::std::mutex, shared_memory_selector>();
+    test<cuda::std::mutex, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
new file mode 100644
index 0000000000..3e8060624e
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// void lock();
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    cuda::std::unique_lock<Mutex> lk(*m, cuda::std::defer_lock);
+    time_point t0 = Clock::now();
+    lk.lock();
+    time_point t1 = Clock::now();
+    assert(lk.owns_lock() == true);
+    assert(t1 - t0 >= ms(250));
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        lk.lock();
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EDEADLK);
+    }
+#endif
+    lk.unlock();
+    lk.release();
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        lk.lock();
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EPERM);
+    }
+#endif
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::mutex, local_memory_selector>();
+#else
+    test<cuda::std::mutex, shared_memory_selector>();
+    test<cuda::std::mutex, global_memory_selector>();
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
new file mode 100644
index 0000000000..00f1aa5a86
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+//
+// FLAKY_TEST.
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// bool try_lock();
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+STATIC_TEST_GLOBAL_VAR bool try_lock_called = false;
+
+struct mutex
+{
+    __host__ __device__ bool try_lock()
+    {
+        try_lock_called = !try_lock_called;
+        return try_lock_called;
+    }
+    __host__ __device__ void unlock() {}
+};
+
+STATIC_TEST_GLOBAL_VAR mutex m;
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<mutex> lk(m, cuda::std::defer_lock);
+    assert(lk.try_lock() == true);
+    assert(try_lock_called == true);
+    assert(lk.owns_lock() == true);
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        TEST_IGNORE_NODISCARD lk.try_lock();
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EDEADLK);
+    }
+#endif
+    lk.unlock();
+    assert(lk.try_lock() == false);
+    assert(try_lock_called == false);
+    assert(lk.owns_lock() == false);
+    lk.release();
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        TEST_IGNORE_NODISCARD lk.try_lock();
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EPERM);
+    }
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
new file mode 100644
index 0000000000..df9b490b08
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// template <class Rep, class Period>
+//   bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+STATIC_TEST_GLOBAL_VAR bool try_lock_for_called = false;
+
+typedef cuda::std::chrono::milliseconds ms;
+
+struct mutex
+{
+    template <class Rep, class Period>
+    __host__ __device__ bool try_lock_for(const cuda::std::chrono::duration<Rep, Period>& rel_time)
+    {
+        assert(rel_time == ms(5));
+        try_lock_for_called = !try_lock_for_called;
+        return try_lock_for_called;
+    }
+    __host__ __device__ void unlock() {}
+};
+
+STATIC_TEST_GLOBAL_VAR mutex m;
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<mutex> lk(m, cuda::std::defer_lock);
+    assert(lk.try_lock_for(ms(5)) == true);
+    assert(try_lock_for_called == true);
+    assert(lk.owns_lock() == true);
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EDEADLK);
+    }
+#endif
+    lk.unlock();
+    assert(lk.try_lock_for(ms(5)) == false);
+    assert(try_lock_for_called == false);
+    assert(lk.owns_lock() == false);
+    lk.release();
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EPERM);
+    }
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
new file mode 100644
index 0000000000..0dcf049b4d
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// template <class Clock, class Duration>
+//   bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+STATIC_TEST_GLOBAL_VAR bool try_lock_until_called = false;
+
+struct mutex
+{
+    template <class Clock, class Duration>
+    __host__ __device__ bool try_lock_until(const cuda::std::chrono::time_point<Clock, Duration>& abs_time)
+    {
+        typedef cuda::std::chrono::milliseconds ms;
+        assert(Clock::now() - abs_time < ms(5));
+        try_lock_until_called = !try_lock_until_called;
+        return try_lock_until_called;
+    }
+    __host__ __device__ void unlock() {}
+};
+
+STATIC_TEST_GLOBAL_VAR mutex m;
+
+#if defined(_LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK)
+typedef cuda::std::chrono::system_clock Clock;
+#else
+typedef cuda::std::chrono::steady_clock Clock;
+#endif
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<mutex> lk(m, cuda::std::defer_lock);
+    assert(lk.try_lock_until(Clock::now()) == true);
+    assert(try_lock_until_called == true);
+    assert(lk.owns_lock() == true);
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EDEADLK);
+    }
+#endif
+    lk.unlock();
+    assert(lk.try_lock_until(Clock::now()) == false);
+    assert(try_lock_until_called == false);
+    assert(lk.owns_lock() == false);
+    lk.release();
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EPERM);
+    }
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
new file mode 100644
index 0000000000..d77cb7c2da
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// void unlock();
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+STATIC_TEST_GLOBAL_VAR bool unlock_called = false;
+
+struct mutex
+{
+    __host__ __device__ void lock() {}
+    __host__ __device__ void unlock() {unlock_called = true;}
+};
+
+STATIC_TEST_GLOBAL_VAR mutex m;
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<mutex> lk(m);
+    lk.unlock();
+    assert(unlock_called == true);
+    assert(lk.owns_lock() == false);
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        lk.unlock();
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EPERM);
+    }
+#endif
+    lk.release();
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try
+    {
+        lk.unlock();
+        assert(false);
+    }
+    catch (cuda::std::system_error& e)
+    {
+        assert(e.code().value() == EPERM);
+    }
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp
new file mode 100644
index 0000000000..133bd70d86
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// void swap(unique_lock& u);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+struct mutex
+{
+    __host__ __device__ void lock() {}
+    __host__ __device__ void unlock() {}
+};
+
+STATIC_TEST_GLOBAL_VAR mutex m;
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<mutex> lk1(m);
+    cuda::std::unique_lock<mutex> lk2;
+    lk1.swap(lk2);
+    assert(lk1.mutex() == nullptr);
+    assert(lk1.owns_lock() == false);
+    assert(lk2.mutex() == &m);
+    assert(lk2.owns_lock() == true);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp
new file mode 100644
index 0000000000..97fe40d3bb
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// template <class Mutex>
+//   void swap(unique_lock<Mutex>& x, unique_lock<Mutex>& y);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+struct mutex
+{
+    __host__ __device__ void lock() {}
+    __host__ __device__ void unlock() {}
+};
+
+STATIC_TEST_GLOBAL_VAR mutex m;
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<mutex> lk1(m);
+    cuda::std::unique_lock<mutex> lk2;
+    swap(lk1, lk2);
+    assert(lk1.mutex() == nullptr);
+    assert(lk1.owns_lock() == false);
+    assert(lk2.mutex() == &m);
+    assert(lk2.owns_lock() == true);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp
new file mode 100644
index 0000000000..9224e3608a
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// mutex_type* release() noexcept;
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+STATIC_TEST_GLOBAL_VAR int lock_count = 0;
+STATIC_TEST_GLOBAL_VAR int unlock_count = 0;
+struct mutex
+{
+    __host__ __device__ void lock() {++lock_count;}
+    __host__ __device__ void unlock() {++unlock_count;}
+};
+
+STATIC_TEST_GLOBAL_VAR mutex m;
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<mutex> lk(m);
+    assert(lk.mutex() == &m);
+    assert(lk.owns_lock() == true);
+    assert(lock_count == 1);
+    assert(unlock_count == 0);
+    assert(lk.release() == &m);
+    assert(lk.mutex() == nullptr);
+    assert(lk.owns_lock() == false);
+    assert(lock_count == 1);
+    assert(unlock_count == 0);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp
new file mode 100644
index 0000000000..312af9b41b
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// mutex_type *mutex() const;
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+STATIC_TEST_GLOBAL_VAR cuda::std::mutex m;
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<cuda::std::mutex> lk0;
+    assert(lk0.mutex() == nullptr);
+    cuda::std::unique_lock<cuda::std::mutex> lk1(m);
+    assert(lk1.mutex() == &m);
+    lk1.unlock();
+    assert(lk1.mutex() == &m);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp
new file mode 100644
index 0000000000..d5be187499
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// explicit operator bool() const noexcept;
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/type_traits>
+
+#include "test_macros.h"
+
+STATIC_TEST_GLOBAL_VAR cuda::std::mutex m;
+
+int main(int, char**)
+{
+    static_assert(cuda::std::is_constructible<bool, cuda::std::unique_lock<cuda::std::mutex> >::value, "");
+    static_assert(!cuda::std::is_convertible<cuda::std::unique_lock<cuda::std::mutex>, bool>::value, "");
+
+    cuda::std::unique_lock<cuda::std::mutex> lk0;
+    assert(static_cast<bool>(lk0) == false);
+    cuda::std::unique_lock<cuda::std::mutex> lk1(m);
+    assert(static_cast<bool>(lk1) == true);
+    lk1.unlock();
+    assert(static_cast<bool>(lk1) == false);
+    ASSERT_NOEXCEPT(static_cast<bool>(lk0));
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp
new file mode 100644
index 0000000000..872193dc7c
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex> class unique_lock;
+
+// bool owns_lock() const;
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+STATIC_TEST_GLOBAL_VAR cuda::std::mutex m;
+
+int main(int, char**)
+{
+    cuda::std::unique_lock<cuda::std::mutex> lk0;
+    assert(lk0.owns_lock() == false);
+    cuda::std::unique_lock<cuda::std::mutex> lk1(m);
+    assert(lk1.owns_lock() == true);
+    lk1.unlock();
+    assert(lk1.owns_lock() == false);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.pass.cpp
new file mode 100644
index 0000000000..51b560aa03
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.pass.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// template <class Mutex>
+// class unique_lock
+// {
+// public:
+//     typedef Mutex mutex_type;
+//     ...
+// };
+
+#include<cuda/std/mutex>
+#include<cuda/std/type_traits>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    static_assert((cuda::std::is_same<cuda::std::unique_lock<cuda::std::mutex>::mutex_type,
+                   cuda::std::mutex>::value), "");
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/types.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/types.fail.cpp
new file mode 100644
index 0000000000..31b9170cdb
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/types.fail.cpp
@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct defer_lock_t { explicit defer_lock_t() = default; };
+// struct try_to_lock_t { explicit try_to_lock_t() = default; };
+// struct adopt_lock_t { explicit adopt_lock_t() = default; };
+
+// This test checks for LWG 2510.
+
+#include<cuda/std/mutex>
+
+
+cuda::std::defer_lock_t __host__ __device__ f1() { return {}; } // expected-error 1 {{chosen constructor is explicit in copy-initialization}}
+cuda::std::try_to_lock_t __host__ __device__ f2() { return {}; } // expected-error 1 {{chosen constructor is explicit in copy-initialization}}
+cuda::std::adopt_lock_t __host__ __device__ f3() { return {}; } // expected-error 1 {{chosen constructor is explicit in copy-initialization}}
+
+int main(int, char**) {
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.lock/types.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/types.pass.cpp
new file mode 100644
index 0000000000..d8cd6106c4
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.lock/types.pass.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct defer_lock_t { explicit defer_lock_t() = default; };
+// struct try_to_lock_t { explicit try_to_lock_t() = default; };
+// struct adopt_lock_t { explicit adopt_lock_t() = default; };
+//
+// constexpr defer_lock_t  defer_lock{};
+// constexpr try_to_lock_t try_to_lock{};
+// constexpr adopt_lock_t  adopt_lock{};
+
+#include<cuda/std/mutex>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    typedef cuda::std::defer_lock_t T1;
+    typedef cuda::std::try_to_lock_t T2;
+    typedef cuda::std::adopt_lock_t T3;
+
+    T1 t1 = cuda::std::defer_lock; unused(t1);
+    T2 t2 = cuda::std::try_to_lock; unused(t2);
+    T3 t3 = cuda::std::adopt_lock; unused(t3);
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.general/nothing_to_do.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.general/nothing_to_do.pass.cpp
new file mode 100644
index 0000000000..f4d009f670
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.general/nothing_to_do.pass.cpp
@@ -0,0 +1,14 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+int main(int, char**)
+{
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp
new file mode 100644
index 0000000000..c00fd45540
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class mutex;
+
+// mutex& operator=(const mutex&) = delete;
+
+#include<cuda/std/mutex>
+
+int main(int, char**)
+{
+    cuda::std::mutex m0;
+    cuda::std::mutex m1;
+    m1 = m0;
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp
new file mode 100644
index 0000000000..68e9fb3e09
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class mutex;
+
+// mutex(const mutex&) = delete;
+
+#include<cuda/std/mutex>
+
+int main(int, char**)
+{
+    cuda::std::mutex m0;
+    cuda::std::mutex m1(m0);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp
new file mode 100644
index 0000000000..6f4926216d
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class mutex;
+
+// mutex();
+
+#include<cuda/std/mutex>
+#include<cuda/std/type_traits>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+  static_assert(cuda::std::is_nothrow_default_constructible<cuda::std::mutex>::value, "");
+  cuda::std::mutex m;
+  unused(m);
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp
new file mode 100644
index 0000000000..27a3bc3de7
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class mutex;
+
+// void lock();
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    m->lock();
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::mutex, local_memory_selector>();
+#else
+    test<cuda::std::mutex, shared_memory_selector>();
+    test<cuda::std::mutex, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp
new file mode 100644
index 0000000000..4d8e2c2c6d
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class mutex;
+
+// bool try_lock();
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(!m->try_lock());
+    assert(!m->try_lock());
+    assert(!m->try_lock());
+    while(!m->try_lock())
+        ;
+    m->unlock();
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::mutex, local_memory_selector>();
+#else
+    test<cuda::std::mutex, shared_memory_selector>();
+    test<cuda::std::mutex, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp
new file mode 100644
index 0000000000..2312e97c49
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class timed_mutex;
+
+// timed_mutex& operator=(const timed_mutex&) = delete;
+
+#include<cuda/std/mutex>
+
+int main(int, char**)
+{
+    cuda::std::timed_mutex m0;
+    cuda::std::timed_mutex m1;
+    m1 = m0;
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp
new file mode 100644
index 0000000000..37e27dcba4
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class timed_mutex;
+
+// timed_mutex(const timed_mutex&) = delete;
+
+#include<cuda/std/mutex>
+
+int main(int, char**)
+{
+    cuda::std::timed_mutex m0;
+    cuda::std::timed_mutex m1(m0);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp
new file mode 100644
index 0000000000..0266d23661
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// class timed_mutex;
+
+// timed_mutex();
+
+#include<cuda/std/mutex>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    cuda::std::timed_mutex m;
+    unused(m);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp
new file mode 100644
index 0000000000..c9b27dd67d
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// void lock();
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    m->lock();
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::timed_mutex, local_memory_selector>();
+#else
+    test<cuda::std::timed_mutex, shared_memory_selector>();
+    test<cuda::std::timed_mutex, global_memory_selector>();
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp
new file mode 100644
index 0000000000..0454764ffe
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// bool try_lock();
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(!m->try_lock());
+    assert(!m->try_lock());
+    assert(!m->try_lock());
+    while(!m->try_lock())
+        ;
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(&Mutex::lock, m);
+    handler.syncthreads();
+
+    handler.run_on_second_thread(f<Mutex>, m);
+    handler.sleep_first_thread(ms(250));
+    handler.run_on_first_thread(&Mutex::unlock, m);
+    handler.join_test_thread();
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::timed_mutex, local_memory_selector>();
+#else
+    test<cuda::std::timed_mutex, shared_memory_selector>();
+    test<cuda::std::timed_mutex, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp
new file mode 100644
index 0000000000..a51127efd3
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// template <class Rep, class Period>
+//     bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f1(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(m->try_lock_for(ms(500)) == true);
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex>
+__host__ __device__ void f2(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(m->try_lock_for(ms(200)) == false);
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(200));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    {
+        handler.run_on_first_thread(&Mutex::lock, m);
+        handler.syncthreads();
+
+        handler.run_on_second_thread(f1<Mutex>, m);
+        handler.sleep_first_thread(ms(250));
+        handler.run_on_first_thread(&Mutex::unlock, m);
+        handler.join_test_thread();
+    }
+
+    handler.syncthreads();
+
+    {
+        handler.run_on_first_thread(&cuda::std::mutex::lock, m);
+        handler.syncthreads();
+
+        handler.run_on_second_thread(f2<Mutex>, m);
+        handler.sleep_first_thread(ms(250));
+        handler.run_on_first_thread(&cuda::std::mutex::unlock, m);
+        handler.join_test_thread();
+    }
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::timed_mutex, local_memory_selector>();
+#else
+    test<cuda::std::timed_mutex, shared_memory_selector>();
+    test<cuda::std::timed_mutex, global_memory_selector>();
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp
new file mode 100644
index 0000000000..656b6e7526
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// FLAKY_TEST.
+
+// <mutex>
+
+// class timed_mutex;
+
+// template <class Clock, class Duration>
+//     bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
+
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::system_clock Clock;
+typedef Clock::time_point time_point;
+typedef Clock::duration duration;
+typedef cuda::std::chrono::milliseconds ms;
+typedef cuda::std::chrono::nanoseconds ns;
+
+template<class Mutex>
+__host__ __device__ void f1(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(m->try_lock_until(Clock::now() + ms(500)) == true);
+    time_point t1 = Clock::now();
+    m->unlock();
+    assert(t1 - t0 >= ms(250));
+}
+
+template<class Mutex>
+__host__ __device__ void f2(Mutex* m)
+{
+    time_point t0 = Clock::now();
+    assert(m->try_lock_until(Clock::now() + ms(200)) == false);
+    time_point t1 = Clock::now();
+    assert(t1 - t0 >= ms(200));
+}
+
+template<class Mutex,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<Mutex, Initializer> sel;
+    SHARED Mutex* m;
+    m = sel.construct();
+
+    heterogeneous_thread_handler handler;
+    {
+        handler.run_on_first_thread(&Mutex::lock, m);
+        handler.syncthreads();
+
+        handler.run_on_second_thread(f1<Mutex>, m);
+        handler.sleep_first_thread(ms(250));
+        handler.run_on_first_thread(&Mutex::unlock, m);
+        handler.join_test_thread();
+    }
+
+    handler.syncthreads();
+
+    {
+        handler.run_on_first_thread(&Mutex::lock, m);
+        handler.syncthreads();
+
+        handler.run_on_second_thread(f2<Mutex>, m);
+        handler.sleep_first_thread(ms(250));
+        handler.run_on_first_thread(&Mutex::unlock, m);
+        handler.join_test_thread();
+    }
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::timed_mutex, local_memory_selector>();
+#else
+    test<cuda::std::timed_mutex, shared_memory_selector>();
+    test<cuda::std::timed_mutex, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.callonce/call_once.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.callonce/call_once.pass.cpp
new file mode 100644
index 0000000000..88a76ae7cd
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.callonce/call_once.pass.cpp
@@ -0,0 +1,303 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// template<class Callable, class ...Args>
+//   void call_once(once_flag& flag, Callable&& func, Args&&... args);
+
+#include<cuda/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::milliseconds ms;
+
+__host__ __device__
+void init0(int& init0_called)
+{
+    test_sleep_thread(ms(250));
+    ++init0_called;
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f0(OnceFlag* flg0, int* init0_called)
+{
+    cuda::std::call_once(*flg0, init0, *init0_called);
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+__host__ __device__
+void init3(int& init3_called, int& init3_completed)
+{
+    ++init3_called;
+    test_sleep_thread(ms(250));
+    if (init3_called == 1)
+#ifdef __CUDA_ARCH__
+        _LIBCUDACXX_UNREACHABLE();
+#else
+        TEST_THROW(1);
+#endif
+    ++init3_completed;
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f3(OnceFlag* flg3, int* init3_called, int* init3_completed)
+{
+    try
+    {
+        cuda::std::call_once(*flg3, init3, *init3_called, *init3_completed);
+    }
+    catch (...)
+    {
+    }
+}
+#endif // TEST_HAS_NO_EXCEPTIONS
+
+#if TEST_STD_VER >= 11
+struct init1
+{
+    int& init1_called;
+    __host__ __device__ void operator()(int i) { init1_called += i; }
+};
+
+template<class OnceFlag>
+__host__ __device__
+void f1(OnceFlag* flg1, int* init1_called)
+{
+    cuda::std::call_once(*flg1, init1{*init1_called}, 1);
+}
+
+struct init2
+{
+    int& init2_called;
+    __host__ __device__ void operator()(int i, int j) const {init2_called += i + j;}
+};
+
+template<class OnceFlag>
+__host__ __device__
+void f2(OnceFlag* flg2, int* init2_called)
+{
+    cuda::std::call_once(*flg2, init2{*init2_called}, 2, 3);
+    cuda::std::call_once(*flg2, init2{*init2_called}, 4, 5);
+}
+
+#endif // TEST_STD_VER >= 11
+__host__ __device__
+void init41(int& init41_called)
+{
+    test_sleep_thread(ms(250));
+    ++init41_called;
+}
+
+__host__ __device__
+void init42(int& init42_called)
+{
+    test_sleep_thread(ms(250));
+    ++init42_called;
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f41(OnceFlag* flg41, OnceFlag* flg42, int* init41_called, int* init42_called)
+{
+    cuda::std::call_once(*flg41, init41, *init41_called);
+    cuda::std::call_once(*flg42, init42, *init42_called);
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f42(OnceFlag* flg41, OnceFlag* flg42, int* init41_called, int* init42_called)
+{
+    cuda::std::call_once(*flg41, init42, *init42_called);
+    cuda::std::call_once(*flg42, init41, *init41_called);
+}
+
+#if TEST_STD_VER >= 11
+
+class MoveOnly
+{
+    __host__ __device__ MoveOnly(const MoveOnly&);
+public:
+    __host__ __device__ MoveOnly() {}
+    __host__ __device__ MoveOnly(MoveOnly&&) {}
+
+    __host__ __device__ void operator()(MoveOnly&&) {}
+};
+
+class NonCopyable
+{
+    __host__ __device__ NonCopyable(const NonCopyable&);
+public:
+    __host__ __device__ NonCopyable() {}
+
+    __host__ __device__ void operator()(int&) {}
+};
+
+// reference qualifiers on functions are a C++11 extension
+struct RefQual
+{
+    __host__ __device__ void operator()( int& lv_called) & { ++lv_called; }
+    __host__ __device__ void operator()( int& rv_called) && { ++rv_called; }
+};
+
+#endif // TEST_STD_VER >= 11
+
+template<class OnceFlag,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<OnceFlag, Initializer> sel;
+    Selector<int, Initializer> sel_count;
+    heterogeneous_thread_handler handler;
+
+    // check basic functionality
+    {
+        SHARED OnceFlag* flg0;
+        flg0 = sel.construct();
+
+        SHARED int* init0_called;
+        init0_called = sel_count.construct();
+
+        handler.run_on_first_thread(f0<OnceFlag>, flg0, init0_called);
+        handler.run_on_second_thread(f0<OnceFlag>, flg0, init0_called);
+        handler.join_test_thread();
+        handler.syncthreads();
+        assert(*init0_called == 1);
+    }
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    // check basic exception safety
+    {
+        SHARED OnceFlag* flg0;
+        flg0 = sel.construct();
+
+        SHARED int* init3_called;
+        init3_called = sel_count.construct();
+        SHARED int* init3_completed;
+        init3_completed = sel_count.construct();
+
+        handler.run_on_first_thread( f3<OnceFlag>, flg3, init3_called, init3_completed);
+        handler.run_on_second_thread(f3<OnceFlag>, flg3, init3_called, init3_completed);
+        handler.join_test_thread();
+        handler.syncthreads();
+
+        assert(*init3_called == 2);
+        assert(*init3_completed == 1);
+    }
+#endif
+    // check deadlock avoidance
+    {
+        SHARED OnceFlag* flg41;
+        flg41 = sel.construct();
+        SHARED OnceFlag* flg42;
+        flg42 = sel.construct();
+
+        SHARED int* init41_called;
+        init41_called = sel_count.construct();
+        SHARED int* init42_called;
+        init42_called = sel_count.construct();
+
+        handler.run_on_first_thread( f41<OnceFlag>, flg41, flg42, init41_called, init42_called);
+        handler.run_on_second_thread(f42<OnceFlag>, flg41, flg42, init41_called, init42_called);
+        handler.join_test_thread();
+        handler.syncthreads();
+
+        assert(*init41_called == 1);
+        assert(*init42_called == 1);
+    }
+#if TEST_STD_VER >= 11
+    // check functors with 1 arg
+    {
+        SHARED OnceFlag* flg1;
+        flg1 = sel.construct();
+
+        SHARED int* init1_called;
+        init1_called = sel_count.construct();
+
+        handler.run_on_first_thread( f1<OnceFlag>, flg1, init1_called);
+        handler.run_on_second_thread(f1<OnceFlag>, flg1, init1_called);
+        handler.join_test_thread();
+        handler.syncthreads();
+        assert(*init1_called == 1);
+    }
+    // check functors with 2 args
+    {
+        SHARED OnceFlag* flg2;
+        flg2 = sel.construct();
+
+        SHARED int* init2_called;
+        init2_called = sel_count.construct();
+
+        handler.run_on_first_thread( f2<OnceFlag>, flg2, init2_called);
+        handler.run_on_second_thread(f2<OnceFlag>, flg2, init2_called);
+        handler.join_test_thread();
+        handler.syncthreads();
+        assert(*init2_called == 5);
+    }
+
+    {
+        SHARED OnceFlag* f;
+        f = sel.construct();
+
+        cuda::std::call_once(*f, MoveOnly(), MoveOnly());
+    }
+    // check LWG2442: call_once() shouldn't DECAY_COPY()
+    {
+        SHARED OnceFlag* f;
+        f = sel.construct();
+        int i = 0;
+
+        cuda::std::call_once(*f, NonCopyable(), i);
+    }
+    // reference qualifiers on functions are a C++11 extension
+    {
+        SHARED OnceFlag* f1, *f2;
+        f1 = sel.construct();
+        f2 = sel.construct();
+
+        SHARED int* lv_called;
+        lv_called = sel_count.construct();
+
+        SHARED int* rv_called;
+        rv_called = sel_count.construct();
+
+        RefQual rq;
+        cuda::std::call_once(*f1, rq, *lv_called);
+        assert(*lv_called == 1);
+        cuda::std::call_once(*f2, cuda::std::move(rq), *rv_called);
+        assert(*rv_called == 1);
+    }
+#endif // TEST_STD_VER >= 11
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::once_flag, local_memory_selector>();
+#else
+    test<cuda::std::once_flag, shared_memory_selector>();
+    test<cuda::std::once_flag, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.callonce/race.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.callonce/race.pass.cpp
new file mode 100644
index 0000000000..bcd3bcf708
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.callonce/race.pass.cpp
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// template<class Callable, class ...Args>
+//   void call_once(once_flag& flag, Callable&& func, Args&&... args);
+
+// This test is supposed to be run with ThreadSanitizer and verifies that
+// call_once properly synchronizes user state, a data race that was fixed
+// in r280621.
+#include<cuda/std/mutex>
+#include<cuda/std/cassert>
+#include<cuda/std/chrono>
+
+#include "test_macros.h"
+
+#include "cuda_space_selector.h"
+#include "heterogeneous_thread_handler.h"
+
+typedef cuda::std::chrono::milliseconds ms;
+
+__host__ __device__
+void init0(int& global)
+{
+    ++global;
+}
+
+template<class OnceFlag>
+__host__ __device__
+void f0(OnceFlag* flg, int* global)
+{
+    cuda::std::call_once(*flg, init0, *global);
+    assert(*global == 1);
+}
+
+template<class OnceFlag,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__ void test() {
+    Selector<OnceFlag, Initializer> sel;
+    SHARED OnceFlag* flg;
+    flg = sel.construct();
+
+    Selector<int, Initializer> sel_count;
+    SHARED int* global;
+    global = sel_count.construct();
+
+    heterogeneous_thread_handler handler;
+    handler.run_on_first_thread(f0<OnceFlag>, flg, global);
+    handler.run_on_second_thread(f0<OnceFlag>, flg, global);
+    handler.syncthreads();
+    handler.join_test_thread();
+    assert(*global == 1);
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+    cuda_thread_count = 2;
+
+    test<cuda::std::once_flag, local_memory_selector>();
+#else
+    test<cuda::std::once_flag, shared_memory_selector>();
+    test<cuda::std::once_flag, global_memory_selector>();
+#endif
+
+    return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/assign.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/assign.compile.fail.cpp
new file mode 100644
index 0000000000..d475288211
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/assign.compile.fail.cpp
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// once_flag& operator=(const once_flag&) = delete;
+
+#include<cuda/std/mutex>
+
+int main(int, char**)
+{
+    cuda::std::once_flag f;
+    cuda::std::once_flag f2;
+    f2 = f;
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/copy.compile.fail.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/copy.compile.fail.cpp
new file mode 100644
index 0000000000..36dd4fd423
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/copy.compile.fail.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// once_flag(const once_flag&) = delete;
+
+#include<cuda/std/mutex>
+
+int main(int, char**)
+{
+    cuda::std::once_flag f;
+    cuda::std::once_flag f2(f);
+
+  return 0;
+}
diff --git a/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/default.pass.cpp b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/default.pass.cpp
new file mode 100644
index 0000000000..da94c0c8a4
--- /dev/null
+++ b/.upstream-tests/test/std/thread/thread.mutex/thread.once/thread.once.onceflag/default.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: pre-sm-70
+
+// <mutex>
+
+// struct once_flag;
+
+// constexpr once_flag() noexcept;
+
+#include<cuda/std/mutex>
+#include "test_macros.h"
+
+int main(int, char**)
+{
+    {
+    cuda::std::once_flag f;
+    unused(f);
+    }
+#if TEST_STD_VER >= 11
+    {
+    constexpr cuda::std::once_flag f;
+    unused(f);
+    }
+#endif
+
+  return 0;
+}
diff --git a/.upstream-tests/test/support/heterogeneous_thread_handler.h b/.upstream-tests/test/support/heterogeneous_thread_handler.h
new file mode 100644
index 0000000000..d3bddb333f
--- /dev/null
+++ b/.upstream-tests/test/support/heterogeneous_thread_handler.h
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_SUPPORT_HETEROGENEOUS_TEST_THREAD_H
+#define TEST_SUPPORT_HETEROGENEOUS_TEST_THREAD_H
+
+#include <nv/target>
+
+#include <cuda/std/chrono>
+
+#ifndef __CUDACC_RTC__
+#include <chrono>
+#include <thread>
+#endif // __CUDACC_RTC__
+
+__host__ __device__
+void test_sleep_thread(cuda::std::chrono::milliseconds dur) {
+    constexpr cuda::std::chrono::nanoseconds max_sleep_duration{500000};
+    cuda::std::chrono::nanoseconds dur_mu{dur};
+    cuda::std::chrono::nanoseconds waited{0};
+    const auto start = cuda::std::chrono::high_resolution_clock::now();
+    while (dur_mu > waited) {
+        NV_IF_TARGET(
+            NV_IS_DEVICE,
+            __libcpp_thread_sleep_for(dur_mu - waited < max_sleep_duration ? dur_mu - waited : max_sleep_duration);,
+            std::this_thread::sleep_for(std::chrono::nanoseconds{(dur_mu - waited).count()});
+        )
+        waited = cuda::std::chrono::high_resolution_clock::now() - start;
+    }
+}
+
+struct heterogeneous_thread_handler {
+#ifndef __CUDACC_RTC__
+    union { std::thread t_; };
+#endif // __CUDACC_RTC__
+
+    __host__ __device__
+    heterogeneous_thread_handler() noexcept {}
+    __host__ __device__
+    ~heterogeneous_thread_handler() noexcept {}
+
+    template <class F, class ...Args>
+    __host__ __device__
+    void run_on_first_thread(F&& f, Args&& ...args) {
+        NV_IF_TARGET(
+            NV_IS_DEVICE,
+            if (threadIdx.x == 0) { cuda::std::__invoke(cuda::std::forward<F>(f), cuda::std::forward<Args>(args)...); },
+            cuda::std::__invoke(cuda::std::forward<F>(f), cuda::std::forward<Args>(args)...);
+        )
+    }
+
+    template <class F, class ...Args>
+    __host__ __device__
+    void run_on_second_thread(F&& f, Args&& ...args) {
+        NV_IF_TARGET(
+            NV_IS_DEVICE,
+            if (threadIdx.x == 1) { cuda::std::__invoke(cuda::std::forward<F>(f), cuda::std::forward<Args>(args)...); },
+            ::new((void*)std::addressof(t_)) std::thread(std::forward<F>(f), std::forward<Args>(args)...);
+        )
+    }
+
+    __host__ __device__
+    void sleep_first_thread(cuda::std::chrono::milliseconds dur) {
+        test_sleep_thread(dur);
+    }
+
+    __host__ __device__
+    void syncthreads() const {
+        NV_IF_TARGET(
+            NV_IS_DEVICE,
+            __syncthreads();
+        )
+    }
+
+    __host__ __device__
+    void join_test_thread() {
+        NV_IF_TARGET(
+            NV_IS_DEVICE,
+            (),
+            t_.join();
+        )
+    }
+};
+
+#endif // TEST_SUPPORT_HETEROGENEOUS_TEST_THREAD_H
diff --git a/.upstream-tests/test/support/nasty_containers.h b/.upstream-tests/test/support/nasty_containers.h
index bfc89170f1..f361288773 100644
--- a/.upstream-tests/test/support/nasty_containers.h
+++ b/.upstream-tests/test/support/nasty_containers.h
@@ -295,26 +295,26 @@ bool operator==(const nasty_list<T>& x, const nasty_list<T>& y) { return x.l_ ==
 class nasty_mutex
 {
 public:
-     nasty_mutex() TEST_NOEXCEPT {}
-     ~nasty_mutex() {}
+     __host__ __device__ nasty_mutex() TEST_NOEXCEPT {}
+     __host__ __device__ ~nasty_mutex() {}
 
-    nasty_mutex *operator& ()   { assert(false); return nullptr; }
+    __host__ __device__ nasty_mutex *operator& ()   { assert(false); return nullptr; }
     template <typename T>
-    void operator, (const T &) { assert(false); }
+    __host__ __device__ void operator, (const T &) { assert(false); }
 
 private:
-    nasty_mutex(const nasty_mutex&)            { assert(false); }
-    nasty_mutex& operator=(const nasty_mutex&) { assert(false); return *this; }
+    __host__ __device__ nasty_mutex(const nasty_mutex&)            { assert(false); }
+    __host__ __device__ nasty_mutex& operator=(const nasty_mutex&) { assert(false); return *this; }
 
 public:
-    void lock()               {}
-    bool try_lock() TEST_NOEXCEPT { return true; }
-    void unlock() TEST_NOEXCEPT   {}
+    __host__ __device__ void lock()               {}
+    __host__ __device__ bool try_lock() TEST_NOEXCEPT { return true; }
+    __host__ __device__ void unlock() TEST_NOEXCEPT   {}
 
     // Shared ownership
-    void lock_shared()     {}
-    bool try_lock_shared() { return true; }
-    void unlock_shared()   {}
+    __host__ __device__ void lock_shared()     {}
+    __host__ __device__ bool try_lock_shared() { return true; }
+    __host__ __device__ void unlock_shared()   {}
 };
 
 #endif

From c69ec0b191d3612ae222994b6de9083658e448fd Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 30 Mar 2023 16:39:58 +0200
Subject: [PATCH 3/4] Increase the retry limit for flaky tests even more

---
 .upstream-tests/utils/libcudacxx/test/format.py | 2 +-
 libcxx/utils/libcxx/test/format.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.upstream-tests/utils/libcudacxx/test/format.py b/.upstream-tests/utils/libcudacxx/test/format.py
index 9d1011adb0..d30206b838 100644
--- a/.upstream-tests/utils/libcudacxx/test/format.py
+++ b/.upstream-tests/utils/libcudacxx/test/format.py
@@ -200,7 +200,7 @@ def _evaluate_pass_test(self, test, tmpBase, lit_config,
             data_files = [os.path.join(local_cwd, f)
                           for f in os.listdir(local_cwd) if f.endswith('.dat')]
             is_flaky = self._get_parser('FLAKY_TEST.', parsers).getValue()
-            max_retry = 3 if is_flaky else 1
+            max_retry = 10 if is_flaky else 1
             for retry_count in range(max_retry):
                 cmd, out, err, rc = self.executor.run(exec_path, [exec_path],
                                                       local_cwd, data_files,
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index 31c54a4f65..ceb6ced7e5 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -200,7 +200,7 @@ def _evaluate_pass_test(self, test, tmpBase, lit_config,
             data_files = [os.path.join(local_cwd, f)
                           for f in os.listdir(local_cwd) if f.endswith('.dat')]
             is_flaky = self._get_parser('FLAKY_TEST.', parsers).getValue()
-            max_retry = 5 if is_flaky else 1
+            max_retry = 10 if is_flaky else 1
             for retry_count in range(max_retry):
                 cmd, out, err, rc = self.executor.run(exec_path, [exec_path],
                                                       local_cwd, data_files,

From 7ce3b1d29dcd3199afe627624c1e484e315e1f5e Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 14 Apr 2023 10:20:33 +0200
Subject: [PATCH 4/4] Fix thread api for `cuda/std`

---
 include/cuda/std/detail/__config | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
index e7a68974db..0000e7bf31 100644
--- a/include/cuda/std/detail/__config
+++ b/include/cuda/std/detail/__config
@@ -109,11 +109,8 @@
 
 #include "libcxx/include/__config"
 
-#if defined(__CUDA_ARCH__)
-    #define _LIBCUDACXX_HAS_THREAD_API_CUDA
-#elif defined(__EMSCRIPTEN__)
-    #define _LIBCUDACXX_HAS_THREAD_API_CUDA
-#elif defined(_LIBCUDACXX_COMPILER_MSVC)
+#define _LIBCUDACXX_HAS_THREAD_API_CUDA
+#if defined(_LIBCUDACXX_COMPILER_MSVC)
     #define _LIBCUDACXX_HAS_THREAD_API_WIN32
 #endif