diff --git a/.upstream-tests/test/support/nasty_containers.h b/.upstream-tests/test/support/nasty_containers.h
index c6584ed76a..bfc89170f1 100644
--- a/.upstream-tests/test/support/nasty_containers.h
+++ b/.upstream-tests/test/support/nasty_containers.h
@@ -9,12 +9,17 @@
 #ifndef NASTY_CONTAINERS_H
 #define NASTY_CONTAINERS_H
 
-#include <cassert>
-#include <vector>
-#include <list>
+#include <cuda/std/cassert>
+#if defined(_LIBCUDACXX_HAS_VECTOR)
+#include <cuda/std/vector>
+#endif
+#if defined(_LIBCUDACXX_HAS_LIST)
+#include <cuda/std/list>
+#endif
 
 #include "test_macros.h"
 
+#if defined(_LIBCUDACXX_HAS_VECTOR)
 template <class T>
 class nasty_vector
 {
@@ -135,7 +140,9 @@ class nasty_vector
 
 template <class T>
 bool operator==(const nasty_vector<T>& x, const nasty_vector<T>& y) { return x.v_ == y.v_; }
+#endif
 
+#if defined(_LIBCUDACXX_HAS_LIST)
 template <class T>
 class nasty_list
 {
@@ -282,6 +289,7 @@ class nasty_list
 
 template <class T>
 bool operator==(const nasty_list<T>& x, const nasty_list<T>& y) { return x.l_ == y.l_; }
+#endif
 
 // Not really a mutex, but can play one in tests
 class nasty_mutex
diff --git a/include/cuda/mutex b/include/cuda/mutex
new file mode 100644
index 0000000000..532cc533c8
--- /dev/null
+++ b/include/cuda/mutex
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_MUTEX
+#define _CUDA_MUTEX
+
+#include "std/mutex"
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template<thread_scope _Sco>
+using mutex = _CUDA_VSTD::__mutex_base<_Sco>;
+
+template<thread_scope _Sco>
+using timed_mutex = _CUDA_VSTD::__mutex_base<_Sco>;
+
+template<thread_scope _Sco>
+using once_flag = _CUDA_VSTD::__once_flag_base<_Sco>;
+
+using _CUDA_VSTD::call_once;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif //_CUDA_MUTEX
diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
index dfcf70f413..dce06ffb08 100644
--- a/include/cuda/std/detail/__config
+++ b/include/cuda/std/detail/__config
@@ -79,10 +79,12 @@
 #define _LIBCUDACXX_HAS_NO_PLATFORM_WAIT
 #define _LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK
 #define _LIBCUDACXX_HAS_NO_TREE_BARRIER
+#define _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
+#define _LIBCUDACXX_INLINE_THREADING
+
 #ifdef __CUDACC_RTC__
     #define __ELF__
     #define _LIBCUDACXX_DISABLE_PRAGMA_GCC_SYSTEM_HEADER
-    #define _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
     #define __alignof(x) alignof(x)
     #define _LIBCUDACXX_LITTLE_ENDIAN
     #define _LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS
@@ -104,9 +106,8 @@
 
 #include "libcxx/include/__config"
 
-#if defined(__CUDA_ARCH__)
-    #define _LIBCUDACXX_HAS_THREAD_API_CUDA
-#elif defined(_LIBCUDACXX_COMPILER_MSVC)
+#define _LIBCUDACXX_HAS_THREAD_API_CUDA
+#if defined(_LIBCUDACXX_COMPILER_MSVC)
     #define _LIBCUDACXX_HAS_THREAD_API_WIN32
 #endif
 
diff --git a/include/cuda/std/detail/libcxx/include/CMakeLists.txt b/include/cuda/std/detail/libcxx/include/CMakeLists.txt
index fa0ebc938c..bfe83f0a68 100644
--- a/include/cuda/std/detail/libcxx/include/CMakeLists.txt
+++ b/include/cuda/std/detail/libcxx/include/CMakeLists.txt
@@ -117,6 +117,7 @@ set(files
   __mdspan/submdspan.hpp
   __mdspan/type_list.hpp
   __memory/addressof.h
+  __memory/atomic_load.h
   __memory/pointer_traits.h
   __mutex_base
   __node_handle
diff --git a/include/cuda/std/detail/libcxx/include/__memory/atomic_load.h b/include/cuda/std/detail/libcxx/include/__memory/atomic_load.h
new file mode 100644
index 0000000000..ae060162ad
--- /dev/null
+++ b/include/cuda/std/detail/libcxx/include/__memory/atomic_load.h
@@ -0,0 +1,69 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___MEMORY_ATOMIMC_LOAD_H
+#define _LIBCUDACXX___MEMORY_ATOMIMC_LOAD_H
+
+#ifndef __cuda_std__
+#include <__config>
+#endif //__cuda_std__
+
+#include "../atomic"
+
+#if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#ifndef __cuda_std__
+
+template <class _ValueType>
+inline _LIBCUDACXX_INLINE_VISIBILITY
+_ValueType __libcpp_relaxed_load(_ValueType const* __value) {
+#if !defined(_LIBCUDACXX_HAS_NO_THREADS) && \
+    defined(__ATOMIC_RELAXED) &&        \
+    (__has_builtin(__atomic_load_n) || defined(_LIBCUDACXX_COMPILER_GCC))
+    return __atomic_load_n(__value, __ATOMIC_RELAXED);
+#else
+    return *__value;
+#endif
+}
+
+template <class _ValueType>
+inline _LIBCUDACXX_INLINE_VISIBILITY
+_ValueType __libcpp_acquire_load(_ValueType const* __value) {
+#if !defined(_LIBCUDACXX_HAS_NO_THREADS) && \
+    defined(__ATOMIC_ACQUIRE) &&        \
+    (__has_builtin(__atomic_load_n) || defined(_LIBCUDACXX_COMPILER_GCC))
+    return __atomic_load_n(__value, __ATOMIC_ACQUIRE);
+#else
+    return *__value;
+#endif
+}
+
+#else
+
+template <class _ValueType>
+inline _LIBCUDACXX_INLINE_VISIBILITY
+_ValueType __libcpp_relaxed_load(atomic<_ValueType> const* __value) {
+    return __value->load(memory_order_relaxed);
+}
+
+template <class _ValueType>
+inline _LIBCUDACXX_INLINE_VISIBILITY
+_ValueType __libcpp_acquire_load(atomic<_ValueType> const* __value) {
+    return __value->load(memory_order_acquire);
+}
+#endif // __cuda_std__
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___MEMORY_ATOMIMC_LOAD_H
diff --git a/include/cuda/std/detail/libcxx/include/__mutex_base b/include/cuda/std/detail/libcxx/include/__mutex_base
index bae7c7c210..b8229c6672 100644
--- a/include/cuda/std/detail/libcxx/include/__mutex_base
+++ b/include/cuda/std/detail/libcxx/include/__mutex_base
@@ -10,23 +10,40 @@
 #ifndef _LIBCUDACXX___MUTEX_BASE
 #define _LIBCUDACXX___MUTEX_BASE
 
+#ifndef __cuda_std__
 #include <__config>
-#include <chrono>
 #include <system_error>
-#include <__threading_support>
-
-#include <time.h>
+#endif // __cuda_std__
+
+#include "__memory/addressof.h"
+#include "__memory/atomic_load.h"
+#include "__threading_support"
+#include "__type_traits/enable_if.h"
+#include "__type_traits/is_floating_point.h"
+#include "__type_traits/is_nothrow_default_constructible.h"
+#include "__utility/unreachable.h"
+#include "chrono"
+#include "ctime"
+#include "semaphore"
+
+#ifndef __cuda_std__
+#include <__pragma_push>
+#endif // __cuda_std__
 
 #if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
 
-_LIBCUDACXX_PUSH_MACROS
-#include <__undef_macros>
-
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
+#ifdef __cuda_std__
+_LIBCUDACXX_INLINE_VISIBILITY
+inline void __throw_system_error(int, const char*)
+{
+    __libcpp_unreachable();
+}
+#endif // __cuda_std__
+
 #ifndef _LIBCUDACXX_HAS_NO_THREADS
 
 #ifndef _LIBCUDACXX_THREAD_SAFETY_ANNOTATION
@@ -37,34 +54,73 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #  endif
 #endif  // _LIBCUDACXX_THREAD_SAFETY_ANNOTATION
 
+#ifndef __cuda_std__
+template<int>
+using __libcpp_mutex_base_t = __libcpp_mutex_t;
+#else
+template<int _Sco>
+using __libcpp_mutex_base_t = __atomic_semaphore_base<_Sco,1>;
+
+#undef _LIBCUDACXX_MUTEX_INITIALIZER
+#define _LIBCUDACXX_MUTEX_INITIALIZER {1ll}
+#endif // __cuda_std__
 
-class _LIBCUDACXX_TYPE_VIS _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(capability("mutex")) mutex
+template<int _Sco>
+class _LIBCUDACXX_TYPE_VIS _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(capability("mutex")) __mutex_base
 {
-    __libcpp_mutex_t __m_ = _LIBCUDACXX_MUTEX_INITIALIZER;
+    __libcpp_mutex_base_t<_Sco> __m_ = _LIBCUDACXX_MUTEX_INITIALIZER;
 
 public:
     _LIBCUDACXX_INLINE_VISIBILITY
-    _LIBCUDACXX_CONSTEXPR mutex() = default;
+    constexpr __mutex_base() noexcept {}
 
-    mutex(const mutex&) = delete;
-    mutex& operator=(const mutex&) = delete;
+    __mutex_base(const __mutex_base&) = delete;
+    __mutex_base& operator=(const __mutex_base&) = delete;
 
-#if defined(_LIBCUDACXX_HAS_TRIVIAL_MUTEX_DESTRUCTION)
-    ~mutex() = default;
+#if defined(_LIBCUDACXX_HAS_TRIVIAL_MUTEX_DESTRUCTION) || defined(__cuda_std__)
+    ~__mutex_base() = default;
 #else
-    ~mutex() _NOEXCEPT;
+    ~__mutex_base() _NOEXCEPT;
 #endif
 
+#ifndef _LIBCUDACXX_INLINE_THREADING
     void lock() _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(acquire_capability());
     bool try_lock() _NOEXCEPT _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(try_acquire_capability(true));
     void unlock() _NOEXCEPT _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(release_capability());
+#else
+    _LIBCUDACXX_INLINE_VISIBILITY
+    void lock() _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(acquire_capability()) {
+        __m_.acquire(); //while(!__m_.exchange(0));
+    }
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock() _NOEXCEPT _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(try_acquire_capability(true)) {
+        return __m_.try_acquire();
+    }
+    _LIBCUDACXX_INLINE_VISIBILITY
+    void unlock() _NOEXCEPT _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(release_capability()) {
+        __m_.release(); //__m_.store(1);
+    }
+
+    template <class _Rep, class _Period>
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock_for(const chrono::duration<_Rep, _Period>& __d) {
+        return __m_.try_acquire_for(__d);
+    }
 
-    typedef __libcpp_mutex_t* native_handle_type;
+    template <class _Clock, class _Duration>
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t) {
+        return __m_.try_acquire_until(__t);
+    }
+#endif
+
+    typedef __libcpp_mutex_base_t<_Sco>* native_handle_type;
     _LIBCUDACXX_INLINE_VISIBILITY native_handle_type native_handle() {return &__m_;}
 };
 
-static_assert(is_nothrow_default_constructible<mutex>::value,
-              "the default constructor for std::mutex must be nothrow");
+using mutex = __mutex_base<0>;
+
+static_assert(is_nothrow_default_constructible<mutex>::value, "the default constructor for std::mutex must be nothrow");
 
 struct _LIBCUDACXX_TYPE_VIS defer_lock_t { explicit defer_lock_t() = default; };
 struct _LIBCUDACXX_TYPE_VIS try_to_lock_t { explicit try_to_lock_t() = default; };
@@ -78,9 +134,9 @@ extern _LIBCUDACXX_EXPORTED_FROM_ABI const adopt_lock_t  adopt_lock;
 
 #else
 
-/* _LIBCUDACXX_INLINE_VAR */ constexpr defer_lock_t  defer_lock  = defer_lock_t();
-/* _LIBCUDACXX_INLINE_VAR */ constexpr try_to_lock_t try_to_lock = try_to_lock_t();
-/* _LIBCUDACXX_INLINE_VAR */ constexpr adopt_lock_t  adopt_lock  = adopt_lock_t();
+_LIBCUDACXX_CPO_ACCESSIBILITY defer_lock_t  defer_lock  = defer_lock_t();
+_LIBCUDACXX_CPO_ACCESSIBILITY try_to_lock_t try_to_lock = try_to_lock_t();
+_LIBCUDACXX_CPO_ACCESSIBILITY adopt_lock_t  adopt_lock  = adopt_lock_t();
 
 #endif
 
@@ -151,8 +207,8 @@ public:
     }
 
 private:
-    unique_lock(unique_lock const&); // = delete;
-    unique_lock& operator=(unique_lock const&); // = delete;
+    unique_lock(unique_lock const&) = delete;
+    unique_lock& operator=(unique_lock const&) = delete;
 
 public:
 #ifndef _LIBCUDACXX_CXX03_LANG
@@ -174,14 +230,18 @@ public:
 
 #endif  // _LIBCUDACXX_CXX03_LANG
 
-    void lock();
-    bool try_lock();
+    _LIBCUDACXX_INLINE_VISIBILITY void lock();
+    _LIBCUDACXX_INLINE_VISIBILITY bool try_lock();
 
     template <class _Rep, class _Period>
-        bool try_lock_for(const chrono::duration<_Rep, _Period>& __d);
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock_for(const chrono::duration<_Rep, _Period>& __d);
+
     template <class _Clock, class _Duration>
-        bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t);
+    _LIBCUDACXX_INLINE_VISIBILITY
+    bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t);
 
+    _LIBCUDACXX_INLINE_VISIBILITY
     void unlock();
 
     _LIBCUDACXX_INLINE_VISIBILITY
@@ -201,6 +261,7 @@ public:
 
     _LIBCUDACXX_INLINE_VISIBILITY
     bool owns_lock() const _NOEXCEPT {return __owns_;}
+
     _LIBCUDACXX_INLINE_VISIBILITY
     _LIBCUDACXX_EXPLICIT
         operator bool () const _NOEXCEPT {return __owns_;}
@@ -209,61 +270,71 @@ public:
 };
 
 template <class _Mutex>
-void
-unique_lock<_Mutex>::lock()
+_LIBCUDACXX_INLINE_VISIBILITY
+void unique_lock<_Mutex>::lock()
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (__m_ == nullptr)
         __throw_system_error(EPERM, "unique_lock::lock: references null mutex");
     if (__owns_)
         __throw_system_error(EDEADLK, "unique_lock::lock: already locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __m_->lock();
     __owns_ = true;
 }
 
 template <class _Mutex>
-bool
-unique_lock<_Mutex>::try_lock()
+_LIBCUDACXX_INLINE_VISIBILITY
+bool unique_lock<_Mutex>::try_lock()
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (__m_ == nullptr)
         __throw_system_error(EPERM, "unique_lock::try_lock: references null mutex");
     if (__owns_)
         __throw_system_error(EDEADLK, "unique_lock::try_lock: already locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __owns_ = __m_->try_lock();
     return __owns_;
 }
 
 template <class _Mutex>
 template <class _Rep, class _Period>
-bool
-unique_lock<_Mutex>::try_lock_for(const chrono::duration<_Rep, _Period>& __d)
+_LIBCUDACXX_INLINE_VISIBILITY
+bool unique_lock<_Mutex>::try_lock_for(const chrono::duration<_Rep, _Period>& __d)
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (__m_ == nullptr)
         __throw_system_error(EPERM, "unique_lock::try_lock_for: references null mutex");
     if (__owns_)
         __throw_system_error(EDEADLK, "unique_lock::try_lock_for: already locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __owns_ = __m_->try_lock_for(__d);
     return __owns_;
 }
 
 template <class _Mutex>
 template <class _Clock, class _Duration>
-bool
+_LIBCUDACXX_INLINE_VISIBILITY bool
 unique_lock<_Mutex>::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t)
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (__m_ == nullptr)
         __throw_system_error(EPERM, "unique_lock::try_lock_until: references null mutex");
     if (__owns_)
         __throw_system_error(EDEADLK, "unique_lock::try_lock_until: already locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __owns_ = __m_->try_lock_until(__t);
     return __owns_;
 }
 
-template <class _Mutex>
+template <class _Mutex> _LIBCUDACXX_INLINE_VISIBILITY
 void
 unique_lock<_Mutex>::unlock()
 {
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
     if (!__owns_)
         __throw_system_error(EPERM, "unique_lock::unlock: not locked");
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
     __m_->unlock();
     __owns_ = false;
 }
@@ -274,6 +345,8 @@ void
 swap(unique_lock<_Mutex>& __x, unique_lock<_Mutex>& __y) _NOEXCEPT
     {__x.swap(__y);}
 
+#ifndef _LIBCUDACXX_HAS_THREAD_API_CUDA
+
 //enum class cv_status
 _LIBCUDACXX_DECLARE_STRONG_ENUM(cv_status)
 {
@@ -346,15 +419,17 @@ private:
     void __do_timed_wait(unique_lock<mutex>& __lk,
        chrono::time_point<_Clock, chrono::nanoseconds>) _NOEXCEPT;
 };
+
+#endif // _LIBCUDACXX_HAS_THREAD_API_CUDA
 #endif // !_LIBCUDACXX_HAS_NO_THREADS
 
 template <class _Rep, class _Period>
 inline _LIBCUDACXX_INLINE_VISIBILITY
-typename enable_if
+__enable_if_t
 <
     is_floating_point<_Rep>::value,
     chrono::nanoseconds
->::type
+>
 __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
 {
     using namespace chrono;
@@ -377,11 +452,11 @@ __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
 
 template <class _Rep, class _Period>
 inline _LIBCUDACXX_INLINE_VISIBILITY
-typename enable_if
+__enable_if_t
 <
     !is_floating_point<_Rep>::value,
     chrono::nanoseconds
->::type
+>
 __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
 {
     using namespace chrono;
@@ -410,6 +485,8 @@ __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
 }
 
 #ifndef _LIBCUDACXX_HAS_NO_THREADS
+#ifndef _LIBCUDACXX_HAS_THREAD_API_CUDA
+
 template <class _Predicate>
 void
 condition_variable::wait(unique_lock<mutex>& __lk, _Predicate __pred)
@@ -532,10 +609,13 @@ condition_variable::__do_timed_wait(unique_lock<mutex>& __lk,
     wait_for(__lk, __tp - _Clock::now());
 }
 
+#endif //_LIBCUDACXX_HAS_THREAD_API_CUDA
 #endif // !_LIBCUDACXX_HAS_NO_THREADS
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-_LIBCUDACXX_POP_MACROS
+#ifndef __cuda_std__
+#include <__pragma_pop>
+#endif // __cuda_std__
 
 #endif  // _LIBCUDACXX___MUTEX_BASE
diff --git a/include/cuda/std/detail/libcxx/include/memory b/include/cuda/std/detail/libcxx/include/memory
index 1258a67e16..f59ce610bf 100644
--- a/include/cuda/std/detail/libcxx/include/memory
+++ b/include/cuda/std/detail/libcxx/include/memory
@@ -663,6 +663,7 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space);
 #include "__iterator/iterator_traits.h"
 #include "__iterator/iterator.h"
 #include "__memory/addressof.h"
+#include "__memory/atomic_load.h"
 #include "__memory/pointer_traits.h"
 #include "__tuple_dir/tuple_indices.h"
 #include "__type_traits/decay.h"
@@ -707,33 +708,6 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space);
 #endif
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <class _ValueType>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-_ValueType __libcpp_relaxed_load(_ValueType const* __value) {
-#if !defined(_LIBCUDACXX_HAS_NO_THREADS) && \
-    defined(__ATOMIC_RELAXED) &&        \
-    (__has_builtin(__atomic_load_n) || defined(_LIBCUDACXX_COMPILER_GCC))
-    return __atomic_load_n(__value, __ATOMIC_RELAXED);
-#else
-    return *__value;
-#endif
-}
-
-template <class _ValueType>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-_ValueType __libcpp_acquire_load(_ValueType const* __value) {
-#if !defined(_LIBCUDACXX_HAS_NO_THREADS) && \
-    defined(__ATOMIC_ACQUIRE) &&        \
-    (__has_builtin(__atomic_load_n) || defined(_LIBCUDACXX_COMPILER_GCC))
-    return __atomic_load_n(__value, __ATOMIC_ACQUIRE);
-#else
-    return *__value;
-#endif
-}
-
-// addressof moved to <type_traits>
-
 template <class _Tp> class allocator;
 
 template <>
diff --git a/include/cuda/std/detail/libcxx/include/mutex b/include/cuda/std/detail/libcxx/include/mutex
index 7a454e5609..eab4957455 100644
--- a/include/cuda/std/detail/libcxx/include/mutex
+++ b/include/cuda/std/detail/libcxx/include/mutex
@@ -186,28 +186,31 @@ template<class Callable, class ...Args>
 
 */
 
+#ifndef __cuda_std__
 #include <__config>
-#include <__mutex_base>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#ifndef _LIBCUDACXX_CXX03_LANG
-#include <tuple>
-#endif
-#include <version>
-#include <__threading_support>
+#include <memory> // for __libcpp_acquire_load
+#endif // __cuda_std__
+
+#include "__mutex_base"
+#include "__threading_support"
+#include "__utility/forward.h"
+#include "cstdint"
+#include "functional"
+#include "tuple"
+#include "version"
+
+#ifndef __cuda_std__
+#include <__pragma_push>
+#endif // __cuda_std__
 
 #if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
 
-_LIBCUDACXX_PUSH_MACROS
-#include <__undef_macros>
-
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #ifndef _LIBCUDACXX_HAS_NO_THREADS
+#ifndef _LIBCUDACXX_HAS_THREAD_API_CUDA
 
 class _LIBCUDACXX_TYPE_VIS recursive_mutex
 {
@@ -303,8 +306,7 @@ public:
 };
 
 template <class _Clock, class _Duration>
-bool
-recursive_timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t)
+bool recursive_timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t)
 {
     using namespace chrono;
     __thread_id __id = this_thread::get_id();
@@ -327,10 +329,15 @@ recursive_timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration
     }
     return false;
 }
+#else
+
+using timed_mutex = __mutex_base<0>;
+
+#endif // _LIBCUDACXX_HAS_THREAD_API_CUDA
 
 template <class _L0, class _L1>
-int
-try_lock(_L0& __l0, _L1& __l1)
+_LIBCUDACXX_INLINE_VISIBILITY
+int try_lock(_L0& __l0, _L1& __l1)
 {
     unique_lock<_L0> __u0(__l0, try_to_lock);
     if (__u0.owns_lock())
@@ -346,11 +353,9 @@ try_lock(_L0& __l0, _L1& __l1)
     return 0;
 }
 
-#ifndef _LIBCUDACXX_CXX03_LANG
-
 template <class _L0, class _L1, class _L2, class... _L3>
-int
-try_lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3)
+_LIBCUDACXX_INLINE_VISIBILITY
+int try_lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3)
 {
     int __r = 0;
     unique_lock<_L0> __u0(__l0, try_to_lock);
@@ -365,11 +370,9 @@ try_lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3)
     return __r;
 }
 
-#endif  // _LIBCUDACXX_CXX03_LANG
-
 template <class _L0, class _L1>
-void
-lock(_L0& __l0, _L1& __l1)
+_LIBCUDACXX_INLINE_VISIBILITY
+void lock(_L0& __l0, _L1& __l1)
 {
     while (true)
     {
@@ -394,11 +397,9 @@ lock(_L0& __l0, _L1& __l1)
     }
 }
 
-#ifndef _LIBCUDACXX_CXX03_LANG
-
 template <class _L0, class _L1, class _L2, class ..._L3>
-void
-__lock_first(int __i, _L0& __l0, _L1& __l1, _L2& __l2, _L3& ...__l3)
+_LIBCUDACXX_INLINE_VISIBILITY
+void __lock_first(int __i, _L0& __l0, _L1& __l1, _L2& __l2, _L3& ...__l3)
 {
     while (true)
     {
@@ -469,8 +470,6 @@ void __unlock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3) {
     _CUDA_VSTD::__unlock(__l2, __l3...);
 }
 
-#endif  // _LIBCUDACXX_CXX03_LANG
-
 #if _LIBCUDACXX_STD_VER > 14
 template <class ..._Mutexes>
 class _LIBCUDACXX_TEMPLATE_VIS scoped_lock;
@@ -478,6 +477,7 @@ class _LIBCUDACXX_TEMPLATE_VIS scoped_lock;
 template <>
 class _LIBCUDACXX_TEMPLATE_VIS scoped_lock<> {
 public:
+    _LIBCUDACXX_INLINE_VISIBILITY
     explicit scoped_lock() {}
     ~scoped_lock() = default;
 
@@ -495,13 +495,13 @@ public:
 private:
     mutex_type& __m_;
 public:
-    explicit scoped_lock(mutex_type & __m) _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
+    _LIBCUDACXX_INLINE_VISIBILITY explicit scoped_lock(mutex_type & __m) _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
         : __m_(__m) {__m_.lock();}
 
-    ~scoped_lock() _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(release_capability()) {__m_.unlock();}
+    _LIBCUDACXX_INLINE_VISIBILITY ~scoped_lock() _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(release_capability()) {__m_.unlock();}
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    explicit scoped_lock(adopt_lock_t, mutex_type& __m) _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
+    _LIBCUDACXX_INLINE_VISIBILITY explicit scoped_lock(adopt_lock_t, mutex_type& __m) _LIBCUDACXX_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
         : __m_(__m) {}
 
     scoped_lock(scoped_lock const&) = delete;
@@ -550,61 +550,39 @@ private:
 #endif // _LIBCUDACXX_STD_VER > 14
 #endif // !_LIBCUDACXX_HAS_NO_THREADS
 
-struct _LIBCUDACXX_TEMPLATE_VIS once_flag;
-
-#ifndef _LIBCUDACXX_CXX03_LANG
-
-template<class _Callable, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY
-void call_once(once_flag&, _Callable&&, _Args&&...);
-
-#else  // _LIBCUDACXX_CXX03_LANG
+template<int _Sco>
+struct _LIBCUDACXX_TEMPLATE_VIS __once_flag_base;
 
-template<class _Callable>
+template<int _Sco, class _Callable, class... _Args>
 _LIBCUDACXX_INLINE_VISIBILITY
-void call_once(once_flag&, _Callable&);
+void call_once(__once_flag_base<_Sco>&, _Callable&&, _Args&&...);
 
-template<class _Callable>
-_LIBCUDACXX_INLINE_VISIBILITY
-void call_once(once_flag&, const _Callable&);
-
-#endif  // _LIBCUDACXX_CXX03_LANG
-
-struct _LIBCUDACXX_TEMPLATE_VIS once_flag
+template<int _Sco>
+struct _LIBCUDACXX_TEMPLATE_VIS __once_flag_base
 {
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _LIBCUDACXX_CONSTEXPR
-        once_flag() _NOEXCEPT : __state_(0) {}
+    constexpr __once_flag_base() noexcept = default;
 
 #if defined(_LIBCUDACXX_ABI_MICROSOFT)
-   typedef uintptr_t _State_type;
+    typedef uintptr_t _State_data_type;
 #else
-   typedef unsigned long _State_type;
+    typedef unsigned long _State_data_type;
 #endif
 
+#ifndef _LIBCUDACXX_INLINE_THREADING
+    using _State_type = _State_data_type;
+#else
+    using _State_type = atomic<_State_data_type>;
+#endif // _LIBCUDACXX_INLINE_THREADING
+
+    _State_type __state_{0};
 
 private:
-    once_flag(const once_flag&); // = delete;
-    once_flag& operator=(const once_flag&); // = delete;
-
-    _State_type __state_;
-
-#ifndef _LIBCUDACXX_CXX03_LANG
-    template<class _Callable, class... _Args>
-    friend
-    void call_once(once_flag&, _Callable&&, _Args&&...);
-#else  // _LIBCUDACXX_CXX03_LANG
-    template<class _Callable>
-    friend
-    void call_once(once_flag&, _Callable&);
-
-    template<class _Callable>
-    friend
-    void call_once(once_flag&, const _Callable&);
-#endif  // _LIBCUDACXX_CXX03_LANG
+
+    __once_flag_base(const __once_flag_base&) = delete;
+    __once_flag_base& operator=(const __once_flag_base&) = delete;
 };
 
-#ifndef _LIBCUDACXX_CXX03_LANG
+using once_flag = __once_flag_base<0>;
 
 template <class _Fp>
 class __call_once_param
@@ -630,82 +608,52 @@ private:
     }
 };
 
-#else
-
 template <class _Fp>
-class __call_once_param
-{
-    _Fp& __f_;
-public:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit __call_once_param(_Fp& __f) : __f_(__f) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator()()
-    {
-        __f_();
-    }
-};
-
-#endif
-
-template <class _Fp>
-void
-__call_once_proxy(void* __vp)
+_LIBCUDACXX_INLINE_VISIBILITY
+void __call_once_proxy(void* __vp)
 {
     __call_once_param<_Fp>* __p = static_cast<__call_once_param<_Fp>*>(__vp);
     (*__p)();
 }
 
-_LIBCUDACXX_FUNC_VIS void __call_once(volatile once_flag::_State_type&, void*,
-                                  void (*)(void*));
-
-#ifndef _LIBCUDACXX_CXX03_LANG
-
-template<class _Callable, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-call_once(once_flag& __flag, _Callable&& __func, _Args&&... __args)
-{
-    if (__libcpp_acquire_load(&__flag.__state_) != ~once_flag::_State_type(0))
-    {
-        typedef tuple<_Callable&&, _Args&&...> _Gp;
-        _Gp __f(_CUDA_VSTD::forward<_Callable>(__func), _CUDA_VSTD::forward<_Args>(__args)...);
-        __call_once_param<_Gp> __p(__f);
-        __call_once(__flag.__state_, &__p, &__call_once_proxy<_Gp>);
-    }
-}
-
-#else  // _LIBCUDACXX_CXX03_LANG
-
-template<class _Callable>
+#ifndef _LIBCUDACXX_INLINE_THREADING
+template<int _Sco>
+_LIBCUDACXX_FUNC_VIS
+void __call_once(volatile typename __once_flag_base<_Sco>::_State_type&, void*, void (*)(void*));
+#else
+template<int _Sco>
 inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-call_once(once_flag& __flag, _Callable& __func)
-{
-    if (__libcpp_acquire_load(&__flag.__state_) != ~once_flag::_State_type(0))
+void __call_once(volatile typename __once_flag_base<_Sco>::_State_type& __s, void* __p, void (* __f)(void*))
     {
-        __call_once_param<_Callable> __p(__func);
-        __call_once(__flag.__state_, &__p, &__call_once_proxy<_Callable>);
+        typename __once_flag_base<_Sco>::_State_data_type __once_expect = 0;
+        if(__s.compare_exchange_strong(__once_expect, typename __once_flag_base<_Sco>::_State_data_type(1), memory_order_acquire))
+        {
+            __f(__p);
+            __s.store(~typename __once_flag_base<_Sco>::_State_data_type(0), memory_order_release);
+            __s.notify_all();
+        }
+        else if(__once_expect == 1)
+            __s.wait(__once_expect);
     }
-}
+#endif // _LIBCUDACXX_INLINE_THREADING
 
-template<class _Callable>
+template<int _Sco, class _Callable, class... _Args>
 inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-call_once(once_flag& __flag, const _Callable& __func)
+void call_once(__once_flag_base<_Sco>& __flag, _Callable&& __func, _Args&&... __args)
 {
-    if (__libcpp_acquire_load(&__flag.__state_) != ~once_flag::_State_type(0))
+    if (__libcpp_acquire_load(&__flag.__state_) != ~typename __once_flag_base<_Sco>::_State_data_type(0))
     {
-        __call_once_param<const _Callable> __p(__func);
-        __call_once(__flag.__state_, &__p, &__call_once_proxy<const _Callable>);
+        typedef tuple<_Callable&&, _Args&&...> _Gp;
+        _Gp __f(_CUDA_VSTD::forward<_Callable>(__func), _CUDA_VSTD::forward<_Args>(__args)...);
+        __call_once_param<_Gp> __p(__f);
+        __call_once<_Sco>(__flag.__state_, &__p, &__call_once_proxy<_Gp>);
     }
 }
 
-#endif  // _LIBCUDACXX_CXX03_LANG
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
-_LIBCUDACXX_POP_MACROS
+#ifndef __cuda_std__
+#include <__pragma_pop>
+#endif // __cuda_std__
 
 #endif  // _LIBCUDACXX_MUTEX
diff --git a/include/cuda/std/mutex b/include/cuda/std/mutex
new file mode 100644
index 0000000000..c60e649adc
--- /dev/null
+++ b/include/cuda/std/mutex
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_MUTEX
+#define _CUDA_STD_MUTEX
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
+#  error "CUDA synchronization primitives are only supported for sm_70 and up."
+#endif
+
+#include "detail/__config"
+
+#include "detail/__pragma_push"
+
+#include "detail/libcxx/include/mutex"
+
+#include "detail/__pragma_pop"
+
+#endif //_CUDA_STD_MUTEX
diff --git a/libcxx/src/mutex.cpp b/libcxx/src/mutex.cpp
index 49352a005a..1744172f4d 100644
--- a/libcxx/src/mutex.cpp
+++ b/libcxx/src/mutex.cpp
@@ -27,6 +27,7 @@ const adopt_lock_t  adopt_lock{};
 
 // ~mutex is defined elsewhere
 
+template<>
 void
 mutex::lock()
 {
@@ -35,12 +36,14 @@ mutex::lock()
         __throw_system_error(ec, "mutex lock failed");
 }
 
+template<>
 bool
 mutex::try_lock() _NOEXCEPT
 {
     return __libcpp_mutex_trylock(&__m_);
 }
 
+template<>
 void
 mutex::unlock() _NOEXCEPT
 {
@@ -200,7 +203,8 @@ _LIBCUDACXX_SAFE_STATIC static __libcpp_mutex_t mut = _LIBCUDACXX_MUTEX_INITIALI
 _LIBCUDACXX_SAFE_STATIC static __libcpp_condvar_t cv = _LIBCUDACXX_CONDVAR_INITIALIZER;
 #endif
 
-void __call_once(volatile once_flag::_State_type& flag, void* arg,
+template<>
+void __call_once<0>(volatile typename __once_flag_base<0>::_State_type& flag, void* arg,
                  void (*func)(void*))
 {
 #if defined(_LIBCUDACXX_HAS_NO_THREADS)