Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Decouple Atomic from host on MSVC #43

Merged
merged 8 commits into from
Nov 14, 2020
60 changes: 34 additions & 26 deletions include/cuda/std/detail/__atomic
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,11 @@
#define __ATOMIC_THREAD 10
#endif //__ATOMIC_BLOCK


_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

namespace detail {

inline __device__ int __stronger_order_cuda(int __a, int __b) {
inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) {
int const __max = __a > __b ? __a : __b;
if(__max != __ATOMIC_RELEASE)
return __max;
Expand Down Expand Up @@ -102,11 +101,22 @@ namespace detail {

_LIBCUDACXX_END_NAMESPACE_CUDA

#if defined(_LIBCUDACXX_COMPILER_MSVC)
// Inject atomic intrinsics built from MSVC compiler intrinsics
#include "libcxx/include/support/win32/atomic_msvc.h"
#endif

#include "__atomic_generated"
#include "__atomic_derived"

_LIBCUDACXX_BEGIN_NAMESPACE_STD

template <typename _Tp>
struct __skip_amt { enum {value = 1}; };

template <typename _Tp>
struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; };

// Forward-declare the function templates that are defined libcxx later.
template <typename _Tp, typename _Tv> _LIBCUDACXX_INLINE_VISIBILITY
typename enable_if<is_assignable<_Tp&, _Tv>::value>::type
Expand All @@ -123,14 +133,14 @@ __host__ __device__ inline void __cxx_atomic_thread_fence(int __order) {
#ifdef __CUDA_ARCH__
detail::__atomic_thread_fence_cuda(__order, detail::__thread_scope_system_tag());
#else
::std::atomic_thread_fence((::std::memory_order)__order);
__atomic_thread_fence(__order);
#endif
}
__host__ __device__ inline void __cxx_atomic_signal_fence(int __order) {
#ifdef __CUDA_ARCH__
detail::__atomic_signal_fence_cuda(__order);
#else
::std::atomic_signal_fence((::std::memory_order)__order);
__atomic_signal_fence(__order);
#endif
}

Expand Down Expand Up @@ -190,46 +200,44 @@ struct __cxx_atomic_base_impl_default {
constexpr __cxx_atomic_base_impl_default() noexcept = default;
__host__ __device__ constexpr explicit __cxx_atomic_base_impl_default(_Tp __value) noexcept : __a_value(__value) {
}
#ifdef __CUDA_ARCH__
__cxx_atomic_alignment_wrapper_t<_Tp> __a_value;
#else
::std::atomic<__cxx_atomic_alignment_wrapper_t<_Tp>> __a_value;
#endif
};

template<class _Tp, int _Sco>
__host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val) {
#if defined(__CUDA_ARCH__)
auto __tmp = __cxx_atomic_alignment_wrap(__val);
__cxx_atomic_assign_volatile(__a->__a_value, __tmp);
#elif defined(_LIBCUDACXX_CUDA_HAS_NO_HOST_STD_ATOMIC_INIT)
__a->__a_value.store(__val, ::std::memory_order_relaxed);
#else
::std::atomic_init(&__a->__a_value, __cxx_atomic_alignment_wrap(__val));
#endif
}
template<class _Tp, int _Sco>
__host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) {
#ifdef __CUDA_ARCH__
detail::__atomic_store_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>());
#else
::std::atomic_store_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order);
auto __t = __cxx_atomic_alignment_wrap(__val);
__atomic_store(&__a->__a_value, &__t, __order);
#endif
}
template<class _Tp, int _Sco>
__host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_impl_default<_Tp, _Sco> const volatile* __a, int __order) {
#ifdef __CUDA_ARCH__
return __cxx_atomic_alignment_unwrap(detail::__atomic_load_n_cuda(&__a->__a_value, __order, detail::__scope_tag<_Sco>()));
#else
return __cxx_atomic_alignment_unwrap(::std::atomic_load_explicit(&__a->__a_value, (::std::memory_order)__order));
alignas(_Tp) unsigned char __buf[sizeof(_Tp)];
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@griwes it looks like I tore off another alignment unwrap here.

Are these char* output buffers a code smell? This was done to fix initialization warnings.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use __atomic_load_n here instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__atomic_load_n does not work for non-integral non-pointer types: https://gcc.godbolt.org/z/osrcs1

Maybe I'm mistaken in some way about its usage?

auto* __dest = reinterpret_cast<_Tp*>(__buf);
__atomic_load(&__a->__a_value, __dest, __order);
return __cxx_atomic_alignment_unwrap(*__dest);
#endif
}
template<class _Tp, int _Sco>
__host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) {
#ifdef __CUDA_ARCH__
return __cxx_atomic_alignment_unwrap(detail::__atomic_exchange_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>()));
#else
return __cxx_atomic_alignment_unwrap(::std::atomic_exchange_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order));
alignas(_Tp) unsigned char __buf[sizeof(_Tp)];
auto* __dest = reinterpret_cast<_Tp*>(__buf);
auto __t = __cxx_atomic_alignment_wrap(__val);
__atomic_exchange(&__a->__a_value, &__t, __dest, __order);
return __cxx_atomic_alignment_unwrap(*__dest);
#endif
}
template<class _Tp, int _Sco>
Expand All @@ -238,7 +246,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomi
#ifdef __CUDA_ARCH__
bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure, detail::__scope_tag<_Sco>());
#else
bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure);
bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, false, __success, __failure);
#endif
*__expected = __cxx_atomic_alignment_unwrap(__tmp);
return __result;
Expand All @@ -249,7 +257,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_
#ifdef __CUDA_ARCH__
bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure, detail::__scope_tag<_Sco>());
#else
bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure);
bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, true, __success, __failure);
#endif
*__expected = __cxx_atomic_alignment_unwrap(__tmp);
return __result;
Expand All @@ -259,55 +267,55 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl_def
#ifdef __CUDA_ARCH__
return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>());
#else
return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order);
return __atomic_fetch_add(&__a->__a_value, __delta, __order);
#endif
}
template<class _Tp, int _Sco>
__host__ __device__ inline _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl_default<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, int __order) {
#ifdef __CUDA_ARCH__
return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>());
#else
return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order);
return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem right to me that we should need the skip amount in this layer. The layer below should be doing that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically, every deviation between the CUDA_ARCH side and this side looks like a bug to me.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem right to me that we should need the skip amount in this layer. The layer below should be doing that.

I'd agree, but there is no layer at the compiler intrinsic level for GCC. At that point incrementing by the sizeof(_Tp) is necessary. https://github.com/NVIDIA/libcudacxx/blob/main/libcxx/include/atomic#L846

Basically, every deviation between the CUDA_ARCH side and this side looks like a bug to me.

@griwes, @jrhemstad, and I had a meeting today about how we could resolve some of this with a better platform layering framework. There's some neat ideas on the table for making this nesting doll thing be a bit cleaner.

It would be relevant to know what things are being done wrong ahead of time.

#endif
}
template<class _Tp, int _Sco>
__host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __delta, int __order) {
#ifdef __CUDA_ARCH__
return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>());
#else
return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order);
return __atomic_fetch_sub(&__a->__a_value, __delta, __order);
#endif
}
template<class _Tp, int _Sco>
__host__ __device__ inline _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_default<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, int __order) {
#ifdef __CUDA_ARCH__
return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>());
#else
return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order);
return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order);
#endif
}
template<class _Tp, int _Sco>
__host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) {
#ifdef __CUDA_ARCH__
return detail::__atomic_fetch_and_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>());
#else
return ::std::atomic_fetch_and_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order);
return __atomic_fetch_and(&__a->__a_value, __pattern, __order);
#endif
}
template<class _Tp, int _Sco>
__host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) {
#ifdef __CUDA_ARCH__
return detail::__atomic_fetch_or_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>());
#else
return ::std::atomic_fetch_or_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order);
return __atomic_fetch_or(&__a->__a_value, __pattern, __order);
#endif
}
template<class _Tp, int _Sco>
__host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) {
#ifdef __CUDA_ARCH__
return detail::__atomic_fetch_xor_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>());
#else
return ::std::atomic_fetch_xor_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order);
return __atomic_fetch_xor(&__a->__a_value, __pattern, __order);
#endif
}

Expand Down
28 changes: 14 additions & 14 deletions include/cuda/std/detail/__atomic_derived
Original file line number Diff line number Diff line change
Expand Up @@ -43,23 +43,23 @@ template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if
_Type __device__ __atomic_fetch_add_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type const __desired = __expected + __val;
_Type __desired = __expected + __val;
while(!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
;
__desired = __expected + __val;
wmaxey marked this conversation as resolved.
Show resolved Hide resolved
return __expected;
}

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
_Type __host__ __device__ __atomic_fetch_max_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type const __desired = __expected > __val ? __expected : __val;
_Type __desired = __expected > __val ? __expected : __val;
#ifdef __CUDA_ARCH__
while(__desired == __val &&
!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s)) {
#else
while(__desired == __val &&
!::std::atomic_compare_exchange_strong_explicit(__ptr, &__expected, __desired, __memorder, __memorder)) {
!__atomic_compare_exchange(__ptr, &__expected, &__desired, true, __memorder, __memorder)) {
#endif
__desired = __expected > __val ? __expected : __val;
}
Expand All @@ -70,13 +70,13 @@ template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if
_Type __host__ __device__ __atomic_fetch_min_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type const __desired = __expected < __val ? __expected : __val;
_Type __desired = __expected < __val ? __expected : __val;
#ifdef __CUDA_ARCH__
while(__desired != __val &&
!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s)) {
#else
while(__desired != __val &&
!::std::atomic_compare_exchange_strong_explicit(__ptr, &__expected, __desired, __memorder, __memorder)) {
!__atomic_compare_exchange(__ptr, &__expected, &__desired, true, __memorder, __memorder)) {
#endif
__desired = __expected < __val ? __expected : __val;
}
Expand All @@ -87,39 +87,39 @@ template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if
_Type __device__ __atomic_fetch_sub_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type const __desired = __expected - __val;
_Type __desired = __expected - __val;
while(!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
;
__desired = __expected - __val;
return __expected;
}

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
_Type __device__ __atomic_fetch_and_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type const __desired = __expected & __val;
_Type __desired = __expected & __val;
while(!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
;
__desired = __expected & __val;
return __expected;
}

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
_Type __device__ __atomic_fetch_xor_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type const __desired = __expected ^ __val;
_Type __desired = __expected ^ __val;
while(!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
;
__desired = __expected ^ __val;
return __expected;
}

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
_Type __device__ __atomic_fetch_or_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type const __desired = __expected | __val;
_Type __desired = __expected | __val;
while(!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
;
__desired = __expected | __val;
return __expected;
}

Expand Down
5 changes: 5 additions & 0 deletions libcxx/include/atomic
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,11 @@ void atomic_signal_fence(memory_order m) noexcept;
#include <type_traits>
#include <version>
#include <__pragma_push>

#if defined(_LIBCUDACXX_COMPILER_MSVC)
#include "support/win32/atomic_msvc.h"
#endif

#endif //__cuda_std__

#if defined(_LIBCUDACXX_USE_PRAGMA_GCC_SYSTEM_HEADER)
Expand Down
Loading