-
Notifications
You must be signed in to change notification settings - Fork 187
Decouple Atomic from host on MSVC #43
Changes from all commits
6488a1a
23d6f5f
45c28ea
0c7add5
4b7dca9
7b2a928
2610560
7d802a6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,12 +44,11 @@ | |
#define __ATOMIC_THREAD 10 | ||
#endif //__ATOMIC_BLOCK | ||
|
||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA | ||
|
||
namespace detail { | ||
|
||
inline __device__ int __stronger_order_cuda(int __a, int __b) { | ||
inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) { | ||
int const __max = __a > __b ? __a : __b; | ||
if(__max != __ATOMIC_RELEASE) | ||
return __max; | ||
|
@@ -102,11 +101,22 @@ namespace detail { | |
|
||
_LIBCUDACXX_END_NAMESPACE_CUDA | ||
|
||
#if defined(_LIBCUDACXX_COMPILER_MSVC) | ||
// Inject atomic intrinsics built from MSVC compiler intrinsics | ||
#include "libcxx/include/support/win32/atomic_msvc.h" | ||
#endif | ||
|
||
#include "__atomic_generated" | ||
#include "__atomic_derived" | ||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_STD | ||
|
||
template <typename _Tp> | ||
struct __skip_amt { enum {value = 1}; }; | ||
|
||
template <typename _Tp> | ||
struct __skip_amt<_Tp*> { enum {value = sizeof(_Tp)}; }; | ||
|
||
// Forward-declare the function templates that are defined libcxx later. | ||
template <typename _Tp, typename _Tv> _LIBCUDACXX_INLINE_VISIBILITY | ||
typename enable_if<is_assignable<_Tp&, _Tv>::value>::type | ||
|
@@ -123,14 +133,14 @@ __host__ __device__ inline void __cxx_atomic_thread_fence(int __order) { | |
#ifdef __CUDA_ARCH__ | ||
detail::__atomic_thread_fence_cuda(__order, detail::__thread_scope_system_tag()); | ||
#else | ||
::std::atomic_thread_fence((::std::memory_order)__order); | ||
__atomic_thread_fence(__order); | ||
#endif | ||
} | ||
__host__ __device__ inline void __cxx_atomic_signal_fence(int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
detail::__atomic_signal_fence_cuda(__order); | ||
#else | ||
::std::atomic_signal_fence((::std::memory_order)__order); | ||
__atomic_signal_fence(__order); | ||
#endif | ||
} | ||
|
||
|
@@ -190,46 +200,44 @@ struct __cxx_atomic_base_impl_default { | |
constexpr __cxx_atomic_base_impl_default() noexcept = default; | ||
__host__ __device__ constexpr explicit __cxx_atomic_base_impl_default(_Tp __value) noexcept : __a_value(__value) { | ||
} | ||
#ifdef __CUDA_ARCH__ | ||
__cxx_atomic_alignment_wrapper_t<_Tp> __a_value; | ||
#else | ||
::std::atomic<__cxx_atomic_alignment_wrapper_t<_Tp>> __a_value; | ||
#endif | ||
}; | ||
|
||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline void __cxx_atomic_init(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val) { | ||
#if defined(__CUDA_ARCH__) | ||
auto __tmp = __cxx_atomic_alignment_wrap(__val); | ||
__cxx_atomic_assign_volatile(__a->__a_value, __tmp); | ||
#elif defined(_LIBCUDACXX_CUDA_HAS_NO_HOST_STD_ATOMIC_INIT) | ||
__a->__a_value.store(__val, ::std::memory_order_relaxed); | ||
#else | ||
::std::atomic_init(&__a->__a_value, __cxx_atomic_alignment_wrap(__val)); | ||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline void __cxx_atomic_store(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
detail::__atomic_store_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>()); | ||
#else | ||
::std::atomic_store_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order); | ||
auto __t = __cxx_atomic_alignment_wrap(__val); | ||
__atomic_store(&__a->__a_value, &__t, __order); | ||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline _Tp __cxx_atomic_load(__cxx_atomic_base_impl_default<_Tp, _Sco> const volatile* __a, int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
return __cxx_atomic_alignment_unwrap(detail::__atomic_load_n_cuda(&__a->__a_value, __order, detail::__scope_tag<_Sco>())); | ||
#else | ||
return __cxx_atomic_alignment_unwrap(::std::atomic_load_explicit(&__a->__a_value, (::std::memory_order)__order)); | ||
alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; | ||
auto* __dest = reinterpret_cast<_Tp*>(__buf); | ||
__atomic_load(&__a->__a_value, __dest, __order); | ||
return __cxx_atomic_alignment_unwrap(*__dest); | ||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __val, int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
return __cxx_atomic_alignment_unwrap(detail::__atomic_exchange_n_cuda(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), __order, detail::__scope_tag<_Sco>())); | ||
#else | ||
return __cxx_atomic_alignment_unwrap(::std::atomic_exchange_explicit(&__a->__a_value, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__order)); | ||
alignas(_Tp) unsigned char __buf[sizeof(_Tp)]; | ||
auto* __dest = reinterpret_cast<_Tp*>(__buf); | ||
auto __t = __cxx_atomic_alignment_wrap(__val); | ||
__atomic_exchange(&__a->__a_value, &__t, __dest, __order); | ||
return __cxx_atomic_alignment_unwrap(*__dest); | ||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
|
@@ -238,7 +246,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_strong(__cxx_atomi | |
#ifdef __CUDA_ARCH__ | ||
bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), false, __success, __failure, detail::__scope_tag<_Sco>()); | ||
#else | ||
bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure); | ||
bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, false, __success, __failure); | ||
#endif | ||
*__expected = __cxx_atomic_alignment_unwrap(__tmp); | ||
return __result; | ||
|
@@ -249,7 +257,7 @@ __host__ __device__ inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_ | |
#ifdef __CUDA_ARCH__ | ||
bool __result = detail::__atomic_compare_exchange_n_cuda(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), true, __success, __failure, detail::__scope_tag<_Sco>()); | ||
#else | ||
bool __result = ::std::atomic_compare_exchange_strong_explicit(&__a->__a_value, &__tmp, __cxx_atomic_alignment_wrap(__val), (::std::memory_order)__success, (::std::memory_order)__failure); | ||
bool __result = __atomic_compare_exchange(&__a->__a_value, &__tmp, &__val, true, __success, __failure); | ||
#endif | ||
*__expected = __cxx_atomic_alignment_unwrap(__tmp); | ||
return __result; | ||
|
@@ -259,55 +267,55 @@ __host__ __device__ inline _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl_def | |
#ifdef __CUDA_ARCH__ | ||
return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); | ||
#else | ||
return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); | ||
return __atomic_fetch_add(&__a->__a_value, __delta, __order); | ||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline _Tp* __cxx_atomic_fetch_add(__cxx_atomic_base_impl_default<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
return detail::__atomic_fetch_add_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); | ||
#else | ||
return ::std::atomic_fetch_add_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); | ||
return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't seem right to me that we should need the skip amount in this layer. The layer below should be doing that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Basically, every deviation between the CUDA_ARCH side and this side looks like a bug to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'd agree, but there is no layer at the compiler intrinsic level for GCC. At that point incrementing by the sizeof(_Tp) is necessary. https://github.com/NVIDIA/libcudacxx/blob/main/libcxx/include/atomic#L846
@griwes, @jrhemstad, and I had a meeting today about how we could resolve some of this with a better platform layering framework. There's some neat ideas on the table for making this nesting doll thing be a bit cleaner. It would be relevant to know what things are being done wrong ahead of time. |
||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __delta, int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); | ||
#else | ||
return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); | ||
return __atomic_fetch_sub(&__a->__a_value, __delta, __order); | ||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline _Tp* __cxx_atomic_fetch_sub(__cxx_atomic_base_impl_default<_Tp*, _Sco> volatile* __a, ptrdiff_t __delta, int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
return detail::__atomic_fetch_sub_cuda(&__a->__a_value, __delta, __order, detail::__scope_tag<_Sco>()); | ||
#else | ||
return ::std::atomic_fetch_sub_explicit(&__a->__a_value, __delta, (::std::memory_order)__order); | ||
return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp*>::value, __order); | ||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
return detail::__atomic_fetch_and_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); | ||
#else | ||
return ::std::atomic_fetch_and_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); | ||
return __atomic_fetch_and(&__a->__a_value, __pattern, __order); | ||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
return detail::__atomic_fetch_or_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); | ||
#else | ||
return ::std::atomic_fetch_or_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); | ||
return __atomic_fetch_or(&__a->__a_value, __pattern, __order); | ||
#endif | ||
} | ||
template<class _Tp, int _Sco> | ||
__host__ __device__ inline _Tp __cxx_atomic_fetch_xor(__cxx_atomic_base_impl_default<_Tp, _Sco> volatile* __a, _Tp __pattern, int __order) { | ||
#ifdef __CUDA_ARCH__ | ||
return detail::__atomic_fetch_xor_cuda(&__a->__a_value, __pattern, __order, detail::__scope_tag<_Sco>()); | ||
#else | ||
return ::std::atomic_fetch_xor_explicit(&__a->__a_value, __pattern, (::std::memory_order)__order); | ||
return __atomic_fetch_xor(&__a->__a_value, __pattern, __order); | ||
#endif | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@griwes it looks like I tore off another alignment unwrap here.
Are these
char*
output buffers a code smell? This was done to fix initialization warnings.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not use __atomic_load_n here instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
__atomic_load_n does not work for non-integral non-pointer types: https://gcc.godbolt.org/z/osrcs1
Maybe I'm mistaken in some way about its usage?