Skip to content

Commit

Permalink
Allow CUDA 12.2 to keep perf, this addresses earlier comments in #478
Browse files Browse the repository at this point in the history
  • Loading branch information
wmaxey committed Sep 10, 2024
1 parent eff2826 commit 0fe24d3
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 37 deletions.
110 changes: 73 additions & 37 deletions libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD

_CCCL_DEVICE inline bool __cuda_is_local(const void* __ptr)
{
#if defined(_LIBCUDACXX_CUDACC_BELOW_12_3)
#if defined(_CCCL_CUDACC_BELOW_12_3)
int __tmp = 0;
asm("{\n\t"
" .reg .pred p;\n\t"
Expand All @@ -49,9 +49,46 @@ _CCCL_DEVICE inline bool __cuda_is_local(const void* __ptr)
#endif
}

template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_and(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom & __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_or(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom | __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_xor(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom ^ __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_add(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom + __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_sub(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom - __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_max(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom < __v ? __v : __atom;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_min(volatile _Type& __atom, _Type const& __v)
{
__atom = __v < __atom ? __v : __atom;
}

template <class _Type>
_CCCL_DEVICE bool __cuda_load_weak_if_local(const volatile _Type* __ptr, _Type* __ret)
{
#if defined(_LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE)
if (!__cuda_is_local((const void*) __ptr))
{
return false;
Expand All @@ -60,17 +97,24 @@ _CCCL_DEVICE bool __cuda_load_weak_if_local(const volatile _Type* __ptr, _Type*
// Required to workaround a compiler bug, see nvbug/4064730
__nanosleep(0);
return true;
#else
return false;
#endif
}

template <class _Type>
_CCCL_DEVICE bool __cuda_store_weak_if_local(volatile _Type* __ptr, _Type __val)
{
#if defined(_LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE)
if (!__cuda_is_local((const void*) __ptr))
{
return false;
}
memcpy((void*) __ptr, (void const*) &__val, sizeof(_Type));
return true;
#else
return false;
#endif
}

template <class _Type>
Expand Down Expand Up @@ -121,82 +165,74 @@ _CCCL_DEVICE bool __cuda_fetch_weak_if_local(volatile _Type* __ptr, _Type __val,
return true;
}

template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_and(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom & __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_or(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom | __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_xor(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom ^ __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_add(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom + __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_sub(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom - __v;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_max(volatile _Type& __atom, _Type const& __v)
{
__atom = __atom < __v ? __v : __atom;
}
template <class _Type>
_CCCL_DEVICE void __cuda_fetch_local_bop_min(volatile _Type& __atom, _Type const& __v)
{
__atom = __v < __atom ? __v : __atom;
}

template <class _Type>
_CCCL_DEVICE bool __cuda_fetch_and_weak_if_local(volatile _Type* __ptr, _Type __val, _Type* __ret)
{
#if defined(_LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE)
return false;
#else
return __cuda_fetch_weak_if_local(__ptr, __val, __ret, __cuda_fetch_local_bop_and<_Type>);
#endif
}

template <class _Type>
_CCCL_DEVICE bool __cuda_fetch_or_weak_if_local(volatile _Type* __ptr, _Type __val, _Type* __ret)
{
#if defined(_LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE)
return false;
#else
return __cuda_fetch_weak_if_local(__ptr, __val, __ret, __cuda_fetch_local_bop_or<_Type>);
#endif
}

template <class _Type>
_CCCL_DEVICE bool __cuda_fetch_xor_weak_if_local(volatile _Type* __ptr, _Type __val, _Type* __ret)
{
#if defined(_LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE)
return false;
#else
return __cuda_fetch_weak_if_local(__ptr, __val, __ret, __cuda_fetch_local_bop_xor<_Type>);
#endif
}

template <class _Type>
_CCCL_DEVICE bool __cuda_fetch_add_weak_if_local(volatile _Type* __ptr, _Type __val, _Type* __ret)
{
#if defined(_LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE)
return false;
#else
return __cuda_fetch_weak_if_local(__ptr, __val, __ret, __cuda_fetch_local_bop_add<_Type>);
#endif
}

template <class _Type>
_CCCL_DEVICE bool __cuda_fetch_sub_weak_if_local(volatile _Type* __ptr, _Type __val, _Type* __ret)
{
#if defined(_LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE)
return false;
#else
return __cuda_fetch_weak_if_local(__ptr, __val, __ret, __cuda_fetch_local_bop_sub<_Type>);
#endif
}

template <class _Type>
_CCCL_DEVICE bool __cuda_fetch_max_weak_if_local(volatile _Type* __ptr, _Type __val, _Type* __ret)
{
#if defined(_LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE)
return false;
#else
return __cuda_fetch_weak_if_local(__ptr, __val, __ret, __cuda_fetch_local_bop_max<_Type>);
#endif
}

template <class _Type>
_CCCL_DEVICE bool __cuda_fetch_min_weak_if_local(volatile _Type* __ptr, _Type __val, _Type* __ret)
{
#if defined(_LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE)
return false;
#else
return __cuda_fetch_weak_if_local(__ptr, __val, __ret, __cuda_fetch_local_bop_min<_Type>);
#endif
}

_LIBCUDACXX_END_NAMESPACE_STD
Expand Down
5 changes: 5 additions & 0 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__config
Original file line number Diff line number Diff line change
Expand Up @@ -1246,6 +1246,11 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
# define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL
# endif

// Enable bypassing automatic storage checks in atomics when using CTK 12.2 and below and if NDEBUG is defined.
# if defined(_CCCL_CUDACC_BELOW_12_2) && !defined(NDEBUG)
# define _LIBCUDACXX_ATOMIC_UNSAFE_AUTOMATIC_STORAGE
# endif // _CCCL_CUDACC_BELOW_12_2

// CUDA Atomics supersede host atomics in order to insert the host/device dispatch layer
# if defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_COMPILER_NVHPC) \
|| defined(_CCCL_CUDACC)
Expand Down

0 comments on commit 0fe24d3

Please sign in to comment.