Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libcu++] Fix undefined behavior in atomics to automatic storage #478

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions libcudacxx/.upstream-tests/test/cuda/atomics/atomic.local.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
//===----------------------------------------------------------------------===//
//
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UNSUPPORTED: windows && pre-sm-70

#include <cuda/atomic>
#include <cuda/std/cassert>

template <typename T>
__device__ T store(T in) {
cuda::atomic<T> x = in;
x.store(in + 1, cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T compare_exchange_weak(T in) {
cuda::atomic<T> x = in;
T old = T(7);
x.compare_exchange_weak(old, T(42), cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T compare_exchange_strong(T in) {
cuda::atomic<T> x = in;
T old = T(7);
x.compare_exchange_strong(old, T(42), cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T exchange(T in) {
cuda::atomic<T> x = in;
T out = x.exchange(T(1), cuda::memory_order_relaxed);
return out + x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T fetch_add(T in) {
cuda::atomic<T> x = in;
x.fetch_add(T(1), cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T fetch_sub(T in) {
cuda::atomic<T> x = in;
x.fetch_sub(T(1), cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T fetch_and(T in) {
cuda::atomic<T> x = in;
x.fetch_and(T(1), cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T fetch_or(T in) {
cuda::atomic<T> x = in;
x.fetch_or(T(1), cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T fetch_xor(T in) {
cuda::atomic<T> x = in;
x.fetch_xor(T(1), cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T fetch_min(T in) {
cuda::atomic<T> x = in;
x.fetch_min(T(7), cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ T fetch_max(T in) {
cuda::atomic<T> x = in;
x.fetch_max(T(7), cuda::memory_order_relaxed);
return x.load(cuda::memory_order_relaxed);
}

template <typename T>
__device__ inline void tests() {
const T tid = threadIdx.x;
assert(tid + T(1) == store(tid));
assert(T(1) + tid == exchange(tid));
assert(tid == T(7)? T(42) : tid == compare_exchange_weak(tid));
assert(tid == T(7)? T(42) : tid == compare_exchange_strong(tid));
assert((tid + T(1)) == fetch_add(tid));
assert((tid & T(1)) == fetch_and(tid));
assert((tid | T(1)) == fetch_or(tid));
assert((tid ^ T(1)) == fetch_xor(tid));
assert(min(tid, T(7)) == fetch_min(tid));
assert(max(tid, T(7)) == fetch_max(tid));
assert(T(tid - T(1)) == fetch_sub(tid));
}

int main(int arg, char ** argv)
{
NV_IF_ELSE_TARGET(
NV_IS_HOST, (
cuda_thread_count = 64;
),(
tests<uint8_t>();
tests<uint16_t>();
tests<uint32_t>();
tests<uint64_t>();
tests<int8_t>();
tests<int16_t>();
tests<int32_t>();
tests<int64_t>();
)
)
return 0;
}
11 changes: 10 additions & 1 deletion libcudacxx/codegen/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ int main() {
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include "atomic_cuda_local.h"
)XXX" << "\n\n";

auto scopenametag = [&](auto scope) {
Expand Down Expand Up @@ -142,6 +144,7 @@ int main() {
for(auto& cv: cv_qualifier) {
out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz/8 << ", int> = 0>\n";
out << "_LIBCUDACXX_DEVICE void __atomic_load_cuda(const " << cv << "_Type *__ptr, _Type *__ret, int __memorder, " << scopenametag(s.first) << ") {\n";
out << " if (__cuda_load_weak_if_local(__ptr, __ret)) return;\n";
Copy link
Collaborator Author

@gonzalobg gonzalobg Jul 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be weak_if_local_or_const_or_grid_param, since:

__constant__ cuda::atomic<int> x;
x.load(); // UB, should use weak load

and

__global__ void kernel(__grid_constant__ const cuda::atomic<int> x) { 
   x.load();
}

have the same issue.

out << " uint" << sz << "_t __tmp = 0;\n";
out << " NV_DISPATCH_TARGET(\n";
out << " NV_PROVIDES_SM_70, (\n";
Expand Down Expand Up @@ -178,6 +181,7 @@ int main() {
for(auto& cv: cv_qualifier) {
out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz/8 << ", int> = 0>\n";
out << "_LIBCUDACXX_DEVICE void __atomic_store_cuda(" << cv << "_Type *__ptr, _Type *__val, int __memorder, " << scopenametag(s.first) << ") {\n";
out << " if (__cuda_store_weak_if_local(__ptr, *__val)) return;\n";
out << " uint" << sz << "_t __tmp = 0;\n";
out << " memcpy(&__tmp, __val, " << sz/8 << ");\n";
out << " NV_DISPATCH_TARGET(\n";
Expand Down Expand Up @@ -239,6 +243,8 @@ int main() {
if(rmw.first == "compare_exchange") {
out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz/8 << ", int> = 0>\n";
out << "_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(" << cv << "_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, " << scopenametag(s.first) << ") {\n";
out << " bool __tmp_out;\n";
out << " if (__cuda_compare_exchange_weak_if_local(__ptr, __expected, __desired, &__tmp_out)) return __tmp_out;\n";
out << " uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n";
out << " memcpy(&__tmp, __desired, " << sz/8 << ");\n";
out << " memcpy(&__old, __expected, " << sz/8 << ");\n";
Expand Down Expand Up @@ -277,7 +283,8 @@ int main() {
if(rmw.first == "exchange") {
out << ", int> = 0>\n";
out << "_LIBCUDACXX_DEVICE void __atomic_exchange_cuda(" << cv << "_Type *__ptr, _Type *__val, _Type *__ret, int __memorder, " << scopenametag(s.first) << ") {\n";
out << " uint" << sz << "_t __tmp = 0;\n";
out << " if (__cuda_exchange_weak_if_local(__ptr, __val, __ret)) return;\n";
out << " uint" << sz << "_t __tmp = 0;\n";
out << " memcpy(&__tmp, __val, " << sz/8 << ");\n";
}
else {
Expand All @@ -295,6 +302,7 @@ int main() {
out << ", int> = 0>\n";
out << "_LIBCUDACXX_DEVICE _Type __atomic_" << rmw.first << "_cuda(" << cv << "_Type *__ptr, _Type __val, int __memorder, " << scopenametag(s.first) << ") {\n";
out << " _Type __ret;\n";
out << " if (__cuda_" << rmw.first << "_weak_if_local(__ptr, __val, &__ret)) return __ret;\n";
if(type.first == "f" && sz == 32)
out << " float";
else if(type.first == "f" && sz == 64)
Expand Down Expand Up @@ -352,6 +360,7 @@ int main() {
if(op == "sub")
out << " __tmp = -__tmp;\n";
out << " __tmp *= sizeof(_Type);\n";
out << " if (__cuda_fetch_add_weak_if_local((uint64_t*)__ptr, __tmp, (uint64_t*)&__ret)) return __ret;\n";
gonzalobg marked this conversation as resolved.
Show resolved Hide resolved
out << " NV_DISPATCH_TARGET(\n";
out << " NV_PROVIDES_SM_70, (\n";
out << " switch (__memorder) {\n";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
#ifndef __LIBCUDACXX_ATOMIC_CUDA_H
#define __LIBCUDACXX_ATOMIC_CUDA_H

#if defined(__CUDA_MINIMUM_ARCH__) && ((!defined(_LIBCUDACXX_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 600) || (defined(_LIBCUDACXX_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 700))
# error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
Expand Down Expand Up @@ -398,25 +400,6 @@ template <typename _Tp, int _Sco>
_LIBCUDACXX_HOST_DEVICE inline _Tp __cxx_atomic_exchange(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __value, memory_order __order) {
return __cxx_small_from_32<_Tp>(__cxx_atomic_exchange(&__a->__a_value, __cxx_small_to_32(__value), __order));
}
_LIBCUDACXX_HOST_DEVICE
inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) {
NV_DISPATCH_TARGET(
NV_IS_DEVICE, (
auto __lhs_c = reinterpret_cast<unsigned char const *>(__lhs);
auto __rhs_c = reinterpret_cast<unsigned char const *>(__rhs);
while (__count--) {
auto const __lhs_v = *__lhs_c++;
auto const __rhs_v = *__rhs_c++;
if (__lhs_v < __rhs_v) { return -1; }
if (__lhs_v > __rhs_v) { return 1; }
}
return 0;
),
NV_IS_HOST, (
return memcmp(__lhs, __rhs, __count);
)
)
}

template <typename _Tp, int _Sco>
_LIBCUDACXX_HOST_DEVICE inline bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure) {
Expand Down Expand Up @@ -478,3 +461,5 @@ template <typename _Tp, typename _Delta, int _Sco>
_LIBCUDACXX_HOST_DEVICE inline _Tp __cxx_atomic_fetch_min(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Delta __val, memory_order __order) {
return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_min(&__a->__a_value, __cxx_small_to_32(__val), __order));
}

#endif // __LIBCUDACXX_ATOMIC_CUDA_H
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
//
//===----------------------------------------------------------------------===//

#include "atomic_cuda_local.h"

template<class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
bool _LIBCUDACXX_DEVICE __atomic_compare_exchange_cuda(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, _Scope __s) {
bool __ret;
if (__cuda_compare_exchange_weak_if_local(__ptr, __expected, __desired, &__ret)) return __ret;

auto const __aligned = (uint32_t*)((intptr_t)__ptr & ~(sizeof(uint32_t) - 1));
auto const __offset = uint32_t((intptr_t)__ptr & (sizeof(uint32_t) - 1)) * 8;
Expand All @@ -31,7 +35,7 @@ bool _LIBCUDACXX_DEVICE __atomic_compare_exchange_cuda(_Type volatile *__ptr, _T

template<class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
void _LIBCUDACXX_DEVICE __atomic_exchange_cuda(_Type volatile *__ptr, _Type *__val, _Type *__ret, int __memorder, _Scope __s) {

if (__cuda_exchange_weak_if_local(__ptr, __val, __ret)) return;
_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
while(!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, __s))
;
Expand All @@ -40,6 +44,8 @@ void _LIBCUDACXX_DEVICE __atomic_exchange_cuda(_Type volatile *__ptr, _Type *__v

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
_Type _LIBCUDACXX_DEVICE __atomic_fetch_add_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
_Type __ret;
if (__cuda_fetch_add_weak_if_local(__ptr, __val, &__ret)) return __ret;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

important: compiler is unable to see through the memory and identify that it's not local. This affects codegen and overall performance. Here's a simple kernel:

using device_atomic_t = cuda::atomic<int, cuda::thread_scope_device>;

__global__ void use(device_atomic_t *d_atomics) {
  d_atomics->fetch_add(threadIdx.x, cuda::memory_order_relaxed);
}

On RTX 6000 Ada the change leads to the following slowdown (up to ~3x slower)

device_scope_atomics

In the case of the block-scope atomics the performance difference is even more pronounced:

template <int BlockSize>
__launch_bounds__(BlockSize) __global__ void use(device_atomic_t *d_atomics, int mv) {
  __shared__ block_atomic_t b_atomics;

  if (threadIdx.x == 0) {
    new (&b_atomics) block_atomic_t{};
  }
  __syncthreads();

  b_atomics.fetch_add(threadIdx.x, cuda::memory_order_relaxed);
  __syncthreads();

  if (threadIdx.x == 0) {
    if (b_atomics.load(cuda::memory_order_relaxed) > mv) {
      d_atomics->fetch_add(1, cuda::memory_order_relaxed);
    }
  }
}

Results for RTX 6000 Ada illustrate up to ~4x slowdown:

block_scope_atomics

I think I agree with:

Since this only impact objects with automatic storage, the impact is not very widespread

Given this, I think we should explore options not to penalize widespread use cases. If compiler is able to see through the local space check, this would be a solution. Otherwise, we can consider refining the:

it affects an object in GPU memory and only GPU threads access it.

requirement to talk about global, cluster or block memory + add a check of automatic storage in debug build.

Copy link
Collaborator Author

@gonzalobg gonzalobg Oct 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is known but the analysis is incomplete since:

  • this lands on CUDA CTK 12.4,
  • the impact is zero on CUDA CTK 12.3 and newer, and
  • the impact is zero on CUDA CTK 12.2 and older iff cuda atomics are used through the cuda::atomic bundled in the CTK, since those are not impacted by this.

The performance regression is scoped to:

  • users of CUDA 12.2 and older,
  • that are not using the CUDA C++ standard library bundled with their CTK, but instead picking a different version from github.

For those users, we could - in a subsequent PR - provide a way to opt out into broken behavior via some feature macro, e.g., LIBCUDACXX_UNSAFE_ATOMIC_AUTOMATIC_STORAGE, that users define before including the headers consistently to avoid ODR issues:

#define LIBCUDACXX_UNSAFE_ATOMIC_AUTOMATIC_STORAGE
#include <cuda/atomic>

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the slack discussion, an alternative is to enable the check in CTK 12.2 and older only in debug mode, to avoid the perf hit.

Copy link
Collaborator

@miscco miscco Oct 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this something where we could work with attributes e.g [[likely]] / [[unlikely]]?


_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type __desired = __expected + __val;
Expand All @@ -50,6 +56,9 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_add_cuda(_Type volatile *__ptr, _Delta _

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0>
_Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_max_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
_Type __ret;
if (__cuda_fetch_max_weak_if_local(__ptr, __val, &__ret)) return __ret;

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type __desired = __expected > __val ? __expected : __val;

Expand All @@ -63,6 +72,9 @@ _Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_max_cuda(_Type volatile *__ptr, _De

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0>
_Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_min_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
_Type __ret;
if (__cuda_fetch_min_weak_if_local(__ptr, __val, &__ret)) return __ret;

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type __desired = __expected < __val ? __expected : __val;

Expand All @@ -76,6 +88,8 @@ _Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_min_cuda(_Type volatile *__ptr, _De

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
_Type _LIBCUDACXX_DEVICE __atomic_fetch_sub_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
_Type __ret;
if (__cuda_fetch_sub_weak_if_local(__ptr, __val, &__ret)) return __ret;

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type __desired = __expected - __val;
Expand All @@ -86,6 +100,8 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_sub_cuda(_Type volatile *__ptr, _Delta _

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
_Type _LIBCUDACXX_DEVICE __atomic_fetch_and_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
_Type __ret;
if (__cuda_fetch_and_weak_if_local(__ptr, __val, &__ret)) return __ret;

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type __desired = __expected & __val;
Expand All @@ -96,6 +112,8 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_and_cuda(_Type volatile *__ptr, _Delta _

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
_Type _LIBCUDACXX_DEVICE __atomic_fetch_xor_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
_Type __ret;
if (__cuda_fetch_xor_weak_if_local(__ptr, __val, &__ret)) return __ret;

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type __desired = __expected ^ __val;
Expand All @@ -106,6 +124,8 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_xor_cuda(_Type volatile *__ptr, _Delta _

template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0>
_Type _LIBCUDACXX_DEVICE __atomic_fetch_or_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) {
_Type __ret;
if (__cuda_fetch_or_weak_if_local(__ptr, __val, &__ret)) return __ret;

_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
_Type __desired = __expected | __val;
Expand Down
Loading