-
Notifications
You must be signed in to change notification settings - Fork 163
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libcu++] Fix undefined behavior in atomics to automatic storage #478
base: main
Are you sure you want to change the base?
Changes from 18 commits
a3f0405
10ce88a
0e05292
88b1d61
6b07744
3f46c1f
b50314f
e51c1b9
e5104cc
236824a
9f2fe4b
f9ad757
3099287
618f71d
91d36fa
18ba198
72c3e5a
ead9dac
c85ff17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
// UNSUPPORTED: windows && pre-sm-70 | ||
|
||
#include <cuda/atomic> | ||
#include <cuda/std/cassert> | ||
|
||
template <typename T> | ||
__device__ T store(T in) { | ||
cuda::atomic<T> x = in; | ||
x.store(in + 1, cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T compare_exchange_weak(T in) { | ||
cuda::atomic<T> x = in; | ||
T old = T(7); | ||
x.compare_exchange_weak(old, T(42), cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T compare_exchange_strong(T in) { | ||
cuda::atomic<T> x = in; | ||
T old = T(7); | ||
x.compare_exchange_strong(old, T(42), cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T exchange(T in) { | ||
cuda::atomic<T> x = in; | ||
T out = x.exchange(T(1), cuda::memory_order_relaxed); | ||
return out + x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T fetch_add(T in) { | ||
cuda::atomic<T> x = in; | ||
x.fetch_add(T(1), cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T fetch_sub(T in) { | ||
cuda::atomic<T> x = in; | ||
x.fetch_sub(T(1), cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T fetch_and(T in) { | ||
cuda::atomic<T> x = in; | ||
x.fetch_and(T(1), cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T fetch_or(T in) { | ||
cuda::atomic<T> x = in; | ||
x.fetch_or(T(1), cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T fetch_xor(T in) { | ||
cuda::atomic<T> x = in; | ||
x.fetch_xor(T(1), cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T fetch_min(T in) { | ||
cuda::atomic<T> x = in; | ||
x.fetch_min(T(7), cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ T fetch_max(T in) { | ||
cuda::atomic<T> x = in; | ||
x.fetch_max(T(7), cuda::memory_order_relaxed); | ||
return x.load(cuda::memory_order_relaxed); | ||
} | ||
|
||
template <typename T> | ||
__device__ inline void tests() { | ||
const T tid = threadIdx.x; | ||
assert(tid + T(1) == store(tid)); | ||
assert(T(1) + tid == exchange(tid)); | ||
assert(tid == T(7)? T(42) : tid == compare_exchange_weak(tid)); | ||
assert(tid == T(7)? T(42) : tid == compare_exchange_strong(tid)); | ||
assert((tid + T(1)) == fetch_add(tid)); | ||
assert((tid & T(1)) == fetch_and(tid)); | ||
assert((tid | T(1)) == fetch_or(tid)); | ||
assert((tid ^ T(1)) == fetch_xor(tid)); | ||
assert(min(tid, T(7)) == fetch_min(tid)); | ||
assert(max(tid, T(7)) == fetch_max(tid)); | ||
assert(T(tid - T(1)) == fetch_sub(tid)); | ||
} | ||
|
||
int main(int arg, char ** argv) | ||
{ | ||
NV_IF_ELSE_TARGET( | ||
NV_IS_HOST, ( | ||
cuda_thread_count = 64; | ||
),( | ||
tests<uint8_t>(); | ||
tests<uint16_t>(); | ||
tests<uint32_t>(); | ||
tests<uint64_t>(); | ||
tests<int8_t>(); | ||
tests<int16_t>(); | ||
tests<int32_t>(); | ||
tests<int64_t>(); | ||
) | ||
) | ||
return 0; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,8 +8,12 @@ | |
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "atomic_cuda_local.h" | ||
|
||
template<class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0> | ||
bool _LIBCUDACXX_DEVICE __atomic_compare_exchange_cuda(_Type volatile *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, _Scope __s) { | ||
bool __ret; | ||
if (__cuda_compare_exchange_weak_if_local(__ptr, __expected, __desired, &__ret)) return __ret; | ||
|
||
auto const __aligned = (uint32_t*)((intptr_t)__ptr & ~(sizeof(uint32_t) - 1)); | ||
auto const __offset = uint32_t((intptr_t)__ptr & (sizeof(uint32_t) - 1)) * 8; | ||
|
@@ -31,7 +35,7 @@ bool _LIBCUDACXX_DEVICE __atomic_compare_exchange_cuda(_Type volatile *__ptr, _T | |
|
||
template<class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0> | ||
void _LIBCUDACXX_DEVICE __atomic_exchange_cuda(_Type volatile *__ptr, _Type *__val, _Type *__ret, int __memorder, _Scope __s) { | ||
|
||
if (__cuda_exchange_weak_if_local(__ptr, __val, __ret)) return; | ||
_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); | ||
while(!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, __s)) | ||
; | ||
|
@@ -40,6 +44,8 @@ void _LIBCUDACXX_DEVICE __atomic_exchange_cuda(_Type volatile *__ptr, _Type *__v | |
|
||
template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0> | ||
_Type _LIBCUDACXX_DEVICE __atomic_fetch_add_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) { | ||
_Type __ret; | ||
if (__cuda_fetch_add_weak_if_local(__ptr, __val, &__ret)) return __ret; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. important: compiler is unable to see through the memory and identify that it's not local. This affects codegen and overall performance. Here's a simple kernel: using device_atomic_t = cuda::atomic<int, cuda::thread_scope_device>;
__global__ void use(device_atomic_t *d_atomics) {
d_atomics->fetch_add(threadIdx.x, cuda::memory_order_relaxed);
} On RTX 6000 Ada the change leads to the following slowdown (up to ~3x slower) In the case of the block-scope atomics the performance difference is even more pronounced: template <int BlockSize>
__launch_bounds__(BlockSize) __global__ void use(device_atomic_t *d_atomics, int mv) {
__shared__ block_atomic_t b_atomics;
if (threadIdx.x == 0) {
new (&b_atomics) block_atomic_t{};
}
__syncthreads();
b_atomics.fetch_add(threadIdx.x, cuda::memory_order_relaxed);
__syncthreads();
if (threadIdx.x == 0) {
if (b_atomics.load(cuda::memory_order_relaxed) > mv) {
d_atomics->fetch_add(1, cuda::memory_order_relaxed);
}
}
} Results for RTX 6000 Ada illustrate up to ~4x slowdown: I think I agree with:
Given this, I think we should explore options not to penalize widespread use cases. If compiler is able to see through the local space check, this would be a solution. Otherwise, we can consider refining the:
requirement to talk about global, cluster or block memory + add a check of automatic storage in debug build. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is known but the analysis is incomplete since:
The performance regression is scoped to:
For those users, we could - in a subsequent PR - provide a way to opt out into broken behavior via some feature macro, e.g., #define LIBCUDACXX_UNSAFE_ATOMIC_AUTOMATIC_STORAGE
#include <cuda/atomic> There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From the slack discussion, an alternative is to enable the check in CTK 12.2 and older only in debug mode, to avoid the perf hit. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this something where we could work with attributes e.g |
||
|
||
_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); | ||
_Type __desired = __expected + __val; | ||
|
@@ -50,6 +56,9 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_add_cuda(_Type volatile *__ptr, _Delta _ | |
|
||
template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0> | ||
_Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_max_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) { | ||
_Type __ret; | ||
if (__cuda_fetch_max_weak_if_local(__ptr, __val, &__ret)) return __ret; | ||
|
||
_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); | ||
_Type __desired = __expected > __val ? __expected : __val; | ||
|
||
|
@@ -63,6 +72,9 @@ _Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_max_cuda(_Type volatile *__ptr, _De | |
|
||
template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0> | ||
_Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_min_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) { | ||
_Type __ret; | ||
if (__cuda_fetch_min_weak_if_local(__ptr, __val, &__ret)) return __ret; | ||
|
||
_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); | ||
_Type __desired = __expected < __val ? __expected : __val; | ||
|
||
|
@@ -76,6 +88,8 @@ _Type _LIBCUDACXX_HOST_DEVICE __atomic_fetch_min_cuda(_Type volatile *__ptr, _De | |
|
||
template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0> | ||
_Type _LIBCUDACXX_DEVICE __atomic_fetch_sub_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) { | ||
_Type __ret; | ||
if (__cuda_fetch_sub_weak_if_local(__ptr, __val, &__ret)) return __ret; | ||
|
||
_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); | ||
_Type __desired = __expected - __val; | ||
|
@@ -86,6 +100,8 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_sub_cuda(_Type volatile *__ptr, _Delta _ | |
|
||
template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0> | ||
_Type _LIBCUDACXX_DEVICE __atomic_fetch_and_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) { | ||
_Type __ret; | ||
if (__cuda_fetch_and_weak_if_local(__ptr, __val, &__ret)) return __ret; | ||
|
||
_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); | ||
_Type __desired = __expected & __val; | ||
|
@@ -96,6 +112,8 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_and_cuda(_Type volatile *__ptr, _Delta _ | |
|
||
template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0> | ||
_Type _LIBCUDACXX_DEVICE __atomic_fetch_xor_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) { | ||
_Type __ret; | ||
if (__cuda_fetch_xor_weak_if_local(__ptr, __val, &__ret)) return __ret; | ||
|
||
_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); | ||
_Type __desired = __expected ^ __val; | ||
|
@@ -106,6 +124,8 @@ _Type _LIBCUDACXX_DEVICE __atomic_fetch_xor_cuda(_Type volatile *__ptr, _Delta _ | |
|
||
template<class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type)<=2, int>::type = 0> | ||
_Type _LIBCUDACXX_DEVICE __atomic_fetch_or_cuda(_Type volatile *__ptr, _Delta __val, int __memorder, _Scope __s) { | ||
_Type __ret; | ||
if (__cuda_fetch_or_weak_if_local(__ptr, __val, &__ret)) return __ret; | ||
|
||
_Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s); | ||
_Type __desired = __expected | __val; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be weak_if_local_or_const_or_grid_param, since:
and
have the same issue.