Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Draft: Revert/barrier parity #192

Closed
wants to merge 44 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
d39dda7
Added parity waiting
ogiroux Jan 30, 2021
a479b85
Added try_wait options
ogiroux Mar 9, 2021
b089ac2
Add a small parity wait test
wmaxey Jul 23, 2021
a424765
Extend the test to measure both phases, make barrier_(try_)wait_parit…
wmaxey Jul 24, 2021
0560b71
Re-add the __try_wait API as pipeline uses internal symbols of barrie…
wmaxey Jul 27, 2021
318c275
Remove all uses of non-compliant __CUDA_ARCH__/preprocessor macros fr…
wmaxey Mar 31, 2021
0179859
Fix an issue in NVRTC tests
wmaxey Mar 31, 2021
b457a75
Move files and implementation for atomic refactor
wmaxey Apr 14, 2021
ae8a1e2
WIP: single interface wrap/unwrap
wmaxey Apr 14, 2021
d6d0812
Finish atomic refactor, bones of atomic_ref are in place
wmaxey May 1, 2021
ed349b7
Refactor a few #ifdef __CUDA__ things and fix statics/shared memory u…
wmaxey May 1, 2021
e04d828
Implement CUDA atomic ref, implement MSVC atomics layer, fix several …
wmaxey May 13, 2021
8cedc23
Fix atomic_mscv header, fix macro processing in __threading_support o…
wmaxey May 14, 2021
f6145ca
Fix some local errors and warnings, put back the SHARED macro, fix pi…
wmaxey Jul 7, 2021
bed0287
Add symlink to nv/target within libcxx
wmaxey Jul 8, 2021
0072881
Fix CV handling of atomics on the libcxx/std layer
wmaxey Jul 9, 2021
a1328d7
Disable C11 atomics in __config
wmaxey Jul 9, 2021
1360f48
fixups for Clang specific issues in atomic, make sure <nv/target> is …
wmaxey Jul 10, 2021
5363202
Fix CUDA and MSVC atomic layers
wmaxey Jul 14, 2021
c7d913f
uncomment a couple tests from pipeline
wmaxey Jul 14, 2021
2da7c63
Revert tests, will <nv/target>-ify later
wmaxey Jul 14, 2021
79ceb8b
Rebuild atomic_cuda_generated
wmaxey Jul 14, 2021
6d0d094
Dedup MSVC by splitting the atomic base class into a seperate header …
wmaxey Jul 22, 2021
be9bacc
Missed grabbing important parts of the nvcxx-compatibility branch whe…
wmaxey Jul 24, 2021
03641c9
Pickup more nv/target specializations from nvcxx_compatibility branch
wmaxey Jul 24, 2021
1b64243
Uglify the atomic detail:: and host:: namespaces
wmaxey Jul 27, 2021
0d31a92
Rename __skip_amt to __atomic_ptr_inc
wmaxey Jul 27, 2021
28a7a2f
Refactor and dedup some code in the __cxx_atomic cuda layer, fix runt…
wmaxey Jul 27, 2021
4060036
Fix set-but-not-used warnings for atomic intrinsics in atomic_base.h
wmaxey Jul 28, 2021
239151f
Fix static_assert in bad_atomic_alignment test.
wmaxey Jul 28, 2021
7db3ff2
Suppress pointless comparison warnings where tests are impacted by th…
wmaxey Jul 28, 2021
b219567
Add a missing license header to the atomic_c11.h file
wmaxey Jul 28, 2021
1e57f8a
Fix pointless comparison warnings on two other pipeline tests
wmaxey Jul 28, 2021
65ab07b
Wrap/Unwrap store, exchange, and load to make sure the 'written to' p…
wmaxey Jul 29, 2021
560a8c6
Change method of ensuring that atomic types match
wmaxey Jul 29, 2021
572c286
Fix spurious warnings in atomic_base.h
wmaxey Aug 2, 2021
c0c861f
Rename __to_gcc_order to __cxx_atomic_order_to_int
wmaxey Aug 3, 2021
c9e07dc
Reset barrier/latch/semaphore to head, as those will be nv/target'd l…
wmaxey Aug 4, 2021
917101a
Fix a few includes occuring within internal namespaces
wmaxey Aug 4, 2021
1d1b6bb
Create changelog for release 1.6.0
wmaxey Jul 30, 2021
5385867
Bump libcudacxx API version to 1.6.0
wmaxey Jul 30, 2021
8ff18aa
Add clearer notification about ABI break. Fix capitalization.
wmaxey Aug 4, 2021
25e386c
Revert "Re-add the __try_wait API as pipeline uses internal symbols o…
wmaxey Aug 4, 2021
4ca86fd
Update Release 1.6.0?
wmaxey Aug 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions .upstream-tests/test/cuda/bad_atomic_alignment.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
// UNSUPPORTED: windows && pre-sm-70

// <cuda/atomic>

// cuda::atomic<key>

// Original test issue:
// https://github.com/NVIDIA/libcudacxx/issues/160

#include <cuda/atomic>

template <typename T>
__host__ __device__
constexpr bool unused(T &&) {return true;}

int main(int argc, char ** argv)
{
// Test default aligned user type
{
struct key {
int32_t a;
int32_t b;
};
static_assert(alignof(key) == 4, "");
cuda::atomic<key> k;
auto r = k.load();
k.store(r);
(void)k.exchange(r);
unused(r);
}
// Test forcibly aligned user type
{
struct alignas(8) key {
int32_t a;
int32_t b;
};
static_assert(alignof(key) == 8, "");
cuda::atomic<key> k;
auto r = k.load();
k.store(r);
(void)k.exchange(r);
unused(r);
}
return 0;
}
2 changes: 2 additions & 0 deletions .upstream-tests/test/cuda/pipeline_arrive_on.pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

// Remove after bump to version 4
#define _LIBCUDACXX_CUDA_ABI_VERSION 3
// TODO: Remove pointless comparison suppression when compiler fixes short-circuiting
#pragma nv_diag_suppress 186

#pragma nv_diag_suppress static_var_with_dynamic_init
#pragma nv_diag_suppress declared_but_not_referenced
Expand Down
3 changes: 3 additions & 0 deletions .upstream-tests/test/cuda/pipeline_arrive_on_abi_v2.pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@

#define _LIBCUDACXX_CUDA_ABI_VERSION 2

// TODO: Remove pointless comparison suppression when compiler fixes short-circuiting
#pragma nv_diag_suppress 186

#pragma nv_diag_suppress static_var_with_dynamic_init
#pragma nv_diag_suppress declared_but_not_referenced

Expand Down
3 changes: 3 additions & 0 deletions .upstream-tests/test/cuda/pipeline_group_concept.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

// UNSUPPORTED: pre-sm-70

// TODO: Remove pointless comparison suppression when compiler fixes short-circuiting
#pragma nv_diag_suppress 186

#include <cuda/pipeline>

template <typename T_size, typename T_thread_rank>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ struct TestFn {
__host__ __device__
void operator()() const {
typedef cuda::std::atomic<T> A;
A t;
A t{};
bool b1 = cuda::std::atomic_is_lock_free(static_cast<const A*>(&t));
volatile A vt;
volatile A vt{};
bool b2 = cuda::std::atomic_is_lock_free(static_cast<const volatile A*>(&vt));
assert(b1 == b2);
}
Expand Down
2 changes: 2 additions & 0 deletions .upstream-tests/test/std/utilities/time/time.cal/euclidian.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
//
//===----------------------------------------------------------------------===//

#pragma nv_diag_suppress 186

#include <type_traits>


Expand Down
238 changes: 133 additions & 105 deletions codegen/codegen.cpp

Large diffs are not rendered by default.

27 changes: 26 additions & 1 deletion docs/releases/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,34 @@ It pulls in the latest version of upstream libc++ and marks the beginning of

!-->

## libcu++ 1.6.0 (CUDA Toolkit 11.5)

libcu++ 1.6.0 is a major release. It adds two new functions to the `cuda::std::barrier` API and
uses `<nv/target>` as the primary dispatch mechanism for `cuda::std::atomic`.

This release introduces ABI version 4, which is now the default.

Supported ABI Versions: 4 (default), 3, and 2.

Included in: CUDA Toolkit 11.5.

### Issues Fixed

- #179: Refactors the atomic layer to allow for layering the host device/host abstractions.
- #189: Changed pragmas for silencing chrono long double warnings.
- #186: Allows `<nv/target>` to be used under NVRTC.
- #177: Allows `<nv/target>` to build when compiled under C and C++98.
- Thanks to David Olsen for this contribution.
- #172: Introduces ABI version 4.
- Forces `cuda::std::complex` alignment for enhanced performance.
- Sets the internal representation of `cuda::std::chrono` literals to `double`.
- #165: For tests on some older distributions keep using Python 3, but downgrade lit.
- #164: Fixes testing issues related to Python 2/3 switch for lit.
- Thanks to Royil Damer for this contribution.

## libcu++ 1.5.0 (CUDA Toolkit 11.4)

libcu++ 1.5.0 is a major release. It adds `<nv/target>`,
libcu++ 1.5.0 is a major release. It adds `<nv/target>`,
the library support header for the new `if target`
target specialization mechanism.

Expand Down
79 changes: 48 additions & 31 deletions include/cuda/std/atomic
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#undef ATOMIC_VAR_INIT
#endif //__CUDACC_RTC__

#include "cassert"
#include "cstddef"
#include "cstdint"
#include "type_traits"
Expand All @@ -46,16 +47,26 @@

#include "detail/__pragma_push"

#include "detail/__atomic"
#include "detail/__threading_support"

#undef _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
#undef _LIBCUDACXX_HAS_C_ATOMIC_IMP

#include "detail/libcxx/include/atomic"

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

using std::__detail::thread_scope;
using std::__detail::thread_scope_system;
using std::__detail::thread_scope_device;
using std::__detail::thread_scope_block;
using std::__detail::thread_scope_thread;

namespace __detail {
using std::__detail::__thread_scope_block_tag;
using std::__detail::__thread_scope_device_tag;
using std::__detail::__thread_scope_system_tag;
using std::__detail::__atomic_signal_fence_cuda;
using std::__detail::__atomic_thread_fence_cuda;
}

using memory_order = std::memory_order;

constexpr memory_order memory_order_relaxed = std::memory_order_relaxed;
Expand All @@ -67,7 +78,7 @@ constexpr memory_order memory_order_seq_cst = std::memory_order_seq_cst;

// atomic<T>

template <class _Tp, thread_scope _Sco = thread_scope_system>
template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
struct atomic
: public std::__atomic_base<_Tp, _Sco>
{
Expand All @@ -87,15 +98,15 @@ struct atomic
__host__ __device__
_Tp fetch_max(const _Tp & __op, memory_order __m = memory_order_seq_cst) volatile noexcept
{
return detail::__atomic_fetch_max_cuda(&this->__a_.__a_value, __op,
__m, detail::__scope_tag<_Sco>());
return std::__detail::__atomic_fetch_max_cuda(&this->__a_.__a_value, __op,
__m, std::__detail::__scope_tag<_Sco>());
}

__host__ __device__
_Tp fetch_min(const _Tp & __op, memory_order __m = memory_order_seq_cst) volatile noexcept
{
return detail::__atomic_fetch_min_cuda(&this->__a_.__a_value, __op,
__m, detail::__scope_tag<_Sco>());
return std::__detail::__atomic_fetch_min_cuda(&this->__a_.__a_value, __op,
__m, std::__detail::__scope_tag<_Sco>());
}
};

Expand Down Expand Up @@ -159,31 +170,37 @@ struct atomic<_Tp*, _Sco>
_Tp* operator-=(ptrdiff_t __op) noexcept {return fetch_sub(__op) - __op;}
};

inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_scope _Scope = thread_scope_system) {
#ifdef __CUDA_ARCH__
switch(_Scope) {
case thread_scope_system:
detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_system_tag());
break;
case thread_scope_device:
detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_device_tag());
break;
case thread_scope_block:
detail::__atomic_thread_fence_cuda((int)__m, detail::__thread_scope_block_tag());
break;
}
#else
(void) _Scope;
::std::atomic_thread_fence((::std::memory_order)__m);
#endif
inline __host__ __device__ void atomic_thread_fence(memory_order __m, thread_scope _Scope = thread_scope::thread_scope_system) {
NV_DISPATCH_TARGET(
NV_IS_DEVICE, (
switch(_Scope) {
case thread_scope::thread_scope_system:
__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_system_tag());
break;
case thread_scope::thread_scope_device:
__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_device_tag());
break;
case thread_scope::thread_scope_block:
__detail::__atomic_thread_fence_cuda((int)__m, __detail::__thread_scope_block_tag());
break;
}
),
NV_IS_HOST, (
(void) _Scope;
::std::atomic_thread_fence((::std::memory_order)__m);
)
)
}

inline __host__ __device__ void atomic_signal_fence(memory_order __m) {
#ifdef __CUDA_ARCH__
detail::__atomic_signal_fence_cuda((int)__m);
#else
::std::atomic_signal_fence((::std::memory_order)__m);
#endif
NV_DISPATCH_TARGET(
NV_IS_DEVICE, (
__detail::__atomic_signal_fence_cuda((int)__m);
),
NV_IS_HOST, (
::std::atomic_signal_fence((::std::memory_order)__m);
)
)
}

_LIBCUDACXX_END_NAMESPACE_CUDA
Expand Down
Loading