From 6e72dc6f676fe06bffb85fc572f1bbfe7f711efa Mon Sep 17 00:00:00 2001 From: Olivier Giroux Date: Tue, 9 Mar 2021 08:40:04 -0800 Subject: [PATCH] Added try_wait options --- include/cuda/std/barrier | 19 +++++++++++------ libcxx/include/barrier | 46 ++++++++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index 781be24296..f9f31b93ab 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -40,8 +40,6 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> { template friend class pipeline; - using std::__barrier_base<_CompletionF, _Sco>::__try_wait; - public: barrier() = default; @@ -77,6 +75,13 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE _LIBCUDACXX_BEGIN_NAMESPACE_CUDA +template +inline _LIBCUDACXX_INLINE_VISIBILITY +bool barrier_try_wait_parity(__Barrier const* __this, bool __parity) +{ + return __this->__try_wait_parity(__parity); +} + template struct __barrier_poll_tester_parity { __Barrier const* __this; @@ -91,15 +96,15 @@ struct __barrier_poll_tester_parity { inline _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const { - return __this->__try_wait_parity(__parity); + return barrier_try_wait_parity(__this, __parity); } }; template inline _LIBCUDACXX_INLINE_VISIBILITY -void barrier_wait_for_parity(__Barrier const* __self, bool __parity) +void barrier_wait_parity(__Barrier const* __this, bool __parity) { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__Barrier>(__self, __parity)); + _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__Barrier>(__this, __parity)); } template<> @@ -114,7 +119,7 @@ public: using arrival_token = typename __barrier_base::arrival_token; _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __phase) const { + bool try_wait(arrival_token __phase) const { #if __CUDA_ARCH__ >= 800 if (__isShared(&__barrier)) { int __ready = 0; @@ -131,7 +136,7 @@ public: else #endif { - return __barrier.__try_wait(std::move(__phase)); + return __barrier.try_wait(std::move(__phase)); } } diff --git a/libcxx/include/barrier b/libcxx/include/barrier index 9b4a240894..125fa6625d 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -209,6 +209,12 @@ class __barrier_base { _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base __expected, __arrived; _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion; _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base __phase; + + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_phase(bool __old_phase) const + { + return __phase.load(memory_order_acquire) != __old_phase; + } public: using arrival_token = bool; @@ -241,11 +247,15 @@ public: return __old_phase; } _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __old_phase) const + bool try_wait(arrival_token __old) const { - return __phase != __old_phase; + return __try_wait_phase(__old); + } + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_parity(bool __parity) const + { + return __try_wait_phase(__parity); } - _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __old_phase) const { @@ -281,10 +291,10 @@ struct __barrier_poll_tester { , __phase(_CUDA_VSTD::move(__phase_)) {} - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const { - return __this->__try_wait(__phase); + return __this->try_wait(__phase); } }; @@ -303,12 +313,18 @@ public: using arrival_token = uint64_t; private: - static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR uint64_t __init(ptrdiff_t __count) _NOEXCEPT { return (((1u << 31) - __count) << 32) | ((1u << 31) - __count); } + _LIBCUDACXX_INLINE_VISIBILITY + bool __try_wait_phase(uint64_t __phase) const + { + uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); + return ((__current & __phase_bit) != __phase); + } public: __barrier_base() = default; @@ -323,19 +339,13 @@ public: __barrier_base(__barrier_base const&) = delete; __barrier_base& operator=(__barrier_base const&) = delete; - inline _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait_phase(uint64_t __phase) const - { - uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); - return ((__current & __phase_bit) != __phase); - } - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait_parity(bool __parity) const { return __try_wait_phase(__parity ? __phase_bit : 0); } - inline _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __old) const + _LIBCUDACXX_INLINE_VISIBILITY + bool try_wait(arrival_token __old) const { return __try_wait_phase(__old & __phase_bit); } @@ -351,17 +361,17 @@ public: } return __old & __phase_bit; } - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __phase) const { __libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base<__empty_completion, _Sco>>(this, _CUDA_VSTD::move(__phase))); } - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait() { wait(arrive()); } - inline _LIBCUDACXX_INLINE_VISIBILITY + _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop() { __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);