diff --git a/libc/src/__support/threads/CMakeLists.txt b/libc/src/__support/threads/CMakeLists.txt index 9ea0b59befe7ac..d2e46b8e2574e9 100644 --- a/libc/src/__support/threads/CMakeLists.txt +++ b/libc/src/__support/threads/CMakeLists.txt @@ -10,6 +10,15 @@ add_header_library( sleep.h ) +add_header_library( + spin_lock + HDRS + spin_lock.h + DEPENDS + .sleep + libc.src.__support.CPP.atomic +) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) add_subdirectory(${LIBC_TARGET_OS}) endif() diff --git a/libc/src/__support/threads/spin_lock.h b/libc/src/__support/threads/spin_lock.h new file mode 100644 index 00000000000000..8a365505684644 --- /dev/null +++ b/libc/src/__support/threads/spin_lock.h @@ -0,0 +1,81 @@ +//===-- TTAS Spin Lock ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_SPIN_LOCK_H +#define LLVM_LIBC_SRC___SUPPORT_THREADS_SPIN_LOCK_H + +#include "src/__support/CPP/atomic.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/properties/architectures.h" +#include "src/__support/threads/sleep.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace spinlock { +template +using AtomicOp = Return (cpp::Atomic::*)(LockWord, cpp::MemoryOrder, + cpp::MemoryScope); +} + +template Acquire, + spinlock::AtomicOp Release> +class SpinLockAdaptor { + cpp::Atomic flag; + +public: + LIBC_INLINE constexpr SpinLockAdaptor() : flag{false} {} + LIBC_INLINE bool try_lock() { + return !flag.*Acquire(static_cast(1), cpp::MemoryOrder::ACQUIRE); + } + LIBC_INLINE void lock() { + // clang-format off + // For normal TTAS, this compiles to the following on armv9a and x86_64: + // mov w8, #1 | .LBB0_1: + // .LBB0_1: | mov al, 1 + // swpab w8, w9, [x0] | xchg byte ptr [rdi], al + // tbnz w9, #0, .LBB0_3 | test al, 1 + // b .LBB0_4 | jne .LBB0_3 + // .LBB0_2: | jmp .LBB0_4 + // isb | .LBB0_2: + // .LBB0_3: | pause + // ldrb w9, [x0] | .LBB0_3: + // tbnz w9, #0, .LBB0_2 | movzx eax, byte ptr [rdi] + // b .LBB0_1 | test al, 1 + // .LBB0_4: | jne .LBB0_2 + // ret | jmp .LBB0_1 + // | .LBB0_4: + // | ret + // clang-format on + // Notice that inside the busy loop .LBB0_2 and .LBB0_3, only instructions + // with load semantics are used. swpab/xchg is only issued in outer loop + // .LBB0_1. This is useful to avoid extra write traffic. The cache + // coherence guarantees "write propagation", so even if the inner loop only + // reads with relaxed ordering, the thread will evetually see the write. + while (!try_lock()) + while (flag.load(cpp::MemoryOrder::RELAXED)) + sleep_briefly(); + } + LIBC_INLINE void unlock() { + flag.*Release(static_cast(0), cpp::MemoryOrder::RELEASE); + } +}; + +// It is reported that atomic operations with higher-order semantics +// lead to better performance on GPUs. +#ifdef LIBC_TARGET_ARCH_IS_GPU +using SpinLock = + SpinLockAdaptor::fetch_or, + &cpp::Atomic::fetch_and>; +#else +using SpinLock = SpinLockAdaptor::exchange, + &cpp::Atomic::store>; +#endif + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_THREADS_SPIN_LOCK_H