diff --git a/api/include/opentelemetry/common/spin_lock_mutex.h b/api/include/opentelemetry/common/spin_lock_mutex.h index 48c7ecbeff..cf47a4fd07 100644 --- a/api/include/opentelemetry/common/spin_lock_mutex.h +++ b/api/include/opentelemetry/common/spin_lock_mutex.h @@ -10,7 +10,7 @@ # ifndef NOMINMAX # define NOMINMAX # endif -# include +# include #elif defined(__i386__) || defined(__x86_64__) # if defined(__clang__) # include @@ -35,6 +35,14 @@ constexpr int SPINLOCK_SLEEP_MS = 1; * This is meant to give a good balance of perofrmance and CPU consumption in * practice. * + * This mutex uses an incremental back-off strategy with the following phases: + * 1. A tight spin-lock loop (pending: using hardware PAUSE/YIELD instructions) + * 2. A loop where the current thread yields control after checking the lock. + * 3. Issuing a thread-sleep call before starting back in phase 1. + * + * This is meant to give a good balance of perofrmance and CPU consumption in + * practice. + * * This class implements the `BasicLockable` specification: * https://en.cppreference.com/w/cpp/named_req/BasicLockable */ @@ -79,8 +87,22 @@ class SpinLockMutex { return; } - // TODO: Issue PAUSE/YIELD instruction to reduce contention. - // e.g. __builtin_ia32_pause() / YieldProcessor() / _mm_pause(); +// Issue a Pause/Yield instruction while spinning. +#if defined(_MSC_VER) + YieldProcessor(); +#elif defined(__i386__) || defined(__x86_64__) +# if defined(__clang__) + _mm_pause(); +# else + __builtin_ia32_pause(); +# endif +#elif defined(__arm__) + // This intrinsic should fail to be found if YIELD is not supported on the current + // processor. + __yield(); +#else + // TODO: Issue PAGE/YIELD on other architectures. +#endif } // Yield then try again (goal ~100ns) std::this_thread::yield(); diff --git a/api/test/common/BUILD b/api/test/common/BUILD new file mode 100644 index 0000000000..3297a9565c --- /dev/null +++ b/api/test/common/BUILD @@ -0,0 +1,7 @@ +load("//bazel:otel_cc_benchmark.bzl", "otel_cc_benchmark") + +otel_cc_benchmark( + name = "spinlock_benchmark", + srcs = ["spinlock_benchmark.cc"], + deps = ["//api"], +) diff --git a/api/test/common/spinlock_benchmark.cc b/api/test/common/spinlock_benchmark.cc new file mode 100644 index 0000000000..3291fad06c --- /dev/null +++ b/api/test/common/spinlock_benchmark.cc @@ -0,0 +1,150 @@ +#include "opentelemetry/common/spin_lock_mutex.h" + +#include +#include + +namespace +{ +using opentelemetry::common::SpinLockMutex; + +constexpr int TightLoopLocks = 10000; + +// Runs a thrash-test where we spin up N threads, each of which will +// attempt to lock-mutate-unlock a total of `TightLoopLocks` times. +// +// lock: A lambda denoting how to lock. Accepts a reference to `SpinLockType`. +// unlock: A lambda denoting how to unlock. Accepts a reference to `SpinLockType`. +template +inline void SpinThrash(benchmark::State &s, SpinLockType &spinlock, LockF lock, UnlockF unlock) +{ + auto num_threads = s.range(0); + // Value we will increment, fighting over a spinlock. + // The contention is meant to be brief, as close to our expected + // use cases of "updating pointers" or "pushing an event onto a buffer". + std::int64_t value = 0; + + std::vector threads; + threads.reserve(num_threads); + + // Timing loop + for (auto _ : s) + { + for (auto i = 0; i < num_threads; i++) + { + threads.emplace_back([&] { + // Increment value once each time the lock is acquired. Spin a few times + // to ensure maximum thread contention. + for (int i = 0; i < TightLoopLocks; i++) + { + lock(spinlock); + value++; + unlock(spinlock); + } + }); + } + // Join threads + for (auto &thread : threads) + thread.join(); + threads.clear(); + } +} + +// Benchmark of full spin-lock implementation. +static void BM_SpinLockThrashing(benchmark::State &s) +{ + SpinLockMutex spinlock; + SpinThrash( + s, spinlock, [](SpinLockMutex &m) { m.lock(); }, [](SpinLockMutex &m) { m.unlock(); }); +} + +// Naive `while(try_lock()) {}` implementation of lock. +static void BM_NaiveSpinLockThrashing(benchmark::State &s) +{ + SpinLockMutex spinlock; + SpinThrash( + s, spinlock, + [](SpinLockMutex &m) { + while (!m.try_lock()) + { + } + }, + [](SpinLockMutex &m) { m.unlock(); }); +} + +// Simple `while(try_lock()) { yield-processor }` +static void BM_ProcYieldSpinLockThrashing(benchmark::State &s) +{ + SpinLockMutex spinlock; + SpinThrash( + s, spinlock, + [](SpinLockMutex &m) { + while (!m.try_lock()) + { +#if defined(_MSC_VER) + YieldProcessor(); +#elif defined(__i386__) || defined(__x86_64__) +# if defined(__clang__) + _mm_pause(); +# else + __builtin_ia32_pause(); +# endif +#elif defined(__arm__) + __yield(); +#endif + } + }, + [](SpinLockMutex &m) { m.unlock(); }); +} + +// SpinLock thrashing with thread::yield() after N spins. +static void BM_ThreadYieldSpinLockThrashing(benchmark::State &s) +{ + std::atomic mutex(false); + SpinThrash>( + s, mutex, + [](std::atomic &l) { + if (!l.exchange(true, std::memory_order_acquire)) + { + return; + } + for (std::size_t i = 0; i < 100; ++i) + { + if (!l.load(std::memory_order_acquire) && !l.exchange(true, std::memory_order_acquire)) + { + return; + } + } + std::this_thread::yield(); + }, + [](std::atomic &l) { l.store(false, std::memory_order_release); }); +} + +// Run the benchamrks at 2x thread/core and measure the amount of time to thrash around. +BENCHMARK(BM_SpinLockThrashing) + ->RangeMultiplier(2) + ->Range(1, std::thread::hardware_concurrency()) + ->MeasureProcessCPUTime() + ->UseRealTime() + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_ProcYieldSpinLockThrashing) + ->RangeMultiplier(2) + ->Range(1, std::thread::hardware_concurrency()) + ->MeasureProcessCPUTime() + ->UseRealTime() + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_NaiveSpinLockThrashing) + ->RangeMultiplier(2) + ->Range(1, std::thread::hardware_concurrency()) + ->MeasureProcessCPUTime() + ->UseRealTime() + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_ThreadYieldSpinLockThrashing) + ->RangeMultiplier(2) + ->Range(1, std::thread::hardware_concurrency()) + ->MeasureProcessCPUTime() + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +} // namespace + +BENCHMARK_MAIN();