Skip to content

Commit

Permalink
Update spin-lock implementation to use incremental back-off strategy …
Browse files Browse the repository at this point in the history
…and be single-CPU friendly(ish) (#443)
  • Loading branch information
jsuereth authored Jan 5, 2021
1 parent c786881 commit 3cb5d26
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 3 deletions.
28 changes: 25 additions & 3 deletions api/include/opentelemetry/common/spin_lock_mutex.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <Windows.h>
# include <windows.h>
#elif defined(__i386__) || defined(__x86_64__)
# if defined(__clang__)
# include <emmintrin.h>
Expand All @@ -35,6 +35,14 @@ constexpr int SPINLOCK_SLEEP_MS = 1;
* This is meant to give a good balance of perofrmance and CPU consumption in
* practice.
*
* This mutex uses an incremental back-off strategy with the following phases:
* 1. A tight spin-lock loop (pending: using hardware PAUSE/YIELD instructions)
* 2. A loop where the current thread yields control after checking the lock.
* 3. Issuing a thread-sleep call before starting back in phase 1.
*
* This is meant to give a good balance of perofrmance and CPU consumption in
* practice.
*
* This class implements the `BasicLockable` specification:
* https://en.cppreference.com/w/cpp/named_req/BasicLockable
*/
Expand Down Expand Up @@ -79,8 +87,22 @@ class SpinLockMutex
{
return;
}
// TODO: Issue PAUSE/YIELD instruction to reduce contention.
// e.g. __builtin_ia32_pause() / YieldProcessor() / _mm_pause();
// Issue a Pause/Yield instruction while spinning.
#if defined(_MSC_VER)
YieldProcessor();
#elif defined(__i386__) || defined(__x86_64__)
# if defined(__clang__)
_mm_pause();
# else
__builtin_ia32_pause();
# endif
#elif defined(__arm__)
// This intrinsic should fail to be found if YIELD is not supported on the current
// processor.
__yield();
#else
// TODO: Issue PAGE/YIELD on other architectures.
#endif
}
// Yield then try again (goal ~100ns)
std::this_thread::yield();
Expand Down
7 changes: 7 additions & 0 deletions api/test/common/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
load("//bazel:otel_cc_benchmark.bzl", "otel_cc_benchmark")

otel_cc_benchmark(
name = "spinlock_benchmark",
srcs = ["spinlock_benchmark.cc"],
deps = ["//api"],
)
150 changes: 150 additions & 0 deletions api/test/common/spinlock_benchmark.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#include "opentelemetry/common/spin_lock_mutex.h"

#include <benchmark/benchmark.h>
#include <mutex>

namespace
{
using opentelemetry::common::SpinLockMutex;

constexpr int TightLoopLocks = 10000;

// Runs a thrash-test where we spin up N threads, each of which will
// attempt to lock-mutate-unlock a total of `TightLoopLocks` times.
//
// lock: A lambda denoting how to lock. Accepts a reference to `SpinLockType`.
// unlock: A lambda denoting how to unlock. Accepts a reference to `SpinLockType`.
template <typename SpinLockType, typename LockF, typename UnlockF>
inline void SpinThrash(benchmark::State &s, SpinLockType &spinlock, LockF lock, UnlockF unlock)
{
auto num_threads = s.range(0);
// Value we will increment, fighting over a spinlock.
// The contention is meant to be brief, as close to our expected
// use cases of "updating pointers" or "pushing an event onto a buffer".
std::int64_t value = 0;

std::vector<std::thread> threads;
threads.reserve(num_threads);

// Timing loop
for (auto _ : s)
{
for (auto i = 0; i < num_threads; i++)
{
threads.emplace_back([&] {
// Increment value once each time the lock is acquired. Spin a few times
// to ensure maximum thread contention.
for (int i = 0; i < TightLoopLocks; i++)
{
lock(spinlock);
value++;
unlock(spinlock);
}
});
}
// Join threads
for (auto &thread : threads)
thread.join();
threads.clear();
}
}

// Benchmark of full spin-lock implementation.
static void BM_SpinLockThrashing(benchmark::State &s)
{
SpinLockMutex spinlock;
SpinThrash(
s, spinlock, [](SpinLockMutex &m) { m.lock(); }, [](SpinLockMutex &m) { m.unlock(); });
}

// Naive `while(try_lock()) {}` implementation of lock.
static void BM_NaiveSpinLockThrashing(benchmark::State &s)
{
SpinLockMutex spinlock;
SpinThrash(
s, spinlock,
[](SpinLockMutex &m) {
while (!m.try_lock())
{
}
},
[](SpinLockMutex &m) { m.unlock(); });
}

// Simple `while(try_lock()) { yield-processor }`
static void BM_ProcYieldSpinLockThrashing(benchmark::State &s)
{
SpinLockMutex spinlock;
SpinThrash<SpinLockMutex>(
s, spinlock,
[](SpinLockMutex &m) {
while (!m.try_lock())
{
#if defined(_MSC_VER)
YieldProcessor();
#elif defined(__i386__) || defined(__x86_64__)
# if defined(__clang__)
_mm_pause();
# else
__builtin_ia32_pause();
# endif
#elif defined(__arm__)
__yield();
#endif
}
},
[](SpinLockMutex &m) { m.unlock(); });
}

// SpinLock thrashing with thread::yield() after N spins.
static void BM_ThreadYieldSpinLockThrashing(benchmark::State &s)
{
std::atomic<bool> mutex(false);
SpinThrash<std::atomic<bool>>(
s, mutex,
[](std::atomic<bool> &l) {
if (!l.exchange(true, std::memory_order_acquire))
{
return;
}
for (std::size_t i = 0; i < 100; ++i)
{
if (!l.load(std::memory_order_acquire) && !l.exchange(true, std::memory_order_acquire))
{
return;
}
}
std::this_thread::yield();
},
[](std::atomic<bool> &l) { l.store(false, std::memory_order_release); });
}

// Run the benchamrks at 2x thread/core and measure the amount of time to thrash around.
BENCHMARK(BM_SpinLockThrashing)
->RangeMultiplier(2)
->Range(1, std::thread::hardware_concurrency())
->MeasureProcessCPUTime()
->UseRealTime()
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_ProcYieldSpinLockThrashing)
->RangeMultiplier(2)
->Range(1, std::thread::hardware_concurrency())
->MeasureProcessCPUTime()
->UseRealTime()
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_NaiveSpinLockThrashing)
->RangeMultiplier(2)
->Range(1, std::thread::hardware_concurrency())
->MeasureProcessCPUTime()
->UseRealTime()
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_ThreadYieldSpinLockThrashing)
->RangeMultiplier(2)
->Range(1, std::thread::hardware_concurrency())
->MeasureProcessCPUTime()
->UseRealTime()
->Unit(benchmark::kMillisecond);

} // namespace

BENCHMARK_MAIN();

0 comments on commit 3cb5d26

Please sign in to comment.