Update spin-lock implementation to use incremental back-off strategy …

…and be single-CPU friendly(ish) (#443)
open-telemetry · Jan 5, 2021 · 3cb5d26 · 3cb5d26
1 parent c786881
commit 3cb5d26
Show file tree

Hide file tree

Showing 3 changed files with 182 additions and 3 deletions.
diff --git a/api/include/opentelemetry/common/spin_lock_mutex.h b/api/include/opentelemetry/common/spin_lock_mutex.h
@@ -10,7 +10,7 @@
 #  ifndef NOMINMAX
 #    define NOMINMAX
 #  endif
-#  include <Windows.h>
+#  include <windows.h>
 #elif defined(__i386__) || defined(__x86_64__)
 #  if defined(__clang__)
 #    include <emmintrin.h>
@@ -35,6 +35,14 @@ constexpr int SPINLOCK_SLEEP_MS        = 1;
  * This is meant to give a good balance of perofrmance and CPU consumption in
  * practice.
  *
+ * This mutex uses an incremental back-off strategy with the following phases:
+ * 1. A tight spin-lock loop (pending: using hardware PAUSE/YIELD instructions)
+ * 2. A loop where the current thread yields control after checking the lock.
+ * 3. Issuing a thread-sleep call before starting back in phase 1.
+ *
+ * This is meant to give a good balance of perofrmance and CPU consumption in
+ * practice.
+ *
  * This class implements the `BasicLockable` specification:
  * https://en.cppreference.com/w/cpp/named_req/BasicLockable
  */
@@ -79,8 +87,22 @@ class SpinLockMutex
         {
           return;
         }
-        // TODO: Issue PAUSE/YIELD instruction to reduce contention.
-        // e.g. __builtin_ia32_pause() / YieldProcessor() / _mm_pause();
+// Issue a Pause/Yield instruction while spinning.
+#if defined(_MSC_VER)
+        YieldProcessor();
+#elif defined(__i386__) || defined(__x86_64__)
+#  if defined(__clang__)
+        _mm_pause();
+#  else
+        __builtin_ia32_pause();
+#  endif
+#elif defined(__arm__)
+        // This intrinsic should fail to be found if YIELD is not supported on the current
+        // processor.
+        __yield();
+#else
+        // TODO: Issue PAGE/YIELD on other architectures.
+#endif
       }
       // Yield then try again (goal ~100ns)
       std::this_thread::yield();

diff --git a/api/test/common/BUILD b/api/test/common/BUILD
@@ -0,0 +1,7 @@
+load("//bazel:otel_cc_benchmark.bzl", "otel_cc_benchmark")
+
+otel_cc_benchmark(
+    name = "spinlock_benchmark",
+    srcs = ["spinlock_benchmark.cc"],
+    deps = ["//api"],
+)
diff --git a/api/test/common/spinlock_benchmark.cc b/api/test/common/spinlock_benchmark.cc
@@ -0,0 +1,150 @@
+#include "opentelemetry/common/spin_lock_mutex.h"
+
+#include <benchmark/benchmark.h>
+#include <mutex>
+
+namespace
+{
+using opentelemetry::common::SpinLockMutex;
+
+constexpr int TightLoopLocks = 10000;
+
+// Runs a thrash-test where we spin up N threads, each of which will
+// attempt to lock-mutate-unlock a total of `TightLoopLocks` times.
+//
+// lock: A lambda denoting how to lock.   Accepts a reference to `SpinLockType`.
+// unlock: A lambda denoting how to unlock.   Accepts a reference to `SpinLockType`.
+template <typename SpinLockType, typename LockF, typename UnlockF>
+inline void SpinThrash(benchmark::State &s, SpinLockType &spinlock, LockF lock, UnlockF unlock)
+{
+  auto num_threads = s.range(0);
+  // Value we will increment, fighting over a spinlock.
+  // The contention is meant to be brief, as close to our expected
+  // use cases of "updating pointers" or "pushing an event onto a buffer".
+  std::int64_t value = 0;
+
+  std::vector<std::thread> threads;
+  threads.reserve(num_threads);
+
+  // Timing loop
+  for (auto _ : s)
+  {
+    for (auto i = 0; i < num_threads; i++)
+    {
+      threads.emplace_back([&] {
+        // Increment value once each time the lock is acquired.  Spin a few times
+        // to ensure maximum thread contention.
+        for (int i = 0; i < TightLoopLocks; i++)
+        {
+          lock(spinlock);
+          value++;
+          unlock(spinlock);
+        }
+      });
+    }
+    // Join threads
+    for (auto &thread : threads)
+      thread.join();
+    threads.clear();
+  }
+}
+
+// Benchmark of full spin-lock implementation.
+static void BM_SpinLockThrashing(benchmark::State &s)
+{
+  SpinLockMutex spinlock;
+  SpinThrash(
+      s, spinlock, [](SpinLockMutex &m) { m.lock(); }, [](SpinLockMutex &m) { m.unlock(); });
+}
+
+// Naive `while(try_lock()) {}` implementation of lock.
+static void BM_NaiveSpinLockThrashing(benchmark::State &s)
+{
+  SpinLockMutex spinlock;
+  SpinThrash(
+      s, spinlock,
+      [](SpinLockMutex &m) {
+        while (!m.try_lock())
+        {
+        }
+      },
+      [](SpinLockMutex &m) { m.unlock(); });
+}
+
+// Simple `while(try_lock()) { yield-processor }`
+static void BM_ProcYieldSpinLockThrashing(benchmark::State &s)
+{
+  SpinLockMutex spinlock;
+  SpinThrash<SpinLockMutex>(
+      s, spinlock,
+      [](SpinLockMutex &m) {
+        while (!m.try_lock())
+        {
+#if defined(_MSC_VER)
+          YieldProcessor();
+#elif defined(__i386__) || defined(__x86_64__)
+#  if defined(__clang__)
+          _mm_pause();
+#  else
+          __builtin_ia32_pause();
+#  endif
+#elif defined(__arm__)
+          __yield();
+#endif
+        }
+      },
+      [](SpinLockMutex &m) { m.unlock(); });
+}
+
+// SpinLock thrashing with thread::yield() after N spins.
+static void BM_ThreadYieldSpinLockThrashing(benchmark::State &s)
+{
+  std::atomic<bool> mutex(false);
+  SpinThrash<std::atomic<bool>>(
+      s, mutex,
+      [](std::atomic<bool> &l) {
+        if (!l.exchange(true, std::memory_order_acquire))
+        {
+          return;
+        }
+        for (std::size_t i = 0; i < 100; ++i)
+        {
+          if (!l.load(std::memory_order_acquire) && !l.exchange(true, std::memory_order_acquire))
+          {
+            return;
+          }
+        }
+        std::this_thread::yield();
+      },
+      [](std::atomic<bool> &l) { l.store(false, std::memory_order_release); });
+}
+
+// Run the benchamrks at 2x thread/core and measure the amount of time to thrash around.
+BENCHMARK(BM_SpinLockThrashing)
+    ->RangeMultiplier(2)
+    ->Range(1, std::thread::hardware_concurrency())
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_ProcYieldSpinLockThrashing)
+    ->RangeMultiplier(2)
+    ->Range(1, std::thread::hardware_concurrency())
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_NaiveSpinLockThrashing)
+    ->RangeMultiplier(2)
+    ->Range(1, std::thread::hardware_concurrency())
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_ThreadYieldSpinLockThrashing)
+    ->RangeMultiplier(2)
+    ->Range(1, std::thread::hardware_concurrency())
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+}  // namespace
+
+BENCHMARK_MAIN();