diff --git a/examples/nproc.c b/examples/nproc.c
new file mode 100644
index 00000000000..73ad91934e4
--- /dev/null
+++ b/examples/nproc.c
@@ -0,0 +1,15 @@
+#if 0
+/*─────────────────────────────────────────────────────────────────╗
+│ To the extent possible under law, Justine Tunney has waived      │
+│ all copyright and related or neighboring rights to this file,    │
+│ as it is written in the following disclaimers:                   │
+│   • http://unlicense.org/                                        │
+│   • http://creativecommons.org/publicdomain/zero/1.0/            │
+╚─────────────────────────────────────────────────────────────────*/
+#endif
+#include <cosmo.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  printf("%d\n", __get_cpu_count());
+}
diff --git a/libc/calls/getcpu.c b/libc/calls/getcpu.c
index bdc97089e5e..b689f43fc9e 100644
--- a/libc/calls/getcpu.c
+++ b/libc/calls/getcpu.c
@@ -30,39 +30,63 @@
 
 int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
 
+/**
+ * Determines ID of CPU on which thread is currently scheduled.
+ *
+ * This is the same as sched_getcpu(), except it also supports returning
+ * the ID of the current NUMA node. On some platforms this functionality
+ * isn't available, in which case `out_opt_node` is always be set to 0.
+ */
 int getcpu(unsigned *out_opt_cpu, unsigned *out_opt_node) {
-  unsigned cpu;
-  unsigned node;
-  if (X86_HAVE(RDTSCP)) {
+
+  if (IsWindows()) {
+    struct NtProcessorNumber pn;
+    if (out_opt_cpu) {
+      GetCurrentProcessorNumberEx(&pn);
+      *out_opt_cpu = 64 * pn.Group + pn.Number;
+    }
+    if (out_opt_node) {
+      unsigned short node16;
+      if (GetNumaProcessorNodeEx(&pn, &node16)) {
+        *out_opt_node = node16;
+      } else {
+        return __winerr();
+      }
+    }
+    return 0;
+  }
+
+#ifdef __x86_64__
+  if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
     unsigned tsc_aux;
     rdtscp(&tsc_aux);
-    cpu = TSC_AUX_CORE(tsc_aux);
-    node = TSC_AUX_NODE(tsc_aux);
-  } else if (IsWindows()) {
-    struct NtProcessorNumber pn;
-    GetCurrentProcessorNumberEx(&pn);
-    cpu = 64 * pn.Group + pn.Number;
-    unsigned short node16;
-    if (GetNumaProcessorNodeEx(&pn, &node16)) {
-      node = node16;
-    } else {
-      return __winerr();
+    if (out_opt_cpu)
+      *out_opt_cpu = TSC_AUX_CORE(tsc_aux);
+    if (out_opt_node)
+      *out_opt_node = TSC_AUX_NODE(tsc_aux);
+    return 0;
+  }
+#endif
+
+  if (IsXnu() || IsOpenbsd() || IsNetbsd() || IsFreebsd()) {
+    if (out_opt_cpu) {
+      int rc = sched_getcpu();
+      if (rc == -1)
+        return -1;
+      *out_opt_cpu = rc;
     }
-  } else if (IsAarch64()) {
-    long tpidr_el0;
-    asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
-    cpu = tpidr_el0 & 255;
-    node = 0;
-  } else {
-    int rc = sys_getcpu(&cpu, &node, 0);
-    if (rc == -1)
-      return -1;
+    if (out_opt_node)
+      *out_opt_node = 0;
+    return 0;
   }
-  if (out_opt_cpu) {
+
+  unsigned cpu, node;
+  int rc = sys_getcpu(&cpu, &node, 0);
+  if (rc == -1)
+    return -1;
+  if (out_opt_cpu)
     *out_opt_cpu = cpu;
-  }
-  if (out_opt_node) {
+  if (out_opt_node)
     *out_opt_node = node;
-  }
   return 0;
 }
diff --git a/libc/calls/sched_getcpu.c b/libc/calls/sched_getcpu.c
index 12a0a832b26..e671e80ca6b 100644
--- a/libc/calls/sched_getcpu.c
+++ b/libc/calls/sched_getcpu.c
@@ -23,32 +23,82 @@
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/nt/struct/processornumber.h"
 #include "libc/nt/synchronization.h"
+#include "libc/runtime/syslib.internal.h"
 #include "libc/sysv/errfuns.h"
 
 int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
 
 /**
  * Returns ID of CPU on which thread is currently scheduled.
+ *
+ * This function is supported on the following platforms:
+ *
+ * - x86-64
+ *
+ *   - Linux: rdtsc
+ *   - FreeBSD: rdtsc
+ *   - Windows: win32
+ *   - OpenBSD: unsupported
+ *   - NetBSD: unsupported
+ *   - MacOS: unsupported
+ *
+ * - aarch64
+ *
+ *   - Linux: syscall
+ *   - FreeBSD: syscall
+ *   - MacOS: supported
+ *
  * @return cpu number on success, or -1 w/ errno
  */
 int sched_getcpu(void) {
-  if (X86_HAVE(RDTSCP)) {
-    unsigned tsc_aux;
-    rdtscp(&tsc_aux);
-    return TSC_AUX_CORE(tsc_aux);
-  } else if (IsAarch64()) {
-    long tpidr_el0;
-    asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
-    return tpidr_el0 & 255;
-  } else if (IsWindows()) {
+
+  if (IsWindows()) {
     struct NtProcessorNumber pn;
     GetCurrentProcessorNumberEx(&pn);
     return 64 * pn.Group + pn.Number;
-  } else {
-    unsigned cpu = 0;
-    int rc = sys_getcpu(&cpu, 0, 0);
-    if (rc == -1)
-      return -1;
-    return cpu;
   }
+
+#ifdef __x86_64__
+  if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
+    // Only the Linux, FreeBSD, and Windows kernels can be counted upon
+    // to populate the TSC_AUX register with the current thread number.
+    unsigned tsc_aux;
+    rdtscp(&tsc_aux);
+    return TSC_AUX_CORE(tsc_aux);
+  }
+#endif
+
+#ifdef __aarch64__
+  if (IsXnu()) {
+    // pthread_cpu_number_np() is defined by MacOS 11.0+ (Big Sur) in
+    // the SDK pthread.h header file, even though there's no man page
+    if (__syslib && __syslib->__version >= 9) {
+      errno_t err;
+      size_t out = 0;
+      if ((err = __syslib->__pthread_cpu_number_np(&out))) {
+        errno = err;
+        return -1;
+      }
+      return out;
+    } else {
+      errno = ENOSYS;  // upgrade your ape loader
+      return -1;       // cc -o /usr/local/bin/ape ape/ape-m1.c
+    }
+  }
+#endif
+
+#ifdef __aarch64__
+  if (IsFreebsd()) {
+    register int x0 asm("x0");
+    register int x8 asm("x8") = 581;  // sched_getcpu
+    asm volatile("svc\t0" : "=r"(x0) : "r"(x8) : "memory");
+    return x0;
+  }
+#endif
+
+  unsigned cpu = 0;
+  int rc = sys_getcpu(&cpu, 0, 0);
+  if (rc == -1)
+    return -1;
+  return cpu;
 }
diff --git a/libc/intrin/atomic.h b/libc/intrin/atomic.h
index 3d503d37f62..a2d93df8a3a 100644
--- a/libc/intrin/atomic.h
+++ b/libc/intrin/atomic.h
@@ -13,48 +13,26 @@
  */
 
 typedef enum {
-  memory_order_relaxed,
-  memory_order_consume,
-  memory_order_acquire,
-  memory_order_release,
-  memory_order_acq_rel,
-  memory_order_seq_cst,
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
 } memory_order;
 
-#define ATOMIC_VAR_INIT(...)     __VA_ARGS__
+#if !(defined __STDC_VERSION__ && __STDC_VERSION__ > 201710L)
+#define ATOMIC_VAR_INIT(...) __VA_ARGS__
+#endif
+
 #define atomic_is_lock_free(obj) ((void)(obj), sizeof(obj) <= sizeof(void *))
 
 #define atomic_flag      atomic_bool
-#define ATOMIC_FLAG_INIT ATOMIC_VAR_INIT(0)
+#define ATOMIC_FLAG_INIT false
 #define atomic_flag_test_and_set_explicit(x, order) \
   atomic_exchange_explicit(x, 1, order)
 #define atomic_flag_clear_explicit(x, order) atomic_store_explicit(x, 0, order)
 
-#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
-  atomic_compare_exchange_strong_explicit(                          \
-      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
-#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
-  atomic_compare_exchange_weak_explicit(                          \
-      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
-#define atomic_exchange(pObject, desired) \
-  atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
-#define atomic_fetch_add(pObject, operand) \
-  atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_and(pObject, operand) \
-  atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_or(pObject, operand) \
-  atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_sub(pObject, operand) \
-  atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_xor(pObject, operand) \
-  atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
-#define atomic_store(pObject, desired) \
-  atomic_store_explicit(pObject, desired, memory_order_seq_cst)
-#define atomic_flag_test_and_set(x) \
-  atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
-#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
-
 #if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
 
 #define atomic_init(obj, value)    __c11_atomic_init(obj, value)
@@ -84,9 +62,35 @@ typedef enum {
 #define atomic_store_explicit(object, desired, order) \
   __c11_atomic_store(object, desired, order)
 
+#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
+  atomic_compare_exchange_strong_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
+  atomic_compare_exchange_weak_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_exchange(pObject, desired) \
+  atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_fetch_add(pObject, operand) \
+  atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_and(pObject, operand) \
+  atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_or(pObject, operand) \
+  atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_sub(pObject, operand) \
+  atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_xor(pObject, operand) \
+  atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
+#define atomic_store(pObject, desired) \
+  atomic_store_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_flag_test_and_set(x) \
+  atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
+#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
+
 #elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 407
 
-#define atomic_init(obj, value)    ((void)(*(obj) = (value)))
+#define atomic_init(obj, value) \
+  atomic_store_explicit(obj, value, __ATOMIC_RELAXED)
 #define atomic_thread_fence(order) __atomic_thread_fence(order)
 #define atomic_signal_fence(order) __atomic_signal_fence(order)
 #define atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
@@ -111,6 +115,31 @@ typedef enum {
 #define atomic_store_explicit(pObject, desired, order) \
   __atomic_store_n(pObject, desired, order)
 
+#define atomic_compare_exchange_strong(pObject, pExpected, desired)    \
+  atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
+                                          __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_weak(pObject, pExpected, desired)    \
+  atomic_compare_exchange_weak_explicit(pObject, pExpected, desired, \
+                                        __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_exchange(pObject, desired) \
+  atomic_exchange_explicit(pObject, desired, __ATOMIC_SEQ_CST)
+#define atomic_fetch_add(pObject, operand) \
+  atomic_fetch_add_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_and(pObject, operand) \
+  atomic_fetch_and_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_or(pObject, operand) \
+  atomic_fetch_or_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub(pObject, operand) \
+  atomic_fetch_sub_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_xor(pObject, operand) \
+  atomic_fetch_xor_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_load(pObject) atomic_load_explicit(pObject, __ATOMIC_SEQ_CST)
+#define atomic_store(pObject, desired) \
+  atomic_store_explicit(pObject, desired, __ATOMIC_SEQ_CST)
+#define atomic_flag_test_and_set(x) \
+  atomic_flag_test_and_set_explicit(x, __ATOMIC_SEQ_CST)
+#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, __ATOMIC_SEQ_CST)
+
 #elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 401
 
 #define atomic_init(obj, value)    ((void)(*(obj) = (value)))
@@ -210,6 +239,31 @@ typedef enum {
 #define atomic_store_explicit(object, desired, order) \
   ((void)atomic_exchange_explicit(object, desired, order))
 
+#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
+  atomic_compare_exchange_strong_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
+  atomic_compare_exchange_weak_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_exchange(pObject, desired) \
+  atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_fetch_add(pObject, operand) \
+  atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_and(pObject, operand) \
+  atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_or(pObject, operand) \
+  atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_sub(pObject, operand) \
+  atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_xor(pObject, operand) \
+  atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
+#define atomic_store(pObject, desired) \
+  atomic_store_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_flag_test_and_set(x) \
+  atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
+#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
+
 #else /* non-gcc or old gcc w/o x86 */
 #error "atomic operations not supported with this compiler and/or architecture"
 #endif
diff --git a/libc/runtime/syslib.internal.h b/libc/runtime/syslib.internal.h
index 90ed2994fcb..424034537b6 100644
--- a/libc/runtime/syslib.internal.h
+++ b/libc/runtime/syslib.internal.h
@@ -82,6 +82,7 @@ struct Syslib {
   char *(*__dlerror)(void);
   /* v9 (2024-01-31) */
   int (*__pthread_cpu_number_np)(size_t *);
+  /* v10 (2024-05-02) */
   long (*__sysctl)(int *, unsigned, void *, size_t *, void *, size_t);
   long (*__sysctlbyname)(const char *, void *, size_t *, void *, size_t);
   long (*__sysctlnametomib)(const char *, int *, size_t *);
diff --git a/test/libc/calls/sched_getcpu_test.c b/test/libc/calls/sched_getcpu_test.c
new file mode 100644
index 00000000000..72c85ee0547
--- /dev/null
+++ b/test/libc/calls/sched_getcpu_test.c
@@ -0,0 +1,113 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/atomic.h"
+#include "libc/calls/calls.h"
+#include "libc/dce.h"
+#include "libc/intrin/atomic.h"
+#include "libc/macros.h"
+#include "libc/runtime/runtime.h"
+#include "libc/testlib/subprocess.h"
+#include "libc/testlib/testlib.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/thread2.h"
+
+int cpu_count;
+
+void SetUpOnce(void) {
+  cpu_count = __get_cpu_count();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AFFINITY TEST
+
+TEST(sched_getcpu, affinity_test) {
+
+  if (IsXnu())
+    return;
+  if (IsNetbsd())
+    return;
+  if (IsOpenbsd())
+    return;
+
+  SPAWN(fork);
+  int n = cpu_count;
+  for (int i = 0; i < n; ++i) {
+    cpu_set_t affinity;
+    CPU_ZERO(&affinity);
+    CPU_SET(i, &affinity);
+    ASSERT_EQ(
+        0, pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity));
+    EXPECT_EQ(i, sched_getcpu());
+  }
+  EXITS(0);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KLUDGE TEST
+
+#define THREADS    2
+#define ITERATIONS 10000
+
+int g_hits[256];
+atomic_int g_sync;
+
+int call_sched_getcpu(void) {
+  int res = sched_getcpu();
+  ASSERT_NE(-1, res);
+  ASSERT_GE(res, 0);
+  ASSERT_LT(res, cpu_count);
+  return res;
+}
+
+void *worker(void *arg) {
+  int ith = (long)arg;
+  int nth = THREADS;
+  for (int i = 0; i < ITERATIONS; ++i) {
+    // help execution of threads be interleaved
+    int sync = atomic_fetch_add(&g_sync, 1);
+    if (sync % nth == ith) {
+      g_hits[call_sched_getcpu() % ARRAYLEN(g_hits)]++;
+    }
+  }
+  return 0;
+}
+
+TEST(sched_getcpu, kludge_test) {
+
+#ifdef __x86_64__
+  if (IsXnu())
+    return;
+#endif
+  if (IsNetbsd())
+    return;
+  if (IsOpenbsd())
+    return;
+
+  if (cpu_count < THREADS)
+    return;
+  pthread_t th[THREADS];
+  for (int i = 0; i < THREADS; ++i)
+    ASSERT_EQ(0, pthread_create(th + i, 0, worker, (void *)(long)i));
+  for (int i = 0; i < THREADS; ++i)
+    ASSERT_EQ(0, pthread_join(th[i], 0));
+  int hit = 0;
+  for (int i = 0; i < ARRAYLEN(g_hits); ++i)
+    hit += !!g_hits[i];
+  ASSERT_GE(hit, THREADS);
+}
diff --git a/third_party/dlmalloc/threaded.inc b/third_party/dlmalloc/threaded.inc
index 2454742cd0b..7c725346120 100644
--- a/third_party/dlmalloc/threaded.inc
+++ b/third_party/dlmalloc/threaded.inc
@@ -21,12 +21,9 @@
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/macros.h"
-#include "libc/nexgen32e/rdtscp.h"
-#include "libc/nexgen32e/x86feature.h"
 #include "libc/runtime/runtime.h"
 #include "libc/thread/thread.h"
-#include "libc/runtime/runtime.h"
-#include "libc/intrin/weaken.h"
+#include "libc/thread/threads.h"
 #include "third_party/dlmalloc/dlmalloc.h"
 
 #if !FOOTERS || !MSPACES
@@ -34,6 +31,7 @@
 #endif
 
 static struct magicu magiu;
+static unsigned g_cpucount;
 static unsigned g_heapslen;
 static mstate g_heaps[128];
 
@@ -90,18 +88,29 @@ void dlmalloc_inspect_all(void handler(void *start, void *end,
   }
 }
 
-forceinline mstate get_arena(void) {
-  unsigned cpu;
-#ifdef __x86_64__
-  unsigned tsc_aux;
-  rdtscp(&tsc_aux);
-  cpu = TSC_AUX_CORE(tsc_aux);
-#else
-  long tpidr_el0;
-  asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
-  cpu = tpidr_el0 & 255;
-#endif
-  return g_heaps[__magicu_div(cpu, magiu) % g_heapslen];
+// we make malloc() scalable basically by
+//
+//     return g_heaps[sched_getcpu() / 2];
+//
+// except we cache the syscall result using thread-local storage. on
+// some platforms, it's not possible to use sched_getcpu() so we use
+// arbitrary assignments to help scalability, but may not be optimal
+static mstate get_arena(void) {
+  static atomic_uint assign;
+  static thread_local unsigned i;
+  static thread_local unsigned n;
+  if (n == 50)
+    n = 0;
+  if (!n) {
+    i = sched_getcpu();
+    if (i == -1) {
+      i = atomic_fetch_add_explicit(&assign, 1, memory_order_relaxed);
+      i %= g_cpucount;
+    }
+    i = __magicu_div(i, magiu) % g_heapslen;
+  }
+  ++n;
+  return g_heaps[i];
 }
 
 static void *dlmalloc_single(size_t n) {
@@ -174,19 +183,18 @@ static void threaded_dlmalloc(void) {
   if (!_weaken(pthread_create))
     return use_single_heap(false);
 
-  if (!IsAarch64() && !X86_HAVE(RDTSCP))
-    return use_single_heap(true);
-
   // determine how many independent heaps we should install
   // by default we do an approximation of one heap per core
   // this code makes the c++ stl go 164x faster on my ryzen
-  cpus = __get_cpu_count();
-  if (cpus == -1)
+  g_cpucount = cpus = __get_cpu_count();
+  if (cpus == -1) {
     heaps = 1;
-  else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT")))
+    g_cpucount = 1;
+  } else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT"))) {
     heaps = dlmalloc_atoi(var);
-  else
+  } else {
     heaps = cpus >> 1;
+  }
   if (heaps <= 1)
     return use_single_heap(true);
   if (heaps > ARRAYLEN(g_heaps))
diff --git a/tool/viz/malloc_scalability.c b/tool/viz/malloc_scalability.c
new file mode 100644
index 00000000000..434be2123d4
--- /dev/null
+++ b/tool/viz/malloc_scalability.c
@@ -0,0 +1,55 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/struct/timespec.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/thread/thread.h"
+
+#define ALLOCATIONS 1000
+
+void *worker(void *arg) {
+  void **ptrs = malloc(ALLOCATIONS * sizeof(void *));
+  for (int i = 0; i < ALLOCATIONS; ++i)
+    ptrs[i] = malloc(1);
+  for (int i = 0; i < ALLOCATIONS; ++i)
+    free(ptrs[i]);
+  free(ptrs);
+  return 0;
+}
+
+void test(int n) {
+  struct timespec start = timespec_real();
+  pthread_t *th = malloc(sizeof(pthread_t) * n);
+  for (int i = 0; i < n; ++i)
+    pthread_create(th + i, 0, worker, 0);
+  for (int i = 0; i < n; ++i)
+    pthread_join(th[i], 0);
+  free(th);
+  struct timespec end = timespec_real();
+  printf("%2d threads * %d allocs = %ld us\n", n, ALLOCATIONS,
+         timespec_tomicros(timespec_sub(end, start)));
+}
+
+int main(int argc, char *argv[]) {
+  int n = __get_cpu_count();
+  if (n < 8)
+    n = 8;
+  for (int i = 1; i <= n; ++i)
+    test(i);
+}
diff --git a/tool/viz/vdsodump.c b/tool/viz/vdsodump.c
new file mode 100644
index 00000000000..22174a323f5
--- /dev/null
+++ b/tool/viz/vdsodump.c
@@ -0,0 +1,40 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/intrin/getauxval.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/auxv.h"
+
+int main(int argc, char *argv[]) {
+  struct AuxiliaryValue av;
+  av = __getauxval(AT_SYSINFO_EHDR);
+  if (!av.isfound)
+    return 2;
+  int fd = creat("vdso.so", 0644);
+  if (fd == -1)
+    return 3;
+  int i;
+  for (i = 0;; i += getpagesize())
+    if (write(fd, (char *)av.value + i, getpagesize()) == -1)
+      break;
+  if (!i)
+    return 4;
+  if (close(fd))
+    return 5;
+}