From 0a79c6961ffdcf123bf1a4783dc893343f039674 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Thu, 15 Aug 2024 21:32:30 -0700
Subject: [PATCH] Make malloc scalable on all platforms

It turns out sched_getcpu() didn't work on many platforms. So the system
call now has tests and is well documented. We now employ new workarounds
on platforms where it isn't supported in our malloc() implementation. It
was previously the case that malloc() was only scalable on Linux/Windows
for x86-64. Now the other platforms are scalable too.
---
 examples/nproc.c                    |  15 ++++
 libc/calls/getcpu.c                 |  78 ++++++++++++------
 libc/calls/sched_getcpu.c           |  80 ++++++++++++++----
 libc/intrin/atomic.h                | 122 ++++++++++++++++++++--------
 libc/runtime/syslib.internal.h      |   1 +
 test/libc/calls/sched_getcpu_test.c | 113 ++++++++++++++++++++++++++
 third_party/dlmalloc/threaded.inc   |  54 ++++++------
 tool/viz/malloc_scalability.c       |  55 +++++++++++++
 tool/viz/vdsodump.c                 |  40 +++++++++
 9 files changed, 459 insertions(+), 99 deletions(-)
 create mode 100644 examples/nproc.c
 create mode 100644 test/libc/calls/sched_getcpu_test.c
 create mode 100644 tool/viz/malloc_scalability.c
 create mode 100644 tool/viz/vdsodump.c

diff --git a/examples/nproc.c b/examples/nproc.c
new file mode 100644
index 00000000000..73ad91934e4
--- /dev/null
+++ b/examples/nproc.c
@@ -0,0 +1,15 @@
+#if 0
+/*─────────────────────────────────────────────────────────────────╗
+│ To the extent possible under law, Justine Tunney has waived      │
+│ all copyright and related or neighboring rights to this file,    │
+│ as it is written in the following disclaimers:                   │
+│   • http://unlicense.org/                                        │
+│   • http://creativecommons.org/publicdomain/zero/1.0/            │
+╚─────────────────────────────────────────────────────────────────*/
+#endif
+#include <cosmo.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  printf("%d\n", __get_cpu_count());
+}
diff --git a/libc/calls/getcpu.c b/libc/calls/getcpu.c
index bdc97089e5e..b689f43fc9e 100644
--- a/libc/calls/getcpu.c
+++ b/libc/calls/getcpu.c
@@ -30,39 +30,63 @@
 
 int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
 
+/**
+ * Determines ID of CPU on which thread is currently scheduled.
+ *
+ * This is the same as sched_getcpu(), except it also supports returning
+ * the ID of the current NUMA node. On some platforms this functionality
+ * isn't available, in which case `out_opt_node` is always be set to 0.
+ */
 int getcpu(unsigned *out_opt_cpu, unsigned *out_opt_node) {
-  unsigned cpu;
-  unsigned node;
-  if (X86_HAVE(RDTSCP)) {
+
+  if (IsWindows()) {
+    struct NtProcessorNumber pn;
+    if (out_opt_cpu) {
+      GetCurrentProcessorNumberEx(&pn);
+      *out_opt_cpu = 64 * pn.Group + pn.Number;
+    }
+    if (out_opt_node) {
+      unsigned short node16;
+      if (GetNumaProcessorNodeEx(&pn, &node16)) {
+        *out_opt_node = node16;
+      } else {
+        return __winerr();
+      }
+    }
+    return 0;
+  }
+
+#ifdef __x86_64__
+  if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
     unsigned tsc_aux;
     rdtscp(&tsc_aux);
-    cpu = TSC_AUX_CORE(tsc_aux);
-    node = TSC_AUX_NODE(tsc_aux);
-  } else if (IsWindows()) {
-    struct NtProcessorNumber pn;
-    GetCurrentProcessorNumberEx(&pn);
-    cpu = 64 * pn.Group + pn.Number;
-    unsigned short node16;
-    if (GetNumaProcessorNodeEx(&pn, &node16)) {
-      node = node16;
-    } else {
-      return __winerr();
+    if (out_opt_cpu)
+      *out_opt_cpu = TSC_AUX_CORE(tsc_aux);
+    if (out_opt_node)
+      *out_opt_node = TSC_AUX_NODE(tsc_aux);
+    return 0;
+  }
+#endif
+
+  if (IsXnu() || IsOpenbsd() || IsNetbsd() || IsFreebsd()) {
+    if (out_opt_cpu) {
+      int rc = sched_getcpu();
+      if (rc == -1)
+        return -1;
+      *out_opt_cpu = rc;
     }
-  } else if (IsAarch64()) {
-    long tpidr_el0;
-    asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
-    cpu = tpidr_el0 & 255;
-    node = 0;
-  } else {
-    int rc = sys_getcpu(&cpu, &node, 0);
-    if (rc == -1)
-      return -1;
+    if (out_opt_node)
+      *out_opt_node = 0;
+    return 0;
   }
-  if (out_opt_cpu) {
+
+  unsigned cpu, node;
+  int rc = sys_getcpu(&cpu, &node, 0);
+  if (rc == -1)
+    return -1;
+  if (out_opt_cpu)
     *out_opt_cpu = cpu;
-  }
-  if (out_opt_node) {
+  if (out_opt_node)
     *out_opt_node = node;
-  }
   return 0;
 }
diff --git a/libc/calls/sched_getcpu.c b/libc/calls/sched_getcpu.c
index 12a0a832b26..e671e80ca6b 100644
--- a/libc/calls/sched_getcpu.c
+++ b/libc/calls/sched_getcpu.c
@@ -23,32 +23,82 @@
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/nt/struct/processornumber.h"
 #include "libc/nt/synchronization.h"
+#include "libc/runtime/syslib.internal.h"
 #include "libc/sysv/errfuns.h"
 
 int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
 
 /**
  * Returns ID of CPU on which thread is currently scheduled.
+ *
+ * This function is supported on the following platforms:
+ *
+ * - x86-64
+ *
+ *   - Linux: rdtsc
+ *   - FreeBSD: rdtsc
+ *   - Windows: win32
+ *   - OpenBSD: unsupported
+ *   - NetBSD: unsupported
+ *   - MacOS: unsupported
+ *
+ * - aarch64
+ *
+ *   - Linux: syscall
+ *   - FreeBSD: syscall
+ *   - MacOS: supported
+ *
  * @return cpu number on success, or -1 w/ errno
  */
 int sched_getcpu(void) {
-  if (X86_HAVE(RDTSCP)) {
-    unsigned tsc_aux;
-    rdtscp(&tsc_aux);
-    return TSC_AUX_CORE(tsc_aux);
-  } else if (IsAarch64()) {
-    long tpidr_el0;
-    asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
-    return tpidr_el0 & 255;
-  } else if (IsWindows()) {
+
+  if (IsWindows()) {
     struct NtProcessorNumber pn;
     GetCurrentProcessorNumberEx(&pn);
     return 64 * pn.Group + pn.Number;
-  } else {
-    unsigned cpu = 0;
-    int rc = sys_getcpu(&cpu, 0, 0);
-    if (rc == -1)
-      return -1;
-    return cpu;
   }
+
+#ifdef __x86_64__
+  if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
+    // Only the Linux, FreeBSD, and Windows kernels can be counted upon
+    // to populate the TSC_AUX register with the current thread number.
+    unsigned tsc_aux;
+    rdtscp(&tsc_aux);
+    return TSC_AUX_CORE(tsc_aux);
+  }
+#endif
+
+#ifdef __aarch64__
+  if (IsXnu()) {
+    // pthread_cpu_number_np() is defined by MacOS 11.0+ (Big Sur) in
+    // the SDK pthread.h header file, even though there's no man page
+    if (__syslib && __syslib->__version >= 9) {
+      errno_t err;
+      size_t out = 0;
+      if ((err = __syslib->__pthread_cpu_number_np(&out))) {
+        errno = err;
+        return -1;
+      }
+      return out;
+    } else {
+      errno = ENOSYS;  // upgrade your ape loader
+      return -1;       // cc -o /usr/local/bin/ape ape/ape-m1.c
+    }
+  }
+#endif
+
+#ifdef __aarch64__
+  if (IsFreebsd()) {
+    register int x0 asm("x0");
+    register int x8 asm("x8") = 581;  // sched_getcpu
+    asm volatile("svc\t0" : "=r"(x0) : "r"(x8) : "memory");
+    return x0;
+  }
+#endif
+
+  unsigned cpu = 0;
+  int rc = sys_getcpu(&cpu, 0, 0);
+  if (rc == -1)
+    return -1;
+  return cpu;
 }
diff --git a/libc/intrin/atomic.h b/libc/intrin/atomic.h
index 3d503d37f62..a2d93df8a3a 100644
--- a/libc/intrin/atomic.h
+++ b/libc/intrin/atomic.h
@@ -13,48 +13,26 @@
  */
 
 typedef enum {
-  memory_order_relaxed,
-  memory_order_consume,
-  memory_order_acquire,
-  memory_order_release,
-  memory_order_acq_rel,
-  memory_order_seq_cst,
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
 } memory_order;
 
-#define ATOMIC_VAR_INIT(...)     __VA_ARGS__
+#if !(defined __STDC_VERSION__ && __STDC_VERSION__ > 201710L)
+#define ATOMIC_VAR_INIT(...) __VA_ARGS__
+#endif
+
 #define atomic_is_lock_free(obj) ((void)(obj), sizeof(obj) <= sizeof(void *))
 
 #define atomic_flag      atomic_bool
-#define ATOMIC_FLAG_INIT ATOMIC_VAR_INIT(0)
+#define ATOMIC_FLAG_INIT false
 #define atomic_flag_test_and_set_explicit(x, order) \
   atomic_exchange_explicit(x, 1, order)
 #define atomic_flag_clear_explicit(x, order) atomic_store_explicit(x, 0, order)
 
-#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
-  atomic_compare_exchange_strong_explicit(                          \
-      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
-#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
-  atomic_compare_exchange_weak_explicit(                          \
-      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
-#define atomic_exchange(pObject, desired) \
-  atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
-#define atomic_fetch_add(pObject, operand) \
-  atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_and(pObject, operand) \
-  atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_or(pObject, operand) \
-  atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_sub(pObject, operand) \
-  atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_xor(pObject, operand) \
-  atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
-#define atomic_store(pObject, desired) \
-  atomic_store_explicit(pObject, desired, memory_order_seq_cst)
-#define atomic_flag_test_and_set(x) \
-  atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
-#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
-
 #if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
 
 #define atomic_init(obj, value)    __c11_atomic_init(obj, value)
@@ -84,9 +62,35 @@ typedef enum {
 #define atomic_store_explicit(object, desired, order) \
   __c11_atomic_store(object, desired, order)
 
+#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
+  atomic_compare_exchange_strong_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
+  atomic_compare_exchange_weak_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_exchange(pObject, desired) \
+  atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_fetch_add(pObject, operand) \
+  atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_and(pObject, operand) \
+  atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_or(pObject, operand) \
+  atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_sub(pObject, operand) \
+  atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_xor(pObject, operand) \
+  atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
+#define atomic_store(pObject, desired) \
+  atomic_store_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_flag_test_and_set(x) \
+  atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
+#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
+
 #elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 407
 
-#define atomic_init(obj, value)    ((void)(*(obj) = (value)))
+#define atomic_init(obj, value) \
+  atomic_store_explicit(obj, value, __ATOMIC_RELAXED)
 #define atomic_thread_fence(order) __atomic_thread_fence(order)
 #define atomic_signal_fence(order) __atomic_signal_fence(order)
 #define atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
@@ -111,6 +115,31 @@ typedef enum {
 #define atomic_store_explicit(pObject, desired, order) \
   __atomic_store_n(pObject, desired, order)
 
+#define atomic_compare_exchange_strong(pObject, pExpected, desired)    \
+  atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
+                                          __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_weak(pObject, pExpected, desired)    \
+  atomic_compare_exchange_weak_explicit(pObject, pExpected, desired, \
+                                        __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_exchange(pObject, desired) \
+  atomic_exchange_explicit(pObject, desired, __ATOMIC_SEQ_CST)
+#define atomic_fetch_add(pObject, operand) \
+  atomic_fetch_add_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_and(pObject, operand) \
+  atomic_fetch_and_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_or(pObject, operand) \
+  atomic_fetch_or_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub(pObject, operand) \
+  atomic_fetch_sub_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_xor(pObject, operand) \
+  atomic_fetch_xor_explicit(pObject, operand, __ATOMIC_SEQ_CST)
+#define atomic_load(pObject) atomic_load_explicit(pObject, __ATOMIC_SEQ_CST)
+#define atomic_store(pObject, desired) \
+  atomic_store_explicit(pObject, desired, __ATOMIC_SEQ_CST)
+#define atomic_flag_test_and_set(x) \
+  atomic_flag_test_and_set_explicit(x, __ATOMIC_SEQ_CST)
+#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, __ATOMIC_SEQ_CST)
+
 #elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 401
 
 #define atomic_init(obj, value)    ((void)(*(obj) = (value)))
@@ -210,6 +239,31 @@ typedef enum {
 #define atomic_store_explicit(object, desired, order) \
   ((void)atomic_exchange_explicit(object, desired, order))
 
+#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
+  atomic_compare_exchange_strong_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
+  atomic_compare_exchange_weak_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_exchange(pObject, desired) \
+  atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_fetch_add(pObject, operand) \
+  atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_and(pObject, operand) \
+  atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_or(pObject, operand) \
+  atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_sub(pObject, operand) \
+  atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_xor(pObject, operand) \
+  atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
+#define atomic_store(pObject, desired) \
+  atomic_store_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_flag_test_and_set(x) \
+  atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
+#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
+
 #else /* non-gcc or old gcc w/o x86 */
 #error "atomic operations not supported with this compiler and/or architecture"
 #endif
diff --git a/libc/runtime/syslib.internal.h b/libc/runtime/syslib.internal.h
index 90ed2994fcb..424034537b6 100644
--- a/libc/runtime/syslib.internal.h
+++ b/libc/runtime/syslib.internal.h
@@ -82,6 +82,7 @@ struct Syslib {
   char *(*__dlerror)(void);
   /* v9 (2024-01-31) */
   int (*__pthread_cpu_number_np)(size_t *);
+  /* v10 (2024-05-02) */
   long (*__sysctl)(int *, unsigned, void *, size_t *, void *, size_t);
   long (*__sysctlbyname)(const char *, void *, size_t *, void *, size_t);
   long (*__sysctlnametomib)(const char *, int *, size_t *);
diff --git a/test/libc/calls/sched_getcpu_test.c b/test/libc/calls/sched_getcpu_test.c
new file mode 100644
index 00000000000..72c85ee0547
--- /dev/null
+++ b/test/libc/calls/sched_getcpu_test.c
@@ -0,0 +1,113 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/atomic.h"
+#include "libc/calls/calls.h"
+#include "libc/dce.h"
+#include "libc/intrin/atomic.h"
+#include "libc/macros.h"
+#include "libc/runtime/runtime.h"
+#include "libc/testlib/subprocess.h"
+#include "libc/testlib/testlib.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/thread2.h"
+
+int cpu_count;
+
+void SetUpOnce(void) {
+  cpu_count = __get_cpu_count();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AFFINITY TEST
+
+TEST(sched_getcpu, affinity_test) {
+
+  if (IsXnu())
+    return;
+  if (IsNetbsd())
+    return;
+  if (IsOpenbsd())
+    return;
+
+  SPAWN(fork);
+  int n = cpu_count;
+  for (int i = 0; i < n; ++i) {
+    cpu_set_t affinity;
+    CPU_ZERO(&affinity);
+    CPU_SET(i, &affinity);
+    ASSERT_EQ(
+        0, pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity));
+    EXPECT_EQ(i, sched_getcpu());
+  }
+  EXITS(0);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KLUDGE TEST
+
+#define THREADS    2
+#define ITERATIONS 10000
+
+int g_hits[256];
+atomic_int g_sync;
+
+int call_sched_getcpu(void) {
+  int res = sched_getcpu();
+  ASSERT_NE(-1, res);
+  ASSERT_GE(res, 0);
+  ASSERT_LT(res, cpu_count);
+  return res;
+}
+
+void *worker(void *arg) {
+  int ith = (long)arg;
+  int nth = THREADS;
+  for (int i = 0; i < ITERATIONS; ++i) {
+    // help execution of threads be interleaved
+    int sync = atomic_fetch_add(&g_sync, 1);
+    if (sync % nth == ith) {
+      g_hits[call_sched_getcpu() % ARRAYLEN(g_hits)]++;
+    }
+  }
+  return 0;
+}
+
+TEST(sched_getcpu, kludge_test) {
+
+#ifdef __x86_64__
+  if (IsXnu())
+    return;
+#endif
+  if (IsNetbsd())
+    return;
+  if (IsOpenbsd())
+    return;
+
+  if (cpu_count < THREADS)
+    return;
+  pthread_t th[THREADS];
+  for (int i = 0; i < THREADS; ++i)
+    ASSERT_EQ(0, pthread_create(th + i, 0, worker, (void *)(long)i));
+  for (int i = 0; i < THREADS; ++i)
+    ASSERT_EQ(0, pthread_join(th[i], 0));
+  int hit = 0;
+  for (int i = 0; i < ARRAYLEN(g_hits); ++i)
+    hit += !!g_hits[i];
+  ASSERT_GE(hit, THREADS);
+}
diff --git a/third_party/dlmalloc/threaded.inc b/third_party/dlmalloc/threaded.inc
index 2454742cd0b..7c725346120 100644
--- a/third_party/dlmalloc/threaded.inc
+++ b/third_party/dlmalloc/threaded.inc
@@ -21,12 +21,9 @@
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/macros.h"
-#include "libc/nexgen32e/rdtscp.h"
-#include "libc/nexgen32e/x86feature.h"
 #include "libc/runtime/runtime.h"
 #include "libc/thread/thread.h"
-#include "libc/runtime/runtime.h"
-#include "libc/intrin/weaken.h"
+#include "libc/thread/threads.h"
 #include "third_party/dlmalloc/dlmalloc.h"
 
 #if !FOOTERS || !MSPACES
@@ -34,6 +31,7 @@
 #endif
 
 static struct magicu magiu;
+static unsigned g_cpucount;
 static unsigned g_heapslen;
 static mstate g_heaps[128];
 
@@ -90,18 +88,29 @@ void dlmalloc_inspect_all(void handler(void *start, void *end,
   }
 }
 
-forceinline mstate get_arena(void) {
-  unsigned cpu;
-#ifdef __x86_64__
-  unsigned tsc_aux;
-  rdtscp(&tsc_aux);
-  cpu = TSC_AUX_CORE(tsc_aux);
-#else
-  long tpidr_el0;
-  asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
-  cpu = tpidr_el0 & 255;
-#endif
-  return g_heaps[__magicu_div(cpu, magiu) % g_heapslen];
+// we make malloc() scalable basically by
+//
+//     return g_heaps[sched_getcpu() / 2];
+//
+// except we cache the syscall result using thread-local storage. on
+// some platforms, it's not possible to use sched_getcpu() so we use
+// arbitrary assignments to help scalability, but may not be optimal
+static mstate get_arena(void) {
+  static atomic_uint assign;
+  static thread_local unsigned i;
+  static thread_local unsigned n;
+  if (n == 50)
+    n = 0;
+  if (!n) {
+    i = sched_getcpu();
+    if (i == -1) {
+      i = atomic_fetch_add_explicit(&assign, 1, memory_order_relaxed);
+      i %= g_cpucount;
+    }
+    i = __magicu_div(i, magiu) % g_heapslen;
+  }
+  ++n;
+  return g_heaps[i];
 }
 
 static void *dlmalloc_single(size_t n) {
@@ -174,19 +183,18 @@ static void threaded_dlmalloc(void) {
   if (!_weaken(pthread_create))
     return use_single_heap(false);
 
-  if (!IsAarch64() && !X86_HAVE(RDTSCP))
-    return use_single_heap(true);
-
   // determine how many independent heaps we should install
   // by default we do an approximation of one heap per core
   // this code makes the c++ stl go 164x faster on my ryzen
-  cpus = __get_cpu_count();
-  if (cpus == -1)
+  g_cpucount = cpus = __get_cpu_count();
+  if (cpus == -1) {
     heaps = 1;
-  else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT")))
+    g_cpucount = 1;
+  } else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT"))) {
     heaps = dlmalloc_atoi(var);
-  else
+  } else {
     heaps = cpus >> 1;
+  }
   if (heaps <= 1)
     return use_single_heap(true);
   if (heaps > ARRAYLEN(g_heaps))
diff --git a/tool/viz/malloc_scalability.c b/tool/viz/malloc_scalability.c
new file mode 100644
index 00000000000..434be2123d4
--- /dev/null
+++ b/tool/viz/malloc_scalability.c
@@ -0,0 +1,55 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/struct/timespec.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/thread/thread.h"
+
+#define ALLOCATIONS 1000
+
+void *worker(void *arg) {
+  void **ptrs = malloc(ALLOCATIONS * sizeof(void *));
+  for (int i = 0; i < ALLOCATIONS; ++i)
+    ptrs[i] = malloc(1);
+  for (int i = 0; i < ALLOCATIONS; ++i)
+    free(ptrs[i]);
+  free(ptrs);
+  return 0;
+}
+
+void test(int n) {
+  struct timespec start = timespec_real();
+  pthread_t *th = malloc(sizeof(pthread_t) * n);
+  for (int i = 0; i < n; ++i)
+    pthread_create(th + i, 0, worker, 0);
+  for (int i = 0; i < n; ++i)
+    pthread_join(th[i], 0);
+  free(th);
+  struct timespec end = timespec_real();
+  printf("%2d threads * %d allocs = %ld us\n", n, ALLOCATIONS,
+         timespec_tomicros(timespec_sub(end, start)));
+}
+
+int main(int argc, char *argv[]) {
+  int n = __get_cpu_count();
+  if (n < 8)
+    n = 8;
+  for (int i = 1; i <= n; ++i)
+    test(i);
+}
diff --git a/tool/viz/vdsodump.c b/tool/viz/vdsodump.c
new file mode 100644
index 00000000000..22174a323f5
--- /dev/null
+++ b/tool/viz/vdsodump.c
@@ -0,0 +1,40 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/intrin/getauxval.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/auxv.h"
+
+int main(int argc, char *argv[]) {
+  struct AuxiliaryValue av;
+  av = __getauxval(AT_SYSINFO_EHDR);
+  if (!av.isfound)
+    return 2;
+  int fd = creat("vdso.so", 0644);
+  if (fd == -1)
+    return 3;
+  int i;
+  for (i = 0;; i += getpagesize())
+    if (write(fd, (char *)av.value + i, getpagesize()) == -1)
+      break;
+  if (!i)
+    return 4;
+  if (close(fd))
+    return 5;
+}