From cdfcee51ca5e90c626f975bc66c52f6a58da6af3 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Sat, 27 Jul 2024 08:20:42 -0700 Subject: [PATCH] Properly serialize fork() operations This change solves an issue where many threads attempting to spawn forks at once would cause fork() performance to degrade with the thread count. Things got real nasty on NetBSD, which slowed down the whole test fleet, because there's no vfork() and we're forced to use fork() in our server. threads count task 1 1062 fork+exit+wait 2 668 fork+exit+wait 4 66 fork+exit+wait 8 19 fork+exit+wait 16 22 fork+exit+wait 32 16 fork+exit+wait Things are now much less bad on NetBSD, but not great, since it does not have futexes; we rely on its semaphore file descriptors to do conditions threads count task 1 1085 fork+exit+wait 2 842 fork+exit+wait 4 532 fork+exit+wait 8 400 fork+exit+wait 16 276 fork+exit+wait 32 66 fork+exit+wait With OpenBSD which also lacks vfork(), things were just as bad as NetBSD threads count task 1 584 fork+exit+wait 2 687 fork+exit+wait 4 206 fork+exit+wait 8 24 fork+exit+wait 16 33 fork+exit+wait 32 26 fork+exit+wait But since OpenBSD has futexes fork() works terrifically thanks to *NSYNC threads count task 1 525 fork+exit+wait 2 580 fork+exit+wait 4 451 fork+exit+wait 8 479 fork+exit+wait 16 408 fork+exit+wait 32 373 fork+exit+wait This issue would most likely only manifest itself, when pthread_atfork() callers manage to slip a spin lock into the outermost position of fork's list of locks. Since fork() is very slow, a spin lock can be devastating Needless to say vfork() rules and anyone who says differently is kidding themselves. Look at what a FreeBSD 14.1 virtual machine with equal specs can do over the course of three hundred milliseconds. threads count task 1 2559 vfork+exit+wait 2 5389 vfork+exit+wait 4 34933 vfork+exit+wait 8 43273 vfork+exit+wait 16 49648 vfork+exit+wait 32 40247 vfork+exit+wait So it's a shame that so few OSes support vfork(). It creates an unsavory situation, where someone wanting to build a server that spawns processes would be better served to not use threads and favor a multiprocess model --- libc/proc/fork.c | 34 ++++++++++++++++++++++++--------- third_party/dlmalloc/dlmalloc.c | 2 +- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/libc/proc/fork.c b/libc/proc/fork.c index 79ab56012b6..8d9ec733acc 100644 --- a/libc/proc/fork.c +++ b/libc/proc/fork.c @@ -51,6 +51,13 @@ __static_yoink("_pthread_atfork"); extern pthread_mutex_t _rand64_lock_obj; extern pthread_mutex_t _pthread_lock_obj; +// fork needs to lock every lock, which makes it very single-threaded in +// nature. the outermost lock matters the most because it serializes all +// threads attempting to spawn processes. the outer lock can't be a spin +// lock that a pthread_atfork() caller slipped in. to ensure it's a fair +// lock, we add an additional one of our own, which protects other locks +static pthread_mutex_t _fork_gil = PTHREAD_MUTEX_INITIALIZER; + static void _onfork_prepare(void) { if (_weaken(_pthread_onfork_prepare)) _weaken(_pthread_onfork_prepare)(); @@ -85,16 +92,14 @@ static void _onfork_child(void) { _weaken(_pthread_onfork_child)(); } -int _fork(uint32_t dwCreationFlags) { +static int _forker(uint32_t dwCreationFlags) { long micros; struct Dll *e; struct timespec started; int ax, dx, tid, parent; parent = __pid; - BLOCK_SIGNALS; - if (__threaded) - _onfork_prepare(); started = timespec_real(); + _onfork_prepare(); if (!IsWindows()) { ax = sys_fork(); } else { @@ -145,19 +150,30 @@ int _fork(uint32_t dwCreationFlags) { atomic_store_explicit(&pt->pt_canceled, false, memory_order_relaxed); // run user fork callbacks - if (__threaded) - _onfork_child(); + _onfork_child(); STRACE("fork() → 0 (child of %d; took %ld us)", parent, micros); } else { // this is the parent process - if (__threaded) - _onfork_parent(); + _onfork_parent(); STRACE("fork() → %d% m (took %ld us)", ax, micros); } - ALLOW_SIGNALS; return ax; } +int _fork(uint32_t dwCreationFlags) { + int rc; + BLOCK_SIGNALS; + pthread_mutex_lock(&_fork_gil); + rc = _forker(dwCreationFlags); + if (!rc) { + pthread_mutex_init(&_fork_gil, 0); + } else { + pthread_mutex_unlock(&_fork_gil); + } + ALLOW_SIGNALS; + return rc; +} + /** * Creates new process. * diff --git a/third_party/dlmalloc/dlmalloc.c b/third_party/dlmalloc/dlmalloc.c index fa546f8cc03..b79113311cf 100644 --- a/third_party/dlmalloc/dlmalloc.c +++ b/third_party/dlmalloc/dlmalloc.c @@ -31,6 +31,7 @@ #define FOOTERS 1 #define MSPACES 1 #define ONLY_MSPACES 1 // enables scalable multi-threaded malloc +#define USE_SPIN_LOCKS 1 // only profitable using sched_getcpu() #else #define INSECURE 1 #define PROCEED_ON_ERROR 1 @@ -43,7 +44,6 @@ #define HAVE_MREMAP 1 #define HAVE_MORECORE 0 #define USE_LOCKS 2 -#define USE_SPIN_LOCKS 1 #define MORECORE_CONTIGUOUS 0 #define MALLOC_INSPECT_ALL 1 #define ABORT_ON_ASSERT_FAILURE 0