diff --git a/examples/greenbean.c b/examples/greenbean.c index d91cac66520..e92ceef8ffb 100644 --- a/examples/greenbean.c +++ b/examples/greenbean.c @@ -18,6 +18,7 @@ #include "libc/fmt/conv.h" #include "libc/fmt/itoa.h" #include "libc/intrin/kprintf.h" +#include "libc/intrin/wait0.internal.h" #include "libc/limits.h" #include "libc/log/check.h" #include "libc/log/log.h" @@ -333,11 +334,17 @@ int main(int argc, char *argv[]) { // clean up terminal line kprintf("\r\e[K"); - // clean up memory - for (i = 0; i < threads; ++i) { - if (stack) munmap(stack[i], GetStackSize()); - if (tls) free(tls[i]); + // join the workers + // this is how we guarantee stacks are safe to free + if (tls && stack) { + for (i = 0; i < threads; ++i) { + _wait0((int *)(tls[i] + 0x38)); + munmap(stack[i], GetStackSize()); + free(tls[i]); + } } + + // clean up memory free(hostips); free(stack); free(tls); diff --git a/libc/calls/getloadavg-nt.c b/libc/calls/getloadavg-nt.c index aad45d40966..234bd45fcfb 100644 --- a/libc/calls/getloadavg-nt.c +++ b/libc/calls/getloadavg-nt.c @@ -29,7 +29,7 @@ static int cpus; static double load; -_Alignas(64) static int lock; +_Alignas(64) static char lock; static struct NtFileTime idle1, kern1, user1; textwindows int sys_getloadavg_nt(double *a, int n) { diff --git a/libc/intrin/once.h b/libc/intrin/once.h index 8aa755ad9dc..52eb74f2de3 100644 --- a/libc/intrin/once.h +++ b/libc/intrin/once.h @@ -2,20 +2,20 @@ #define COSMOPOLITAN_LIBC_INTRIN_ONCE_H_ #include "libc/intrin/spinlock.h" -#define _once(x) \ - ({ \ - typeof(x) oncerc; \ - static bool once; \ - static typeof(oncerc) onceresult; \ - _Alignas(64) static int oncelock; \ - _spinlock(&oncelock); \ - if (once) { \ - oncerc = onceresult; \ - } else { \ - oncerc = onceresult = x; \ - } \ - _spunlock(&oncelock); \ - oncerc; \ +#define _once(x) \ + ({ \ + typeof(x) oncerc; \ + static bool once; \ + static typeof(oncerc) onceresult; \ + _Alignas(64) static char oncelock; \ + _spinlock(&oncelock); \ + if (once) { \ + oncerc = onceresult; \ + } else { \ + oncerc = onceresult = x; \ + } \ + _spunlock(&oncelock); \ + oncerc; \ }) #endif /* COSMOPOLITAN_LIBC_INTRIN_ONCE_H_ */ diff --git a/libc/intrin/pthread.h b/libc/intrin/pthread.h index 1c426742483..ac96d8f2350 100644 --- a/libc/intrin/pthread.h +++ b/libc/intrin/pthread.h @@ -7,9 +7,9 @@ COSMOPOLITAN_C_START_ #define PTHREAD_ONCE_INIT 0 -#define PTHREAD_MUTEX_DEFAULT PTHREAD_MUTEX_RECURSIVE -#define PTHREAD_MUTEX_NORMAL 0 -#define PTHREAD_MUTEX_RECURSIVE 1 +#define PTHREAD_MUTEX_DEFAULT PTHREAD_MUTEX_NORMAL +#define PTHREAD_MUTEX_RECURSIVE 0 +#define PTHREAD_MUTEX_NORMAL 1 #define PTHREAD_MUTEX_ERRORCHECK 2 #define PTHREAD_MUTEX_STALLED 0 #define PTHREAD_MUTEX_ROBUST 1 @@ -26,7 +26,7 @@ typedef int pthread_once_t; typedef struct { int attr; int reent; - _Atomic(int) owner; + _Atomic(int) lock; _Atomic(int) waits; } pthread_mutex_t; diff --git a/libc/intrin/pthread_mutex_lock.c b/libc/intrin/pthread_mutex_lock.c index c2fc26d00f0..049cb8200ec 100644 --- a/libc/intrin/pthread_mutex_lock.c +++ b/libc/intrin/pthread_mutex_lock.c @@ -29,11 +29,12 @@ * @return 0 on success, or error number on failure */ int pthread_mutex_lock(pthread_mutex_t *mutex) { - int me, owner; - unsigned tries; + int i, me, owner, tries; for (tries = 0, me = gettid();;) { - owner = 0; - if (atomic_compare_exchange_strong(&mutex->owner, &owner, me)) { + owner = atomic_load_explicit(&mutex->lock, memory_order_relaxed); + if (!owner && atomic_compare_exchange_weak_explicit( + &mutex->lock, &owner, me, memory_order_acquire, + memory_order_relaxed)) { break; } else if (owner == me) { if (mutex->attr != PTHREAD_MUTEX_ERRORCHECK) { @@ -42,15 +43,17 @@ int pthread_mutex_lock(pthread_mutex_t *mutex) { return EDEADLK; } } - atomic_fetch_add(&mutex->waits, 1); - if (!IsLinux() || LinuxFutexWait((void *)&mutex->owner, owner, 0)) { - if (++tries & 7) { - __builtin_ia32_pause(); - } else { - sched_yield(); + if (tries < 7) { + for (i = 0; i != 1 << tries; i++) { } + tries++; + } else if (IsLinux()) { + atomic_fetch_add(&mutex->waits, 1); + LinuxFutexWait(&mutex->lock, owner, 0); + atomic_fetch_sub(&mutex->waits, 1); + } else { + sched_yield(); } - atomic_fetch_sub(&mutex->waits, 1); } ++mutex->reent; return 0; diff --git a/libc/intrin/pthread_mutex_trylock.c b/libc/intrin/pthread_mutex_trylock.c index 20063a8567e..8e995a4d47c 100644 --- a/libc/intrin/pthread_mutex_trylock.c +++ b/libc/intrin/pthread_mutex_trylock.c @@ -29,7 +29,7 @@ int pthread_mutex_trylock(pthread_mutex_t *mutex) { int rc, me, owner; me = gettid(); owner = 0; - if (!atomic_compare_exchange_strong(&mutex->owner, &owner, me) && + if (!atomic_compare_exchange_strong(&mutex->lock, &owner, me) && owner == me) { rc = 0; ++mutex->reent; diff --git a/libc/intrin/pthread_mutex_unlock.c b/libc/intrin/pthread_mutex_unlock.c index 3a9238cd82a..265c507c0b0 100644 --- a/libc/intrin/pthread_mutex_unlock.c +++ b/libc/intrin/pthread_mutex_unlock.c @@ -31,14 +31,14 @@ */ int pthread_mutex_unlock(pthread_mutex_t *mutex) { int owner; - if (mutex->attr == PTHREAD_MUTEX_ERRORCHECK && mutex->owner != gettid()) { + if (mutex->attr == PTHREAD_MUTEX_ERRORCHECK && mutex->lock != gettid()) { return EPERM; } if (!--mutex->reent) { - atomic_store_explicit(&mutex->owner, 0, memory_order_relaxed); + atomic_store_explicit(&mutex->lock, 0, memory_order_relaxed); if (IsLinux() && atomic_load_explicit(&mutex->waits, memory_order_acquire)) { - LinuxFutexWake(&mutex->owner, 1); + LinuxFutexWake(&mutex->lock, 1); } } return 0; diff --git a/libc/intrin/spinlock.h b/libc/intrin/spinlock.h index 8ee2690cb6a..9294432b4ab 100644 --- a/libc/intrin/spinlock.h +++ b/libc/intrin/spinlock.h @@ -20,20 +20,19 @@ __atomic_store(__lock, &__x, __ATOMIC_RELEASE); \ }) -#define _spinlock_tiny(lock) \ - ({ \ - autotype(lock) __lock = (lock); \ - while (_trylock(__lock)) { \ - __builtin_ia32_pause(); \ - } \ - 0; \ +#define _spinlock_tiny(lock) \ + ({ \ + while (_trylock(lock)) { \ + __builtin_ia32_pause(); \ + } \ + 0; \ }) #define _spinlock_cooperative(lock) \ ({ \ - autotype(lock) __lock = (lock); \ - typeof(*__lock) __x; \ + char __x; \ unsigned __tries = 0; \ + char *__lock = (lock); \ for (;;) { \ __atomic_load(__lock, &__x, __ATOMIC_RELAXED); \ if (!__x && !_trylock(__lock)) { \ @@ -47,6 +46,27 @@ 0; \ }) +#define _spinlock_cooperative_(lock) \ + ({ \ + char __x; \ + volatile int __i; \ + unsigned __tries = 0; \ + char *__lock = (lock); \ + for (;;) { \ + __atomic_load(__lock, &__x, __ATOMIC_RELAXED); \ + if (!__x && !_trylock(__lock)) { \ + break; \ + } else if (__tries < 7) { \ + for (__i = 0; __i != 1 << __tries; __i++) { \ + } \ + __tries++; \ + } else { \ + _spinlock_yield(); \ + } \ + } \ + 0; \ + }) + #define _trylock(lock) __atomic_test_and_set(lock, __ATOMIC_SEQ_CST) void _spinlock_yield(void); diff --git a/libc/intrin/wait0.internal.h b/libc/intrin/wait0.internal.h new file mode 100644 index 00000000000..28d104629be --- /dev/null +++ b/libc/intrin/wait0.internal.h @@ -0,0 +1,20 @@ +#ifndef COSMOPOLITAN_LIBC_INTRIN_WAIT0_H_ +#define COSMOPOLITAN_LIBC_INTRIN_WAIT0_H_ +#include "libc/bits/atomic.h" +#include "libc/calls/calls.h" +#include "libc/dce.h" +#include "libc/linux/futex.h" + +#define _wait0(ptid) \ + do { \ + int x; \ + if (!(x = atomic_load_explicit(ptid, memory_order_relaxed))) { \ + break; \ + } else if (IsLinux()) { \ + LinuxFutexWait(ptid, x, 0); \ + } else { \ + sched_yield(); \ + } \ + } while (1) + +#endif /* COSMOPOLITAN_LIBC_INTRIN_WAIT0_H_ */ diff --git a/libc/runtime/clone.c b/libc/runtime/clone.c index 88f7b87b694..73a8e746a55 100644 --- a/libc/runtime/clone.c +++ b/libc/runtime/clone.c @@ -63,7 +63,7 @@ struct CloneArgs { int64_t tid64; }; union { - int lock; + char lock; void *pstack; }; int *ctid; diff --git a/libc/testlib/showerror.c b/libc/testlib/showerror.c index 680ee7ef6e4..320e2faccf2 100644 --- a/libc/testlib/showerror.c +++ b/libc/testlib/showerror.c @@ -32,7 +32,7 @@ const char *testlib_showerror_func; const char *testlib_showerror_isfatal; const char *testlib_showerror_macro; const char *testlib_showerror_symbol; -_Alignas(64) static int testlib_showerror_lock; +_Alignas(64) static char testlib_showerror_lock; testonly void testlib_showerror(const char *file, int line, const char *func, const char *method, const char *symbol, diff --git a/libc/thread/join.c b/libc/thread/join.c index c07e98482fe..05dedb13421 100644 --- a/libc/thread/join.c +++ b/libc/thread/join.c @@ -22,7 +22,6 @@ #include "libc/dce.h" #include "libc/errno.h" #include "libc/intrin/asan.internal.h" -#include "libc/intrin/spinlock.h" #include "libc/runtime/runtime.h" #include "libc/str/str.h" #include "libc/sysv/consts/futex.h" diff --git a/test/libc/calls/reservefd_test.c b/test/libc/calls/reservefd_test.c index 5c2a314f791..ace2bb5be05 100644 --- a/test/libc/calls/reservefd_test.c +++ b/test/libc/calls/reservefd_test.c @@ -25,6 +25,7 @@ #include "libc/errno.h" #include "libc/intrin/kprintf.h" #include "libc/intrin/spinlock.h" +#include "libc/intrin/wait0.internal.h" #include "libc/macros.internal.h" #include "libc/nexgen32e/threaded.h" #include "libc/rand/rand.h" @@ -128,7 +129,7 @@ TEST(reservefd, tortureTest) { (int *)(tls[i] + 0x38)); } for (i = 0; i < THREADS; ++i) { - _spinlock((int *)(tls[i] + 0x38)); + _wait0((int *)(tls[i] + 0x38)); } // EXPECT_SYS(0, 0, sigaction(SIGALRM, &oldsa, 0)); // EXPECT_SYS(0, 0, setitimer(ITIMER_REAL, &oldit, 0)); diff --git a/test/libc/intrin/pthread_mutex_lock_test.c b/test/libc/intrin/pthread_mutex_lock_test.c index 79d0a6c36fb..4cfc7ea3fc9 100644 --- a/test/libc/intrin/pthread_mutex_lock_test.c +++ b/test/libc/intrin/pthread_mutex_lock_test.c @@ -22,7 +22,9 @@ #include "libc/intrin/kprintf.h" #include "libc/intrin/pthread.h" #include "libc/intrin/spinlock.h" +#include "libc/intrin/wait0.internal.h" #include "libc/nexgen32e/threaded.h" +#include "libc/runtime/internal.h" #include "libc/runtime/runtime.h" #include "libc/runtime/stack.h" #include "libc/sysv/consts/clone.h" @@ -32,24 +34,23 @@ #include "libc/testlib/testlib.h" #include "libc/thread/thread.h" -static char tls[64]; +#define THREADS 8 +#define ITERATIONS 512 -int PutProcessInThreadingMode(void *p) { - return 0; -} +char *stack[THREADS]; +char tls[THREADS][64]; -void SetUp(void) { - clone(PutProcessInThreadingMode, - mmap(0, GetStackSize(), PROT_READ | PROT_WRITE, - MAP_STACK | MAP_ANONYMOUS, -1, 0), - GetStackSize(), - CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | - CLONE_SETTLS, - 0, 0, __initialize_tls(tls), sizeof(tls), 0); +__attribute__((__constructor__)) void init(void) { + __enable_tls(); + __enable_threads(); } TEST(pthread_mutex_lock, normal) { pthread_mutex_t lock; + pthread_mutexattr_t attr; + ASSERT_EQ(0, pthread_mutexattr_init(&attr)); + ASSERT_EQ(0, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL)); + ASSERT_EQ(0, pthread_mutex_init(&lock, &attr)); ASSERT_EQ(0, pthread_mutex_init(&lock, 0)); ASSERT_EQ(0, pthread_mutex_lock(&lock)); ASSERT_EQ(0, pthread_mutex_unlock(&lock)); @@ -89,6 +90,78 @@ TEST(pthread_mutex_lock, errorcheck) { ASSERT_EQ(0, pthread_mutexattr_destroy(&attr)); } +int count; +_Atomic(int) finished; +_Alignas(64) char slock; +pthread_mutex_t lock; + +int MutexWorker(void *p) { + int i; + for (i = 0; i < ITERATIONS; ++i) { + pthread_mutex_lock(&lock); + ++count; + pthread_mutex_unlock(&lock); + } + return 0; +} + +TEST(pthread_mutex_lock, contention) { + int i; + count = 0; + for (i = 0; i < THREADS; ++i) { + clone(MutexWorker, + (stack[i] = mmap(0, GetStackSize(), PROT_READ | PROT_WRITE, + MAP_STACK | MAP_ANONYMOUS, -1, 0)), + GetStackSize(), + CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | + CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | CLONE_SETTLS, + 0, 0, __initialize_tls(tls[i]), sizeof(tls[i]), + (int *)(tls[i] + 0x38)); + } + for (i = 0; i < THREADS; ++i) { + _wait0((int *)(tls[i] + 0x38)); + } + ASSERT_EQ(THREADS * ITERATIONS, count); + for (i = 0; i < THREADS; ++i) { + munmap(stack[i], GetStackSize()); + } +} + +int SpinlockWorker(void *p) { + int i; + for (i = 0; i < ITERATIONS; ++i) { + _spinlock(&slock); + ++count; + _spunlock(&slock); + } + ++finished; + return 0; +} + +TEST(_spinlock, contention) { + int i; + count = 0; + finished = 0; + for (i = 0; i < THREADS; ++i) { + clone(SpinlockWorker, + (stack[i] = mmap(0, GetStackSize(), PROT_READ | PROT_WRITE, + MAP_STACK | MAP_ANONYMOUS, -1, 0)), + GetStackSize(), + CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | + CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | CLONE_SETTLS, + 0, 0, __initialize_tls(tls[i]), sizeof(tls[i]), + (int *)(tls[i] + 0x38)); + } + for (i = 0; i < THREADS; ++i) { + _wait0((int *)(tls[i] + 0x38)); + } + ASSERT_EQ(THREADS, finished); + ASSERT_EQ(THREADS * ITERATIONS, count); + for (i = 0; i < THREADS; ++i) { + munmap(stack[i], GetStackSize()); + } +} + BENCH(pthread_mutex_lock, bench) { char schar = 0; pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; @@ -100,4 +173,6 @@ BENCH(pthread_mutex_lock, bench) { (_spinlock_tiny(&schar), _spunlock(&schar))); EZBENCH2("_spinlock_coop", donothing, (_spinlock_cooperative(&schar), _spunlock(&schar))); + EZBENCH2("content mut", donothing, pthread_mutex_lock_contention()); + EZBENCH2("content spin", donothing, _spinlock_contention()); } diff --git a/test/libc/runtime/clone_test.c b/test/libc/runtime/clone_test.c index 5fd0384a18e..25fcb6b2da9 100644 --- a/test/libc/runtime/clone_test.c +++ b/test/libc/runtime/clone_test.c @@ -21,6 +21,7 @@ #include "libc/errno.h" #include "libc/intrin/kprintf.h" #include "libc/intrin/spinlock.h" +#include "libc/intrin/wait0.internal.h" #include "libc/log/backtrace.internal.h" #include "libc/mem/mem.h" #include "libc/nexgen32e/gettls.h" @@ -102,14 +103,13 @@ int CloneTest1(void *arg) { TEST(clone, test1) { int ptid = 0; *childetid = -1; - _seizelock(childetid, -1); ASSERT_NE(-1, (tid = clone(CloneTest1, stack, GetStackSize(), CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | CLONE_SETTLS, (void *)23, &ptid, tls, 64, childetid))); - _spinlock(childetid); // CLONE_CHILD_CLEARTID + _wait0(childetid); // CLONE_CHILD_CLEARTID ASSERT_NE(gettid(), tid); ASSERT_EQ(tid, ptid); ASSERT_EQ(42, x); @@ -174,7 +174,7 @@ TEST(clone, tlsSystemCallsErrno_wontClobberMainThreadBecauseTls) { } sysbarrier = 1; for (i = 0; i < 8; ++i) { - _spinlock((int *)(tls[i] + 0x38)); + _wait0((int *)(tls[i] + 0x38)); free(tls[i]); munmap(stack[i], GetStackSize()); } diff --git a/test/libc/stdio/dtoa_test.c b/test/libc/stdio/dtoa_test.c index 8d2a315d677..1ccd2a6d904 100644 --- a/test/libc/stdio/dtoa_test.c +++ b/test/libc/stdio/dtoa_test.c @@ -19,6 +19,7 @@ #include "libc/calls/calls.h" #include "libc/fmt/fmt.h" #include "libc/intrin/spinlock.h" +#include "libc/intrin/wait0.internal.h" #include "libc/math.h" #include "libc/runtime/stack.h" #include "libc/stdio/stdio.h" @@ -71,7 +72,7 @@ TEST(dtoa, test) { (int *)(tls[i] + 0x38)); } for (i = 0; i < THREADS; ++i) { - _spinlock((int *)(tls[i] + 0x38)); + _wait0((int *)(tls[i] + 0x38)); } } diff --git a/tool/build/mkdeps.c b/tool/build/mkdeps.c index a27647aef49..7501a3cfcac 100644 --- a/tool/build/mkdeps.c +++ b/tool/build/mkdeps.c @@ -30,6 +30,7 @@ #include "libc/fmt/fmt.h" #include "libc/intrin/kprintf.h" #include "libc/intrin/spinlock.h" +#include "libc/intrin/wait0.internal.h" #include "libc/log/check.h" #include "libc/log/log.h" #include "libc/macros.internal.h" @@ -131,10 +132,10 @@ struct Sauce *sauces; struct Strings strings; struct Sources sources; const char *buildroot; -_Alignas(64) int galock; -_Alignas(64) int readlock; -_Alignas(64) int writelock; -_Alignas(64) int reportlock; +_Alignas(64) char galock; +_Alignas(64) char readlock; +_Alignas(64) char writelock; +_Alignas(64) char reportlock; unsigned Hash(const void *s, size_t l) { return max(1, crc32c(0, s, l)); @@ -316,7 +317,7 @@ void LoadRelationships(int argc, char *argv[]) { } } for (i = 0; i < threads; ++i) { - _spinlock((int *)(tls[i] + 0x38)); + _wait0((int *)(tls[i] + 0x38)); } getargs_destroy(&ga); } @@ -431,7 +432,7 @@ void Explore(void) { } } for (i = 0; i < threads; ++i) { - _spinlock((int *)(tls[i] + 0x38)); + _wait0((int *)(tls[i] + 0x38)); } } diff --git a/tool/build/wastecpu.c b/tool/build/wastecpu.c index bce1ea198a5..fbb1d86e106 100644 --- a/tool/build/wastecpu.c +++ b/tool/build/wastecpu.c @@ -18,6 +18,7 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/calls/struct/sigaction.h" #include "libc/intrin/spinlock.h" +#include "libc/intrin/wait0.internal.h" #include "libc/log/log.h" #include "libc/mem/mem.h" #include "libc/nexgen32e/threaded.h" @@ -71,7 +72,7 @@ int main(int argc, char *argv[]) { usleep(1000); } for (i = 0; i < n; ++i) { - _spinlock((int *)(tls[i] + 0x38)); + _wait0((int *)(tls[i] + 0x38)); free(tls[i]); } } diff --git a/tool/net/redbean.c b/tool/net/redbean.c index 739afabb499..db0646c3d04 100644 --- a/tool/net/redbean.c +++ b/tool/net/redbean.c @@ -36,6 +36,7 @@ #include "libc/fmt/itoa.h" #include "libc/intrin/kprintf.h" #include "libc/intrin/nomultics.internal.h" +#include "libc/intrin/wait0.internal.h" #include "libc/log/check.h" #include "libc/log/log.h" #include "libc/macros.internal.h" @@ -6357,7 +6358,7 @@ static int ExitWorker(void) { } if (monitortty) { terminatemonitor = true; - _spinlock(monitortid); + _wait0(monitortid); } _Exit(0); } @@ -7295,7 +7296,7 @@ void RedBean(int argc, char *argv[]) { if (!IsTiny()) { if (monitortty) { terminatemonitor = true; - _spinlock(monitortid); + _wait0(monitortid); munmap(monitorstack, GetStackSize()); free(monitortls); }