Skip to content

Commit

Permalink
Make malloc scalable on all platforms
Browse files Browse the repository at this point in the history
It turns out sched_getcpu() didn't work on many platforms. So the system
call now has tests and is well documented. We now employ new workarounds
on platforms where it isn't supported in our malloc() implementation. It
was previously the case that malloc() was only scalable on Linux/Windows
for x86-64. Now the other platforms are scalable too.
  • Loading branch information
jart committed Aug 16, 2024
1 parent 3fd275f commit 0a79c69
Show file tree
Hide file tree
Showing 9 changed files with 459 additions and 99 deletions.
15 changes: 15 additions & 0 deletions examples/nproc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#if 0
/*─────────────────────────────────────────────────────────────────╗
│ To the extent possible under law, Justine Tunney has waived │
│ all copyright and related or neighboring rights to this file, │
│ as it is written in the following disclaimers: │
│ • http://unlicense.org/ │
│ • http://creativecommons.org/publicdomain/zero/1.0/ │
╚─────────────────────────────────────────────────────────────────*/
#endif
#include <cosmo.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
printf("%d\n", __get_cpu_count());
}
78 changes: 51 additions & 27 deletions libc/calls/getcpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,39 +30,63 @@

int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);

/**
* Determines ID of CPU on which thread is currently scheduled.
*
* This is the same as sched_getcpu(), except it also supports returning
* the ID of the current NUMA node. On some platforms this functionality
* isn't available, in which case `out_opt_node` is always be set to 0.
*/
int getcpu(unsigned *out_opt_cpu, unsigned *out_opt_node) {
unsigned cpu;
unsigned node;
if (X86_HAVE(RDTSCP)) {

if (IsWindows()) {
struct NtProcessorNumber pn;
if (out_opt_cpu) {
GetCurrentProcessorNumberEx(&pn);
*out_opt_cpu = 64 * pn.Group + pn.Number;
}
if (out_opt_node) {
unsigned short node16;
if (GetNumaProcessorNodeEx(&pn, &node16)) {
*out_opt_node = node16;
} else {
return __winerr();
}
}
return 0;
}

#ifdef __x86_64__
if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
unsigned tsc_aux;
rdtscp(&tsc_aux);
cpu = TSC_AUX_CORE(tsc_aux);
node = TSC_AUX_NODE(tsc_aux);
} else if (IsWindows()) {
struct NtProcessorNumber pn;
GetCurrentProcessorNumberEx(&pn);
cpu = 64 * pn.Group + pn.Number;
unsigned short node16;
if (GetNumaProcessorNodeEx(&pn, &node16)) {
node = node16;
} else {
return __winerr();
if (out_opt_cpu)
*out_opt_cpu = TSC_AUX_CORE(tsc_aux);
if (out_opt_node)
*out_opt_node = TSC_AUX_NODE(tsc_aux);
return 0;
}
#endif

if (IsXnu() || IsOpenbsd() || IsNetbsd() || IsFreebsd()) {
if (out_opt_cpu) {
int rc = sched_getcpu();
if (rc == -1)
return -1;
*out_opt_cpu = rc;
}
} else if (IsAarch64()) {
long tpidr_el0;
asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
cpu = tpidr_el0 & 255;
node = 0;
} else {
int rc = sys_getcpu(&cpu, &node, 0);
if (rc == -1)
return -1;
if (out_opt_node)
*out_opt_node = 0;
return 0;
}
if (out_opt_cpu) {

unsigned cpu, node;
int rc = sys_getcpu(&cpu, &node, 0);
if (rc == -1)
return -1;
if (out_opt_cpu)
*out_opt_cpu = cpu;
}
if (out_opt_node) {
if (out_opt_node)
*out_opt_node = node;
}
return 0;
}
80 changes: 65 additions & 15 deletions libc/calls/sched_getcpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,82 @@
#include "libc/nexgen32e/x86feature.h"
#include "libc/nt/struct/processornumber.h"
#include "libc/nt/synchronization.h"
#include "libc/runtime/syslib.internal.h"
#include "libc/sysv/errfuns.h"

int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);

/**
* Returns ID of CPU on which thread is currently scheduled.
*
* This function is supported on the following platforms:
*
* - x86-64
*
* - Linux: rdtsc
* - FreeBSD: rdtsc
* - Windows: win32
* - OpenBSD: unsupported
* - NetBSD: unsupported
* - MacOS: unsupported
*
* - aarch64
*
* - Linux: syscall
* - FreeBSD: syscall
* - MacOS: supported
*
* @return cpu number on success, or -1 w/ errno
*/
int sched_getcpu(void) {
if (X86_HAVE(RDTSCP)) {
unsigned tsc_aux;
rdtscp(&tsc_aux);
return TSC_AUX_CORE(tsc_aux);
} else if (IsAarch64()) {
long tpidr_el0;
asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
return tpidr_el0 & 255;
} else if (IsWindows()) {

if (IsWindows()) {
struct NtProcessorNumber pn;
GetCurrentProcessorNumberEx(&pn);
return 64 * pn.Group + pn.Number;
} else {
unsigned cpu = 0;
int rc = sys_getcpu(&cpu, 0, 0);
if (rc == -1)
return -1;
return cpu;
}

#ifdef __x86_64__
if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
// Only the Linux, FreeBSD, and Windows kernels can be counted upon
// to populate the TSC_AUX register with the current thread number.
unsigned tsc_aux;
rdtscp(&tsc_aux);
return TSC_AUX_CORE(tsc_aux);
}
#endif

#ifdef __aarch64__
if (IsXnu()) {
// pthread_cpu_number_np() is defined by MacOS 11.0+ (Big Sur) in
// the SDK pthread.h header file, even though there's no man page
if (__syslib && __syslib->__version >= 9) {
errno_t err;
size_t out = 0;
if ((err = __syslib->__pthread_cpu_number_np(&out))) {
errno = err;
return -1;
}
return out;
} else {
errno = ENOSYS; // upgrade your ape loader
return -1; // cc -o /usr/local/bin/ape ape/ape-m1.c
}
}
#endif

#ifdef __aarch64__
if (IsFreebsd()) {
register int x0 asm("x0");
register int x8 asm("x8") = 581; // sched_getcpu
asm volatile("svc\t0" : "=r"(x0) : "r"(x8) : "memory");
return x0;
}
#endif

unsigned cpu = 0;
int rc = sys_getcpu(&cpu, 0, 0);
if (rc == -1)
return -1;
return cpu;
}
122 changes: 88 additions & 34 deletions libc/intrin/atomic.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,48 +13,26 @@
*/

typedef enum {
memory_order_relaxed,
memory_order_consume,
memory_order_acquire,
memory_order_release,
memory_order_acq_rel,
memory_order_seq_cst,
memory_order_relaxed = __ATOMIC_RELAXED,
memory_order_consume = __ATOMIC_CONSUME,
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
memory_order_seq_cst = __ATOMIC_SEQ_CST
} memory_order;

#define ATOMIC_VAR_INIT(...) __VA_ARGS__
#if !(defined __STDC_VERSION__ && __STDC_VERSION__ > 201710L)
#define ATOMIC_VAR_INIT(...) __VA_ARGS__
#endif

#define atomic_is_lock_free(obj) ((void)(obj), sizeof(obj) <= sizeof(void *))

#define atomic_flag atomic_bool
#define ATOMIC_FLAG_INIT ATOMIC_VAR_INIT(0)
#define ATOMIC_FLAG_INIT false
#define atomic_flag_test_and_set_explicit(x, order) \
atomic_exchange_explicit(x, 1, order)
#define atomic_flag_clear_explicit(x, order) atomic_store_explicit(x, 0, order)

#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
atomic_compare_exchange_strong_explicit( \
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
atomic_compare_exchange_weak_explicit( \
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
#define atomic_exchange(pObject, desired) \
atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
#define atomic_fetch_add(pObject, operand) \
atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_and(pObject, operand) \
atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_or(pObject, operand) \
atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_sub(pObject, operand) \
atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_xor(pObject, operand) \
atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
#define atomic_store(pObject, desired) \
atomic_store_explicit(pObject, desired, memory_order_seq_cst)
#define atomic_flag_test_and_set(x) \
atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)

#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)

#define atomic_init(obj, value) __c11_atomic_init(obj, value)
Expand Down Expand Up @@ -84,9 +62,35 @@ typedef enum {
#define atomic_store_explicit(object, desired, order) \
__c11_atomic_store(object, desired, order)

#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
atomic_compare_exchange_strong_explicit( \
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
atomic_compare_exchange_weak_explicit( \
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
#define atomic_exchange(pObject, desired) \
atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
#define atomic_fetch_add(pObject, operand) \
atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_and(pObject, operand) \
atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_or(pObject, operand) \
atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_sub(pObject, operand) \
atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_xor(pObject, operand) \
atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
#define atomic_store(pObject, desired) \
atomic_store_explicit(pObject, desired, memory_order_seq_cst)
#define atomic_flag_test_and_set(x) \
atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)

#elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 407

#define atomic_init(obj, value) ((void)(*(obj) = (value)))
#define atomic_init(obj, value) \
atomic_store_explicit(obj, value, __ATOMIC_RELAXED)
#define atomic_thread_fence(order) __atomic_thread_fence(order)
#define atomic_signal_fence(order) __atomic_signal_fence(order)
#define atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
Expand All @@ -111,6 +115,31 @@ typedef enum {
#define atomic_store_explicit(pObject, desired, order) \
__atomic_store_n(pObject, desired, order)

#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
atomic_compare_exchange_weak_explicit(pObject, pExpected, desired, \
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
#define atomic_exchange(pObject, desired) \
atomic_exchange_explicit(pObject, desired, __ATOMIC_SEQ_CST)
#define atomic_fetch_add(pObject, operand) \
atomic_fetch_add_explicit(pObject, operand, __ATOMIC_SEQ_CST)
#define atomic_fetch_and(pObject, operand) \
atomic_fetch_and_explicit(pObject, operand, __ATOMIC_SEQ_CST)
#define atomic_fetch_or(pObject, operand) \
atomic_fetch_or_explicit(pObject, operand, __ATOMIC_SEQ_CST)
#define atomic_fetch_sub(pObject, operand) \
atomic_fetch_sub_explicit(pObject, operand, __ATOMIC_SEQ_CST)
#define atomic_fetch_xor(pObject, operand) \
atomic_fetch_xor_explicit(pObject, operand, __ATOMIC_SEQ_CST)
#define atomic_load(pObject) atomic_load_explicit(pObject, __ATOMIC_SEQ_CST)
#define atomic_store(pObject, desired) \
atomic_store_explicit(pObject, desired, __ATOMIC_SEQ_CST)
#define atomic_flag_test_and_set(x) \
atomic_flag_test_and_set_explicit(x, __ATOMIC_SEQ_CST)
#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, __ATOMIC_SEQ_CST)

#elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 401

#define atomic_init(obj, value) ((void)(*(obj) = (value)))
Expand Down Expand Up @@ -210,6 +239,31 @@ typedef enum {
#define atomic_store_explicit(object, desired, order) \
((void)atomic_exchange_explicit(object, desired, order))

#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
atomic_compare_exchange_strong_explicit( \
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
atomic_compare_exchange_weak_explicit( \
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
#define atomic_exchange(pObject, desired) \
atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
#define atomic_fetch_add(pObject, operand) \
atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_and(pObject, operand) \
atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_or(pObject, operand) \
atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_sub(pObject, operand) \
atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_fetch_xor(pObject, operand) \
atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
#define atomic_store(pObject, desired) \
atomic_store_explicit(pObject, desired, memory_order_seq_cst)
#define atomic_flag_test_and_set(x) \
atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)

#else /* non-gcc or old gcc w/o x86 */
#error "atomic operations not supported with this compiler and/or architecture"
#endif
Expand Down
1 change: 1 addition & 0 deletions libc/runtime/syslib.internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ struct Syslib {
char *(*__dlerror)(void);
/* v9 (2024-01-31) */
int (*__pthread_cpu_number_np)(size_t *);
/* v10 (2024-05-02) */
long (*__sysctl)(int *, unsigned, void *, size_t *, void *, size_t);
long (*__sysctlbyname)(const char *, void *, size_t *, void *, size_t);
long (*__sysctlnametomib)(const char *, int *, size_t *);
Expand Down
Loading

0 comments on commit 0a79c69

Please sign in to comment.