From a2b1b4e335a9cc97e502369840fa1a47ed44bf2a Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Tue, 20 Aug 2024 16:42:30 -0400 Subject: [PATCH] Reduce size of Task object (#55515) Move the registers onto the stack, so that they only are present when the Task is actually switched out, saving memory when the Task is not running yet or already finished. It makes this mostly just a huge renaming job. On Linux x86_64 this reduces it from 376 bytes to 184 bytes. Has some additional advantages too, such as copy_stack tasks (e.g. with always_copy_stacks) can migrate to other threads before starting if they are not sticky. Also fixes a variable that got mixed up by #54639 and caused always_copy_stacks to abort, since the stack limits were wrong. Also now fixes https://github.com/JuliaLang/julia/issues/43124, though I am not quite confident enough in it to re-enable that test right now. --- src/gc-debug.c | 8 +- src/gc-stacks.c | 22 +- src/gc-stock.c | 10 +- src/init.c | 21 +- src/julia.h | 6 +- src/julia_internal.h | 3 +- src/julia_threads.h | 14 +- src/signals-unix.c | 6 +- src/stackwalk.c | 65 +++--- src/task.c | 503 +++++++++++++++++++++++-------------------- 10 files changed, 342 insertions(+), 316 deletions(-) diff --git a/src/gc-debug.c b/src/gc-debug.c index ec3c8d731edd8..19dd93af5f236 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -537,13 +537,13 @@ static void gc_scrub_task(jl_task_t *ta) char *low; char *high; - if (ta->copy_stack && ptls2 && ta == jl_atomic_load_relaxed(&ptls2->current_task)) { + if (ta->ctx.copy_stack && ptls2 && ta == jl_atomic_load_relaxed(&ptls2->current_task)) { low = (char*)ptls2->stackbase - ptls2->stacksize; high = (char*)ptls2->stackbase; } - else if (ta->stkbuf) { - low = (char*)ta->stkbuf; - high = (char*)ta->stkbuf + ta->bufsz; + else if (ta->ctx.stkbuf) { + low = (char*)ta->ctx.stkbuf; + high = (char*)ta->ctx.stkbuf + ta->ctx.bufsz; } else return; diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 5706f4ce67c1d..65173d3a8df37 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -186,14 +186,14 @@ JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz) void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task) { // avoid adding an original thread stack to the free list - if (task == ptls->root_task && !task->copy_stack) + if (task == ptls->root_task && !task->ctx.copy_stack) return; - void *stkbuf = task->stkbuf; - size_t bufsz = task->bufsz; + void *stkbuf = task->ctx.stkbuf; + size_t bufsz = task->ctx.bufsz; if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) { unsigned pool_id = select_pool(bufsz); if (pool_sizes[pool_id] == bufsz) { - task->stkbuf = NULL; + task->ctx.stkbuf = NULL; #ifdef _COMPILER_ASAN_ENABLED_ __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz); #endif @@ -296,17 +296,17 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT jl_task_t *t = (jl_task_t*)lst[n]; assert(jl_is_task(t)); if (gc_marked(jl_astaggedvalue(t)->bits.gc)) { - if (t->stkbuf == NULL) + if (t->ctx.stkbuf == NULL) ndel++; // jl_release_task_stack called else n++; } else { ndel++; - void *stkbuf = t->stkbuf; - size_t bufsz = t->bufsz; + void *stkbuf = t->ctx.stkbuf; + size_t bufsz = t->ctx.bufsz; if (stkbuf) { - t->stkbuf = NULL; + t->ctx.stkbuf = NULL; _jl_free_stack(ptls2, stkbuf, bufsz); } #ifdef _COMPILER_TSAN_ENABLED_ @@ -338,7 +338,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void) continue; small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks; size_t n = mtarraylist_length(live_tasks); - l += n + (ptls2->root_task->stkbuf != NULL); + l += n + (ptls2->root_task->ctx.stkbuf != NULL); } l += l / 20; // add 5% for margin of estimation error jl_array_t *a = jl_alloc_vec_any(l); // may gc, changing the number of tasks and forcing us to reload everything @@ -350,7 +350,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void) if (ptls2 == NULL) continue; jl_task_t *t = ptls2->root_task; - if (t->stkbuf != NULL) { + if (t->ctx.stkbuf != NULL) { if (j == l) goto restart; jl_array_data(a,void*)[j++] = t; @@ -359,7 +359,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void) size_t n = mtarraylist_length(live_tasks); for (size_t i = 0; i < n; i++) { jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i); - if (t->stkbuf != NULL) { + if (t->ctx.stkbuf != NULL) { if (j == l) goto restart; jl_array_data(a,void*)[j++] = t; diff --git a/src/gc-stock.c b/src/gc-stock.c index 1fc2087da6503..3ae14f378a2e7 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -2144,9 +2144,9 @@ FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_ (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task)); } #ifdef COPY_STACKS - void *stkbuf = ta->stkbuf; - if (stkbuf && ta->copy_stack) { - gc_setmark_buf_(ptls, stkbuf, bits, ta->bufsz); + void *stkbuf = ta->ctx.stkbuf; + if (stkbuf && ta->ctx.copy_stack) { + gc_setmark_buf_(ptls, stkbuf, bits, ta->ctx.bufsz); // For gc_heap_snapshot_record: // TODO: attribute size of stack // TODO: edge to stack data @@ -2159,12 +2159,12 @@ FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_ uintptr_t lb = 0; uintptr_t ub = (uintptr_t)-1; #ifdef COPY_STACKS - if (stkbuf && ta->copy_stack && !ta->ptls) { + if (stkbuf && ta->ctx.copy_stack && !ta->ptls) { int16_t tid = jl_atomic_load_relaxed(&ta->tid); assert(tid >= 0); jl_ptls_t ptls2 = gc_all_tls_states[tid]; ub = (uintptr_t)ptls2->stackbase; - lb = ub - ta->copy_stack; + lb = ub - ta->ctx.copy_stack; offset = (uintptr_t)stkbuf - lb; } #endif diff --git a/src/init.c b/src/init.c index eff786b564e54..9e6a695c71eb0 100644 --- a/src/init.c +++ b/src/init.c @@ -64,15 +64,19 @@ void jl_init_stack_limits(int ismaster, void **stack_lo, void **stack_hi) // threads since it seems to return bogus values for master thread on Linux // and possibly OSX. if (!ismaster) { -# if defined(_OS_LINUX_) +# if defined(_OS_LINUX_) || defined(_OS_FREEBSD_) pthread_attr_t attr; +#if defined(_OS_FREEBSD_) + pthread_attr_get_np(pthread_self(), &attr); +#else pthread_getattr_np(pthread_self(), &attr); +#endif void *stackaddr; size_t stacksize; pthread_attr_getstack(&attr, &stackaddr, &stacksize); pthread_attr_destroy(&attr); - *stack_hi = stackaddr; - *stack_lo = (char*)stackaddr - stacksize; + *stack_lo = stackaddr; + *stack_hi = (char*)stackaddr + stacksize; return; # elif defined(_OS_DARWIN_) extern void *pthread_get_stackaddr_np(pthread_t thread); @@ -80,19 +84,8 @@ void jl_init_stack_limits(int ismaster, void **stack_lo, void **stack_hi) pthread_t thread = pthread_self(); void *stackaddr = pthread_get_stackaddr_np(thread); size_t stacksize = pthread_get_stacksize_np(thread); - *stack_hi = stackaddr; *stack_lo = (char*)stackaddr - stacksize; - return; -# elif defined(_OS_FREEBSD_) - pthread_attr_t attr; - pthread_attr_init(&attr); - pthread_attr_get_np(pthread_self(), &attr); - void *stackaddr; - size_t stacksize; - pthread_attr_getstack(&attr, &stackaddr, &stacksize); - pthread_attr_destroy(&attr); *stack_hi = stackaddr; - *stack_lo = (char*)stackaddr - stacksize; return; # else # warning "Getting precise stack size for thread is not supported." diff --git a/src/julia.h b/src/julia.h index e211f31c6512c..074c50fd0aa21 100644 --- a/src/julia.h +++ b/src/julia.h @@ -2231,11 +2231,7 @@ typedef struct _jl_task_t { // current exception handler jl_handler_t *eh; // saved thread state - jl_ucontext_t ctx; - void *stkbuf; // malloc'd memory (either copybuf or stack) - size_t bufsz; // actual sizeof stkbuf - unsigned int copy_stack:31; // sizeof stack for copybuf - unsigned int started:1; + jl_ucontext_t ctx; // pointer into stkbuf, if suspended } jl_task_t; #define JL_TASK_STATE_RUNNABLE 0 diff --git a/src/julia_internal.h b/src/julia_internal.h index d4d1a3239785c..23a9c90edf8aa 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -65,7 +65,8 @@ static inline void asan_unpoison_task_stack(jl_task_t *ct, jl_jmp_buf *buf) that we're resetting to. The idea is to remove the poison from the frames that we're skipping over, since they won't be unwound. */ uintptr_t top = jmpbuf_sp(buf); - uintptr_t bottom = (uintptr_t)ct->stkbuf; + uintptr_t bottom = (uintptr_t)(ct->ctx.copy_stack ? (char*)ct->ptls->stackbase - ct->ptls->stacksize : (char*)ct->ctx.stkbuf); + //uintptr_t bottom = (uintptr_t)⊤ __asan_unpoison_stack_memory(bottom, top - bottom); } static inline void asan_unpoison_stack_memory(uintptr_t addr, size_t size) { diff --git a/src/julia_threads.h b/src/julia_threads.h index 9a2a8cec375f5..e56ff5edd6176 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -86,9 +86,13 @@ typedef ucontext_t _jl_ucontext_t; typedef struct { union { - _jl_ucontext_t ctx; - jl_stack_context_t copy_ctx; + _jl_ucontext_t *ctx; + jl_stack_context_t *copy_ctx; }; + void *stkbuf; // malloc'd memory (either copybuf or stack) + size_t bufsz; // actual sizeof stkbuf + unsigned int copy_stack:31; // sizeof stack for copybuf + unsigned int started:1; #if defined(_COMPILER_TSAN_ENABLED_) void *tsan_state; #endif @@ -155,13 +159,9 @@ typedef struct _jl_tls_states_t { struct _jl_task_t *previous_task; struct _jl_task_t *root_task; struct _jl_timing_block_t *timing_stack; + // This is the location of our copy_stack void *stackbase; size_t stacksize; - union { - _jl_ucontext_t base_ctx; // base context of stack - // This hack is needed to support always_copy_stacks: - jl_stack_context_t copy_stack_ctx; - }; // Temp storage for exception thrown in signal handler. Not rooted. struct _jl_value_t *sig_exception; // Temporary backtrace buffer. Scanned for gc roots when bt_size > 0. diff --git a/src/signals-unix.c b/src/signals-unix.c index edca523bed6d1..005422bea03d3 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -230,13 +230,13 @@ static pthread_t signals_thread; static int is_addr_on_stack(jl_task_t *ct, void *addr) { - if (ct->copy_stack) { + if (ct->ctx.copy_stack) { jl_ptls_t ptls = ct->ptls; return ((char*)addr > (char*)ptls->stackbase - ptls->stacksize && (char*)addr < (char*)ptls->stackbase); } - return ((char*)addr > (char*)ct->stkbuf && - (char*)addr < (char*)ct->stkbuf + ct->bufsz); + return ((char*)addr > (char*)ct->ctx.stkbuf && + (char*)addr < (char*)ct->ctx.stkbuf + ct->ctx.bufsz); } static void sigdie_handler(int sig, siginfo_t *info, void *context) diff --git a/src/stackwalk.c b/src/stackwalk.c index a63694e7c3b0c..15a9fddeac9a4 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -83,7 +83,7 @@ static int jl_unw_stepn(bt_cursor_t *cursor, jl_bt_element_t *bt_data, size_t *b skip--; } #endif -#if !defined(_OS_WINDOWS_) +#if !defined(_OS_WINDOWS_) // no point on windows, since RtlVirtualUnwind won't give us a second chance if the segfault happens in ntdll jl_jmp_buf *old_buf = jl_get_safe_restore(); jl_jmp_buf buf; jl_set_safe_restore(&buf); @@ -921,14 +921,12 @@ _os_ptr_munge(uintptr_t ptr) JL_NOTSAFEPOINT extern bt_context_t *jl_to_bt_context(void *sigctx); -static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT +JL_DLLEXPORT size_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT { jl_task_t *ct = jl_current_task; jl_ptls_t ptls = ct->ptls; - ptls->bt_size = 0; if (t == ct) { - ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0); - return; + return rec_backtrace(bt_data, max_bt_size, 0); } bt_context_t *context = NULL; bt_context_t c; @@ -936,9 +934,11 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT while (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid) { int lockret = jl_lock_stackwalk(); // if this task is already running somewhere, we need to stop the thread it is running on and query its state - if (!jl_thread_suspend_and_get_state(old, 0, &c)) { + if (!jl_thread_suspend_and_get_state(old, 1, &c)) { jl_unlock_stackwalk(lockret); - return; + if (jl_atomic_load_relaxed(&t->tid) != old) + continue; + return 0; } jl_unlock_stackwalk(lockret); if (jl_atomic_load_relaxed(&t->tid) == old) { @@ -953,11 +953,11 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT // got the wrong thread stopped, try again jl_thread_resume(old); } - if (context == NULL && (!t->copy_stack && t->started && t->stkbuf != NULL)) { + if (context == NULL && (!t->ctx.copy_stack && t->ctx.started && t->ctx.ctx != NULL)) { // need to read the context from the task stored state #if defined(_OS_WINDOWS_) memset(&c, 0, sizeof(c)); - _JUMP_BUFFER *mctx = (_JUMP_BUFFER*)&t->ctx.ctx.uc_mcontext; + _JUMP_BUFFER *mctx = (_JUMP_BUFFER*)&t->ctx.ctx->uc_mcontext; #if defined(_CPU_X86_64_) c.Rbx = mctx->Rbx; c.Rsp = mctx->Rsp; @@ -979,13 +979,13 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT #endif context = &c; #elif defined(JL_HAVE_UNW_CONTEXT) - context = &t->ctx.ctx; + context = t->ctx.ctx; #elif defined(JL_HAVE_UCONTEXT) - context = jl_to_bt_context(&t->ctx.ctx); + context = jl_to_bt_context(t->ctx.ctx); #elif defined(JL_HAVE_ASM) memset(&c, 0, sizeof(c)); #if defined(_OS_LINUX_) && defined(__GLIBC__) - __jmp_buf *mctx = &t->ctx.ctx.uc_mcontext->__jmpbuf; + __jmp_buf *mctx = &t->ctx.ctx->uc_mcontext->__jmpbuf; mcontext_t *mc = &c.uc_mcontext; #if defined(_CPU_X86_) // https://github.com/bminor/glibc/blame/master/sysdeps/i386/__longjmp.S @@ -1071,13 +1071,13 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT mc->pc = mc->regs[30]; context = &c; #else - #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown linux") + #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown linux") (void)mc; (void)c; (void)mctx; #endif #elif defined(_OS_DARWIN_) - sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext; + sigjmp_buf *mctx = &t->ctx.ctx->uc_mcontext; #if defined(_CPU_X86_64_) // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/x86_64/_setjmp.s x86_thread_state64_t *mc = (x86_thread_state64_t*)&c; @@ -1133,12 +1133,12 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT mc->__pad = 0; // aka __ra_sign_state = not signed context = &c; #else - #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown darwin") + #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown darwin") (void)mctx; (void)c; #endif #elif defined(_OS_FREEBSD_) - sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext; + sigjmp_buf *mctx = &t->ctx.ctx->uc_mcontext; mcontext_t *mc = &c.uc_mcontext; #if defined(_CPU_X86_64_) // https://github.com/freebsd/freebsd-src/blob/releng/13.1/lib/libc/amd64/gen/_setjmp.S @@ -1175,24 +1175,26 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT mc->mc_fpregs.fp_q[14] = ((long*)mctx)[20]; context = &c; #else - #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown freebsd") + #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown freebsd") (void)mctx; (void)c; #endif #else - #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown system") + #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown system") (void)c; #endif #else - #pragma message("jl_rec_backtrace not defined for unknown task system") + #pragma message("jl_record_backtrace not defined for unknown task system") #endif } + size_t bt_size = 0; if (context) - ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context, t->gcstack); + bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, t->gcstack); if (old == -1) jl_atomic_store_relaxed(&t->tid, old); else if (old != ptls->tid) jl_thread_resume(old); + return bt_size; } //-------------------------------------------------- @@ -1224,12 +1226,15 @@ JL_DLLEXPORT void jlbacktracet(jl_task_t *t) JL_NOTSAFEPOINT { jl_task_t *ct = jl_current_task; jl_ptls_t ptls = ct->ptls; - jl_rec_backtrace(t); - size_t i, bt_size = ptls->bt_size; + ptls->bt_size = 0; jl_bt_element_t *bt_data = ptls->bt_data; + size_t bt_size = jl_record_backtrace(t, bt_data, JL_MAX_BT_SIZE); + size_t i; for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { jl_print_bt_entry_codeloc(bt_data + i); } + if (bt_size == 0) + jl_safe_printf(" no backtrace recorded\n"); } JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT @@ -1269,14 +1274,9 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT jl_safe_printf(" ---- Root task (%p)\n", ptls2->root_task); if (t != NULL) { jl_safe_printf(" (sticky: %d, started: %d, state: %d, tid: %d)\n", - t->sticky, t->started, t_state, + t->sticky, t->ctx.started, t_state, jl_atomic_load_relaxed(&t->tid) + 1); - if (t->stkbuf != NULL) { - jlbacktracet(t); - } - else { - jl_safe_printf(" no stack\n"); - } + jlbacktracet(t); } jl_safe_printf(" ---- End root task\n"); } @@ -1291,12 +1291,9 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT jl_safe_printf(" ---- Task %zu (%p)\n", j + 1, t); // n.b. this information might not be consistent with the stack printing after it, since it could start running or change tid, etc. jl_safe_printf(" (sticky: %d, started: %d, state: %d, tid: %d)\n", - t->sticky, t->started, t_state, + t->sticky, t->ctx.started, t_state, jl_atomic_load_relaxed(&t->tid) + 1); - if (t->stkbuf != NULL) - jlbacktracet(t); - else - jl_safe_printf(" no stack\n"); + jlbacktracet(t); jl_safe_printf(" ---- End task %zu\n", j + 1); } jl_safe_printf("==== End thread %d\n", ptls2->tid + 1); diff --git a/src/task.c b/src/task.c index 1f41ebd8cd2f8..a88f3b55fc419 100644 --- a/src/task.c +++ b/src/task.c @@ -49,27 +49,27 @@ extern "C" { // c.f. interceptor in jl_dlopen as well void (*real_siglongjmp)(jmp_buf _Buf, int _Value) = NULL; #endif -static inline void sanitizer_start_switch_fiber(jl_ptls_t ptls, jl_task_t *from, jl_task_t *to) { +static inline void sanitizer_start_switch_fiber(jl_ptls_t ptls, jl_ucontext_t *from, jl_ucontext_t *to) { if (to->copy_stack) - __sanitizer_start_switch_fiber(&from->ctx.asan_fake_stack, (char*)ptls->stackbase-ptls->stacksize, ptls->stacksize); + __sanitizer_start_switch_fiber(&from->asan_fake_stack, (char*)ptls->stackbase - ptls->stacksize, ptls->stacksize); else - __sanitizer_start_switch_fiber(&from->ctx.asan_fake_stack, to->stkbuf, to->bufsz); + __sanitizer_start_switch_fiber(&from->asan_fake_stack, to->stkbuf, to->bufsz); } -static inline void sanitizer_start_switch_fiber_killed(jl_ptls_t ptls, jl_task_t *to) { +static inline void sanitizer_start_switch_fiber_killed(jl_ptls_t ptls, jl_ucontext_t *to) { if (to->copy_stack) - __sanitizer_start_switch_fiber(NULL, (char*)ptls->stackbase-ptls->stacksize, ptls->stacksize); + __sanitizer_start_switch_fiber(NULL, (char*)ptls->stackbase - ptls->stacksize, ptls->stacksize); else __sanitizer_start_switch_fiber(NULL, to->stkbuf, to->bufsz); } -static inline void sanitizer_finish_switch_fiber(jl_task_t *last, jl_task_t *current) { - __sanitizer_finish_switch_fiber(current->ctx.asan_fake_stack, NULL, NULL); +static inline void sanitizer_finish_switch_fiber(jl_ucontext_t *last, jl_ucontext_t *current) { + __sanitizer_finish_switch_fiber(current->asan_fake_stack, NULL, NULL); //(const void**)&last->stkbuf, //&last->bufsz); } #else -static inline void sanitizer_start_switch_fiber(jl_ptls_t ptls, jl_task_t *from, jl_task_t *to) JL_NOTSAFEPOINT {} -static inline void sanitizer_start_switch_fiber_killed(jl_ptls_t ptls, jl_task_t *to) JL_NOTSAFEPOINT {} -static inline void sanitizer_finish_switch_fiber(jl_task_t *last, jl_task_t *current) JL_NOTSAFEPOINT {} +static inline void sanitizer_start_switch_fiber(jl_ptls_t ptls, jl_ucontext_t *from, jl_ucontext_t *to) JL_NOTSAFEPOINT {} +static inline void sanitizer_start_switch_fiber_killed(jl_ptls_t ptls, jl_ucontext_t *to) JL_NOTSAFEPOINT {} +static inline void sanitizer_finish_switch_fiber(jl_ucontext_t *last, jl_ucontext_t *current) JL_NOTSAFEPOINT {} #endif #if defined(_COMPILER_TSAN_ENABLED_) @@ -85,19 +85,6 @@ static inline void sanitizer_finish_switch_fiber(jl_task_t *last, jl_task_t *cur jl_ucontext_t *_tsan_macro_ctx = (_ctx); \ __tsan_switch_to_fiber(_tsan_macro_ctx->tsan_state, 0); \ } while (0) -#ifdef COPY_STACKS -#define tsan_destroy_copyctx(_ptls, _ctx) do { \ - jl_ucontext_t *_tsan_macro_ctx = (_ctx); \ - if (_tsan_macro_ctx != &(_ptls)->root_task->ctx) { \ - __tsan_destroy_fiber(_tsan_macro_ctx->tsan_state); \ - } \ - _tsan_macro_ctx->tsan_state = NULL; \ - } while (0) -#define tsan_switch_to_copyctx(_ctx) do { \ - struct jl_stack_context_t *_tsan_macro_ctx = (_ctx); \ - __tsan_switch_to_fiber(_tsan_macro_ctx->tsan_state, 0); \ - } while (0) -#endif #else // just do minimal type-checking on the arguments #define tsan_destroy_ctx(_ptls, _ctx) do { \ @@ -108,16 +95,6 @@ static inline void sanitizer_finish_switch_fiber(jl_task_t *last, jl_task_t *cur jl_ucontext_t *_tsan_macro_ctx = (_ctx); \ (void)_tsan_macro_ctx; \ } while (0) -#ifdef COPY_STACKS -#define tsan_destroy_copyctx(_ptls, _ctx) do { \ - jl_ucontext_t *_tsan_macro_ctx = (_ctx); \ - (void)_tsan_macro_ctx; \ - } while (0) -#define tsan_switch_to_copyctx(_ctx) do { \ - jl_ucontext_t *_tsan_macro_ctx = (_ctx); \ - (void)_tsan_macro_ctx; \ - } while (0) -#endif #endif // empirically, jl_finish_task needs about 64k stack space to infer/run @@ -134,7 +111,6 @@ static inline void sanitizer_finish_switch_fiber(jl_task_t *last, jl_task_t *cur #define ROOT_TASK_STACK_ADJUSTMENT 3000000 #endif -static char *jl_alloc_fiber(_jl_ucontext_t *t, size_t *ssize, jl_task_t *owner) JL_NOTSAFEPOINT; static void jl_set_fiber(jl_ucontext_t *t); static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t); static void jl_start_fiber_swap(jl_ucontext_t *savet, jl_ucontext_t *t); @@ -214,17 +190,17 @@ static void NOINLINE save_stack(jl_ptls_t ptls, jl_task_t *lastt, jl_task_t **pt assert(stackbase > frame_addr); size_t nb = stackbase - frame_addr; void *buf; - if (lastt->bufsz < nb) { - asan_free_copy_stack(lastt->stkbuf, lastt->bufsz); + if (lastt->ctx.bufsz < nb) { + asan_free_copy_stack(lastt->ctx.stkbuf, lastt->ctx.bufsz); buf = (void*)jl_gc_alloc_buf(ptls, nb); - lastt->stkbuf = buf; - lastt->bufsz = nb; + lastt->ctx.stkbuf = buf; + lastt->ctx.bufsz = nb; } else { - buf = lastt->stkbuf; + buf = lastt->ctx.stkbuf; } *pt = NULL; // clear the gc-root for the target task before copying the stack for saving - lastt->copy_stack = nb; + lastt->ctx.copy_stack = nb; lastt->sticky = 1; memcpy_stack_a16((uint64_t*)buf, (uint64_t*)frame_addr, nb); // this task's stack could have been modified after @@ -233,58 +209,97 @@ static void NOINLINE save_stack(jl_ptls_t ptls, jl_task_t *lastt, jl_task_t **pt jl_gc_wb_back(lastt); } -JL_NO_ASAN static void NOINLINE JL_NORETURN restore_stack(jl_task_t *t, jl_ptls_t ptls, char *p) +JL_NO_ASAN static void NOINLINE JL_NORETURN restore_stack(jl_ucontext_t *t, jl_ptls_t ptls, char *p) { size_t nb = t->copy_stack; char *_x = (char*)ptls->stackbase - nb; if (!p) { // switch to a stackframe that's beyond the bounds of the last switch - p = _x; - if ((char*)&_x > _x) { - p = (char*)alloca((char*)&_x - _x); + p = _x - 4096; + if ((char*)&_x > p) { + p = (char*)alloca((char*)&_x - p); } restore_stack(t, ptls, p); // pass p to ensure the compiler can't tailcall this or avoid the alloca } void *_y = t->stkbuf; assert(_x != NULL && _y != NULL); +#if defined(_CPU_X86_) || defined(_CPU_X86_64_) + void *volatile *return_address = (void *volatile *)__builtin_frame_address(0) + 1; + assert(*return_address == __builtin_return_address(0)); + *return_address = NULL; +#else +#pragma message("warning: CFI_NORETURN not implemented for this platform, so profiling of copy_stacks may segfault in this build") +#endif +CFI_NORETURN memcpy_stack_a16((uint64_t*)_x, (uint64_t*)_y, nb); // destroys all but the current stackframe #if defined(_OS_WINDOWS_) - jl_setcontext(&t->ctx.copy_ctx); + jl_setcontext(t->copy_ctx); #else - jl_longjmp(t->ctx.copy_ctx.uc_mcontext, 1); + jl_longjmp(t->copy_ctx->uc_mcontext, 1); #endif abort(); // unreachable } -JL_NO_ASAN static void restore_stack2(jl_task_t *t, jl_ptls_t ptls, jl_task_t *lastt) +JL_NO_ASAN static void restore_stack2(jl_ucontext_t *t, jl_ptls_t ptls, jl_ucontext_t *lastt) { assert(t->copy_stack && !lastt->copy_stack); size_t nb = t->copy_stack; - char *_x = (char*)ptls->stackbase - nb; - void *_y = t->stkbuf; - assert(_x != NULL && _y != NULL); - memcpy_stack_a16((uint64_t*)_x, (uint64_t*)_y, nb); // destroys all but the current stackframe + if (nb > 1) { + char *_x = (char*)ptls->stackbase - nb; + void *_y = t->stkbuf; + assert(_x != NULL && _y != NULL); + memcpy_stack_a16((uint64_t*)_x, (uint64_t*)_y, nb); + } +#if defined(_OS_WINDOWS_) + // jl_swapcontext and setjmp are the same on Windows, so we can just use swapcontext directly + tsan_switch_to_ctx(t); + jl_swapcontext(lastt->ctx, t->copy_ctx); +#else #if defined(JL_HAVE_UNW_CONTEXT) volatile int returns = 0; - int r = unw_getcontext(&lastt->ctx.ctx); + int r = unw_getcontext(lastt->ctx); if (++returns == 2) // r is garbage after the first return return; if (r != 0 || returns != 1) abort(); -#elif defined(JL_HAVE_ASM) || defined(_OS_WINDOWS_) - if (jl_setjmp(lastt->ctx.copy_ctx.uc_mcontext, 0)) +#elif defined(JL_HAVE_ASM) + if (jl_setjmp(lastt->ctx->uc_mcontext, 0)) return; #else #error COPY_STACKS is incompatible with this platform #endif - tsan_switch_to_copyctx(&t->ctx); -#if defined(_OS_WINDOWS_) - jl_setcontext(&t->ctx.copy_ctx); + tsan_switch_to_ctx(t); + jl_longjmp(t->copy_ctx->uc_mcontext, 1); +#endif +} + +JL_NO_ASAN static void NOINLINE restore_stack3(jl_ucontext_t *t, jl_ptls_t ptls, char *p) +{ +#if !defined(JL_HAVE_ASM) + char *_x = (char*)ptls->stackbase; + if (!p) { + // switch to a stackframe that's well beyond the bounds of the next switch + p = _x - 4096; + if ((char*)&_x > p) { + p = (char*)alloca((char*)&_x - p); + } + restore_stack3(t, ptls, p); // pass p to ensure the compiler can't tailcall this or avoid the alloca + } +#endif +#if defined(_CPU_X86_) || defined(_CPU_X86_64_) + void *volatile *return_address = (void *volatile *)__builtin_frame_address(0) + 1; + assert(*return_address == __builtin_return_address(0)); + *return_address = NULL; #else - jl_longjmp(t->ctx.copy_ctx.uc_mcontext, 1); +#pragma message("warning: CFI_NORETURN not implemented for this platform, so profiling of copy_stacks may segfault in this build") #endif +CFI_NORETURN + tsan_switch_to_ctx(t); + jl_start_fiber_set(t); // (doesn't return) + abort(); } + #endif /* Rooted by the base module */ @@ -298,9 +313,9 @@ void JL_NORETURN jl_finish_task(jl_task_t *ct) jl_atomic_store_release(&ct->_state, JL_TASK_STATE_FAILED); else jl_atomic_store_release(&ct->_state, JL_TASK_STATE_DONE); - if (ct->copy_stack) { // early free of stkbuf - asan_free_copy_stack(ct->stkbuf, ct->bufsz); - ct->stkbuf = NULL; + if (ct->ctx.copy_stack) { // early free of stkbuf + asan_free_copy_stack(ct->ctx.stkbuf, ct->ctx.bufsz); + ct->ctx.stkbuf = NULL; } // ensure that state is cleared ct->ptls->in_finalizer = 0; @@ -344,33 +359,33 @@ JL_DLLEXPORT void *jl_task_stack_buffer(jl_task_t *task, size_t *size, int *ptid if (ptls2) { *ptid = jl_atomic_load_relaxed(&task->tid); #ifdef COPY_STACKS - if (task->copy_stack) { + if (task->ctx.copy_stack) { *size = ptls2->stacksize; return (char *)ptls2->stackbase - *size; } #endif } - *size = task->bufsz - off; - return (void *)((char *)task->stkbuf + off); + *size = task->ctx.bufsz - off; + return (void *)((char *)task->ctx.stkbuf + off); } JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task, char **active_start, char **active_end, char **total_start, char **total_end) { - if (!task->started) { + if (!task->ctx.started) { *total_start = *active_start = 0; *total_end = *active_end = 0; return; } jl_ptls_t ptls2 = task->ptls; - if (task->copy_stack && ptls2) { + if (task->ctx.copy_stack && ptls2) { *total_start = *active_start = (char*)ptls2->stackbase - ptls2->stacksize; *total_end = *active_end = (char*)ptls2->stackbase; } - else if (task->stkbuf) { - *total_start = *active_start = (char*)task->stkbuf; + else if (task->ctx.stkbuf) { + *total_start = *active_start = (char*)task->ctx.stkbuf; #ifndef _OS_WINDOWS_ jl_ptls_t ptls0 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; if (ptls0->root_task == task) { @@ -383,12 +398,12 @@ JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task, } #endif - *total_end = *active_end = (char*)task->stkbuf + task->bufsz; + *total_end = *active_end = (char*)task->ctx.stkbuf + task->ctx.bufsz; #ifdef COPY_STACKS // save_stack stores the stack of an inactive task in stkbuf, and the // actual number of used bytes in copy_stack. - if (task->copy_stack > 1) - *active_end = (char*)task->stkbuf + task->copy_stack; + if (task->ctx.copy_stack > 1) + *active_end = (char*)task->ctx.stkbuf + task->ctx.copy_stack; #endif } else { @@ -449,20 +464,16 @@ JL_NO_ASAN static void ctx_switch(jl_task_t *lastt) #endif int killed = jl_atomic_load_relaxed(&lastt->_state) != JL_TASK_STATE_RUNNABLE; - if (!t->started && !t->copy_stack) { + if (!t->ctx.started && !t->ctx.copy_stack) { // may need to allocate the stack - if (t->stkbuf == NULL) { - t->stkbuf = jl_alloc_fiber(&t->ctx.ctx, &t->bufsz, t); - if (t->stkbuf == NULL) { + if (t->ctx.stkbuf == NULL) { + t->ctx.stkbuf = jl_malloc_stack(&t->ctx.bufsz, t); + if (t->ctx.stkbuf == NULL) { #ifdef COPY_STACKS // fall back to stack copying if mmap fails - t->copy_stack = 1; + t->ctx.copy_stack = 1; + t->ctx.bufsz = 0; t->sticky = 1; - t->bufsz = 0; - if (always_copy_stacks) - memcpy(&t->ctx.copy_ctx, &ptls->copy_stack_ctx, sizeof(t->ctx.copy_ctx)); - else - memcpy(&t->ctx.ctx, &ptls->base_ctx, sizeof(t->ctx.ctx)); #else jl_throw(jl_memory_exception); #endif @@ -470,28 +481,45 @@ JL_NO_ASAN static void ctx_switch(jl_task_t *lastt) } } + union { + _jl_ucontext_t ctx; + jl_stack_context_t copy_ctx; + } lasttstate; + if (killed) { *pt = NULL; // can't fail after here: clear the gc-root for the target task now lastt->gcstack = NULL; lastt->eh = NULL; - if (!lastt->copy_stack && lastt->stkbuf) { + if (!lastt->ctx.copy_stack && lastt->ctx.stkbuf) { // early free of stkbuf back to the pool jl_release_task_stack(ptls, lastt); } } else { + if (lastt->ctx.copy_stack) { // save the old copy-stack +#ifdef _OS_WINDOWS_ + lasttstate.copy_ctx.uc_stack.ss_sp = (char*)ptls->stackbase - ptls->stacksize; + lasttstate.copy_ctx.uc_stack.ss_size = ptls->stacksize; +#endif #ifdef COPY_STACKS - if (lastt->copy_stack) { // save the old copy-stack - save_stack(ptls, lastt, pt); // allocates (gc-safepoint, and can also fail) - if (jl_setjmp(lastt->ctx.copy_ctx.uc_mcontext, 0)) { - sanitizer_finish_switch_fiber(ptls->previous_task, jl_atomic_load_relaxed(&ptls->current_task)); - // TODO: mutex unlock the thread we just switched from + if (jl_setjmp(lasttstate.copy_ctx.uc_mcontext, 0)) { +#ifdef MIGRATE_TASKS + ptls = lastt->ptls; +#endif + lastt->ctx.copy_ctx = NULL; + sanitizer_finish_switch_fiber(&ptls->previous_task->ctx, &lastt->ctx); return; } - } - else + save_stack(ptls, lastt, pt); // allocates (gc-safepoint, and can also fail) + lastt->ctx.copy_ctx = &lasttstate.copy_ctx; +#else + abort(); #endif - *pt = NULL; // can't fail after here: clear the gc-root for the target task now + } + else { + *pt = NULL; // can't fail after here: clear the gc-root for the target task now + lastt->ctx.ctx = &lasttstate.ctx; + } } // set up global state for new task and clear global state for old task @@ -506,41 +534,44 @@ JL_NO_ASAN static void ctx_switch(jl_task_t *lastt) ptls->previous_task = lastt; #endif - if (t->started) { + if (t->ctx.started) { + if (t->ctx.copy_stack) { #ifdef COPY_STACKS - if (t->copy_stack) { - if (lastt->copy_stack) { + if (lastt->ctx.copy_stack) { // Switching from copystack to copystack. Clear any shadow stack // memory above the saved shadow stack. - uintptr_t stacktop = (uintptr_t)ptls->stackbase - t->copy_stack; + uintptr_t stacktop = (uintptr_t)ptls->stackbase - t->ctx.copy_stack; uintptr_t stackbottom = ((uintptr_t)jl_get_frame_addr() & ~15); if (stackbottom < stacktop) - asan_unpoison_stack_memory(stackbottom, stacktop-stackbottom); + asan_unpoison_stack_memory(stackbottom, stacktop - stackbottom); } - if (!killed && !lastt->copy_stack) { - sanitizer_start_switch_fiber(ptls, lastt, t); - restore_stack2(t, ptls, lastt); - } else { - tsan_switch_to_copyctx(&t->ctx); + if (!killed && !lastt->ctx.copy_stack) { + sanitizer_start_switch_fiber(ptls, &lastt->ctx, &t->ctx); + restore_stack2(&t->ctx, ptls, &lastt->ctx); // half jl_swap_fiber and half restore_stack + } + else { + tsan_switch_to_ctx(&t->ctx); if (killed) { - sanitizer_start_switch_fiber_killed(ptls, t); - tsan_destroy_copyctx(ptls, &lastt->ctx); - } else { - sanitizer_start_switch_fiber(ptls, lastt, t); + sanitizer_start_switch_fiber_killed(ptls, &t->ctx); + tsan_destroy_ctx(ptls, &lastt->ctx); + } + else { + sanitizer_start_switch_fiber(ptls, &lastt->ctx, &t->ctx); } - if (lastt->copy_stack) { - restore_stack(t, ptls, NULL); // (doesn't return) + if (lastt->ctx.copy_stack) { + restore_stack(&t->ctx, ptls, NULL); // (doesn't return) + abort(); } else { - restore_stack(t, ptls, (char*)1); // (doesn't return) + restore_stack(&t->ctx, ptls, (char*)1); // (doesn't return) + abort(); } } - } - else #endif - { - if (lastt->copy_stack) { + } + else { + if (lastt->ctx.copy_stack) { // Switching away from a copystack to a non-copystack. Clear // the whole shadow stack now, because otherwise we won't know // how much stack memory to clear the next time we switch to @@ -549,22 +580,23 @@ JL_NO_ASAN static void ctx_switch(jl_task_t *lastt) uintptr_t stackbottom = ((uintptr_t)jl_get_frame_addr() & ~15); // We're not restoring the stack, but we still need to unpoison the // stack, so it starts with a pristine stack. - asan_unpoison_stack_memory(stackbottom, stacktop-stackbottom); + asan_unpoison_stack_memory(stackbottom, stacktop - stackbottom); } if (killed) { - sanitizer_start_switch_fiber_killed(ptls, t); + sanitizer_start_switch_fiber_killed(ptls, &t->ctx); tsan_switch_to_ctx(&t->ctx); tsan_destroy_ctx(ptls, &lastt->ctx); jl_set_fiber(&t->ctx); // (doesn't return) abort(); // unreachable } else { - sanitizer_start_switch_fiber(ptls, lastt, t); - if (lastt->copy_stack) { + sanitizer_start_switch_fiber(ptls, &lastt->ctx, &t->ctx); + if (lastt->ctx.copy_stack) { // Resume at the jl_setjmp earlier in this function, // don't do a full task swap tsan_switch_to_ctx(&t->ctx); jl_set_fiber(&t->ctx); // (doesn't return) + abort(); } else { jl_swap_fiber(&lastt->ctx, &t->ctx); @@ -573,41 +605,58 @@ JL_NO_ASAN static void ctx_switch(jl_task_t *lastt) } } else { - if (lastt->copy_stack) { +#ifdef _COMPILER_TSAN_ENABLED_ + t->ctx.tsan_state = __tsan_create_fiber(0); +#endif + if (lastt->ctx.copy_stack) { uintptr_t stacktop = (uintptr_t)ptls->stackbase; uintptr_t stackbottom = ((uintptr_t)jl_get_frame_addr() & ~15); // We're not restoring the stack, but we still need to unpoison the // stack, so it starts with a pristine stack. - asan_unpoison_stack_memory(stackbottom, stacktop-stackbottom); + asan_unpoison_stack_memory(stackbottom, stacktop - stackbottom); } - if (t->copy_stack && always_copy_stacks) { + if (t->ctx.copy_stack) { +#ifdef COPY_STACKS tsan_switch_to_ctx(&t->ctx); + // create a temporary non-copy_stack context for starting this fiber + jl_ucontext_t ctx = t->ctx; + ctx.ctx = NULL; + ctx.stkbuf = (char*)ptls->stackbase - ptls->stacksize; + ctx.bufsz = ptls->stacksize; + ctx.copy_stack = 0; + ctx.started = 0; if (killed) { - sanitizer_start_switch_fiber_killed(ptls, t); + sanitizer_start_switch_fiber_killed(ptls, &t->ctx); tsan_destroy_ctx(ptls, &lastt->ctx); - } else { - sanitizer_start_switch_fiber(ptls, lastt, t); + if (lastt->ctx.copy_stack) + restore_stack3(&ctx, ptls, NULL); // (doesn't return) + else + jl_start_fiber_set(&ctx); + abort(); + } + sanitizer_start_switch_fiber(ptls, &lastt->ctx, &t->ctx); + if (lastt->ctx.copy_stack) { + restore_stack3(&ctx, ptls, NULL); // (doesn't return) + abort(); + } + else { + jl_start_fiber_swap(&lastt->ctx, &ctx); } -#ifdef COPY_STACKS -#if defined(_OS_WINDOWS_) - jl_setcontext(&t->ctx.copy_ctx); #else - jl_longjmp(t->ctx.copy_ctx.uc_mcontext, 1); + abort(); #endif -#endif - abort(); // unreachable } else { if (killed) { - sanitizer_start_switch_fiber_killed(ptls, t); + sanitizer_start_switch_fiber_killed(ptls, &t->ctx); tsan_switch_to_ctx(&t->ctx); tsan_destroy_ctx(ptls, &lastt->ctx); jl_start_fiber_set(&t->ctx); // (doesn't return) abort(); } - sanitizer_start_switch_fiber(ptls, lastt, t); - if (lastt->copy_stack) { - // Resume at the jl_setjmp earlier in this function + sanitizer_start_switch_fiber(ptls, &lastt->ctx, &t->ctx); + if (lastt->ctx.copy_stack) { + // copy_stack resumes at the jl_setjmp earlier in this function, so don't swap here tsan_switch_to_ctx(&t->ctx); jl_start_fiber_set(&t->ctx); // (doesn't return) abort(); @@ -617,7 +666,14 @@ JL_NO_ASAN static void ctx_switch(jl_task_t *lastt) } } } - sanitizer_finish_switch_fiber(ptls->previous_task, jl_atomic_load_relaxed(&ptls->current_task)); + +#ifdef MIGRATE_TASKS + ptls = lastt->ptls; +#endif + assert(ptls); + assert(lastt == jl_atomic_load_relaxed(&ptls->current_task)); + lastt->ctx.ctx = NULL; + sanitizer_finish_switch_fiber(&ptls->previous_task->ctx, &lastt->ctx); } JL_DLLEXPORT void jl_switch(void) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER @@ -629,7 +685,7 @@ JL_DLLEXPORT void jl_switch(void) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER return; } int8_t gc_state = jl_gc_unsafe_enter(ptls); - if (t->started && t->stkbuf == NULL) + if (t->ctx.started && t->ctx.stkbuf == NULL) jl_error("attempt to switch to exited task"); if (ptls->in_finalizer) jl_error("task switch not allowed from inside gc finalizer"); @@ -654,7 +710,7 @@ JL_DLLEXPORT void jl_switch(void) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER ptls->previous_task = NULL; assert(t != ct); assert(jl_atomic_load_relaxed(&t->tid) == ptls->tid); - if (!t->sticky && !t->copy_stack) + if (!t->sticky && !t->ctx.copy_stack) jl_atomic_store_release(&t->tid, -1); #else assert(ptls == ct->ptls); @@ -1071,26 +1127,28 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion jl_task_t *t = (jl_task_t*)jl_gc_alloc(ct->ptls, sizeof(jl_task_t), jl_task_type); jl_set_typetagof(t, jl_task_tag, 0); JL_PROBE_RT_NEW_TASK(ct, t); - t->copy_stack = 0; + t->ctx.copy_stack = 0; if (ssize == 0) { // stack size unspecified; use default if (always_copy_stacks) { - t->copy_stack = 1; - t->bufsz = 0; + t->ctx.copy_stack = 1; + t->ctx.bufsz = 0; } else { - t->bufsz = JL_STACK_SIZE; + t->ctx.bufsz = JL_STACK_SIZE; } - t->stkbuf = NULL; + t->ctx.stkbuf = NULL; } else { // user requested dedicated stack of a certain size if (ssize < MINSTKSZ) ssize = MINSTKSZ; - t->bufsz = ssize; - t->stkbuf = jl_alloc_fiber(&t->ctx.ctx, &t->bufsz, t); - if (t->stkbuf == NULL) + t->ctx.bufsz = ssize; + t->ctx.stkbuf = jl_malloc_stack(&t->ctx.bufsz, t); + if (t->ctx.stkbuf == NULL) { + t->ctx.bufsz = 0; jl_throw(jl_memory_exception); + } } t->next = jl_nothing; t->queue = jl_nothing; @@ -1109,30 +1167,21 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion t->sticky = 1; t->gcstack = NULL; t->excstack = NULL; - t->started = 0; + t->ctx.started = 0; t->priority = 0; - jl_atomic_store_relaxed(&t->tid, t->copy_stack ? jl_atomic_load_relaxed(&ct->tid) : -1); // copy_stacks are always pinned since they can't be moved + jl_atomic_store_relaxed(&t->tid, -1); t->threadpoolid = ct->threadpoolid; t->ptls = NULL; t->world_age = ct->world_age; t->reentrant_timing = 0; jl_timing_task_init(t); -#ifdef COPY_STACKS - if (!t->copy_stack) { -#if defined(JL_DEBUG_BUILD) - memset(&t->ctx, 0, sizeof(t->ctx)); -#endif - } - else { - if (always_copy_stacks) - memcpy(&t->ctx.copy_ctx, &ct->ptls->copy_stack_ctx, sizeof(t->ctx.copy_ctx)); - else - memcpy(&t->ctx.ctx, &ct->ptls->base_ctx, sizeof(t->ctx.ctx)); - } -#endif + if (t->ctx.copy_stack) + t->ctx.copy_ctx = NULL; + else + t->ctx.ctx = NULL; #ifdef _COMPILER_TSAN_ENABLED_ - t->ctx.tsan_state = __tsan_create_fiber(0); + t->ctx.tsan_state = NULL; #endif #ifdef _COMPILER_ASAN_ENABLED_ t->ctx.asan_fake_stack = NULL; @@ -1196,7 +1245,7 @@ CFI_NORETURN jl_task_t *ct = jl_current_task; #endif jl_ptls_t ptls = ct->ptls; - sanitizer_finish_switch_fiber(ptls->previous_task, ct); + sanitizer_finish_switch_fiber(&ptls->previous_task->ctx, &ct->ctx); _start_task(); } @@ -1210,6 +1259,7 @@ CFI_NORETURN #else jl_task_t *ct = jl_current_task; #endif + ct->ctx.ctx = NULL; jl_ptls_t ptls = ct->ptls; jl_value_t *res; assert(ptls->finalizers_inhibited == 0); @@ -1217,11 +1267,11 @@ CFI_NORETURN #ifdef MIGRATE_TASKS jl_task_t *pt = ptls->previous_task; ptls->previous_task = NULL; - if (!pt->sticky && !pt->copy_stack) + if (!pt->sticky && !pt->ctx.copy_stack) jl_atomic_store_release(&pt->tid, -1); #endif - ct->started = 1; + ct->ctx.started = 1; JL_PROBE_RT_START_TASK(ct); jl_timing_block_task_enter(ct, ptls, NULL); if (jl_atomic_load_relaxed(&ct->_isexception)) { @@ -1258,64 +1308,52 @@ skip_pop_exception:; #ifdef _OS_WINDOWS_ #define setcontext jl_setcontext #define swapcontext jl_swapcontext -#define makecontext jl_makecontext #endif -static char *jl_alloc_fiber(_jl_ucontext_t *t, size_t *ssize, jl_task_t *owner) JL_NOTSAFEPOINT +static int make_fiber(jl_ucontext_t *t, _jl_ucontext_t *ctx) { #ifndef _OS_WINDOWS_ - int r = getcontext(t); - if (r != 0) - jl_error("getcontext failed"); + int r = getcontext(ctx); + if (r != 0) abort(); #endif - void *stk = jl_malloc_stack(ssize, owner); - if (stk == NULL) - return NULL; - t->uc_stack.ss_sp = stk; - t->uc_stack.ss_size = *ssize; + ctx->uc_stack.ss_sp = (char*)t->stkbuf; + ctx->uc_stack.ss_size = t->bufsz; #ifdef _OS_WINDOWS_ - makecontext(t, &start_task); + jl_makecontext(ctx, &start_task); #else - t->uc_link = NULL; - makecontext(t, &start_task, 0); + ctx->uc_link = NULL; + makecontext(ctx, &start_task, 0); #endif - return (char*)stk; + return 1; } static void jl_start_fiber_set(jl_ucontext_t *t) { - setcontext(&t->ctx); + _jl_ucontext_t ctx; + make_fiber(t, &ctx); + setcontext(&ctx); } static void jl_start_fiber_swap(jl_ucontext_t *lastt, jl_ucontext_t *t) { + _jl_ucontext_t ctx; + make_fiber(t, &ctx); assert(lastt); tsan_switch_to_ctx(t); - swapcontext(&lastt->ctx, &t->ctx); + swapcontext(lastt->ctx, &ctx); } static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t) { tsan_switch_to_ctx(t); - swapcontext(&lastt->ctx, &t->ctx); + swapcontext(lastt->ctx, t->ctx); } static void jl_set_fiber(jl_ucontext_t *t) { - setcontext(&t->ctx); -} -#endif - -#if defined(JL_HAVE_UNW_CONTEXT) || defined(JL_HAVE_ASM) -static char *jl_alloc_fiber(_jl_ucontext_t *t, size_t *ssize, jl_task_t *owner) -{ - char *stkbuf = (char*)jl_malloc_stack(ssize, owner); - if (stkbuf == NULL) - return NULL; -#ifndef __clang_gcanalyzer__ - ((char**)t)[0] = stkbuf; // stash the stack pointer somewhere for start_fiber - ((size_t*)t)[1] = *ssize; // stash the stack size somewhere for start_fiber -#endif - return stkbuf; + setcontext(t->ctx); } #endif #if defined(JL_HAVE_UNW_CONTEXT) +#ifdef _OS_WINDOWS_ +#error unw_context_t not defined in Windows +#endif static inline void jl_unw_swapcontext(unw_context_t *old, unw_cursor_t *c) { volatile int returns = 0; @@ -1329,15 +1367,15 @@ static inline void jl_unw_swapcontext(unw_context_t *old, unw_cursor_t *c) static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t) { unw_cursor_t c; - int r = unw_init_local(&c, &t->ctx); + int r = unw_init_local(&c, t->ctx); if (r < 0) abort(); - jl_unw_swapcontext(&lastt->ctx, &c); + jl_unw_swapcontext(lastt->ctx, &c); } static void jl_set_fiber(jl_ucontext_t *t) { unw_cursor_t c; - int r = unw_init_local(&c, &t->ctx); + int r = unw_init_local(&c, t->ctx); if (r < 0) abort(); unw_resume(&c); @@ -1345,14 +1383,14 @@ static void jl_set_fiber(jl_ucontext_t *t) #elif defined(JL_HAVE_ASM) static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t) { - if (jl_setjmp(lastt->ctx.uc_mcontext, 0)) + if (jl_setjmp(lastt->ctx->uc_mcontext, 0)) return; tsan_switch_to_ctx(t); jl_set_fiber(t); // doesn't return } static void jl_set_fiber(jl_ucontext_t *t) { - jl_longjmp(t->ctx.uc_mcontext, 1); + jl_longjmp(t->ctx->uc_mcontext, 1); } #endif @@ -1373,14 +1411,14 @@ static void jl_set_fiber(jl_ucontext_t *t) static void jl_start_fiber_set(jl_ucontext_t *t) { unw_cursor_t c; - char *stk = ((char**)&t->ctx)[0]; - size_t ssize = ((size_t*)&t->ctx)[1]; + char *stk = (char*)t->stkbuf; + size_t ssize = t->bufsz; uintptr_t fn = (uintptr_t)&start_task; stk += ssize; - int r = unw_getcontext(&t->ctx); + int r = unw_getcontext(t->ctx); if (r) abort(); - if (unw_init_local(&c, &t->ctx)) + if (unw_init_local(&c, t->ctx)) abort(); PUSH_RET(&c, stk); #if defined __linux__ @@ -1396,43 +1434,46 @@ static void jl_start_fiber_swap(jl_ucontext_t *lastt, jl_ucontext_t *t) { assert(lastt); unw_cursor_t c; - char *stk = ((char**)&t->ctx)[0]; - size_t ssize = ((size_t*)&t->ctx)[1]; + char *stk = (char*)t->stkbuf; + size_t ssize = t->bufsz; uintptr_t fn = (uintptr_t)&start_task; stk += ssize; volatile int returns = 0; - int r = unw_getcontext(&lastt->ctx); + int r = unw_getcontext(lastt->ctx); if (++returns == 2) // r is garbage after the first return return; if (r != 0 || returns != 1) abort(); - r = unw_getcontext(&t->ctx); + r = unw_getcontext(t->ctx); if (r != 0) abort(); - if (unw_init_local(&c, &t->ctx)) + if (unw_init_local(&c, t->ctx)) abort(); PUSH_RET(&c, stk); if (unw_set_reg(&c, UNW_REG_SP, (uintptr_t)stk)) abort(); if (unw_set_reg(&c, UNW_REG_IP, fn)) abort(); - jl_unw_swapcontext(&lastt->ctx, &c); + jl_unw_swapcontext(lastt->ctx, &c); } #endif #if defined(JL_HAVE_ASM) +#ifdef _OS_WINDOWS_ +#error JL_HAVE_ASM not defined in Windows +#endif JL_NO_ASAN static void jl_start_fiber_swap(jl_ucontext_t *lastt, jl_ucontext_t *t) { assert(lastt); #ifdef JL_HAVE_UNW_CONTEXT volatile int returns = 0; - int r = unw_getcontext(&lastt->ctx); + int r = unw_getcontext(lastt->ctx); if (++returns == 2) // r is garbage after the first return return; if (r != 0 || returns != 1) abort(); #else - if (jl_setjmp(lastt->ctx.uc_mcontext, 0)) + if (jl_setjmp(lastt->ctx->uc_mcontext, 0)) return; #endif tsan_switch_to_ctx(t); @@ -1440,8 +1481,9 @@ JL_NO_ASAN static void jl_start_fiber_swap(jl_ucontext_t *lastt, jl_ucontext_t * } JL_NO_ASAN static void jl_start_fiber_set(jl_ucontext_t *t) { - char *stk = ((char**)&t->ctx)[0]; - size_t ssize = ((size_t*)&t->ctx)[1]; +CFI_NORETURN + char *stk = (char*)t->stkbuf; + size_t ssize = t->bufsz; uintptr_t fn = (uintptr_t)&start_task; stk += ssize; #ifdef _CPU_X86_64_ @@ -1539,14 +1581,14 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi) } #endif if (always_copy_stacks) { - ct->copy_stack = 1; - ct->stkbuf = NULL; - ct->bufsz = 0; + ct->ctx.copy_stack = 1; + ct->ctx.stkbuf = NULL; + ct->ctx.bufsz = 0; } else { - ct->copy_stack = 0; - ct->stkbuf = stack; - ct->bufsz = ssize; + ct->ctx.copy_stack = 0; + ct->ctx.stkbuf = stack; + ct->ctx.bufsz = ssize; } #ifdef USE_TRACY @@ -1554,7 +1596,7 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi) strcpy(unique_string, "Root"); ct->name = unique_string; #endif - ct->started = 1; + ct->ctx.started = 1; ct->next = jl_nothing; ct->queue = jl_nothing; ct->tls = jl_nothing; @@ -1594,21 +1636,18 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi) if (always_copy_stacks) { // when this is set, we will attempt to corrupt the process stack to switch tasks, // although this is unreliable, and thus not recommended - ptls->stackbase = stack_hi; - ptls->stacksize = ssize; -#ifdef _OS_WINDOWS_ - ptls->copy_stack_ctx.uc_stack.ss_sp = stack_hi; - ptls->copy_stack_ctx.uc_stack.ss_size = ssize; -#endif - if (jl_setjmp(ptls->copy_stack_ctx.uc_mcontext, 0)) - start_task(); // sanitizer_finish_switch_fiber is part of start_task + ptls->stackbase = jl_get_frame_addr(); + ptls->stacksize = (char*)ptls->stackbase - (char*)stack_lo; } else { - ssize = JL_STACK_SIZE; - char *stkbuf = jl_alloc_fiber(&ptls->base_ctx, &ssize, NULL); + size_t bufsz = JL_STACK_SIZE; + void *stkbuf = jl_malloc_stack(&bufsz, NULL); if (stkbuf != NULL) { - ptls->stackbase = stkbuf + ssize; - ptls->stacksize = ssize; + ptls->stackbase = (char*)stkbuf + bufsz; + ptls->stacksize = bufsz; + } + else { + ptls->stacksize = 0; } } #endif @@ -1621,7 +1660,7 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi) JL_DLLEXPORT int jl_is_task_started(jl_task_t *t) JL_NOTSAFEPOINT { - return t->started; + return t->ctx.started; } JL_DLLEXPORT int16_t jl_get_task_tid(jl_task_t *t) JL_NOTSAFEPOINT