Skip to content

Commit

Permalink
Reduce size of Task object (#55515)
Browse files Browse the repository at this point in the history
Move the registers onto the stack, so that they only are present when
the Task is actually switched out, saving memory when the Task is not
running yet or already finished. It makes this mostly just a huge
renaming job.

On Linux x86_64 this reduces it from 376 bytes to 184 bytes.

Has some additional advantages too, such as copy_stack tasks (e.g. with
always_copy_stacks) can migrate to other threads before starting if they
are not sticky.

Also fixes a variable that got mixed up by #54639 and caused
always_copy_stacks to abort, since the stack limits were wrong.

Also now fixes #43124, though I
am not quite confident enough in it to re-enable that test right now.
  • Loading branch information
vtjnash authored Aug 20, 2024
1 parent 7b8dd90 commit a2b1b4e
Show file tree
Hide file tree
Showing 10 changed files with 342 additions and 316 deletions.
8 changes: 4 additions & 4 deletions src/gc-debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -537,13 +537,13 @@ static void gc_scrub_task(jl_task_t *ta)

char *low;
char *high;
if (ta->copy_stack && ptls2 && ta == jl_atomic_load_relaxed(&ptls2->current_task)) {
if (ta->ctx.copy_stack && ptls2 && ta == jl_atomic_load_relaxed(&ptls2->current_task)) {
low = (char*)ptls2->stackbase - ptls2->stacksize;
high = (char*)ptls2->stackbase;
}
else if (ta->stkbuf) {
low = (char*)ta->stkbuf;
high = (char*)ta->stkbuf + ta->bufsz;
else if (ta->ctx.stkbuf) {
low = (char*)ta->ctx.stkbuf;
high = (char*)ta->ctx.stkbuf + ta->ctx.bufsz;
}
else
return;
Expand Down
22 changes: 11 additions & 11 deletions src/gc-stacks.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,14 @@ JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz)
void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task)
{
// avoid adding an original thread stack to the free list
if (task == ptls->root_task && !task->copy_stack)
if (task == ptls->root_task && !task->ctx.copy_stack)
return;
void *stkbuf = task->stkbuf;
size_t bufsz = task->bufsz;
void *stkbuf = task->ctx.stkbuf;
size_t bufsz = task->ctx.bufsz;
if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) {
unsigned pool_id = select_pool(bufsz);
if (pool_sizes[pool_id] == bufsz) {
task->stkbuf = NULL;
task->ctx.stkbuf = NULL;
#ifdef _COMPILER_ASAN_ENABLED_
__asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);
#endif
Expand Down Expand Up @@ -296,17 +296,17 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
jl_task_t *t = (jl_task_t*)lst[n];
assert(jl_is_task(t));
if (gc_marked(jl_astaggedvalue(t)->bits.gc)) {
if (t->stkbuf == NULL)
if (t->ctx.stkbuf == NULL)
ndel++; // jl_release_task_stack called
else
n++;
}
else {
ndel++;
void *stkbuf = t->stkbuf;
size_t bufsz = t->bufsz;
void *stkbuf = t->ctx.stkbuf;
size_t bufsz = t->ctx.bufsz;
if (stkbuf) {
t->stkbuf = NULL;
t->ctx.stkbuf = NULL;
_jl_free_stack(ptls2, stkbuf, bufsz);
}
#ifdef _COMPILER_TSAN_ENABLED_
Expand Down Expand Up @@ -338,7 +338,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
continue;
small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
size_t n = mtarraylist_length(live_tasks);
l += n + (ptls2->root_task->stkbuf != NULL);
l += n + (ptls2->root_task->ctx.stkbuf != NULL);
}
l += l / 20; // add 5% for margin of estimation error
jl_array_t *a = jl_alloc_vec_any(l); // may gc, changing the number of tasks and forcing us to reload everything
Expand All @@ -350,7 +350,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
if (ptls2 == NULL)
continue;
jl_task_t *t = ptls2->root_task;
if (t->stkbuf != NULL) {
if (t->ctx.stkbuf != NULL) {
if (j == l)
goto restart;
jl_array_data(a,void*)[j++] = t;
Expand All @@ -359,7 +359,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
size_t n = mtarraylist_length(live_tasks);
for (size_t i = 0; i < n; i++) {
jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
if (t->stkbuf != NULL) {
if (t->ctx.stkbuf != NULL) {
if (j == l)
goto restart;
jl_array_data(a,void*)[j++] = t;
Expand Down
10 changes: 5 additions & 5 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -2144,9 +2144,9 @@ FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_
(ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task));
}
#ifdef COPY_STACKS
void *stkbuf = ta->stkbuf;
if (stkbuf && ta->copy_stack) {
gc_setmark_buf_(ptls, stkbuf, bits, ta->bufsz);
void *stkbuf = ta->ctx.stkbuf;
if (stkbuf && ta->ctx.copy_stack) {
gc_setmark_buf_(ptls, stkbuf, bits, ta->ctx.bufsz);
// For gc_heap_snapshot_record:
// TODO: attribute size of stack
// TODO: edge to stack data
Expand All @@ -2159,12 +2159,12 @@ FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_
uintptr_t lb = 0;
uintptr_t ub = (uintptr_t)-1;
#ifdef COPY_STACKS
if (stkbuf && ta->copy_stack && !ta->ptls) {
if (stkbuf && ta->ctx.copy_stack && !ta->ptls) {
int16_t tid = jl_atomic_load_relaxed(&ta->tid);
assert(tid >= 0);
jl_ptls_t ptls2 = gc_all_tls_states[tid];
ub = (uintptr_t)ptls2->stackbase;
lb = ub - ta->copy_stack;
lb = ub - ta->ctx.copy_stack;
offset = (uintptr_t)stkbuf - lb;
}
#endif
Expand Down
21 changes: 7 additions & 14 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,35 +64,28 @@ void jl_init_stack_limits(int ismaster, void **stack_lo, void **stack_hi)
// threads since it seems to return bogus values for master thread on Linux
// and possibly OSX.
if (!ismaster) {
# if defined(_OS_LINUX_)
# if defined(_OS_LINUX_) || defined(_OS_FREEBSD_)
pthread_attr_t attr;
#if defined(_OS_FREEBSD_)
pthread_attr_get_np(pthread_self(), &attr);
#else
pthread_getattr_np(pthread_self(), &attr);
#endif
void *stackaddr;
size_t stacksize;
pthread_attr_getstack(&attr, &stackaddr, &stacksize);
pthread_attr_destroy(&attr);
*stack_hi = stackaddr;
*stack_lo = (char*)stackaddr - stacksize;
*stack_lo = stackaddr;
*stack_hi = (char*)stackaddr + stacksize;
return;
# elif defined(_OS_DARWIN_)
extern void *pthread_get_stackaddr_np(pthread_t thread);
extern size_t pthread_get_stacksize_np(pthread_t thread);
pthread_t thread = pthread_self();
void *stackaddr = pthread_get_stackaddr_np(thread);
size_t stacksize = pthread_get_stacksize_np(thread);
*stack_hi = stackaddr;
*stack_lo = (char*)stackaddr - stacksize;
return;
# elif defined(_OS_FREEBSD_)
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_get_np(pthread_self(), &attr);
void *stackaddr;
size_t stacksize;
pthread_attr_getstack(&attr, &stackaddr, &stacksize);
pthread_attr_destroy(&attr);
*stack_hi = stackaddr;
*stack_lo = (char*)stackaddr - stacksize;
return;
# else
# warning "Getting precise stack size for thread is not supported."
Expand Down
6 changes: 1 addition & 5 deletions src/julia.h
Original file line number Diff line number Diff line change
Expand Up @@ -2231,11 +2231,7 @@ typedef struct _jl_task_t {
// current exception handler
jl_handler_t *eh;
// saved thread state
jl_ucontext_t ctx;
void *stkbuf; // malloc'd memory (either copybuf or stack)
size_t bufsz; // actual sizeof stkbuf
unsigned int copy_stack:31; // sizeof stack for copybuf
unsigned int started:1;
jl_ucontext_t ctx; // pointer into stkbuf, if suspended
} jl_task_t;

#define JL_TASK_STATE_RUNNABLE 0
Expand Down
3 changes: 2 additions & 1 deletion src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ static inline void asan_unpoison_task_stack(jl_task_t *ct, jl_jmp_buf *buf)
that we're resetting to. The idea is to remove the poison from the frames
that we're skipping over, since they won't be unwound. */
uintptr_t top = jmpbuf_sp(buf);
uintptr_t bottom = (uintptr_t)ct->stkbuf;
uintptr_t bottom = (uintptr_t)(ct->ctx.copy_stack ? (char*)ct->ptls->stackbase - ct->ptls->stacksize : (char*)ct->ctx.stkbuf);
//uintptr_t bottom = (uintptr_t)&top;
__asan_unpoison_stack_memory(bottom, top - bottom);
}
static inline void asan_unpoison_stack_memory(uintptr_t addr, size_t size) {
Expand Down
14 changes: 7 additions & 7 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,13 @@ typedef ucontext_t _jl_ucontext_t;

typedef struct {
union {
_jl_ucontext_t ctx;
jl_stack_context_t copy_ctx;
_jl_ucontext_t *ctx;
jl_stack_context_t *copy_ctx;
};
void *stkbuf; // malloc'd memory (either copybuf or stack)
size_t bufsz; // actual sizeof stkbuf
unsigned int copy_stack:31; // sizeof stack for copybuf
unsigned int started:1;
#if defined(_COMPILER_TSAN_ENABLED_)
void *tsan_state;
#endif
Expand Down Expand Up @@ -155,13 +159,9 @@ typedef struct _jl_tls_states_t {
struct _jl_task_t *previous_task;
struct _jl_task_t *root_task;
struct _jl_timing_block_t *timing_stack;
// This is the location of our copy_stack
void *stackbase;
size_t stacksize;
union {
_jl_ucontext_t base_ctx; // base context of stack
// This hack is needed to support always_copy_stacks:
jl_stack_context_t copy_stack_ctx;
};
// Temp storage for exception thrown in signal handler. Not rooted.
struct _jl_value_t *sig_exception;
// Temporary backtrace buffer. Scanned for gc roots when bt_size > 0.
Expand Down
6 changes: 3 additions & 3 deletions src/signals-unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -230,13 +230,13 @@ static pthread_t signals_thread;

static int is_addr_on_stack(jl_task_t *ct, void *addr)
{
if (ct->copy_stack) {
if (ct->ctx.copy_stack) {
jl_ptls_t ptls = ct->ptls;
return ((char*)addr > (char*)ptls->stackbase - ptls->stacksize &&
(char*)addr < (char*)ptls->stackbase);
}
return ((char*)addr > (char*)ct->stkbuf &&
(char*)addr < (char*)ct->stkbuf + ct->bufsz);
return ((char*)addr > (char*)ct->ctx.stkbuf &&
(char*)addr < (char*)ct->ctx.stkbuf + ct->ctx.bufsz);
}

static void sigdie_handler(int sig, siginfo_t *info, void *context)
Expand Down
Loading

0 comments on commit a2b1b4e

Please sign in to comment.