From f290b154143125dfe14b56de7140775cf24c4905 Mon Sep 17 00:00:00 2001 From: d-netto Date: Wed, 17 Jan 2024 12:28:51 -0300 Subject: [PATCH] reduce contention on page metadata lists during the sweeping phase --- src/gc.c | 134 ++++++++++++++++++++++++++++++++++++++++---- src/gc.h | 37 +++++++++++- src/julia_threads.h | 6 ++ src/scheduler.c | 2 +- 4 files changed, 165 insertions(+), 14 deletions(-) diff --git a/src/gc.c b/src/gc.c index ca53132fcbc160..972e5b76812f91 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1541,7 +1541,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_ done: if (re_use_page) { - push_lf_back(allocd, pg); + // we're pushing into a local page stack to reduce contention + push_lf_back_nosync(allocd, pg); } else { gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED); @@ -1596,8 +1597,71 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_ pg->nfree = nfree; } -void gc_sweep_wake_all(void) +int gc_sweep_prescan(jl_ptls_t ptls) { + // 4MB worth of pages is worth parallelizing + const int n_pages_worth_parallel_sweep = (int)(4 * (1 << 20) / GC_PAGE_SZ); + int n_pages_to_scan = 0; + gc_page_profiler_serializer_t serializer = gc_page_serializer_create(); + // push into local page stack. we'll merge them later... + jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch); + assert(allocd_scratch != NULL); + jl_gc_page_stack_t *dest = &allocd_scratch[ptls->tid]; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) { + continue; + } + jl_gc_page_stack_t tmp; + jl_gc_pagemeta_t *tail = NULL; + memset(&tmp, 0, sizeof(tmp)); + while (1) { + jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&ptls2->page_metadata_allocd); + if (pg == NULL) { + break; + } + int should_scan = 1; + if (!pg->has_marked) { + should_scan = 0; + } + if (!current_sweep_full && !pg->has_young) { + assert(!prev_sweep_full || pg->prev_nold >= pg->nold); + if (!prev_sweep_full || pg->prev_nold == pg->nold) { + should_scan = 0; + } + } + if (should_scan) { + if (tail == NULL) { + tail = pg; + } + n_pages_to_scan++; + push_lf_back_nosync(&tmp, pg); + } + else { + gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg); + } + if (n_pages_to_scan >= n_pages_worth_parallel_sweep) { + break; + } + } + if (tail != NULL) { + tail->next = ptls2->page_metadata_allocd.bottom; + } + ptls2->page_metadata_allocd = tmp; + if (n_pages_to_scan >= n_pages_worth_parallel_sweep) { + break; + } + } + gc_page_serializer_destroy(&serializer); + return n_pages_to_scan >= n_pages_worth_parallel_sweep; +} + +void gc_sweep_wake_all(jl_ptls_t ptls) +{ + int parallel_sweep_worthwhile = gc_sweep_prescan(ptls); + if (!parallel_sweep_worthwhile) { + return; + } uv_mutex_lock(&gc_threads_lock); for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; @@ -1615,30 +1679,52 @@ void gc_sweep_wait_for_all(void) } } -void gc_sweep_pool_parallel(void) +void gc_sweep_pool_parallel(jl_ptls_t ptls) { jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch); if (allocd_scratch != NULL) { + // push into local page stack to reduce contention + // we'll merge them later... + jl_gc_page_stack_t *dest = &allocd_scratch[ptls->tid]; gc_page_profiler_serializer_t serializer = gc_page_serializer_create(); while (1) { int found_pg = 0; + // sequentially walk the threads and sweep the pages for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + // skip foreign threads that already exited if (ptls2 == NULL) { continue; } - jl_gc_page_stack_t *allocd = &allocd_scratch[t_i]; - jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd); + jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd); + // failed steal attempt if (pg == NULL) { continue; } - gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg); + gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg); found_pg = 1; } if (!found_pg) { - break; + // check for termination + int no_more_work = 1; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + // skip foreign threads that already exited + if (ptls2 == NULL) { + continue; + } + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + if (pg != NULL) { + no_more_work = 0; + break; + } + } + if (no_more_work) { + break; + } } + jl_cpu_pause(); } gc_page_serializer_destroy(&serializer); } @@ -1669,7 +1755,7 @@ static void gc_sweep_pool(void) // allocate enough space to hold the end of the free list chain // for every thread and pool size - jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**)); + jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) malloc_s(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**)); // update metadata of pages that were pointed to by freelist or newpages from a pool // i.e. pages being the current allocation target @@ -1711,17 +1797,37 @@ static void gc_sweep_pool(void) } // the actual sweeping - jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t)); + jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)jl_malloc_aligned(n_threads * sizeof(jl_gc_page_stack_t), 128); + if (tmp == NULL) { + abort(); + } memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t)); jl_atomic_store(&gc_allocd_scratch, tmp); - gc_sweep_wake_all(); - gc_sweep_pool_parallel(); + jl_ptls_t ptls = jl_current_task->ptls; + gc_sweep_wake_all(ptls); + gc_sweep_pool_parallel(ptls); gc_sweep_wait_for_all(); + // merge the page metadata lists + for (int t_i = 0; t_i < n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) { + continue; + } + while (1) { + jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&tmp[t_i]); + if (pg == NULL) { + break; + } + jl_ptls_t ptls3 = gc_all_tls_states[pg->thread_n]; + push_lf_back_nosync(&ptls3->page_metadata_allocd, pg); + } + } + + // reset half-pages pointers for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) { - ptls2->page_metadata_allocd = tmp[t_i]; for (int i = 0; i < JL_GC_N_POOLS; i++) { jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; p->newpages = NULL; @@ -1759,6 +1865,10 @@ static void gc_sweep_pool(void) } } + // cleanup + free(pfl); + free(tmp); + #ifdef _P64 // only enable concurrent sweeping on 64bit // wake thread up to sweep concurrently if (jl_n_sweepthreads > 0) { diff --git a/src/gc.h b/src/gc.h index 9de67f6e7c679f..235c90c0c6dc87 100644 --- a/src/gc.h +++ b/src/gc.h @@ -199,6 +199,24 @@ extern jl_gc_page_stack_t global_page_pool_freed; // in the sweeping phase, which also doesn't push a node into the // same stack after it's popped +STATIC_INLINE void push_lf_back_nosync(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT +{ + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); + elt->next = old_back; + jl_atomic_store_relaxed(&pool->bottom, elt); +} + +STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back_nosync(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT +{ + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); + if (old_back == NULL) { + return NULL; + } + jl_gc_pagemeta_t *new_back = old_back->next; + jl_atomic_store_relaxed(&pool->bottom, new_back); + return old_back; +} + STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT { while (1) { @@ -211,6 +229,23 @@ STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) } } +#define MAX_POP_ATTEMPTS (1 << 10) + +STATIC_INLINE jl_gc_pagemeta_t *try_pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT +{ + for (int i = 0; i < MAX_POP_ATTEMPTS; i++) { + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); + if (old_back == NULL) { + return NULL; + } + if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) { + return old_back; + } + jl_cpu_pause(); + } + return NULL; +} + STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT { while (1) { @@ -473,7 +508,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_ void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_loop_serial(jl_ptls_t ptls); void gc_mark_loop_parallel(jl_ptls_t ptls, int master); -void gc_sweep_pool_parallel(void); +void gc_sweep_pool_parallel(jl_ptls_t ptls); void gc_free_pages(void); void sweep_stack_pools(void); void jl_gc_debug_init(void); diff --git a/src/julia_threads.h b/src/julia_threads.h index 292c11f61d60d0..fcc4457e5f83c4 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -197,6 +197,12 @@ struct _jl_gc_pagemeta_t; typedef struct { _Atomic(struct _jl_gc_pagemeta_t *) bottom; + // pad to 128 bytes to avoid false-sharing +#ifdef _P64 + void *_pad[15]; +#else + void *_pad[31]; +#endif } jl_gc_page_stack_t; // This includes all the thread local states we care about for a thread. diff --git a/src/scheduler.c b/src/scheduler.c index 50e15b286a8eba..2af88db89ca14e 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -147,7 +147,7 @@ void jl_parallel_gc_threadfun(void *arg) gc_mark_loop_parallel(ptls, 0); } if (may_sweep(ptls)) { // not an else! - gc_sweep_pool_parallel(); + gc_sweep_pool_parallel(ptls); jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1); } }