Skip to content

Commit

Permalink
reduce contention on page metadata lists during the sweeping phase
Browse files Browse the repository at this point in the history
  • Loading branch information
d-netto committed Jan 19, 2024
1 parent fb2d946 commit e76984f
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 14 deletions.
131 changes: 119 additions & 12 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1541,7 +1541,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_

done:
if (re_use_page) {
push_lf_back(allocd, pg);
// we're pushing into a local page stack to reduce contention
push_lf_back_nosync(allocd, pg);
}
else {
gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED);
Expand Down Expand Up @@ -1596,8 +1597,68 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
pg->nfree = nfree;
}

void gc_sweep_wake_all(void)
int gc_sweep_prescan(jl_ptls_t ptls)
{
// 4MB worth of pages is worth parallelizing
const int n_pages_worth_parallel_sweep = (int)(4 * (1 << 20) / GC_PAGE_SZ);
int n_pages_to_scan = 0;
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
// push into local page stack. we'll merge them later...
jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
assert(allocd_scratch != NULL);
jl_gc_page_stack_t *dest = &allocd_scratch[ptls->tid];
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
continue;
}
jl_gc_page_stack_t tmp;
jl_gc_pagemeta_t *tail = NULL;
memset(&tmp, 0, sizeof(tmp));
while (1) {
jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd);
if (pg == NULL) {
break;
}
if (tail == NULL) {
tail = pg;
}
int should_scan = 1;
if (!pg->has_marked) {
should_scan = 0;
}
if (!current_sweep_full && !pg->has_young) {
assert(!prev_sweep_full || pg->prev_nold >= pg->nold);
if (!prev_sweep_full || pg->prev_nold == pg->nold) {
should_scan = 0;
}
}
if (should_scan) {
n_pages_to_scan++;
push_lf_back_nosync(&tmp, pg);
}
else {
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
}
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
tail->next = ptls2->page_metadata_allocd.bottom;
}
}
ptls2->page_metadata_allocd = tmp;
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
break;
}
}
gc_page_serializer_destroy(&serializer);
return n_pages_to_scan >= n_pages_worth_parallel_sweep;
}

void gc_sweep_wake_all(jl_ptls_t ptls)
{
int parallel_sweep_worthwhile = gc_sweep_prescan(ptls);
if (!parallel_sweep_worthwhile) {
return;
}
uv_mutex_lock(&gc_threads_lock);
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
Expand All @@ -1615,30 +1676,52 @@ void gc_sweep_wait_for_all(void)
}
}

void gc_sweep_pool_parallel(void)
void gc_sweep_pool_parallel(jl_ptls_t ptls)
{
jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
if (allocd_scratch != NULL) {
// push into local page stack to reduce contention
// we'll merge them later...
jl_gc_page_stack_t *dest = &allocd_scratch[ptls->tid];
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
while (1) {
int found_pg = 0;
// sequentially walk the threads and sweep the pages
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
// skip foreign threads that already exited
if (ptls2 == NULL) {
continue;
}
jl_gc_page_stack_t *allocd = &allocd_scratch[t_i];
jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd);
jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd);
// failed steal attempt
if (pg == NULL) {
continue;
}
gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg);
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
found_pg = 1;
}
if (!found_pg) {
break;
// check for termination
int no_more_work = 1;
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
// skip foreign threads that already exited
if (ptls2 == NULL) {
continue;
}
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
if (pg != NULL) {
no_more_work = 0;
break;
}
}
if (no_more_work) {
break;
}
}
jl_cpu_pause();
}
gc_page_serializer_destroy(&serializer);
}
Expand Down Expand Up @@ -1669,7 +1752,7 @@ static void gc_sweep_pool(void)

// allocate enough space to hold the end of the free list chain
// for every thread and pool size
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) malloc_s(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));

// update metadata of pages that were pointed to by freelist or newpages from a pool
// i.e. pages being the current allocation target
Expand Down Expand Up @@ -1711,17 +1794,37 @@ static void gc_sweep_pool(void)
}

// the actual sweeping
jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t));
jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)jl_malloc_aligned(n_threads * sizeof(jl_gc_page_stack_t), 128);
if (tmp == NULL) {
abort();
}
memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t));
jl_atomic_store(&gc_allocd_scratch, tmp);
gc_sweep_wake_all();
gc_sweep_pool_parallel();
jl_ptls_t ptls = jl_current_task->ptls;
gc_sweep_wake_all(ptls);
gc_sweep_pool_parallel(ptls);
gc_sweep_wait_for_all();

// merge the page metadata lists
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
continue;
}
while (1) {
jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&tmp[t_i]);
if (pg == NULL) {
break;
}
jl_ptls_t ptls3 = gc_all_tls_states[pg->thread_n];
push_lf_back_nosync(&ptls3->page_metadata_allocd, pg);
}
}

// reset half-pages pointers
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
ptls2->page_metadata_allocd = tmp[t_i];
for (int i = 0; i < JL_GC_N_POOLS; i++) {
jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
p->newpages = NULL;
Expand Down Expand Up @@ -1759,6 +1862,10 @@ static void gc_sweep_pool(void)
}
}

// cleanup
free(pfl);
free(tmp);

#ifdef _P64 // only enable concurrent sweeping on 64bit
// wake thread up to sweep concurrently
if (jl_n_sweepthreads > 0) {
Expand Down
37 changes: 36 additions & 1 deletion src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,24 @@ extern jl_gc_page_stack_t global_page_pool_freed;
// in the sweeping phase, which also doesn't push a node into the
// same stack after it's popped

STATIC_INLINE void push_lf_back_nosync(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
{
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
elt->next = old_back;
jl_atomic_store_relaxed(&pool->bottom, elt);
}

STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back_nosync(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
{
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
if (old_back == NULL) {
return NULL;
}
jl_gc_pagemeta_t *new_back = old_back->next;
jl_atomic_store_relaxed(&pool->bottom, new_back);
return old_back;
}

STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
{
while (1) {
Expand All @@ -211,6 +229,23 @@ STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt)
}
}

#define MAX_POP_ATTEMPTS (1 << 10)

STATIC_INLINE jl_gc_pagemeta_t *try_pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
{
for (int i = 0; i < MAX_POP_ATTEMPTS; i++) {
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
if (old_back == NULL) {
return NULL;
}
if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) {
return old_back;
}
jl_cpu_pause();
}
return NULL;
}

STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
{
while (1) {
Expand Down Expand Up @@ -473,7 +508,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_
void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
void gc_mark_loop_serial(jl_ptls_t ptls);
void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
void gc_sweep_pool_parallel(void);
void gc_sweep_pool_parallel(jl_ptls_t ptls);
void gc_free_pages(void);
void sweep_stack_pools(void);
void jl_gc_debug_init(void);
Expand Down
6 changes: 6 additions & 0 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,12 @@ struct _jl_gc_pagemeta_t;

typedef struct {
_Atomic(struct _jl_gc_pagemeta_t *) bottom;
// pad to 128 bytes to avoid false-sharing
#ifdef _P64
void *_pad[15];
#else
void *_pad[31];
#endif
} jl_gc_page_stack_t;

// This includes all the thread local states we care about for a thread.
Expand Down
2 changes: 1 addition & 1 deletion src/scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ void jl_parallel_gc_threadfun(void *arg)
gc_mark_loop_parallel(ptls, 0);
}
if (may_sweep(ptls)) { // not an else!
gc_sweep_pool_parallel();
gc_sweep_pool_parallel(ptls);
jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1);
}
}
Expand Down

0 comments on commit e76984f

Please sign in to comment.