Skip to content

Commit

Permalink
reduce contention on page metadata lists during the sweeping phase (J…
Browse files Browse the repository at this point in the history
…uliaLang#52943)

**EDIT**: fixes JuliaLang#52937 by
decreasing the contention on the page lists and only waking GC threads
up if we have a sufficiently large number of pages.

Seems to address the regression from the MWE of
JuliaLang#52937:

- master:
```
../julia-master/julia --project=. run_benchmarks.jl serial obj_arrays issue-52937 -n5 --gcthreads=1
bench = "issue-52937.jl"
┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐
│         │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │
│         │         ms │      ms │        ms │         ms │           ms │                us │       MB │          % │
├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤
│ minimum │      24841 │     818 │        78 │        740 │           44 │             10088 │       96 │          3 │
│  median │      24881 │     834 │        83 │        751 │           45 │             10738 │       97 │          3 │
│ maximum │      25002 │     891 │        87 │        803 │           48 │             11074 │      112 │          4 │
│   stdev │         78 │      29 │         4 │         26 │            1 │               393 │        7 │          0 │
└─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘
 ../julia-master/julia --project=. run_benchmarks.jl serial obj_arrays issue-52937 -n5 --gcthreads=8
bench = "issue-52937.jl"
┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐
│         │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │
│         │         ms │      ms │        ms │         ms │           ms │                us │       MB │          % │
├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤
│ minimum │      29113 │    5200 │        68 │       5130 │           12 │              9724 │       95 │         18 │
│  median │      29354 │    5274 │        69 │       5204 │           12 │             10456 │       96 │         18 │
│ maximum │      29472 │    5333 │        70 │       5264 │           14 │             11913 │       97 │         18 │
│   stdev │        138 │      54 │         1 │         55 │            1 │               937 │        1 │          0 │
└─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘
```

- PR:
```
../julia-master/julia --project=. run_benchmarks.jl serial obj_arrays issue-52937 -n5 --gcthreads=1
bench = "issue-52937.jl"
┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐
│         │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │
│         │         ms │      ms │        ms │         ms │           ms │                us │       MB │          % │
├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤
│ minimum │      24475 │     761 │        77 │        681 │           40 │              9499 │       94 │          3 │
│  median │      24845 │     775 │        80 │        698 │           43 │             10793 │       97 │          3 │
│ maximum │      25128 │     811 │        85 │        726 │           47 │             12820 │      113 │          3 │
│   stdev │        240 │      22 │         3 │         21 │            3 │              1236 │        8 │          0 │
└─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘
../julia-master/julia --project=. run_benchmarks.jl serial obj_arrays issue-52937 -n5 --gcthreads=8
bench = "issue-52937.jl"
┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐
│         │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │
│         │         ms │      ms │        ms │         ms │           ms │                us │       MB │          % │
├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤
│ minimum │      24709 │     679 │        70 │        609 │           11 │              9981 │       95 │          3 │
│  median │      24869 │     702 │        70 │        631 │           12 │             10705 │       96 │          3 │
│ maximum │      24911 │     708 │        72 │        638 │           13 │             10820 │       98 │          3 │
│   stdev │         79 │      12 │         1 │         12 │            1 │               401 │        1 │          0 │
└─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘
```

Also, performance on `objarray.jl` (an example of benchmark in which
sweeping parallelizes well with the current implementation) seems fine:

- master:
```
../julia-master/julia --project=. run_benchmarks.jl multithreaded bigarrays -n5 --gcthreads=1      
bench = "objarray.jl"
┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐
│         │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │
│         │         ms │      ms │        ms │         ms │           ms │                us │       MB │          % │
├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤
│ minimum │      19301 │   10792 │      7485 │       3307 │         1651 │               196 │     4519 │         56 │
│  median │      21415 │   12646 │      9094 │       3551 │         1985 │               241 │     6576 │         59 │
│ maximum │      21873 │   13118 │      9353 │       3765 │         2781 │               330 │     8793 │         60 │
│   stdev │       1009 │     932 │       757 │        190 │          449 │                50 │     1537 │          2 │
└─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘
../julia-master/julia --project=. run_benchmarks.jl multithreaded bigarrays -n5 --gcthreads=8
bench = "objarray.jl"
┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐
│         │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │
│         │         ms │      ms │        ms │         ms │           ms │                us │       MB │          % │
├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤
│ minimum │      13135 │    4377 │      3350 │       1007 │          491 │               231 │     6062 │         33 │
│  median │      13164 │    4540 │      3370 │       1177 │          669 │               256 │     6383 │         35 │
│ maximum │      13525 │    4859 │      3675 │       1184 │          748 │               320 │     7528 │         36 │
│   stdev │        183 │     189 │       146 │         77 │          129 │                42 │      584 │          1 │
└─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘
```

- PR:
```
../julia-master/julia --project=. run_benchmarks.jl multithreaded bigarrays -n5 --gcthreads=1    
bench = "objarray.jl"
┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐
│         │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │
│         │         ms │      ms │        ms │         ms │           ms │                us │       MB │          % │
├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤
│ minimum │      19642 │   10931 │      7566 │       3365 │         1653 │               204 │     5688 │         56 │
│  median │      21441 │   12717 │      8948 │       3770 │         1796 │               217 │     6972 │         59 │
│ maximum │      23494 │   14643 │     10576 │       4067 │         2513 │               248 │     8229 │         62 │
│   stdev │       1408 │    1339 │      1079 │        267 │          393 │                19 │      965 │          2 │
└─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘
../julia-master/julia --project=. run_benchmarks.jl multithreaded bigarrays -n5 --gcthreads=8
bench = "objarray.jl"
┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐
│         │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │
│         │         ms │      ms │        ms │         ms │           ms │                us │       MB │          % │
├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤
│ minimum │      13365 │    4544 │      3389 │       1104 │          516 │               255 │     6349 │         34 │
│  median │      13445 │    4624 │      3404 │       1233 │          578 │               275 │     6385 │         34 │
│ maximum │      14413 │    5278 │      3837 │       1441 │          753 │               300 │     7547 │         37 │
│   stdev │        442 │     303 │       194 │        121 │           89 │                18 │      522 │          1 │
└─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘
```
  • Loading branch information
d-netto authored and RAI CI (GitHub Action Automation) committed Jan 27, 2024
1 parent a5dd1ab commit c69f6e1
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 18 deletions.
124 changes: 108 additions & 16 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ int jl_n_sweepthreads;
_Atomic(int) gc_n_threads_marking;
// Number of threads sweeping
_Atomic(int) gc_n_threads_sweeping;
// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping
_Atomic(jl_gc_page_stack_t *) gc_allocd_scratch;
// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
_Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch;
// `tid` of mutator thread that triggered GC
_Atomic(int) gc_master_tid;
// `tid` of first GC thread
Expand Down Expand Up @@ -1586,8 +1586,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
pg->nfree = nfree;
}

void gc_sweep_wake_all(void)
// pre-scan pages to check whether there are enough pages so that's worth parallelizing
// also sweeps pages that don't need to be linearly scanned
int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
{
// 4MB worth of pages is worth parallelizing
const int n_pages_worth_parallel_sweep = (int)(4 * (1 << 20) / GC_PAGE_SZ);
int n_pages_to_scan = 0;
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
continue;
}
jl_gc_page_stack_t *dest = &new_gc_allocd_scratch[ptls2->tid].stack;
jl_gc_page_stack_t tmp;
jl_gc_pagemeta_t *tail = NULL;
memset(&tmp, 0, sizeof(tmp));
while (1) {
jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&ptls2->page_metadata_allocd);
if (pg == NULL) {
break;
}
int should_scan = 1;
if (!pg->has_marked) {
should_scan = 0;
}
if (!current_sweep_full && !pg->has_young) {
assert(!prev_sweep_full || pg->prev_nold >= pg->nold);
if (!prev_sweep_full || pg->prev_nold == pg->nold) {
should_scan = 0;
}
}
if (should_scan) {
if (tail == NULL) {
tail = pg;
}
n_pages_to_scan++;
push_lf_back_nosync(&tmp, pg);
}
else {
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
}
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
break;
}
}
if (tail != NULL) {
tail->next = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
}
ptls2->page_metadata_allocd = tmp;
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
break;
}
}
gc_page_serializer_destroy(&serializer);
return n_pages_to_scan >= n_pages_worth_parallel_sweep;
}

// wake up all threads to sweep the pages
void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
{
int parallel_sweep_worthwhile = gc_sweep_prescan(ptls, new_gc_allocd_scratch);
jl_atomic_store(&gc_allocd_scratch, new_gc_allocd_scratch);
if (!parallel_sweep_worthwhile) {
return;
}
uv_mutex_lock(&gc_threads_lock);
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
Expand All @@ -1597,6 +1661,7 @@ void gc_sweep_wake_all(void)
uv_mutex_unlock(&gc_threads_lock);
}

// wait for all threads to finish sweeping
void gc_sweep_wait_for_all(void)
{
jl_atomic_store(&gc_allocd_scratch, NULL);
Expand All @@ -1605,36 +1670,58 @@ void gc_sweep_wait_for_all(void)
}
}

void gc_sweep_pool_parallel(void)
// sweep all pools
void gc_sweep_pool_parallel(jl_ptls_t ptls)
{
jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
jl_gc_padded_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
if (allocd_scratch != NULL) {
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
while (1) {
int found_pg = 0;
// sequentially walk the threads and sweep the pages
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
// skip foreign threads that already exited
if (ptls2 == NULL) {
continue;
}
jl_gc_page_stack_t *allocd = &allocd_scratch[t_i];
jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd);
jl_gc_page_stack_t *dest = &allocd_scratch[ptls2->tid].stack;
jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd);
// failed steal attempt
if (pg == NULL) {
continue;
}
gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg);
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
found_pg = 1;
}
if (!found_pg) {
break;
// check for termination
int no_more_work = 1;
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
// skip foreign threads that already exited
if (ptls2 == NULL) {
continue;
}
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
if (pg != NULL) {
no_more_work = 0;
break;
}
}
if (no_more_work) {
break;
}
}
jl_cpu_pause();
}
gc_page_serializer_destroy(&serializer);
}
jl_atomic_fetch_add(&gc_n_threads_sweeping, -1);
}

// free all pages (i.e. through `madvise` on Linux) that were lazily freed
void gc_free_pages(void)
{
while (1) {
Expand All @@ -1659,7 +1746,7 @@ static void gc_sweep_pool(void)

// allocate enough space to hold the end of the free list chain
// for every thread and pool size
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) malloc_s(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));

// update metadata of pages that were pointed to by freelist or newpages from a pool
// i.e. pages being the current allocation target
Expand Down Expand Up @@ -1701,17 +1788,18 @@ static void gc_sweep_pool(void)
}

// the actual sweeping
jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t));
memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t));
jl_atomic_store(&gc_allocd_scratch, tmp);
gc_sweep_wake_all();
gc_sweep_pool_parallel();
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) malloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
memset(new_gc_allocd_scratch, 0, n_threads * sizeof(jl_gc_padded_page_stack_t));
jl_ptls_t ptls = jl_current_task->ptls;
gc_sweep_wake_all(ptls, new_gc_allocd_scratch);
gc_sweep_pool_parallel(ptls);
gc_sweep_wait_for_all();

// reset half-pages pointers
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
ptls2->page_metadata_allocd = tmp[t_i];
ptls2->page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
for (int i = 0; i < JL_GC_N_POOLS; i++) {
jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
p->newpages = NULL;
Expand Down Expand Up @@ -1749,6 +1837,10 @@ static void gc_sweep_pool(void)
}
}

// cleanup
free(pfl);
free(new_gc_allocd_scratch);

#ifdef _P64 // only enable concurrent sweeping on 64bit
// wake thread up to sweep concurrently
if (jl_n_sweepthreads > 0) {
Expand Down
46 changes: 45 additions & 1 deletion src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,23 @@ extern jl_gc_page_stack_t global_page_pool_freed;
// in the sweeping phase, which also doesn't push a node into the
// same stack after it's popped

STATIC_INLINE void push_lf_back_nosync(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
{
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
elt->next = old_back;
jl_atomic_store_relaxed(&pool->bottom, elt);
}

STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back_nosync(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
{
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
if (old_back == NULL) {
return NULL;
}
jl_atomic_store_relaxed(&pool->bottom, old_back->next);
return old_back;
}

STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
{
while (1) {
Expand All @@ -207,6 +224,23 @@ STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt)
}
}

#define MAX_POP_ATTEMPTS (1 << 10)

STATIC_INLINE jl_gc_pagemeta_t *try_pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
{
for (int i = 0; i < MAX_POP_ATTEMPTS; i++) {
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
if (old_back == NULL) {
return NULL;
}
if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) {
return old_back;
}
jl_cpu_pause();
}
return NULL;
}

STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
{
while (1) {
Expand All @@ -220,6 +254,16 @@ STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFE
jl_cpu_pause();
}
}
typedef struct {
jl_gc_page_stack_t stack;
// pad to 128 bytes to avoid false-sharing
#ifdef _P64
void *_pad[15];
#else
void *_pad[31];
#endif
} jl_gc_padded_page_stack_t;
static_assert(sizeof(jl_gc_padded_page_stack_t) == 128, "jl_gc_padded_page_stack_t is not 128 bytes");

typedef struct {
_Atomic(size_t) n_freed_objs;
Expand Down Expand Up @@ -461,7 +505,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_
void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
void gc_mark_loop_serial(jl_ptls_t ptls);
void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
void gc_sweep_pool_parallel(void);
void gc_sweep_pool_parallel(jl_ptls_t ptls);
void gc_free_pages(void);
void sweep_stack_pools(void);
void jl_gc_debug_init(void);
Expand Down
2 changes: 1 addition & 1 deletion src/partr.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ void jl_parallel_gc_threadfun(void *arg)
gc_mark_loop_parallel(ptls, 0);
}
if (may_sweep(ptls)) { // not an else!
gc_sweep_pool_parallel();
gc_sweep_pool_parallel(ptls);
jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1);
}
}
Expand Down

0 comments on commit c69f6e1

Please sign in to comment.