From c14f6dea59ce6ef0d90a49f717c83047fac698ac Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Fri, 22 Sep 2023 15:48:34 -0300 Subject: [PATCH] parallelize sweeping of object pools (#51282) Sweeping of object pools will either construct a free list through dead objects (if there is at least one live object in a given page) or return the page to the OS (if there are no live objects whatsoever). With this PR, we're basically constructing the free-lists for each GC page in parallel. --- src/gc-debug.c | 4 +- src/gc-pages.c | 12 +-- src/gc.c | 188 ++++++++++++++++++++++++++++++-------------- src/gc.h | 33 ++++---- src/julia_threads.h | 9 ++- src/partr.c | 33 ++++---- src/threading.c | 4 +- src/threading.h | 4 +- 8 files changed, 187 insertions(+), 100 deletions(-) diff --git a/src/gc-debug.c b/src/gc-debug.c index 3f0f3368533a42..12acb1cfbe7cb9 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -115,7 +115,7 @@ static void gc_clear_mark_outer(int bits) { for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - jl_gc_pagemeta_t *pg = ptls2->page_metadata_allocd; + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); while (pg != NULL) { gc_clear_mark_page(pg, bits); pg = pg->next; @@ -1129,7 +1129,7 @@ static void gc_count_pool_pagetable(void) { for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - jl_gc_pagemeta_t *pg = ptls2->page_metadata_allocd; + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); while (pg != NULL) { if (gc_alloc_map_is_set(pg->data)) { gc_count_pool_page(pg); diff --git a/src/gc-pages.c b/src/gc-pages.c index 7a32a7f4bf9753..40e54541361488 100644 --- a/src/gc-pages.c +++ b/src/gc-pages.c @@ -129,7 +129,7 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT jl_gc_pagemeta_t *meta = NULL; // try to get page from `pool_lazily_freed` - meta = pop_lf_page_metadata_back(&global_page_pool_lazily_freed); + meta = pop_lf_back(&global_page_pool_lazily_freed); if (meta != NULL) { gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED); // page is already mapped @@ -137,14 +137,14 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT } // try to get page from `pool_clean` - meta = pop_lf_page_metadata_back(&global_page_pool_clean); + meta = pop_lf_back(&global_page_pool_clean); if (meta != NULL) { gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED); goto exit; } // try to get page from `pool_freed` - meta = pop_lf_page_metadata_back(&global_page_pool_freed); + meta = pop_lf_back(&global_page_pool_freed); if (meta != NULL) { gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED); goto exit; @@ -152,7 +152,7 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT uv_mutex_lock(&gc_perm_lock); // another thread may have allocated a large block while we were waiting... - meta = pop_lf_page_metadata_back(&global_page_pool_clean); + meta = pop_lf_back(&global_page_pool_clean); if (meta != NULL) { uv_mutex_unlock(&gc_perm_lock); gc_alloc_map_set(meta->data, 1); @@ -166,10 +166,10 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT pg->data = data + GC_PAGE_SZ * i; gc_alloc_map_maybe_create(pg->data); if (i == 0) { - gc_alloc_map_set(pg->data, 1); + gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED); } else { - push_lf_page_metadata_back(&global_page_pool_clean, pg); + push_lf_back(&global_page_pool_clean, pg); } } uv_mutex_unlock(&gc_perm_lock); diff --git a/src/gc.c b/src/gc.c index 355147935e881b..f935b68635dd60 100644 --- a/src/gc.c +++ b/src/gc.c @@ -18,6 +18,10 @@ int jl_n_markthreads; int jl_n_sweepthreads; // Number of threads currently running the GC mark-loop _Atomic(int) gc_n_threads_marking; +// Number of threads sweeping +_Atomic(int) gc_n_threads_sweeping; +// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping +_Atomic(jl_gc_page_stack_t *) gc_allocd_scratch; // `tid` of mutator thread that triggered GC _Atomic(int) gc_master_tid; // `tid` of first GC thread @@ -743,6 +747,7 @@ static int mark_reset_age = 0; static int64_t scanned_bytes; // young bytes scanned while marking static int64_t perm_scanned_bytes; // old bytes scanned while marking int prev_sweep_full = 1; +int current_sweep_full = 0; int under_pressure = 0; // Full collection heuristics @@ -1257,9 +1262,9 @@ STATIC_INLINE jl_taggedvalue_t *gc_reset_page(jl_ptls_t ptls2, const jl_gc_pool_ return beg; } -jl_gc_global_page_pool_t global_page_pool_lazily_freed; -jl_gc_global_page_pool_t global_page_pool_clean; -jl_gc_global_page_pool_t global_page_pool_freed; +jl_gc_page_stack_t global_page_pool_lazily_freed; +jl_gc_page_stack_t global_page_pool_clean; +jl_gc_page_stack_t global_page_pool_freed; pagetable_t alloc_map; // Add a new page to the pool. Discards any pages in `p->newpages` before. @@ -1268,7 +1273,7 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT // Do not pass in `ptls` as argument. This slows down the fast path // in pool_alloc significantly jl_ptls_t ptls = jl_current_task->ptls; - jl_gc_pagemeta_t *pg = pop_page_metadata_back(&ptls->page_metadata_lazily_freed); + jl_gc_pagemeta_t *pg = pop_lf_back(&ptls->page_metadata_buffered); if (pg != NULL) { gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED); } @@ -1278,7 +1283,7 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT pg->osize = p->osize; pg->thread_n = ptls->tid; set_page_metadata(pg); - push_page_metadata_back(&ptls->page_metadata_allocd, pg); + push_lf_back(&ptls->page_metadata_allocd, pg); jl_taggedvalue_t *fl = gc_reset_page(ptls, p, pg); p->newpages = fl; return fl; @@ -1376,11 +1381,11 @@ int jl_gc_classify_pools(size_t sz, int *osize) // sweep phase -int64_t lazy_freed_pages = 0; +int64_t buffered_pages = 0; // Returns pointer to terminal pointer of list rooted at *pfl. -static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allocd, - jl_gc_pagemeta_t **lazily_freed, jl_gc_pagemeta_t *pg, jl_taggedvalue_t **pfl, int sweep_full, int osize) JL_NOTSAFEPOINT +static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *buffered, + jl_gc_pagemeta_t *pg, int osize) JL_NOTSAFEPOINT { char *data = pg->data; jl_taggedvalue_t *v = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET); @@ -1393,7 +1398,7 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo size_t nfree; int re_use_page = 1; - int freed_lazily = 0; + int keep_as_local_buffer = 0; int freedall = 1; int pg_skpd = 1; if (!pg->has_marked) { @@ -1404,9 +1409,9 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo // the eager one uses less memory. // FIXME - need to do accounting on a per-thread basis // on quick sweeps, keep a few pages empty but allocated for performance - if (!sweep_full && lazy_freed_pages <= default_collect_interval / GC_PAGE_SZ) { - lazy_freed_pages++; - freed_lazily = 1; + if (!current_sweep_full && buffered_pages <= default_collect_interval / GC_PAGE_SZ) { + buffered_pages++; + keep_as_local_buffer = 1; } #endif nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / osize; @@ -1414,15 +1419,9 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo } // For quick sweep, we might be able to skip the page if the page doesn't // have any young live cell before marking. - if (!sweep_full && !pg->has_young) { + if (!current_sweep_full && !pg->has_young) { assert(!prev_sweep_full || pg->prev_nold >= pg->nold); if (!prev_sweep_full || pg->prev_nold == pg->nold) { - // the position of the freelist begin/end in this page - // is stored in its metadata - if (pg->fl_begin_offset != (uint16_t)-1) { - *pfl = page_pfl_beg(pg); - pfl = (jl_taggedvalue_t**)page_pfl_end(pg); - } freedall = 0; nfree = pg->nfree; goto done; @@ -1435,6 +1434,8 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo int has_young = 0; int16_t prev_nold = 0; int pg_nfree = 0; + jl_taggedvalue_t *fl = NULL; + jl_taggedvalue_t **pfl = &fl; jl_taggedvalue_t **pfl_begin = NULL; while ((char*)v <= lim) { int bits = v->bits.gc; @@ -1446,7 +1447,7 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo pg_nfree++; } else { // marked young or old - if (sweep_full || bits == GC_MARKED) { // old enough + if (current_sweep_full || bits == GC_MARKED) { // old enough bits = v->bits.gc = GC_OLD; // promote } prev_nold++; @@ -1468,7 +1469,7 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo } pg->nfree = pg_nfree; - if (sweep_full) { + if (current_sweep_full) { pg->nold = 0; pg->prev_nold = prev_nold; } @@ -1477,43 +1478,32 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo done: if (re_use_page) { - push_page_metadata_back(allocd, pg); - } - else if (freed_lazily) { - gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED); - push_page_metadata_back(lazily_freed, pg); + push_lf_back(allocd, pg); } else { - #ifdef _P64 // only enable concurrent sweeping on 64bit - if (jl_n_sweepthreads == 0) { - jl_gc_free_page(pg); - push_lf_page_metadata_back(&global_page_pool_freed, pg); + gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED); + if (keep_as_local_buffer) { + push_lf_back(buffered, pg); } else { - gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED); - push_lf_page_metadata_back(&global_page_pool_lazily_freed, pg); + push_lf_back(&global_page_pool_lazily_freed, pg); } - #else - jl_gc_free_page(pg); - push_lf_page_metadata_back(&global_page_pool_freed, pg); - #endif } gc_time_count_page(freedall, pg_skpd); - gc_num.freed += (nfree - old_nfree) * osize; - pool_live_bytes += GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize; - return pfl; + jl_atomic_fetch_add((_Atomic(int64_t) *)&pool_live_bytes, GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize); + jl_atomic_fetch_add((_Atomic(int64_t) *)&gc_num.freed, (nfree - old_nfree) * osize); } // the actual sweeping over all allocated pages in a memory pool -STATIC_INLINE void gc_sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t **allocd, - jl_gc_pagemeta_t **lazily_freed, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT +STATIC_INLINE void gc_sweep_pool_page(jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *lazily_freed, + jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT { int p_n = pg->pool_n; int t_n = pg->thread_n; jl_ptls_t ptls2 = gc_all_tls_states[t_n]; jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; int osize = pg->osize; - pfl[t_n * JL_GC_N_POOLS + p_n] = gc_sweep_page(p, allocd, lazily_freed, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_full, osize); + gc_sweep_page(p, allocd, lazily_freed, pg, osize); } // sweep over all memory that is being used and not in a pool @@ -1539,11 +1529,70 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_ pg->nfree = nfree; } +void gc_sweep_wake_all(void) +{ + uv_mutex_lock(&gc_threads_lock); + for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + jl_atomic_fetch_add(&ptls2->gc_sweeps_requested, 1); + } + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); +} + +void gc_sweep_pool_parallel(void) +{ + jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); + jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch); + if (allocd_scratch != NULL) { + while (1) { + int found_pg = 0; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) { + continue; + } + jl_gc_page_stack_t *allocd = &allocd_scratch[t_i]; + jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd); + if (pg == NULL) { + continue; + } + gc_sweep_pool_page(allocd, &ptls2->page_metadata_buffered, pg); + found_pg = 1; + } + if (!found_pg) { + break; + } + } + } + jl_atomic_fetch_add(&gc_n_threads_sweeping, -1); +} + +void gc_sweep_wait_for_all(void) +{ + jl_atomic_store(&gc_allocd_scratch, NULL); + while (jl_atomic_load_relaxed(&gc_n_threads_sweeping) != 0) { + jl_cpu_pause(); + } +} + +void gc_free_pages(void) +{ + while (1) { + jl_gc_pagemeta_t *pg = pop_lf_back(&global_page_pool_lazily_freed); + if (pg == NULL) { + break; + } + jl_gc_free_page(pg); + push_lf_back(&global_page_pool_freed, pg); + } +} + // setup the data-structures for a sweep over all memory pools -static void gc_sweep_pool(int sweep_full) +static void gc_sweep_pool(void) { gc_time_pool_start(); - lazy_freed_pages = 0; + buffered_pages = 0; // For the benefit of the analyzer, which doesn't know that gc_n_threads // doesn't change over the course of this function @@ -1583,26 +1632,26 @@ static void gc_sweep_pool(int sweep_full) pg->has_young = 1; } } - jl_gc_pagemeta_t *pg = ptls2->page_metadata_lazily_freed; + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_buffered.bottom); while (pg != NULL) { jl_gc_pagemeta_t *pg2 = pg->next; - lazy_freed_pages++; + buffered_pages++; pg = pg2; } } // the actual sweeping + jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t)); + memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t)); + jl_atomic_store(&gc_allocd_scratch, tmp); + gc_sweep_wake_all(); + gc_sweep_pool_parallel(); + gc_sweep_wait_for_all(); + for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) { - jl_gc_pagemeta_t *allocd = NULL; - jl_gc_pagemeta_t *pg = ptls2->page_metadata_allocd; - while (pg != NULL) { - jl_gc_pagemeta_t *pg2 = pg->next; - gc_sweep_pool_page(pfl, &allocd, &ptls2->page_metadata_lazily_freed, pg, sweep_full); - pg = pg2; - } - ptls2->page_metadata_allocd = allocd; + ptls2->page_metadata_allocd = tmp[t_i]; for (int i = 0; i < JL_GC_N_POOLS; i++) { jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; p->newpages = NULL; @@ -1610,6 +1659,26 @@ static void gc_sweep_pool(int sweep_full) } } + // merge free lists + for (int t_i = 0; t_i < n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) { + continue; + } + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + while (pg != NULL) { + jl_gc_pagemeta_t *pg2 = pg->next; + if (pg->fl_begin_offset != UINT16_MAX) { + char *cur_pg = pg->data; + jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset); + jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset); + *pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg; + pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next; + } + pg = pg2; + } + } + // null out terminal pointers of free lists for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; @@ -1625,9 +1694,13 @@ static void gc_sweep_pool(int sweep_full) if (jl_n_sweepthreads > 0) { uv_sem_post(&gc_sweep_assists_needed); } + else { + gc_free_pages(); + } +#else + gc_free_pages(); #endif - - gc_time_pool_end(sweep_full); + gc_time_pool_end(current_sweep_full); } static void gc_sweep_perm_alloc(void) @@ -3320,13 +3393,14 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) #ifdef USE_TRACY TracyCZoneColor(full_timing_block.tracy_ctx, 0xFFA500); #endif + current_sweep_full = sweep_full; sweep_weak_refs(); sweep_stack_pools(); gc_sweep_foreign_objs(); gc_sweep_other(ptls, sweep_full); gc_scrub(); gc_verify_tags(); - gc_sweep_pool(sweep_full); + gc_sweep_pool(); if (sweep_full) gc_sweep_perm_alloc(); } diff --git a/src/gc.h b/src/gc.h index ed733551b0e57f..40aabeb79a2d99 100644 --- a/src/gc.h +++ b/src/gc.h @@ -180,13 +180,9 @@ typedef struct _jl_gc_pagemeta_t { char *data; } jl_gc_pagemeta_t; -typedef struct { - _Atomic(jl_gc_pagemeta_t *) page_metadata_back; -} jl_gc_global_page_pool_t; - -extern jl_gc_global_page_pool_t global_page_pool_lazily_freed; -extern jl_gc_global_page_pool_t global_page_pool_clean; -extern jl_gc_global_page_pool_t global_page_pool_freed; +extern jl_gc_page_stack_t global_page_pool_lazily_freed; +extern jl_gc_page_stack_t global_page_pool_clean; +extern jl_gc_page_stack_t global_page_pool_freed; #define GC_BACKOFF_MIN 4 #define GC_BACKOFF_MAX 12 @@ -203,27 +199,33 @@ STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT // Lock-free stack implementation taken // from Herlihy's "The Art of Multiprocessor Programming" +// XXX: this is not a general-purpose lock-free stack. We can +// get away with just using a CAS and not implementing some ABA +// prevention mechanism since once a node is popped from the +// `jl_gc_global_page_pool_t`, it may only be pushed back to them +// in the sweeping phase, which also doesn't push a node into the +// same stack after it's popped -STATIC_INLINE void push_lf_page_metadata_back(jl_gc_global_page_pool_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT +STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT { while (1) { - jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->page_metadata_back); + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); elt->next = old_back; - if (jl_atomic_cmpswap(&pool->page_metadata_back, &old_back, elt)) { + if (jl_atomic_cmpswap(&pool->bottom, &old_back, elt)) { break; } jl_cpu_pause(); } } -STATIC_INLINE jl_gc_pagemeta_t *pop_lf_page_metadata_back(jl_gc_global_page_pool_t *pool) JL_NOTSAFEPOINT +STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT { while (1) { - jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->page_metadata_back); + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); if (old_back == NULL) { return NULL; } - if (jl_atomic_cmpswap(&pool->page_metadata_back, &old_back, old_back->next)) { + if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) { return old_back; } jl_cpu_pause(); @@ -372,7 +374,7 @@ extern jl_gc_num_t gc_num; extern bigval_t *big_objects_marked; extern arraylist_t finalizer_list_marked; extern arraylist_t to_finalize; -extern int64_t lazy_freed_pages; +extern int64_t buffered_pages; extern int gc_first_tid; extern int gc_n_threads; extern jl_ptls_t* gc_all_tls_states; @@ -440,12 +442,15 @@ extern uv_mutex_t gc_threads_lock; extern uv_cond_t gc_threads_cond; extern uv_sem_t gc_sweep_assists_needed; extern _Atomic(int) gc_n_threads_marking; +extern _Atomic(int) gc_n_threads_sweeping; void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT; void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT; void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_loop_serial(jl_ptls_t ptls); void gc_mark_loop_parallel(jl_ptls_t ptls, int master); +void gc_sweep_pool_parallel(void); +void gc_free_pages(void); void sweep_stack_pools(void); void jl_gc_debug_init(void); diff --git a/src/julia_threads.h b/src/julia_threads.h index 8acbf9b53d90ca..025c5707e5507b 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -200,6 +200,10 @@ typedef struct { struct _jl_bt_element_t; struct _jl_gc_pagemeta_t; +typedef struct { + _Atomic(struct _jl_gc_pagemeta_t *) bottom; +} jl_gc_page_stack_t; + // This includes all the thread local states we care about for a thread. // Changes to TLS field types must be reflected in codegen. #define JL_MAX_BT_SIZE 80000 @@ -261,11 +265,12 @@ typedef struct _jl_tls_states_t { #endif jl_thread_t system_id; arraylist_t finalizers; - struct _jl_gc_pagemeta_t *page_metadata_allocd; - struct _jl_gc_pagemeta_t *page_metadata_lazily_freed; + jl_gc_page_stack_t page_metadata_allocd; + jl_gc_page_stack_t page_metadata_buffered; jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; arraylist_t sweep_objs; + _Atomic(int64_t) gc_sweeps_requested; // Saved exception for previous *external* API call or NULL if cleared. // Access via jl_exception_occurred(). struct _jl_value_t *previous_exception; diff --git a/src/partr.c b/src/partr.c index 0f3b581f5122f6..23a252b537f992 100644 --- a/src/partr.c +++ b/src/partr.c @@ -108,14 +108,18 @@ void jl_init_threadinginfra(void) void JL_NORETURN jl_finish_task(jl_task_t *t); - static inline int may_mark(void) JL_NOTSAFEPOINT { return (jl_atomic_load(&gc_n_threads_marking) > 0); } -// gc thread mark function -void jl_gc_mark_threadfun(void *arg) +static inline int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + return (jl_atomic_load(&ptls->gc_sweeps_requested) > 0); +} + +// parallel gc thread function +void jl_parallel_gc_threadfun(void *arg) { jl_threadarg_t *targ = (jl_threadarg_t*)arg; @@ -131,16 +135,22 @@ void jl_gc_mark_threadfun(void *arg) while (1) { uv_mutex_lock(&gc_threads_lock); - while (!may_mark()) { + while (!may_mark() && !may_sweep(ptls)) { uv_cond_wait(&gc_threads_cond, &gc_threads_lock); } uv_mutex_unlock(&gc_threads_lock); - gc_mark_loop_parallel(ptls, 0); + if (may_mark()) { + gc_mark_loop_parallel(ptls, 0); + } + if (may_sweep(ptls)) { // not an else! + gc_sweep_pool_parallel(); + jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1); + } } } -// gc thread sweep function -void jl_gc_sweep_threadfun(void *arg) +// concurrent gc thread function +void jl_concurrent_gc_threadfun(void *arg) { jl_threadarg_t *targ = (jl_threadarg_t*)arg; @@ -156,14 +166,7 @@ void jl_gc_sweep_threadfun(void *arg) while (1) { uv_sem_wait(&gc_sweep_assists_needed); - while (1) { - jl_gc_pagemeta_t *pg = pop_lf_page_metadata_back(&global_page_pool_lazily_freed); - if (pg == NULL) { - break; - } - jl_gc_free_page(pg); - push_lf_page_metadata_back(&global_page_pool_freed, pg); - } + gc_free_pages(); } } diff --git a/src/threading.c b/src/threading.c index acdd5770098f11..2ec9c220fbafb3 100644 --- a/src/threading.c +++ b/src/threading.c @@ -754,10 +754,10 @@ void jl_start_threads(void) } } else if (i == nthreads - 1 && jl_n_sweepthreads == 1) { - uv_thread_create(&uvtid, jl_gc_sweep_threadfun, t); + uv_thread_create(&uvtid, jl_concurrent_gc_threadfun, t); } else { - uv_thread_create(&uvtid, jl_gc_mark_threadfun, t); + uv_thread_create(&uvtid, jl_parallel_gc_threadfun, t); } uv_thread_detach(&uvtid); } diff --git a/src/threading.h b/src/threading.h index 73d2cd73fb70d2..260ecffa30dd55 100644 --- a/src/threading.h +++ b/src/threading.h @@ -25,8 +25,8 @@ jl_ptls_t jl_init_threadtls(int16_t tid) JL_NOTSAFEPOINT; // provided by a threading infrastructure void jl_init_threadinginfra(void); -void jl_gc_mark_threadfun(void *arg); -void jl_gc_sweep_threadfun(void *arg); +void jl_parallel_gc_threadfun(void *arg); +void jl_concurrent_gc_threadfun(void *arg); void jl_threadfun(void *arg); #ifdef __cplusplus