From f290b154143125dfe14b56de7140775cf24c4905 Mon Sep 17 00:00:00 2001
From: d-netto <diogonetto.dcn@gmail.com>
Date: Wed, 17 Jan 2024 12:28:51 -0300
Subject: [PATCH] reduce contention on page metadata lists during the sweeping
 phase

---
 src/gc.c            | 134 ++++++++++++++++++++++++++++++++++++++++----
 src/gc.h            |  37 +++++++++++-
 src/julia_threads.h |   6 ++
 src/scheduler.c     |   2 +-
 4 files changed, 165 insertions(+), 14 deletions(-)

diff --git a/src/gc.c b/src/gc.c
index ca53132fcbc160..972e5b76812f91 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -1541,7 +1541,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
 
 done:
     if (re_use_page) {
-        push_lf_back(allocd, pg);
+        // we're pushing into a local page stack to reduce contention
+        push_lf_back_nosync(allocd, pg);
     }
     else {
         gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED);
@@ -1596,8 +1597,71 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
     pg->nfree = nfree;
 }
 
-void gc_sweep_wake_all(void)
+int gc_sweep_prescan(jl_ptls_t ptls)
 {
+    // 4MB worth of pages is worth parallelizing
+    const int n_pages_worth_parallel_sweep = (int)(4 * (1 << 20) / GC_PAGE_SZ);
+    int n_pages_to_scan = 0;
+    gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
+    // push into local page stack. we'll merge them later...
+    jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
+    assert(allocd_scratch != NULL);
+    jl_gc_page_stack_t *dest = &allocd_scratch[ptls->tid];
+    for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 == NULL) {
+            continue;
+        }
+        jl_gc_page_stack_t tmp;
+        jl_gc_pagemeta_t *tail = NULL;
+        memset(&tmp, 0, sizeof(tmp));
+        while (1) {
+            jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&ptls2->page_metadata_allocd);
+            if (pg == NULL) {
+                break;
+            }
+            int should_scan = 1;
+            if (!pg->has_marked) {
+                should_scan = 0;
+            }
+            if (!current_sweep_full && !pg->has_young) {
+                assert(!prev_sweep_full || pg->prev_nold >= pg->nold);
+                if (!prev_sweep_full || pg->prev_nold == pg->nold) {
+                    should_scan = 0;
+                }
+            }
+            if (should_scan) {
+                if (tail == NULL) {
+                    tail = pg;
+                }
+                n_pages_to_scan++;
+                push_lf_back_nosync(&tmp, pg);
+            }
+            else {
+                gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
+            }
+            if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
+                break;
+            }
+        }
+        if (tail != NULL) {
+            tail->next = ptls2->page_metadata_allocd.bottom;
+        }
+        ptls2->page_metadata_allocd = tmp;
+        if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
+            break;
+        }
+    }
+    gc_page_serializer_destroy(&serializer);
+    return n_pages_to_scan >= n_pages_worth_parallel_sweep;
+}
+
+void gc_sweep_wake_all(jl_ptls_t ptls)
+{
+    int parallel_sweep_worthwhile = gc_sweep_prescan(ptls);
+    if (!parallel_sweep_worthwhile) {
+        return;
+    }
     uv_mutex_lock(&gc_threads_lock);
     for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
@@ -1615,30 +1679,52 @@ void gc_sweep_wait_for_all(void)
     }
 }
 
-void gc_sweep_pool_parallel(void)
+void gc_sweep_pool_parallel(jl_ptls_t ptls)
 {
     jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
     jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
     if (allocd_scratch != NULL) {
+        // push into local page stack to reduce contention
+        // we'll merge them later...
+        jl_gc_page_stack_t *dest = &allocd_scratch[ptls->tid];
         gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
         while (1) {
             int found_pg = 0;
+            // sequentially walk the threads and sweep the pages
             for (int t_i = 0; t_i < gc_n_threads; t_i++) {
                 jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+                // skip foreign threads that already exited
                 if (ptls2 == NULL) {
                     continue;
                 }
-                jl_gc_page_stack_t *allocd = &allocd_scratch[t_i];
-                jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd);
+                jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd);
+                // failed steal attempt
                 if (pg == NULL) {
                     continue;
                 }
-                gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg);
+                gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
                 found_pg = 1;
             }
             if (!found_pg) {
-                break;
+                // check for termination
+                int no_more_work = 1;
+                for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+                    jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+                    // skip foreign threads that already exited
+                    if (ptls2 == NULL) {
+                        continue;
+                    }
+                    jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
+                    if (pg != NULL) {
+                        no_more_work = 0;
+                        break;
+                    }
+                }
+                if (no_more_work) {
+                    break;
+                }
             }
+            jl_cpu_pause();
         }
         gc_page_serializer_destroy(&serializer);
     }
@@ -1669,7 +1755,7 @@ static void gc_sweep_pool(void)
 
     // allocate enough space to hold the end of the free list chain
     // for every thread and pool size
-    jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
+    jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) malloc_s(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
 
     // update metadata of pages that were pointed to by freelist or newpages from a pool
     // i.e. pages being the current allocation target
@@ -1711,17 +1797,37 @@ static void gc_sweep_pool(void)
     }
 
     // the actual sweeping
-    jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t));
+    jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)jl_malloc_aligned(n_threads * sizeof(jl_gc_page_stack_t), 128);
+    if (tmp == NULL) {
+        abort();
+    }
     memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t));
     jl_atomic_store(&gc_allocd_scratch, tmp);
-    gc_sweep_wake_all();
-    gc_sweep_pool_parallel();
+    jl_ptls_t ptls = jl_current_task->ptls;
+    gc_sweep_wake_all(ptls);
+    gc_sweep_pool_parallel(ptls);
     gc_sweep_wait_for_all();
 
+    // merge the page metadata lists
+    for (int t_i = 0; t_i < n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 == NULL) {
+            continue;
+        }
+        while (1) {
+            jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&tmp[t_i]);
+            if (pg == NULL) {
+                break;
+            }
+            jl_ptls_t ptls3 = gc_all_tls_states[pg->thread_n];
+            push_lf_back_nosync(&ptls3->page_metadata_allocd, pg);
+        }
+    }
+
+    // reset half-pages pointers
     for (int t_i = 0; t_i < n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         if (ptls2 != NULL) {
-            ptls2->page_metadata_allocd = tmp[t_i];
             for (int i = 0; i < JL_GC_N_POOLS; i++) {
                 jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
                 p->newpages = NULL;
@@ -1759,6 +1865,10 @@ static void gc_sweep_pool(void)
         }
     }
 
+    // cleanup
+    free(pfl);
+    free(tmp);
+
 #ifdef _P64 // only enable concurrent sweeping on 64bit
     // wake thread up to sweep concurrently
     if (jl_n_sweepthreads > 0) {
diff --git a/src/gc.h b/src/gc.h
index 9de67f6e7c679f..235c90c0c6dc87 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -199,6 +199,24 @@ extern jl_gc_page_stack_t global_page_pool_freed;
 // in the sweeping phase, which also doesn't push a node into the
 // same stack after it's popped
 
+STATIC_INLINE void push_lf_back_nosync(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
+{
+    jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
+    elt->next = old_back;
+    jl_atomic_store_relaxed(&pool->bottom, elt);
+}
+
+STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back_nosync(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
+{
+    jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
+    if (old_back == NULL) {
+        return NULL;
+    }
+    jl_gc_pagemeta_t *new_back = old_back->next;
+    jl_atomic_store_relaxed(&pool->bottom, new_back);
+    return old_back;
+}
+
 STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
 {
     while (1) {
@@ -211,6 +229,23 @@ STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt)
     }
 }
 
+#define MAX_POP_ATTEMPTS (1 << 10)
+
+STATIC_INLINE jl_gc_pagemeta_t *try_pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
+{
+    for (int i = 0; i < MAX_POP_ATTEMPTS; i++) {
+        jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
+        if (old_back == NULL) {
+            return NULL;
+        }
+        if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) {
+            return old_back;
+        }
+        jl_cpu_pause();
+    }
+    return NULL;
+}
+
 STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
 {
     while (1) {
@@ -473,7 +508,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_
 void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
 void gc_mark_loop_serial(jl_ptls_t ptls);
 void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
-void gc_sweep_pool_parallel(void);
+void gc_sweep_pool_parallel(jl_ptls_t ptls);
 void gc_free_pages(void);
 void sweep_stack_pools(void);
 void jl_gc_debug_init(void);
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 292c11f61d60d0..fcc4457e5f83c4 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -197,6 +197,12 @@ struct _jl_gc_pagemeta_t;
 
 typedef struct {
     _Atomic(struct _jl_gc_pagemeta_t *) bottom;
+    // pad to 128 bytes to avoid false-sharing
+#ifdef _P64
+    void *_pad[15];
+#else
+    void *_pad[31];
+#endif
 } jl_gc_page_stack_t;
 
 // This includes all the thread local states we care about for a thread.
diff --git a/src/scheduler.c b/src/scheduler.c
index 50e15b286a8eba..2af88db89ca14e 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -147,7 +147,7 @@ void jl_parallel_gc_threadfun(void *arg)
             gc_mark_loop_parallel(ptls, 0);
         }
         if (may_sweep(ptls)) { // not an else!
-            gc_sweep_pool_parallel();
+            gc_sweep_pool_parallel(ptls);
             jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1);
         }
     }