Ports upstream #12212 (#2118)

ocaml-flambda · Dec 4, 2023 · 395cc30 · 395cc30
1 parent f513da6
commit 395cc30
Show file tree

Hide file tree

Showing 10 changed files with 109 additions and 13 deletions.
diff --git a/ocaml/runtime/alloc.c b/ocaml/runtime/alloc.c
@@ -396,3 +396,13 @@ CAMLexport value caml_alloc_some(value v)
   Field(some, 0) = v;
   CAMLreturn(some);
 }
+
+CAMLprim value caml_atomic_make_contended(value v)
+{
+  CAMLparam1(v);
+  const mlsize_t sz = Wosize_bhsize(Cache_line_bsize);
+  value res = caml_alloc_shr(sz, 0);
+  caml_initialize(&Field(res, 0), v);
+  for (mlsize_t i = 1; i < sz; i++) Field(res, i) = Val_unit;
+  CAMLreturn(res);
+}
diff --git a/ocaml/runtime/caml/config.h b/ocaml/runtime/caml/config.h
@@ -272,4 +272,17 @@ typedef uint64_t uintnat;
 /* Default size of runtime_events ringbuffers, in words, in powers of two */
 #define Default_runtime_events_log_wsize 16
 
+/* Assumed size of cache line. This value can be bigger than the actual L1
+   cache line size. Atomics allocated with aligned constructor are
+   memory-aligned this value to avoid false sharing of cache line. */
+#if defined(TARGET_s390x)
+   #define Cache_line_bsize 256
+#elif defined(TARGET_arm64) || defined(TARGET_power)
+   #define Cache_line_bsize 128
+#elif defined(TARGET_amd64) || defined(TARGET_riscv)
+   #define Cache_line_bsize 64
+#elif (!defined(NATIVE_CODE))
+   #define Cache_line_bsize 64
+#endif
+
 #endif /* CAML_CONFIG_H */
diff --git a/ocaml/runtime/caml/sizeclasses.h b/ocaml/runtime/caml/sizeclasses.h
@@ -4,14 +4,14 @@
 #define SIZECLASS_MAX 128
 #define NUM_SIZECLASSES 32
 static const unsigned int wsize_sizeclass[NUM_SIZECLASSES] =
-{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 23, 26, 29, 33, 37, 42,
+{ 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 17, 19, 22, 25, 28, 32, 33, 37, 42,
   47, 53, 59, 65, 73, 81, 89, 99, 108, 118, 128 };
 static const unsigned char wastage_sizeclass[NUM_SIZECLASSES] =
-{ 0, 0, 0, 0, 2, 0, 4, 4, 6, 2, 0, 4, 12, 6, 12, 21, 10, 3, 0, 22, 18, 3, 11,
+{ 0, 0, 0, 0, 2, 0, 4, 4, 2, 0, 4, 12, 12, 7, 0, 17, 4, 28, 0, 22, 18, 3, 11,
   21, 62, 4, 42, 87, 33, 96, 80, 124 };
 static const unsigned char sizeclass_wsize[SIZECLASS_MAX + 1] =
-{ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
-  15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20,
+{ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 13, 13, 14,
+  14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 17, 18, 19, 19, 19, 19, 20, 20,
   20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23,
   23, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
   26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28,

diff --git a/ocaml/runtime/shared_heap.c b/ocaml/runtime/shared_heap.c
@@ -53,8 +53,6 @@ typedef struct pool {
   sizeclass sz;
 } pool;
 CAML_STATIC_ASSERT(sizeof(pool) == Bsize_wsize(POOL_HEADER_WSIZE));
-#define POOL_HEADER_SZ sizeof(pool)
-
 #define POOL_SLAB_WOFFSET(sz) (POOL_HEADER_WSIZE + wastage_sizeclass[sz])
 #define POOL_FIRST_BLOCK(p, sz) ((header_t*)(p) + POOL_SLAB_WOFFSET(sz))
 #define POOL_END(p) ((header_t*)(p) + POOL_WSIZE)
@@ -238,7 +236,7 @@ static void calc_pool_stats(pool* a, sizeclass sz, struct heap_stats* s)
   header_t* p = POOL_FIRST_BLOCK(a, sz);
   header_t* end = POOL_END(a);
   mlsize_t wh = wsize_sizeclass[sz];
-  s->pool_frag_words += Wsize_bsize(POOL_HEADER_SZ);
+  s->pool_frag_words += POOL_SLAB_WOFFSET(sz);
 
   while (p + wh <= end) {
     header_t hd = (header_t)atomic_load_relaxed((atomic_uintnat*)p);
@@ -250,8 +248,7 @@ static void calc_pool_stats(pool* a, sizeclass sz, struct heap_stats* s)
 
     p += wh;
   }
-  CAMLassert(end - p == wastage_sizeclass[sz]);
-  s->pool_frag_words += end - p;
+  CAMLassert(end == p);
   s->pool_words += POOL_WSIZE;
 }
 
@@ -283,6 +280,8 @@ Caml_inline void pool_initialize(pool* r,
     #endif
     p += wh;
   }
+  CAMLassert(p == end);
+  CAMLassert((uintptr_t)end % Cache_line_bsize == 0);
   r->next_obj = (value*)(p - wh);
 }
 
@@ -1311,7 +1310,7 @@ static void verify_pool(pool* a, sizeclass sz, struct mem_stats* s) {
     header_t* p = POOL_FIRST_BLOCK(a, sz);
     header_t* end = POOL_END(a);
     mlsize_t wh = wsize_sizeclass[sz];
-    s->overhead += Wsize_bsize(POOL_HEADER_SZ);
+    s->overhead += POOL_SLAB_WOFFSET(sz);
 
     while (p + wh <= end) {
       header_t hd = (header_t)*p;
@@ -1325,8 +1324,7 @@ static void verify_pool(pool* a, sizeclass sz, struct mem_stats* s) {
       }
       p += wh;
     }
-    CAMLassert(end - p == wastage_sizeclass[sz]);
-    s->overhead += end - p;
+    CAMLassert(end == p);
     s->alloced += POOL_WSIZE;
   }
 }

diff --git a/ocaml/runtime4/alloc.c b/ocaml/runtime4/alloc.c
@@ -289,3 +289,8 @@ CAMLexport value caml_alloc_some(value v)
   Field(some, 0) = v;
   CAMLreturn(some);
 }
+
+CAMLprim value caml_atomic_make_contended(value v)
+{
+  caml_fatal_error("Atomic.make_contended is not supported by runtime4");
+}
diff --git a/ocaml/stdlib/atomic.ml b/ocaml/stdlib/atomic.ml
@@ -15,6 +15,7 @@
 type !'a t
 
 external make : 'a -> 'a t = "%makemutable"
+external make_contended : 'a -> 'a t = "caml_atomic_make_contended"
 external get : 'a t -> 'a = "%atomic_load"
 external exchange : 'a t -> 'a -> 'a = "%atomic_exchange"
 external compare_and_set : 'a t -> 'a -> 'a -> bool = "%atomic_cas"

diff --git a/ocaml/stdlib/atomic.mli b/ocaml/stdlib/atomic.mli
@@ -29,6 +29,20 @@ type !'a t
 (** Create an atomic reference. *)
 val make : 'a -> 'a t
 
+(** Create an atomic reference that is alone on a cache line. It occupies 4-16x
+    the memory of one allocated with [make v].
+    The primary purpose is to prevent false-sharing and the resulting
+    performance degradation. When a CPU performs an atomic operation, it
+    temporarily takes ownership of an entire cache line that contains the
+    atomic reference. If multiple atomic references share the same cache line,
+    modifying these disjoint memory regions simultaneously becomes impossible,
+    which can create a bottleneck. Hence, as a general guideline, if an atomic
+    reference is experiencing contention, assigning it its own cache line may
+    enhance performance.
+
+    CR ocaml 5 all-runtime5: does not support runtime4 *)
+val make_contended : 'a -> 'a t
+
 (** Get the current value of the atomic reference. *)
 val get : 'a t -> 'a
 

diff --git a/ocaml/testsuite/tests/c-api/aligned_alloc.ml b/ocaml/testsuite/tests/c-api/aligned_alloc.ml
@@ -0,0 +1,28 @@
+(* TEST
+  modules = "aligned_alloc_stubs.c"
+  * runtime4
+  ** skip
+  * runtime5
+  ** native
+*)
+
+external is_aligned : 'a Atomic.t -> bool = "caml_atomic_is_aligned"
+let test_is_aligned () =
+  let l = List.init 100 Atomic.make in
+  let all_aligned =
+    List.for_all is_aligned l
+  in
+  assert (not all_aligned)
+;;
+
+let test_make_contended () =
+  let l = List.init 100 Atomic.make_contended in
+  List.iteri (fun i atomic ->
+    assert (Atomic.get atomic == i);
+    assert (is_aligned atomic)) l
+;;
+
+let () =
+  test_is_aligned ();
+  test_make_contended ();
+;;
diff --git a/ocaml/testsuite/tests/c-api/aligned_alloc_stubs.c b/ocaml/testsuite/tests/c-api/aligned_alloc_stubs.c
@@ -0,0 +1,12 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "caml/alloc.h"
+
+CAMLprim value caml_atomic_is_aligned(value val)
+{
+  if ((uintptr_t)Hp_val(val) % Cache_line_bsize == 0) {
+    return Val_true;
+  } else {
+    return Val_false;
+  }
+}
diff --git a/ocaml/tools/gen_sizeclasses.ml b/ocaml/tools/gen_sizeclasses.ml
@@ -15,12 +15,27 @@
 let overhead block slot obj =
   1. -. float_of_int((block / slot) * obj) /. float_of_int block
 
-let max_overhead = 0.10
+let max_overhead = 0.101
+
+(*
+  Prevention of false sharing requires certain sizeclasses to be present. This
+  ensures they are generated.
+  Runtime has a constructor for atomics (`caml_atomic_make_contended`), which
+  aligns them with cache lines to avoid false sharing. The implementation
+  relies on the fact that pools are cache-aligned by design and slots of
+  appropriate size maintain this property. To be precise, slots whose size is a
+  multiple of cache line are laid out in such a way, that their boundaries
+  coincide with boundaries between cache lines.
+*)
+let required_for_contended_atomic = function
+  | 16 | 32 -> true
+  | _ -> false
 
 let rec blocksizes block slot = function
   | 0 -> []
   | obj ->
     if overhead block slot obj > max_overhead
+      || required_for_contended_atomic obj
     then
       if overhead block obj obj < max_overhead then
         obj :: blocksizes block obj (obj - 1)