Skip to content

Commit

Permalink
flambda-backend: Ports upstream #12212 (ocaml-flambda#2118)
Browse files Browse the repository at this point in the history
  • Loading branch information
riaqn authored Dec 4, 2023
1 parent 5aeb364 commit 9248fea
Show file tree
Hide file tree
Showing 10 changed files with 109 additions and 13 deletions.
10 changes: 10 additions & 0 deletions runtime/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -396,3 +396,13 @@ CAMLexport value caml_alloc_some(value v)
Field(some, 0) = v;
CAMLreturn(some);
}

CAMLprim value caml_atomic_make_contended(value v)
{
CAMLparam1(v);
const mlsize_t sz = Wosize_bhsize(Cache_line_bsize);
value res = caml_alloc_shr(sz, 0);
caml_initialize(&Field(res, 0), v);
for (mlsize_t i = 1; i < sz; i++) Field(res, i) = Val_unit;
CAMLreturn(res);
}
13 changes: 13 additions & 0 deletions runtime/caml/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,4 +272,17 @@ typedef uint64_t uintnat;
/* Default size of runtime_events ringbuffers, in words, in powers of two */
#define Default_runtime_events_log_wsize 16

/* Assumed size of cache line. This value can be bigger than the actual L1
cache line size. Atomics allocated with aligned constructor are
memory-aligned this value to avoid false sharing of cache line. */
#if defined(TARGET_s390x)
#define Cache_line_bsize 256
#elif defined(TARGET_arm64) || defined(TARGET_power)
#define Cache_line_bsize 128
#elif defined(TARGET_amd64) || defined(TARGET_riscv)
#define Cache_line_bsize 64
#elif (!defined(NATIVE_CODE))
#define Cache_line_bsize 64
#endif

#endif /* CAML_CONFIG_H */
8 changes: 4 additions & 4 deletions runtime/caml/sizeclasses.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
#define SIZECLASS_MAX 128
#define NUM_SIZECLASSES 32
static const unsigned int wsize_sizeclass[NUM_SIZECLASSES] =
{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 23, 26, 29, 33, 37, 42,
{ 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 17, 19, 22, 25, 28, 32, 33, 37, 42,
47, 53, 59, 65, 73, 81, 89, 99, 108, 118, 128 };
static const unsigned char wastage_sizeclass[NUM_SIZECLASSES] =
{ 0, 0, 0, 0, 2, 0, 4, 4, 6, 2, 0, 4, 12, 6, 12, 21, 10, 3, 0, 22, 18, 3, 11,
{ 0, 0, 0, 0, 2, 0, 4, 4, 2, 0, 4, 12, 12, 7, 0, 17, 4, 28, 0, 22, 18, 3, 11,
21, 62, 4, 42, 87, 33, 96, 80, 124 };
static const unsigned char sizeclass_wsize[SIZECLASS_MAX + 1] =
{ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20,
{ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 13, 13, 14,
14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 17, 18, 19, 19, 19, 19, 20, 20,
20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23,
23, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28,
Expand Down
14 changes: 6 additions & 8 deletions runtime/shared_heap.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ typedef struct pool {
sizeclass sz;
} pool;
CAML_STATIC_ASSERT(sizeof(pool) == Bsize_wsize(POOL_HEADER_WSIZE));
#define POOL_HEADER_SZ sizeof(pool)

#define POOL_SLAB_WOFFSET(sz) (POOL_HEADER_WSIZE + wastage_sizeclass[sz])
#define POOL_FIRST_BLOCK(p, sz) ((header_t*)(p) + POOL_SLAB_WOFFSET(sz))
#define POOL_END(p) ((header_t*)(p) + POOL_WSIZE)
Expand Down Expand Up @@ -238,7 +236,7 @@ static void calc_pool_stats(pool* a, sizeclass sz, struct heap_stats* s)
header_t* p = POOL_FIRST_BLOCK(a, sz);
header_t* end = POOL_END(a);
mlsize_t wh = wsize_sizeclass[sz];
s->pool_frag_words += Wsize_bsize(POOL_HEADER_SZ);
s->pool_frag_words += POOL_SLAB_WOFFSET(sz);

while (p + wh <= end) {
header_t hd = (header_t)atomic_load_relaxed((atomic_uintnat*)p);
Expand All @@ -250,8 +248,7 @@ static void calc_pool_stats(pool* a, sizeclass sz, struct heap_stats* s)

p += wh;
}
CAMLassert(end - p == wastage_sizeclass[sz]);
s->pool_frag_words += end - p;
CAMLassert(end == p);
s->pool_words += POOL_WSIZE;
}

Expand Down Expand Up @@ -283,6 +280,8 @@ Caml_inline void pool_initialize(pool* r,
#endif
p += wh;
}
CAMLassert(p == end);
CAMLassert((uintptr_t)end % Cache_line_bsize == 0);
r->next_obj = (value*)(p - wh);
}

Expand Down Expand Up @@ -1311,7 +1310,7 @@ static void verify_pool(pool* a, sizeclass sz, struct mem_stats* s) {
header_t* p = POOL_FIRST_BLOCK(a, sz);
header_t* end = POOL_END(a);
mlsize_t wh = wsize_sizeclass[sz];
s->overhead += Wsize_bsize(POOL_HEADER_SZ);
s->overhead += POOL_SLAB_WOFFSET(sz);

while (p + wh <= end) {
header_t hd = (header_t)*p;
Expand All @@ -1325,8 +1324,7 @@ static void verify_pool(pool* a, sizeclass sz, struct mem_stats* s) {
}
p += wh;
}
CAMLassert(end - p == wastage_sizeclass[sz]);
s->overhead += end - p;
CAMLassert(end == p);
s->alloced += POOL_WSIZE;
}
}
Expand Down
5 changes: 5 additions & 0 deletions runtime4/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -289,3 +289,8 @@ CAMLexport value caml_alloc_some(value v)
Field(some, 0) = v;
CAMLreturn(some);
}

CAMLprim value caml_atomic_make_contended(value v)
{
caml_fatal_error("Atomic.make_contended is not supported by runtime4");
}
1 change: 1 addition & 0 deletions stdlib/atomic.ml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
type !'a t

external make : 'a -> 'a t = "%makemutable"
external make_contended : 'a -> 'a t = "caml_atomic_make_contended"
external get : 'a t -> 'a = "%atomic_load"
external exchange : 'a t -> 'a -> 'a = "%atomic_exchange"
external compare_and_set : 'a t -> 'a -> 'a -> bool = "%atomic_cas"
Expand Down
14 changes: 14 additions & 0 deletions stdlib/atomic.mli
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,20 @@ type !'a t
(** Create an atomic reference. *)
val make : 'a -> 'a t

(** Create an atomic reference that is alone on a cache line. It occupies 4-16x
the memory of one allocated with [make v].
The primary purpose is to prevent false-sharing and the resulting
performance degradation. When a CPU performs an atomic operation, it
temporarily takes ownership of an entire cache line that contains the
atomic reference. If multiple atomic references share the same cache line,
modifying these disjoint memory regions simultaneously becomes impossible,
which can create a bottleneck. Hence, as a general guideline, if an atomic
reference is experiencing contention, assigning it its own cache line may
enhance performance.
CR ocaml 5 all-runtime5: does not support runtime4 *)
val make_contended : 'a -> 'a t

(** Get the current value of the atomic reference. *)
val get : 'a t -> 'a

Expand Down
28 changes: 28 additions & 0 deletions testsuite/tests/c-api/aligned_alloc.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
(* TEST
modules = "aligned_alloc_stubs.c"
* runtime4
** skip
* runtime5
** native
*)

external is_aligned : 'a Atomic.t -> bool = "caml_atomic_is_aligned"
let test_is_aligned () =
let l = List.init 100 Atomic.make in
let all_aligned =
List.for_all is_aligned l
in
assert (not all_aligned)
;;

let test_make_contended () =
let l = List.init 100 Atomic.make_contended in
List.iteri (fun i atomic ->
assert (Atomic.get atomic == i);
assert (is_aligned atomic)) l
;;

let () =
test_is_aligned ();
test_make_contended ();
;;
12 changes: 12 additions & 0 deletions testsuite/tests/c-api/aligned_alloc_stubs.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#include <stdio.h>
#include <stdlib.h>
#include "caml/alloc.h"

CAMLprim value caml_atomic_is_aligned(value val)
{
if ((uintptr_t)Hp_val(val) % Cache_line_bsize == 0) {
return Val_true;
} else {
return Val_false;
}
}
17 changes: 16 additions & 1 deletion tools/gen_sizeclasses.ml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,27 @@
let overhead block slot obj =
1. -. float_of_int((block / slot) * obj) /. float_of_int block

let max_overhead = 0.10
let max_overhead = 0.101

(*
Prevention of false sharing requires certain sizeclasses to be present. This
ensures they are generated.
Runtime has a constructor for atomics (`caml_atomic_make_contended`), which
aligns them with cache lines to avoid false sharing. The implementation
relies on the fact that pools are cache-aligned by design and slots of
appropriate size maintain this property. To be precise, slots whose size is a
multiple of cache line are laid out in such a way, that their boundaries
coincide with boundaries between cache lines.
*)
let required_for_contended_atomic = function
| 16 | 32 -> true
| _ -> false

let rec blocksizes block slot = function
| 0 -> []
| obj ->
if overhead block slot obj > max_overhead
|| required_for_contended_atomic obj
then
if overhead block obj obj < max_overhead then
obj :: blocksizes block obj (obj - 1)
Expand Down

0 comments on commit 9248fea

Please sign in to comment.