From 2a11d34be21f1d298d068995eabdc2553b782d37 Mon Sep 17 00:00:00 2001
From: Alexandre Fonseca <alexandre.fonseca@datadoghq.com>
Date: Tue, 21 Nov 2023 15:09:25 +0000
Subject: [PATCH] [PROF-8543] Address comments

---
 benchmarks/profiler_sample_loop_v2.rb         |   4 +-
 .../collectors_cpu_and_wall_time_worker.c     |  76 +++++-----
 .../collectors_thread_context.c               |   5 +-
 .../extconf.rb                                |   2 +-
 .../heap_recorder.c                           | 134 +++++++++++-------
 .../heap_recorder.h                           |   6 +-
 .../ruby_helpers.c                            |  10 ++
 .../ruby_helpers.h                            |   8 ++
 .../stack_recorder.c                          |  44 +++---
 .../stack_recorder.h                          |   2 +-
 lib/datadog/core/configuration/settings.rb    |  56 ++++----
 .../collectors/cpu_and_wall_time_worker.rb    |  22 +--
 lib/datadog/profiling/component.rb            |  58 +++++++-
 lib/datadog/profiling/ext.rb                  |  24 ++++
 14 files changed, 285 insertions(+), 166 deletions(-)

diff --git a/benchmarks/profiler_sample_loop_v2.rb b/benchmarks/profiler_sample_loop_v2.rb
index 953da0a4990..4722a6c050f 100644
--- a/benchmarks/profiler_sample_loop_v2.rb
+++ b/benchmarks/profiler_sample_loop_v2.rb
@@ -18,8 +18,8 @@ class ProfilerSampleLoopBenchmark
   def create_profiler
     @recorder = Datadog::Profiling::StackRecorder.new(
       cpu_time_enabled: true,
-      alloc_samples_enabled: true,
-      heap_samples_enabled: true
+      alloc_samples_enabled: false,
+      heap_samples_enabled: false
     )
     @collector = Datadog::Profiling::Collectors::ThreadContext.new(
       recorder: @recorder, max_frames: 400, tracer: nil, endpoint_collection_enabled: false, timeline_enabled: false
diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index 1c13a452d35..aa32adf8764 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -81,10 +81,11 @@ struct cpu_and_wall_time_worker_state {
 
   bool gc_profiling_enabled;
   bool allocation_counting_enabled;
-  bool heap_counting_enabled;
   bool no_signals_workaround_enabled;
   bool dynamic_sampling_rate_enabled;
-  int allocation_sample_every; // Temporarily used for development/testing of allocation profiling
+  int allocation_sample_every;
+  bool allocation_profiling_enabled;
+  bool heap_profiling_enabled;
   VALUE self_instance;
   VALUE thread_context_collector_instance;
   VALUE idle_sampling_helper_instance;
@@ -152,10 +153,11 @@ static VALUE _native_initialize(
   VALUE gc_profiling_enabled,
   VALUE idle_sampling_helper_instance,
   VALUE allocation_counting_enabled,
-  VALUE heap_counting_enabled,
   VALUE no_signals_workaround_enabled,
   VALUE dynamic_sampling_rate_enabled,
-  VALUE allocation_sample_every
+  VALUE allocation_sample_every,
+  VALUE allocation_profiling_enabled,
+  VALUE heap_profiling_enabled
 );
 static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
 static VALUE _native_sampling_loop(VALUE self, VALUE instance);
@@ -193,7 +195,6 @@ static void on_freeobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
 static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state);
 static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self);
 static VALUE rescued_sample_allocation(VALUE tracepoint_data);
-static VALUE rescued_sample_free(VALUE tracepoint_data);
 
 // Note on sampler global state safety:
 //
@@ -231,7 +232,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
   // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
   rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
 
-  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 9);
+  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 10);
   rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
   rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
   rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
@@ -270,10 +271,11 @@ static VALUE _native_new(VALUE klass) {
 
   state->gc_profiling_enabled = false;
   state->allocation_counting_enabled = false;
-  state->heap_counting_enabled = false;
   state->no_signals_workaround_enabled = false;
   state->dynamic_sampling_rate_enabled = true;
   state->allocation_sample_every = 0;
+  state->allocation_profiling_enabled = false;
+  state->heap_profiling_enabled = false;
   state->thread_context_collector_instance = Qnil;
   state->idle_sampling_helper_instance = Qnil;
   state->owner_thread = Qnil;
@@ -300,30 +302,37 @@ static VALUE _native_initialize(
   VALUE gc_profiling_enabled,
   VALUE idle_sampling_helper_instance,
   VALUE allocation_counting_enabled,
-  VALUE heap_counting_enabled,
   VALUE no_signals_workaround_enabled,
   VALUE dynamic_sampling_rate_enabled,
-  VALUE allocation_sample_every
+  VALUE allocation_sample_every,
+  VALUE allocation_profiling_enabled,
+  VALUE heap_profiling_enabled
 ) {
   ENFORCE_BOOLEAN(gc_profiling_enabled);
   ENFORCE_BOOLEAN(allocation_counting_enabled);
-  ENFORCE_BOOLEAN(heap_counting_enabled);
   ENFORCE_BOOLEAN(no_signals_workaround_enabled);
   ENFORCE_BOOLEAN(dynamic_sampling_rate_enabled);
   ENFORCE_TYPE(allocation_sample_every, T_FIXNUM);
+  ENFORCE_BOOLEAN(allocation_profiling_enabled);
+  ENFORCE_BOOLEAN(heap_profiling_enabled);
 
   struct cpu_and_wall_time_worker_state *state;
   TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
 
   state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
   state->allocation_counting_enabled = (allocation_counting_enabled == Qtrue);
-  state->heap_counting_enabled = state->allocation_counting_enabled && (heap_counting_enabled == Qtrue);
   state->no_signals_workaround_enabled = (no_signals_workaround_enabled == Qtrue);
   state->dynamic_sampling_rate_enabled = (dynamic_sampling_rate_enabled == Qtrue);
   state->allocation_sample_every = NUM2INT(allocation_sample_every);
+  state->allocation_profiling_enabled = (allocation_profiling_enabled == Qtrue);
+  state->heap_profiling_enabled = (heap_profiling_enabled == Qtrue);
 
-  if (state->allocation_sample_every < 0) {
-    rb_raise(rb_eArgError, "Unexpected value for allocation_sample_every: %d. This value must be >= 0.", state->allocation_sample_every);
+  if (state->allocation_sample_every <= 0) {
+    rb_raise(rb_eArgError, "Unexpected value for allocation_sample_every: %d. This value must be > 0.", state->allocation_sample_every);
+  }
+
+  if (state->heap_profiling_enabled && !state->allocation_profiling_enabled) {
+    rb_raise(rb_eArgError, "Heap profiling requires allocation profiling to be enabled but it isn't.");
   }
 
   state->thread_context_collector_instance = enforce_thread_context_collector_instance(thread_context_collector_instance);
@@ -644,8 +653,8 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
   // because they may raise exceptions.
   install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
   if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
-  if (state->allocation_counting_enabled) rb_tracepoint_enable(state->object_allocation_tracepoint);
-  if (state->heap_counting_enabled) rb_tracepoint_enable(state->object_free_tracepoint);
+  if (state->allocation_counting_enabled || state->allocation_profiling_enabled) rb_tracepoint_enable(state->object_allocation_tracepoint);
+  if (state->heap_profiling_enabled) rb_tracepoint_enable(state->object_free_tracepoint);
 
   rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);
 
@@ -929,15 +938,11 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
     return;
   }
 
-  // @ivoanjo: Strictly speaking, this is not needed because Ruby should not call the same tracepoint while a previous
-  // invocation is still pending, (e.g. it wouldn't call `on_newobj_event` while it's already running), but I decided
-  // to keep this here for consistency -- every call to the thread context (other than the special gc calls which are
-  // defined as not being able to allocate) sets this.
   state->during_sample = true;
 
   // TODO: This is a placeholder sampling decision strategy. We plan to replace it with a better one soon (e.g. before
   // beta), and having something here allows us to test the rest of feature, sampling decision aside.
-  if (state->allocation_sample_every > 0 && ((allocation_count % state->allocation_sample_every) == 0)) {
+  if (state->allocation_profiling_enabled && state->allocation_sample_every > 0 && ((allocation_count % state->allocation_sample_every) == 0)) {
     // Rescue against any exceptions that happen during sampling
     safely_call(rescued_sample_allocation, tracepoint_data, state->self_instance);
   }
@@ -945,6 +950,9 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
   state->during_sample = false;
 }
 
+// Safety: This function may get called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
+// *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
+// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
 static void on_freeobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
   struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
 
@@ -952,15 +960,14 @@ static void on_freeobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
   // and disabled before it is cleared, but just in case...
   if (state == NULL) return;
 
-  // @ivoanjo: Strictly speaking, this is not needed because Ruby should not call the same tracepoint while a previous
-  // invocation is still pending, (e.g. it wouldn't call `on_newobj_event` while it's already running), but I decided
-  // to keep this here for consistency -- every call to the thread context (other than the special gc calls which are
-  // defined as not being able to allocate) sets this.
-  state->during_sample = true;
+  // NOTE: Because this is likely to be happening during GC, handling of this tracepoint does not do any allocation.
+  // We also do not want to lose any frees as that would affect the accuracy of our live heap tracking so we skip
+  // the typical `state->during_sample` dropping that other sampling tracepoints have.
 
-  safely_call(rescued_sample_free, tracepoint_data, state->self_instance);
+  rb_trace_arg_t *data = rb_tracearg_from_tracepoint(tracepoint_data);
+  VALUE freed_object = rb_tracearg_object(data);
 
-  state->during_sample = false;
+  thread_context_collector_sample_free(state->thread_context_collector_instance, freed_object);
 }
 
 static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state) {
@@ -996,18 +1003,3 @@ static VALUE rescued_sample_allocation(VALUE tracepoint_data) {
   // Return a dummy VALUE because we're called from rb_rescue2 which requires it
   return Qnil;
 }
-
-static VALUE rescued_sample_free(VALUE tracepoint_data) {
-  struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
-
-  // This should not happen in a normal situation because on_newobj_event already checked for this, but just in case...
-  if (state == NULL) return Qnil;
-
-  rb_trace_arg_t *data = rb_tracearg_from_tracepoint(tracepoint_data);
-  VALUE freed_object = rb_tracearg_object(data);
-
-  thread_context_collector_sample_free(state->thread_context_collector_instance, freed_object);
-
-  // Return a dummy VALUE because we're called from rb_rescue2 which requires it
-  return Qnil;
-}
diff --git a/ext/ddtrace_profiling_native_extension/collectors_thread_context.c b/ext/ddtrace_profiling_native_extension/collectors_thread_context.c
index 84f485382e7..8183825ade7 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_thread_context.c
+++ b/ext/ddtrace_profiling_native_extension/collectors_thread_context.c
@@ -1203,7 +1203,7 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
     }
   }
 
-  record_obj_allocation(state->recorder_instance, new_object, sample_weight, optional_class_name);
+  track_obj_allocation(state->recorder_instance, new_object, sample_weight);
 
   trigger_sample_for_thread(
     state,
@@ -1218,6 +1218,9 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
   );
 }
 
+// Safety: This function may get called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
+// *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
+// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
 void thread_context_collector_sample_free(VALUE self_instance, VALUE freed_object) {
   struct thread_context_collector_state *state;
   TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
diff --git a/ext/ddtrace_profiling_native_extension/extconf.rb b/ext/ddtrace_profiling_native_extension/extconf.rb
index 44c224ee527..064b36e7445 100644
--- a/ext/ddtrace_profiling_native_extension/extconf.rb
+++ b/ext/ddtrace_profiling_native_extension/extconf.rb
@@ -120,7 +120,7 @@ def add_compiler_flag(flag)
 add_compiler_flag '-Wall'
 add_compiler_flag '-Wextra'
 
-if ENV['DEBUG']
+if ENV['DDTRACE_DEBUG']
   CONFIG['optflags'] = '-O0'
   CONFIG['debugflags'] = '-ggdb3'
 end
diff --git a/ext/ddtrace_profiling_native_extension/heap_recorder.c b/ext/ddtrace_profiling_native_extension/heap_recorder.c
index 37d3f4ff971..09a453676e1 100644
--- a/ext/ddtrace_profiling_native_extension/heap_recorder.c
+++ b/ext/ddtrace_profiling_native_extension/heap_recorder.c
@@ -22,6 +22,9 @@ static st_index_t heap_frame_hash(heap_frame*, st_index_t seed);
 typedef struct {
   heap_frame *frames;
   uint64_t frames_len;
+  st_index_t hash;
+  st_index_t hash_seed;
+  bool hash_calculated;
 } heap_stack;
 static heap_stack* heap_stack_init(ddog_prof_Slice_Location);
 static void heap_stack_free(heap_stack*);
@@ -57,7 +60,6 @@ static void object_record_free(object_record*);
 typedef struct {
   VALUE obj;
   unsigned int weight;
-  ddog_CharSlice *class_name;
 } partial_heap_recording;
 
 typedef struct sample {
@@ -166,14 +168,14 @@ static int st_heap_records_iterate(st_data_t key, st_data_t value, st_data_t ext
   return ST_CONTINUE;
 }
 
-void heap_recorder_iterate_stacks(heap_recorder *heap_recorder, void (*for_each_callback)(stack_iteration_data stack_data, void *extra_arg), void *for_each_callback_extra_arg) {
-  pthread_mutex_lock(&heap_recorder->records_mutex);
+void heap_recorder_iterate_stacks_without_gvl(heap_recorder *heap_recorder, void (*for_each_callback)(stack_iteration_data stack_data, void *extra_arg), void *for_each_callback_extra_arg) {
+  ENFORCE_SUCCESS_NO_GVL(pthread_mutex_lock(&heap_recorder->records_mutex));
   internal_iteration_data internal_iteration_data;
   internal_iteration_data.for_each_callback = for_each_callback;
   internal_iteration_data.for_each_callback_extra_arg = for_each_callback_extra_arg;
   internal_iteration_data.heap_recorder = heap_recorder;
   st_foreach(heap_recorder->heap_records, st_heap_records_iterate, (st_data_t) &internal_iteration_data);
-  pthread_mutex_unlock(&heap_recorder->records_mutex);
+  ENFORCE_SUCCESS_NO_GVL(pthread_mutex_unlock(&heap_recorder->records_mutex));
 }
 
 void commit_allocation(heap_recorder *heap_recorder, heap_stack *heap_stack, VALUE obj, unsigned int weight) {
@@ -181,8 +183,8 @@ void commit_allocation(heap_recorder *heap_recorder, heap_stack *heap_stack, VAL
   if (!st_lookup(heap_recorder->heap_records, (st_data_t) heap_stack, (st_data_t*) &heap_record)) {
     heap_record = heap_record_init(heap_stack);
     if (st_insert(heap_recorder->heap_records, (st_data_t) heap_stack, (st_data_t) heap_record)) {
+      // This should not be possible but just in case something bugs out, lets error out
       rb_raise(rb_eRuntimeError, "Duplicate heap stack tracking: %p", heap_stack);
-      return;
     };
   } else {
     // FIXME: Figure out a way to not have to instantiate a new stack only to free it if it's
@@ -194,9 +196,9 @@ void commit_allocation(heap_recorder *heap_recorder, heap_stack *heap_stack, VAL
   object_record *object_record = object_record_init(obj, weight, heap_record);
   if (st_insert(heap_recorder->object_records, (st_data_t) obj, (st_data_t) object_record) != 0) {
     // Object already tracked?
+    // FIXME: This seems to happen in practice. Research how/why and handle differently.
     object_record_free(object_record);
     rb_raise(rb_eRuntimeError, "Duplicate heap object tracking: %lu", obj);
-    return;
   }
 
   fprintf(stderr, "Committed allocation of %lu (heap_record=%p, object_record=%p)\n", obj, heap_record, object_record);
@@ -209,9 +211,9 @@ void commit_free(heap_recorder *heap_recorder, VALUE obj) {
   st_data_t key = (st_data_t) obj;
   object_record *object_record = NULL;
   if (!st_delete(heap_recorder->object_records, (st_data_t*) &key, (st_data_t*) &object_record)) {
-    // Object not tracked?
+    // This should not be possible since we're already checking for tracked objects during the free
+    // tracepoint but just in case something bugs out, lets error out
     rb_raise(rb_eRuntimeError, "Committing free of untracked object");
-    return;
   }
 
   heap_record *heap_record = object_record->heap_record;
@@ -223,6 +225,7 @@ void commit_free(heap_recorder *heap_recorder, VALUE obj) {
   object_record_free(object_record);
 }
 
+// NOTE: Must be holding the records_mutex lock
 static void flush_queue(heap_recorder *heap_recorder) {
   for (size_t i = 0; i < heap_recorder->queued_samples_len; i++) {
     sample *queued_sample = &heap_recorder->queued_samples[i];
@@ -241,9 +244,28 @@ static void flush_queue(heap_recorder *heap_recorder) {
   heap_recorder->queued_samples_len = 0;
 }
 
+void heap_recorder_flush(heap_recorder *heap_recorder) {
+  int error = pthread_mutex_lock(&heap_recorder->records_mutex);
+  if (!error) {
+    // We were able to get a lock to heap_records so lets flush any pending samples
+    // that might have been queued previously before adding this new one.
+    flush_queue(heap_recorder);
+  } else {
+    ENFORCE_SUCCESS_GVL(error)
+    return;
+  }
+
+  pthread_mutex_unlock(&heap_recorder->records_mutex);
+}
+
+// Safety: This function may get called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
+// *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
+// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
 static void enqueue_sample(heap_recorder *heap_recorder, sample new_sample) {
   fprintf(stderr, "Enqueuing sample for %lu (weight=%u free=%i)\n", new_sample.obj, new_sample.weight, new_sample.free);
   if (heap_recorder->queued_samples_len >= MAX_QUEUE_LIMIT) {
+    // FIXME: If we're droppping a free sample here, the accuracy of our heap profiles will be affected.
+    // Should we completely give up or should we trigger a flag that we can then use to add a warning in the UI?
     fprintf(stderr, "Dropping sample on the floor.\n");
     return;
   }
@@ -262,6 +284,9 @@ static void enqueue_allocation(heap_recorder *heap_recorder, heap_stack *heap_st
   });
 }
 
+// Safety: This function may get called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
+// *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
+// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
 static void enqueue_free(heap_recorder *heap_recorder, VALUE obj) {
   enqueue_sample(heap_recorder, (sample) {
       .stack = NULL,
@@ -272,38 +297,32 @@ static void enqueue_free(heap_recorder *heap_recorder, VALUE obj) {
   });
 }
 
-void start_heap_allocation_recording(heap_recorder* heap_recorder, VALUE new_obj, unsigned int weight, ddog_CharSlice *class_name) {
+void start_heap_allocation_recording(heap_recorder* heap_recorder, VALUE new_obj, unsigned int weight) {
   fprintf(stderr, "Started recording allocation of %lu with weight %u\n", new_obj, weight);
-  partial_heap_recording *active_recording = &heap_recorder->active_recording;
-  active_recording->obj = new_obj;
-  active_recording->weight = weight;
-  active_recording->class_name = class_name;
+  heap_recorder->active_recording = (partial_heap_recording) {
+    .obj = new_obj,
+    .weight = weight,
+  };
 }
 
 void end_heap_allocation_recording(struct heap_recorder *heap_recorder, ddog_prof_Slice_Location locations) {
-  // TODO: Make use of active_recording->class_name
   partial_heap_recording *active_recording = &heap_recorder->active_recording;
 
   VALUE new_obj = active_recording->obj;
   if (!new_obj) {
     // Recording ended without having been started?
     rb_raise(rb_eRuntimeError, "Ended a heap recording that was not started");
-    return;
   }
 
   int weight = active_recording->weight;
 
   // From now on, mark active recording as invalid so we can short-circuit at any point and
   // not end up with a still active recording. new_obj still holds the object for this recording
-  active_recording->obj = 0;
+  active_recording->obj = Qnil;
 
   heap_stack *heap_stack = heap_stack_init(locations);
   int error = pthread_mutex_trylock(&heap_recorder->records_mutex);
-  if (!error) {
-    // We were able to get a lock to heap_records so lets flush any pending samples
-    // that might have been queued previously before adding this new one.
-    flush_queue(heap_recorder);
-  } else {
+  if (error) {
     // We weren't able to get a lock, so enqueue this sample for later processing
     // and end early
     if (error == EBUSY) {
@@ -314,14 +333,23 @@ void end_heap_allocation_recording(struct heap_recorder *heap_recorder, ddog_pro
     return;
   }
 
-  // If we got this far, we got a write lock so we can commit the record
+  // We were able to get a lock to heap_records so lets flush any pending samples
+  // that might have been queued previously before adding this new one.
+  flush_queue(heap_recorder);
+
+  // And then add the new allocation
   commit_allocation(heap_recorder, heap_stack, new_obj, weight);
 
-  pthread_mutex_unlock(&heap_recorder->records_mutex);
+  ENFORCE_SUCCESS_GVL(pthread_mutex_unlock(&heap_recorder->records_mutex));
 }
 
+// Safety: This function can get called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
+// *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
+// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
 void record_heap_free(heap_recorder *heap_recorder, VALUE obj) {
   object_record *object_record = NULL;
+  // lookups require hashing and traversal over hash buckets but should not require doing any allocations
+  // and should thus be safe to run in GC.
   st_lookup(heap_recorder->object_records, (st_data_t) obj, (st_data_t*) &object_record);
 
   if (object_record == NULL) {
@@ -329,7 +357,7 @@ void record_heap_free(heap_recorder *heap_recorder, VALUE obj) {
     // check if the allocation sample is in the queue
     for (size_t i = 0; i < heap_recorder->queued_samples_len; i++) {
       sample *queued_sample = &heap_recorder->queued_samples[i];
-      if (queued_sample->obj == obj) {
+      if (queued_sample->obj == obj && !queued_sample->skip) {
         queued_sample->skip = true;
         break;
       }
@@ -339,23 +367,14 @@ void record_heap_free(heap_recorder *heap_recorder, VALUE obj) {
     return;
   }
 
-  // if we got this far, we freed a tracked object so need to update records!
-  int error = pthread_mutex_trylock(&heap_recorder->records_mutex);
-  if (error) {
-    // We weren't able to get a lock, so enqueue this sample for later processing
-    // and exit early
-    if (error == EBUSY) {
-      enqueue_free(heap_recorder, obj);
-    } else {
-      ENFORCE_SUCCESS_GVL(error)
-    }
-    return;
-  }
-
-  // If we got this far, we got a write lock so we can commit the record
-  commit_free(heap_recorder, obj);
-
-  pthread_mutex_unlock(&heap_recorder->records_mutex);
+  // If we got this far, we freed a tracked object so we need to update and remove records!
+  // However, there's a caveat: we're under tight constraints and may be running during a GC where we are forbidden
+  // to do any more allocations. In certain situations, even calling ruby_xfree on an object_record may trigger
+  // such allocations (https://github.com/ruby/ruby/blob/ffb1eb37e74334ae85d6bfee07d784a145e23dd8/gc.c#L12599).
+  // We also do not want to risk triggering reentrant free sampling. Therefore, we take the extremely cautious
+  // approach of enqueuing this free to be applied at next allocation recording or flush with no explicit heap
+  // allocations or frees, direct or otherwise, happening during the execution of this method.
+  enqueue_free(heap_recorder, obj);
 }
 
 // ===============
@@ -392,15 +411,15 @@ void object_record_free(object_record *record) {
 // Heap Frame API
 // ==============
 int heap_frame_cmp(heap_frame *f1, heap_frame *f2) {
-  int cmp = strcmp(f1->name, f2->name);
-  if (cmp != 0) {
-    return cmp;
+  int line_diff = (int) (f1->line - f2->line);
+  if (line_diff != 0) {
+    return line_diff;
   }
-  cmp = strcmp(f1->filename, f2->filename);
+  int cmp = strcmp(f1->name, f2->name);
   if (cmp != 0) {
     return cmp;
   }
-  return (int) (f1->line - f2->line);
+  return strcmp(f1->filename, f2->filename);
 }
 
 st_index_t string_hash(char *str, st_index_t seed) {
@@ -424,14 +443,20 @@ st_index_t char_slice_hash(ddog_CharSlice char_slice, st_index_t seed) {
 // ==============
 heap_stack* heap_stack_init(ddog_prof_Slice_Location locations) {
   heap_stack *stack = ruby_xcalloc(1, sizeof(heap_stack));
-  stack->frames = ruby_xcalloc(locations.len, sizeof(heap_frame));
-  stack->frames_len = locations.len;
+  *stack = (heap_stack) {
+    .frames = ruby_xcalloc(locations.len, sizeof(heap_frame)),
+    .frames_len = locations.len,
+    .hash = 0,
+    .hash_seed = 0,
+    .hash_calculated = false
+  };
   for (uint64_t i = 0; i < locations.len; i++) {
     const ddog_prof_Location *location = &locations.ptr[i];
-    heap_frame *frame = &stack->frames[i];
-    frame->name = ruby_strdup(location->function.name.ptr);
-    frame->filename = ruby_strdup(location->function.filename.ptr);
-    frame->line = location->line;
+    stack->frames[i] = (heap_frame) {
+      .name = ruby_strndup(location->function.name.ptr, location->function.name.len),
+        .filename = ruby_strndup(location->function.filename.ptr, location->function.name.len),
+        .line = location->line,
+    };
   }
   return stack;
 }
@@ -477,6 +502,11 @@ int heap_stack_cmp_st(st_data_t key1, st_data_t key2) {
 }
 
 st_index_t heap_stack_hash(heap_stack *stack, st_index_t seed) {
+  if (stack->hash_calculated && stack->hash_seed == seed) {
+    // fast path, hash is already known
+    return stack->hash;
+  }
+
   st_index_t hash = seed;
   for (uint64_t i = 0; i < stack->frames_len; i++) {
     hash = heap_frame_hash(&stack->frames[i], hash);
diff --git a/ext/ddtrace_profiling_native_extension/heap_recorder.h b/ext/ddtrace_profiling_native_extension/heap_recorder.h
index 7b25780c8d4..f45b87492b6 100644
--- a/ext/ddtrace_profiling_native_extension/heap_recorder.h
+++ b/ext/ddtrace_profiling_native_extension/heap_recorder.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include "stack_recorder.h"
 #include <datadog/profiling.h>
 #include <ruby.h>
 
@@ -13,7 +12,8 @@ typedef struct {
 
 heap_recorder* heap_recorder_init(void);
 void heap_recorder_free(heap_recorder *heap_recorder);
-void heap_recorder_iterate_stacks(heap_recorder *heap_recorder, void (*for_each_callback)(stack_iteration_data stack_data, void* extra_arg), void *for_each_callback_extra_arg);
-void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj, unsigned int weight, ddog_CharSlice *class_name);
+void heap_recorder_flush(heap_recorder *heap_recorder);
+void heap_recorder_iterate_stacks_without_gvl(heap_recorder *heap_recorder, void (*for_each_callback)(stack_iteration_data stack_data, void* extra_arg), void *for_each_callback_extra_arg);
+void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj, unsigned int weight);
 void end_heap_allocation_recording(heap_recorder *heap_recorder, ddog_prof_Slice_Location locations);
 void record_heap_free(heap_recorder *heap_recorder, VALUE obj);
diff --git a/ext/ddtrace_profiling_native_extension/ruby_helpers.c b/ext/ddtrace_profiling_native_extension/ruby_helpers.c
index b874d1f249a..1d0f95e23dc 100644
--- a/ext/ddtrace_profiling_native_extension/ruby_helpers.c
+++ b/ext/ddtrace_profiling_native_extension/ruby_helpers.c
@@ -108,3 +108,13 @@ void raise_syserr(
     grab_gvl_and_raise_syserr(syserr_errno, "Failure returned by '%s' at %s:%d:in `%s'", expression, file, line, function_name);
   }
 }
+
+char* ruby_strndup(const char *str, size_t size) {
+  char *tmp;
+
+  tmp = xmalloc(size + 1);
+  memcpy(tmp, str, size);
+  tmp[size] = '\0';
+
+  return tmp;
+}
diff --git a/ext/ddtrace_profiling_native_extension/ruby_helpers.h b/ext/ddtrace_profiling_native_extension/ruby_helpers.h
index 84889fb83dd..157edeee9d4 100644
--- a/ext/ddtrace_profiling_native_extension/ruby_helpers.h
+++ b/ext/ddtrace_profiling_native_extension/ruby_helpers.h
@@ -87,3 +87,11 @@ NORETURN(void raise_syserr(
   int line,
   const char *function_name
 ));
+
+// Alternative to ruby_strdup that takes a size argument.
+// Similar to C's strndup but slightly less smart as size is expected to
+// be smaller or equal to the real size of str (minus null termination if it
+// exists).
+// A new string will be returned with size+1 bytes and last byte set to '\0'.
+// The returned string must be freed explicitly.
+char* ruby_strndup(const char *str, size_t size);
diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.c b/ext/ddtrace_profiling_native_extension/stack_recorder.c
index 0a51b8961e7..2e9d4add4cf 100644
--- a/ext/ddtrace_profiling_native_extension/stack_recorder.c
+++ b/ext/ddtrace_profiling_native_extension/stack_recorder.c
@@ -278,6 +278,8 @@ static VALUE _native_new(VALUE klass) {
 
   VALUE stack_recorder = TypedData_Wrap_Struct(klass, &stack_recorder_typed_data, state);
 
+  state->heap_recorder = heap_recorder_init();
+
   // Note: Don't raise exceptions after this point, since it'll lead to libdatadog memory leaking!
 
   initialize_profiles(state, sample_types);
@@ -338,8 +340,6 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_insta
   struct stack_recorder_state *state;
   TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state);
 
-  state->heap_recorder = heap_recorder_init();
-
   if (cpu_time_enabled == Qtrue && alloc_samples_enabled == Qtrue) return Qtrue; // Nothing to do, this is the default
 
   // When some sample types are disabled, we need to reconfigure libdatadog to record less types,
@@ -374,7 +374,7 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_insta
     state->position_for[ALLOC_SAMPLES_VALUE_ID] = next_disabled_pos++;
   }
 
-  if (alloc_samples_enabled == Qtrue && heap_samples_enabled == Qtrue) {
+  if (heap_samples_enabled == Qtrue) {
     enabled_value_types[next_enabled_pos] = (ddog_prof_ValueType) HEAP_SAMPLES_VALUE;
     state->position_for[HEAP_SAMPLES_VALUE_ID] = next_enabled_pos++;
   } else {
@@ -398,6 +398,11 @@ static VALUE _native_serialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_instan
   // Need to do this while still holding on to the Global VM Lock; see comments on method for why
   serializer_set_start_timestamp_for_next_profile(state, finish_timestamp);
 
+  // Flush any pending data in the heap recorder prior to doing the iteration during serialization
+  // This needs to happen while holding on to the Global VM Lock as flushing may do allocations,
+  // frees and complex hash table rebalancings.
+  heap_recorder_flush(state->heap_recorder);
+
   // We'll release the Global VM Lock while we're calling serialize, so that the Ruby VM can continue to work while this
   // is pending
   struct call_serialize_without_gvl_arguments args = {.state = state, .finish_timestamp = finish_timestamp, .serialize_ran = false};
@@ -460,6 +465,9 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations,
   metric_values[position_for[ALLOC_SAMPLES_VALUE_ID]] = values.alloc_samples;
 
   if (values.alloc_samples != 0) {
+    // FIXME: Heap sampling is currently being done in 2 parts because the construction of locations is happening
+    //        very late in the allocation-sampling path (which is shared with the cpu sampling path). This can
+    //        be fixed with some refactoring but for now this is a less impactful change.
     end_heap_allocation_recording(state->heap_recorder, locations);
   }
 
@@ -480,12 +488,15 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations,
   }
 }
 
-void record_obj_allocation(VALUE recorder_instance, VALUE new_object, unsigned int sample_weight, ddog_CharSlice *optional_class_name) {
+void track_obj_allocation(VALUE recorder_instance, VALUE new_object, unsigned int sample_weight) {
   struct stack_recorder_state *state;
   TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state);
-  start_heap_allocation_recording(state->heap_recorder, new_object, sample_weight, optional_class_name);
+  start_heap_allocation_recording(state->heap_recorder, new_object, sample_weight);
 }
 
+// Safety: This function can get called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
+// *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
+// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
 void record_obj_free(VALUE recorder_instance, VALUE freed_object) {
   struct stack_recorder_state *state;
   TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state);
@@ -512,13 +523,9 @@ typedef struct stack_iteration_context {
   ddog_prof_Profile *profile;
 } stack_iteration_context;
 
-static void add_heap_sample_to_active_profile(stack_iteration_data stack_data, void *extra_arg) {
+static void add_heap_sample_to_active_profile_without_gvl(stack_iteration_data stack_data, void *extra_arg) {
   stack_iteration_context *context = (stack_iteration_context*) extra_arg;
 
-  // Note: We initialize this array to have ALL_VALUE_TYPES_COUNT but only tell libdatadog to use the first
-  // state->enabled_values_count values. This simplifies handling disabled value types -- we still put them on the
-  // array, but in _native_initialize we arrange so their position starts from state->enabled_values_count and thus
-  // libdatadog doesn't touch them.
   int64_t metric_values[ALL_VALUE_TYPES_COUNT] = {0};
   uint8_t *position_for = context->state->position_for;
 
@@ -534,15 +541,18 @@ static void add_heap_sample_to_active_profile(stack_iteration_data stack_data, v
   );
 
   if (result.tag == DDOG_PROF_PROFILE_RESULT_ERR) {
-    rb_raise(rb_eArgError, "Failed to record sample: %"PRIsVALUE, get_error_details_and_drop(&result.err));
+    // NOTE: Can't use get_error_details_and_drop since it builds new ruby strings and we're outside the GVL
+    ddog_CharSlice errorMsg = ddog_Error_message(&result.err);
+    grab_gvl_and_raise(rb_eArgError, "Failed to record sample: %.*s", (int) errorMsg.len, errorMsg.ptr);
   }
 }
 
-static void build_heap_profile(struct stack_recorder_state *state, ddog_prof_Profile *profile) {
-  stack_iteration_context iteration_context;
-  iteration_context.state = state;
-  iteration_context.profile = profile;
-  heap_recorder_iterate_stacks(state->heap_recorder, add_heap_sample_to_active_profile, (void*) &iteration_context);
+static void build_heap_profile_without_gvl(struct stack_recorder_state *state, ddog_prof_Profile *profile) {
+  stack_iteration_context iteration_context = {
+    .state = state,
+    .profile = profile
+  };
+  heap_recorder_iterate_stacks_without_gvl(state->heap_recorder, add_heap_sample_to_active_profile_without_gvl, (void*) &iteration_context);
 }
 
 static void *call_serialize_without_gvl(void *call_args) {
@@ -552,7 +562,7 @@ static void *call_serialize_without_gvl(void *call_args) {
 
   // Now that we have the inactive profile with all but heap samples, lets fill it with heap data
   // without needing to race with the active sampler
-  build_heap_profile(args->state, args->profile);
+  build_heap_profile_without_gvl(args->state, args->profile);
 
   // Note: The profile gets reset by the serialize call
   args->result = ddog_prof_Profile_serialize(args->profile, &args->finish_timestamp, NULL /* duration_nanos is optional */, NULL /* start_time is optional */);
diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.h b/ext/ddtrace_profiling_native_extension/stack_recorder.h
index 2f8acbb3b92..aa492cacfa9 100644
--- a/ext/ddtrace_profiling_native_extension/stack_recorder.h
+++ b/ext/ddtrace_profiling_native_extension/stack_recorder.h
@@ -22,6 +22,6 @@ typedef struct sample_labels {
 
 void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels);
 void record_endpoint(VALUE recorder_instance, uint64_t local_root_span_id, ddog_CharSlice endpoint);
-void record_obj_allocation(VALUE recorder_instance, VALUE new_object, unsigned int sample_weight, ddog_CharSlice *optional_class_name);
+void track_obj_allocation(VALUE recorder_instance, VALUE new_object, unsigned int sample_weight);
 void record_obj_free(VALUE recorder_instance, VALUE freed_object);
 VALUE enforce_recorder_instance(VALUE object);
diff --git a/lib/datadog/core/configuration/settings.rb b/lib/datadog/core/configuration/settings.rb
index 981996997f2..bbb071b8c40 100644
--- a/lib/datadog/core/configuration/settings.rb
+++ b/lib/datadog/core/configuration/settings.rb
@@ -314,45 +314,53 @@ def initialize(*_)
 
             # Can be used to enable/disable the Datadog::Profiling.allocation_count feature.
             #
-            # This feature is safe and enabled by default on Ruby 2.x, but has a few caveats on Ruby 3.x.
-            #
-            # Caveat 1 (severe):
-            # On Ruby versions 3.0 (all), 3.1.0 to 3.1.3, and 3.2.0 to 3.2.2 this is disabled by default because it
-            # can trigger a VM bug that causes a segmentation fault during garbage collection of Ractors
-            # (https://bugs.ruby-lang.org/issues/18464). We don't recommend using this feature on such Rubies.
-            # This bug is fixed on Ruby versions 3.1.4, 3.2.3 and 3.3.0.
-            #
-            # Caveat 2 (annoyance):
-            # On all known versions of Ruby 3.x, due to https://bugs.ruby-lang.org/issues/19112, when a ractor gets
-            # garbage collected, Ruby will disable all active tracepoints, which this feature internally relies on.
-            # Thus this feature is only usable if you're not using Ractors.
-            #
-            # Caveat 3 (severe):
-            # Ruby 3.2.0 to 3.2.2 have a bug in the newobj tracepoint (https://bugs.ruby-lang.org/issues/19482,
-            # https://github.com/ruby/ruby/pull/7464) so that's an extra reason why it's not safe on those Rubies.
-            # This bug is fixed on Ruby versions 3.2.3 and 3.3.0.
+            # This feature is safe and enabled by default only on Rubies where we haven't identified issues.
+            # Refer to {Datadog::Profiling::Ext::IS_ALLOC_SAMPLING_SUPPORTED} for the details.
             #
             # @default `true` on Ruby 2.x and 3.1.4+, 3.2.3+ and 3.3.0+; `false` for Ruby 3.0 and unpatched Rubies.
             option :allocation_counting_enabled do |o|
               o.type :bool
-              o.env 'DD_PROFILING_EXPERIMENTAL_ALLOCATION_ENABLED'
               o.default do
-                RUBY_VERSION.start_with?('2.') ||
-                  (RUBY_VERSION.start_with?('3.1.') && RUBY_VERSION >= '3.1.4') ||
-                  (RUBY_VERSION.start_with?('3.2.') && RUBY_VERSION >= '3.2.3') ||
-                  RUBY_VERSION >= '3.3.'
+                Profiling::Ext::IS_ALLOCATION_SAMPLING_SUPPORTED
               end
             end
 
-            # Can be used to enable/disable the Datadog::Profiling.heap_count feature.
+            # Can be used to enable/disable collection of allocation profiles.
             #
             # This feature is alpha and disabled by default
-            option :heap_counting_enabled do |o|
+            #
+            # @default `DD_PROFILING_EXPERIMENTAL_ALLOCATION_ENABLED` environment variable as a boolean, otherwise `false`
+            option :experimental_allocation_enabled do |o|
+              o.type :bool
+              o.env 'DD_PROFILING_EXPERIMENTAL_ALLOCATION_ENABLED'
+              o.default false
+            end
+
+            # Can be used to enable/disable the collection of heap profiles.
+            #
+            # This feature is alpha and disabled by default
+            #
+            # @default `DD_PROFILING_EXPERIMENTAL_HEAP_ENABLED` environment variable as a boolean, otherwise `false`
+            option :experimental_heap_enabled do |o|
               o.type :bool
               o.env 'DD_PROFILING_EXPERIMENTAL_HEAP_ENABLED'
               o.default false
             end
 
+            # Can be used to configure the allocation sampling rate: a sample will be collected every x allocations.
+            #
+            # The lower the value, the more accuracy in allocation and heap tracking but the bigger the overhead. In
+            # particular, a value of 1 will sample ALL allocations.
+            #
+            # This feature is not supported in all Rubies. Refer to {Datadog::Profiling::Ext::IS_ALLOC_SAMPLING_SUPPORTED}
+            #
+            # @default `DD_PROFILING_EXPERIMENTAL_ALLOCATION_SAMPLE_RATE` environment variable, otherwise `50`.
+            option :experimental_allocation_sample_rate do |o|
+              o.type :int
+              o.env 'DD_PROFILING_EXPERIMENTAL_ALLOCATION_SAMPLE_RATE'
+              o.default 50
+            end
+
             # Can be used to disable checking which version of `libmysqlclient` is being used by the `mysql2` gem.
             #
             # This setting is only used when the `mysql2` gem is installed.
diff --git a/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb b/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb
index c1669e28f9c..cb3182aab00 100644
--- a/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb
+++ b/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb
@@ -16,14 +16,15 @@ class CpuAndWallTimeWorker
         def initialize(
           gc_profiling_enabled:,
           allocation_counting_enabled:,
-          heap_counting_enabled:,
           no_signals_workaround_enabled:,
           thread_context_collector:,
           idle_sampling_helper: IdleSamplingHelper.new,
           # **NOTE**: This should only be used for testing; disabling the dynamic sampling rate will increase the
           # profiler overhead!
           dynamic_sampling_rate_enabled: true,
-          allocation_sample_every: 0 # Currently only for testing; Setting this to > 0 can add a lot of overhead!
+          allocation_sample_every:,
+          allocation_profiling_enabled:,
+          heap_profiling_enabled:
         )
           unless dynamic_sampling_rate_enabled
             Datadog.logger.warn(
@@ -31,30 +32,17 @@ def initialize(
             )
           end
 
-          if allocation_counting_enabled && allocation_sample_every > 0
-            Datadog.logger.warn(
-              "Enabled experimental allocation profiling: allocation_sample_every=#{allocation_sample_every}. This is " \
-              'experimental, not recommended, and will increase overhead!'
-            )
-
-            if heap_counting_enabled
-              Datadog.logger.warn(
-                "Enabled experimental heap profiling: allocation_sample_every=#{allocation_sample_every}. This is " \
-                'experimental, not recommended, and will increase overhead!'
-              )
-            end
-          end
-
           self.class._native_initialize(
             self,
             thread_context_collector,
             gc_profiling_enabled,
             idle_sampling_helper,
             allocation_counting_enabled,
-            heap_counting_enabled,
             no_signals_workaround_enabled,
             dynamic_sampling_rate_enabled,
             allocation_sample_every,
+            allocation_profiling_enabled,
+            heap_profiling_enabled,
           )
           @worker_thread = nil
           @failure_exception = nil
diff --git a/lib/datadog/profiling/component.rb b/lib/datadog/profiling/component.rb
index 10ae9e909b5..9f16d3cc163 100644
--- a/lib/datadog/profiling/component.rb
+++ b/lib/datadog/profiling/component.rb
@@ -41,12 +41,14 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
 
         no_signals_workaround_enabled = no_signals_workaround_enabled?(settings)
         timeline_enabled = settings.profiling.advanced.experimental_timeline_enabled
+        allocation_sample_every = get_allocation_sample_every(settings)
+        allocation_profiling_enabled = enable_allocation_profiling?(settings, allocation_sample_every)
+        heap_profiling_enabled = enable_heap_profiling?(settings, allocation_profiling_enabled)
 
         recorder = Datadog::Profiling::StackRecorder.new(
           cpu_time_enabled: RUBY_PLATFORM.include?('linux'), # Only supported on Linux currently
-          # FIXME: Don't hardcode this
-          alloc_samples_enabled: true,
-          heap_samples_enabled: true
+          alloc_samples_enabled: allocation_profiling_enabled,
+          heap_samples_enabled: heap_profiling_enabled,
         )
         thread_context_collector = Datadog::Profiling::Collectors::ThreadContext.new(
           recorder: recorder,
@@ -58,11 +60,11 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
         worker = Datadog::Profiling::Collectors::CpuAndWallTimeWorker.new(
           gc_profiling_enabled: enable_gc_profiling?(settings),
           allocation_counting_enabled: settings.profiling.advanced.allocation_counting_enabled,
-          heap_counting_enabled: settings.profiling.advanced.heap_counting_enabled,
           no_signals_workaround_enabled: no_signals_workaround_enabled,
           thread_context_collector: thread_context_collector,
-          # FIXME: Don't hardcode this
-          allocation_sample_every: 50,
+          allocation_sample_every: allocation_sample_every,
+          allocation_profiling_enabled: allocation_profiling_enabled,
+          heap_profiling_enabled: heap_profiling_enabled,
         )
 
         internal_metadata = {
@@ -114,6 +116,50 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
         end
       end
 
+      private_class_method def self.get_allocation_sample_every(settings)
+        allocation_sample_rate = settings.profiling.advanced.experimental_allocation_sample_rate
+
+        if allocation_sample_rate <= 0
+          raise("Allocation sample rate must be a positive integer. Was #{allocation_sample_rate}")
+        end
+
+        allocation_sample_rate
+      end
+
+      private_class_method def self.enable_allocation_profiling?(settings, allocation_sample_every)
+        allocation_profiling_enabled = settings.profiling.advanced.experimental_allocation_enabled
+
+        if allocation_profiling_enabled
+          Datadog.logger.warn(
+            "Enabled experimental allocation profiling: allocation_sample_rate=#{allocation_sample_every}. This is " \
+            'experimental, not recommended, and will increase overhead!'
+          )
+        end
+
+        if allocation_profiling_enabled && !Ext::IS_ALLOCATION_SAMPLING_SUPPORTED
+          Datadog.logger.warn(
+            "Current Ruby version (#{RUBY_VERSION}) does not officially support allocation profiling but it was " \
+            'requested. There may be unexpected problems during execution.'
+          )
+        end
+
+        allocation_profiling_enabled
+      end
+
+      private_class_method def self.enable_heap_profiling?(settings, allocation_profiling_enabled)
+        heap_profiling_enabled = settings.profiling.advanced.experimental_allocation_enabled
+
+        raise('Heap profiling requires allocation profiling to be enabled') unless allocation_profiling_enabled
+
+        if heap_profiling_enabled
+          Datadog.logger.warn(
+            'Enabled experimental heap profiling. This is experimental, not recommended, and will increase overhead!'
+          )
+        end
+
+        heap_profiling_enabled
+      end
+
       private_class_method def self.no_signals_workaround_enabled?(settings) # rubocop:disable Metrics/MethodLength
         setting_value = settings.profiling.advanced.no_signals_workaround_enabled
         legacy_ruby_that_should_use_workaround = RUBY_VERSION.start_with?('2.3.', '2.4.', '2.5.')
diff --git a/lib/datadog/profiling/ext.rb b/lib/datadog/profiling/ext.rb
index 2122ec7a95f..b3836e2e6f4 100644
--- a/lib/datadog/profiling/ext.rb
+++ b/lib/datadog/profiling/ext.rb
@@ -9,6 +9,30 @@ module Ext
       ENV_AGENTLESS = 'DD_PROFILING_AGENTLESS'
       ENV_ENDPOINT_COLLECTION_ENABLED = 'DD_PROFILING_ENDPOINT_COLLECTION_ENABLED'
 
+      # Allocation sampling is safe and supported on Ruby 2.x, but has a few caveats on Ruby 3.x.
+      #
+      # TL;DR: Supported on (2.x, 3.1.4+, 3.2.3+, and 3.3.0+).
+      #
+      # Caveat 1 (severe):
+      # On Ruby versions 3.0 (all), 3.1.0 to 3.1.3, and 3.2.0 to 3.2.2 this is disabled by default because it
+      # can trigger a VM bug that causes a segmentation fault during garbage collection of Ractors
+      # (https://bugs.ruby-lang.org/issues/18464). We don't recommend using this feature on such Rubies.
+      # This bug is fixed on Ruby versions 3.1.4, 3.2.3 and 3.3.0.
+      #
+      # Caveat 2 (annoyance):
+      # On all known versions of Ruby 3.x, due to https://bugs.ruby-lang.org/issues/19112, when a ractor gets
+      # garbage collected, Ruby will disable all active tracepoints, which this feature internally relies on.
+      # Thus this feature is only usable if you're not using Ractors.
+      #
+      # Caveat 3 (severe):
+      # Ruby 3.2.0 to 3.2.2 have a bug in the newobj tracepoint (https://bugs.ruby-lang.org/issues/19482,
+      # https://github.com/ruby/ruby/pull/7464) so that's an extra reason why it's not safe on those Rubies.
+      # This bug is fixed on Ruby versions 3.2.3 and 3.3.0.
+      IS_ALLOCATION_SAMPLING_SUPPORTED = RUBY_VERSION.start_with?('2.') ||
+        (RUBY_VERSION.start_with?('3.1.') && RUBY_VERSION >= '3.1.4') ||
+        (RUBY_VERSION.start_with?('3.2.') && RUBY_VERSION >= '3.2.3') ||
+        RUBY_VERSION >= '3.3.'
+
       module Transport
         module HTTP
           FORM_FIELD_TAG_ENV = 'env'