diff --git a/benchmarks/profiler_sample_loop_v2.rb b/benchmarks/profiler_sample_loop_v2.rb
index 4722a6c050f..43a9e85e148 100644
--- a/benchmarks/profiler_sample_loop_v2.rb
+++ b/benchmarks/profiler_sample_loop_v2.rb
@@ -19,7 +19,8 @@ def create_profiler
     @recorder = Datadog::Profiling::StackRecorder.new(
       cpu_time_enabled: true,
       alloc_samples_enabled: false,
-      heap_samples_enabled: false
+      heap_samples_enabled: false,
+      timeline_enabled: false,
     )
     @collector = Datadog::Profiling::Collectors::ThreadContext.new(
       recorder: @recorder, max_frames: 400, tracer: nil, endpoint_collection_enabled: false, timeline_enabled: false
diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index af9b0127e18..876dbd4afee 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -718,28 +718,11 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
   if (event == RUBY_INTERNAL_EVENT_GC_ENTER) {
     thread_context_collector_on_gc_start(state->thread_context_collector_instance);
   } else if (event == RUBY_INTERNAL_EVENT_GC_EXIT) {
-    // Design: In an earlier iteration of this feature (see https://github.com/DataDog/dd-trace-rb/pull/2308) we
-    // actually had a single method to implement the behavior of both thread_context_collector_on_gc_finish
-    // and thread_context_collector_sample_after_gc (the latter is called via after_gc_from_postponed_job).
-    //
-    // Unfortunately, then we discovered the safety issue around no allocations, and thus decided to separate them -- so that
-    // the sampling could run outside the tight safety constraints of the garbage collection process.
-    //
-    // There is a downside: The sample is now taken very very shortly afterwards the GC finishes, and not immediately
-    // as the GC finishes, which means the stack captured may by affected by "skid", e.g. point slightly after where
-    // it should be pointing at.
-    // Alternatives to solve this would be to capture no stack for garbage collection (as we do for Java and .net);
-    // making the sampling process allocation-safe (very hard); or separate stack sampling from sample recording,
-    // e.g. enabling us to capture the stack in thread_context_collector_on_gc_finish and do the rest later
-    // (medium hard).
-
-    thread_context_collector_on_gc_finish(state->thread_context_collector_instance);
-    // We use rb_postponed_job_register_one to ask Ruby to run thread_context_collector_sample_after_gc after if
-    // fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual.
-    //
-    // Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
-    // this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
-    rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
+    bool should_flush = thread_context_collector_on_gc_finish(state->thread_context_collector_instance);
+
+    // We use rb_postponed_job_register_one to ask Ruby to run thread_context_collector_sample_after_gc when the
+    // thread collector flags it's time to flush.
+    if (should_flush) rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
   }
 }
 
diff --git a/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.c b/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.c
new file mode 100644
index 00000000000..e9dd6c727d9
--- /dev/null
+++ b/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.c
@@ -0,0 +1,156 @@
+#include <ruby.h>
+#include <datadog/profiling.h>
+
+#include "collectors_gc_profiling_helper.h"
+
+// This helper is used by the Datadog::Profiling::Collectors::ThreadContext to profile garbage collection.
+// It's tested through that class' interfaces.
+// ---
+
+// Used when retrieving GC information from the VM.
+//  All these are symbols, but we don't need to mark them since we ask for them to be interned (and thus live forever)
+static VALUE state_sym;
+static VALUE marking_sym;
+static VALUE sweeping_sym;
+static VALUE none_sym;
+static VALUE gc_by_sym;
+static VALUE newobj_sym;
+static VALUE malloc_sym;
+static VALUE method_sym;
+static VALUE capi_sym;
+static VALUE stress_sym;
+static VALUE major_by_sym;
+static VALUE nofree_sym;
+static VALUE oldgen_sym;
+static VALUE shady_sym;
+static VALUE force_sym;
+static VALUE oldmalloc_sym;
+
+static ddog_CharSlice major_gc_reason_pretty(VALUE major_gc_reason);
+static ddog_CharSlice gc_cause_pretty(VALUE gc_cause);
+static ddog_CharSlice gc_type_pretty(VALUE major_gc_reason, VALUE gc_state);
+
+void gc_profiling_init(void) {
+  // This function lazy-interns a few constants, which may trigger allocations. Since we want to call it during GC as
+  // well, when allocations are not allowed, we call it once here so that the constants get defined ahead of time.
+  rb_gc_latest_gc_info(rb_hash_new());
+
+  // Used to query and look up the results of GC information
+  state_sym     = ID2SYM(rb_intern_const("state"));
+  marking_sym   = ID2SYM(rb_intern_const("marking"));
+  sweeping_sym  = ID2SYM(rb_intern_const("sweeping"));
+  none_sym      = ID2SYM(rb_intern_const("none"));
+  gc_by_sym     = ID2SYM(rb_intern_const("gc_by"));
+  newobj_sym    = ID2SYM(rb_intern_const("newobj"));
+  malloc_sym    = ID2SYM(rb_intern_const("malloc"));
+  method_sym    = ID2SYM(rb_intern_const("method"));
+  capi_sym      = ID2SYM(rb_intern_const("capi"));
+  stress_sym    = ID2SYM(rb_intern_const("stress"));
+  major_by_sym  = ID2SYM(rb_intern_const("major_by"));
+  nofree_sym    = ID2SYM(rb_intern_const("nofree"));
+  oldgen_sym    = ID2SYM(rb_intern_const("oldgen"));
+  shady_sym     = ID2SYM(rb_intern_const("shady"));
+  force_sym     = ID2SYM(rb_intern_const("force"));
+  oldmalloc_sym = ID2SYM(rb_intern_const("oldmalloc"));
+  state_sym     = ID2SYM(rb_intern_const("state"));
+  none_sym      = ID2SYM(rb_intern_const("none"));
+}
+
+bool gc_profiling_has_major_gc_finished(void) {
+  return rb_gc_latest_gc_info(state_sym) == none_sym && rb_gc_latest_gc_info(major_by_sym) != Qnil;
+}
+
+uint8_t gc_profiling_set_metadata(ddog_prof_Label *labels, int labels_length) {
+  uint8_t max_label_count =
+    1 + // thread id
+    1 + // thread name
+    1 + // state
+    1 + // event
+    1 + // gc reason
+    1 + // gc cause
+    1;  // gc type
+
+  if (max_label_count > labels_length) {
+    rb_raise(rb_eArgError, "BUG: gc_profiling_set_metadata invalid labels_length (%d) < max_label_count (%d)", labels_length, max_label_count);
+  }
+
+  uint8_t label_pos = 0;
+
+  labels[label_pos++] = (ddog_prof_Label) {
+    .key = DDOG_CHARSLICE_C("thread id"),
+    .str = DDOG_CHARSLICE_C("GC"),
+    .num = 0, // This shouldn't be needed but the tracer-2.7 docker image ships a buggy gcc that complains about this
+  };
+
+  labels[label_pos++] = (ddog_prof_Label) {
+    .key = DDOG_CHARSLICE_C("thread name"),
+    .str = DDOG_CHARSLICE_C("Garbage Collection"),
+    .num = 0, // Workaround, same as above
+  };
+
+  labels[label_pos++] = (ddog_prof_Label) {
+    .key = DDOG_CHARSLICE_C("state"),
+    .str = DDOG_CHARSLICE_C("had cpu"),
+    .num = 0, // Workaround, same as above
+  };
+
+  labels[label_pos++] = (ddog_prof_Label) {
+    .key = DDOG_CHARSLICE_C("event"),
+    .str = DDOG_CHARSLICE_C("gc"),
+    .num = 0, // Workaround, same as above
+  };
+
+  VALUE major_by = rb_gc_latest_gc_info(major_by_sym);
+  if (major_by != Qnil) {
+    labels[label_pos++] = (ddog_prof_Label) {
+      .key = DDOG_CHARSLICE_C("gc reason"),
+      .str = major_gc_reason_pretty(major_by),
+    };
+  }
+
+  labels[label_pos++] = (ddog_prof_Label) {
+    .key = DDOG_CHARSLICE_C("gc cause"),
+    .str = gc_cause_pretty(rb_gc_latest_gc_info(gc_by_sym)),
+  };
+
+  labels[label_pos++] = (ddog_prof_Label) {
+    .key = DDOG_CHARSLICE_C("gc type"),
+    .str = gc_type_pretty(major_by, rb_gc_latest_gc_info(state_sym)),
+  };
+
+  if (label_pos > max_label_count) {
+    rb_raise(rb_eRuntimeError, "BUG: gc_profiling_set_metadata unexpected label_pos (%d) > max_label_count (%d)", label_pos, max_label_count);
+  }
+
+  return label_pos;
+}
+
+static ddog_CharSlice major_gc_reason_pretty(VALUE major_gc_reason) {
+  if (major_gc_reason == nofree_sym   ) return DDOG_CHARSLICE_C("not enough free slots (NOFREE)");
+  if (major_gc_reason == oldgen_sym   ) return DDOG_CHARSLICE_C("old generation full (OLDGEN)");
+  if (major_gc_reason == shady_sym    ) return DDOG_CHARSLICE_C("too many objects without write barriers (SHADY)");
+  if (major_gc_reason == force_sym    ) return DDOG_CHARSLICE_C("requested (FORCE)");
+  if (major_gc_reason == oldmalloc_sym) return DDOG_CHARSLICE_C("heap bytes allocated threshold (OLDMALLOC)");
+  return DDOG_CHARSLICE_C("unknown");
+}
+
+static ddog_CharSlice gc_cause_pretty(VALUE gc_cause) {
+  if (gc_cause == newobj_sym) return DDOG_CHARSLICE_C("object allocation");
+  if (gc_cause == malloc_sym) return DDOG_CHARSLICE_C("malloc()");
+  if (gc_cause == method_sym) return DDOG_CHARSLICE_C("GC.start()");
+  if (gc_cause == capi_sym  ) return DDOG_CHARSLICE_C("rb_gc()");
+  if (gc_cause == stress_sym) return DDOG_CHARSLICE_C("stress");
+  return DDOG_CHARSLICE_C("unknown");
+}
+
+static ddog_CharSlice gc_type_pretty(VALUE major_gc_reason, VALUE gc_state) {
+  if (major_gc_reason != Qnil) {
+    if (gc_state == marking_sym ) return DDOG_CHARSLICE_C("major (ongoing, marking)");
+    if (gc_state == sweeping_sym) return DDOG_CHARSLICE_C("major (ongoing, sweeping)");
+    return DDOG_CHARSLICE_C("major");
+  } else {
+    // As we delay flushing events when a minor GC finishes, it's not relevant to include the observed state of the
+    // minor GC, as we often won't record a marking -> sweeping -> done cycle, as it happens too quickly.
+    return DDOG_CHARSLICE_C("minor");
+  }
+}
diff --git a/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.h b/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.h
new file mode 100644
index 00000000000..7a5096624c6
--- /dev/null
+++ b/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.h
@@ -0,0 +1,5 @@
+#pragma once
+
+void gc_profiling_init(void);
+bool gc_profiling_has_major_gc_finished(void);
+uint8_t gc_profiling_set_metadata(ddog_prof_Label *labels, int labels_length);
diff --git a/ext/ddtrace_profiling_native_extension/collectors_stack.c b/ext/ddtrace_profiling_native_extension/collectors_stack.c
index 7611f13e41e..9bc0d11e0cb 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_stack.c
+++ b/ext/ddtrace_profiling_native_extension/collectors_stack.c
@@ -36,22 +36,13 @@ static VALUE _native_sample(
   VALUE in_gc
 );
 static void maybe_add_placeholder_frames_omitted(VALUE thread, sampling_buffer* buffer, char *frames_omitted_message, int frames_omitted_message_size);
-static void record_placeholder_stack_in_native_code(
-  sampling_buffer* buffer,
-  VALUE recorder_instance,
-  sample_values values,
-  sample_labels labels,
-  sampling_buffer *record_buffer,
-  int extra_frames_in_record_buffer
-);
+static void record_placeholder_stack_in_native_code(sampling_buffer* buffer, VALUE recorder_instance, sample_values values, sample_labels labels);
 static void sample_thread_internal(
   VALUE thread,
   sampling_buffer* buffer,
   VALUE recorder_instance,
   sample_values values,
-  sample_labels labels,
-  sampling_buffer *record_buffer,
-  int extra_frames_in_record_buffer
+  sample_labels labels
 );
 
 void collectors_stack_init(VALUE profiling_module) {
@@ -88,6 +79,7 @@ static VALUE _native_sample(
     .cpu_or_wall_samples = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("cpu-samples"), zero)),
     .wall_time_ns  = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("wall-time"),     zero)),
     .alloc_samples = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("alloc-samples"), zero)),
+    .timeline_wall_time_ns = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("timeline"), zero)),
   };
 
   long labels_count = RARRAY_LEN(labels_array) + RARRAY_LEN(numeric_labels_array);
@@ -122,62 +114,29 @@ static VALUE _native_sample(
 
   ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = labels_count};
 
-  sample_thread(
-    thread,
-    buffer,
-    recorder_instance,
-    values,
-    (sample_labels) {.labels = slice_labels, .state_label = state_label},
-    RTEST(in_gc) ? SAMPLE_IN_GC : SAMPLE_REGULAR
-  );
+  if (in_gc == Qtrue) {
+    record_placeholder_stack(
+      buffer,
+      recorder_instance,
+      values,
+      (sample_labels) {.labels = slice_labels, .state_label = state_label},
+      DDOG_CHARSLICE_C("Garbage Collection")
+    );
+  } else {
+    sample_thread(
+      thread,
+      buffer,
+      recorder_instance,
+      values,
+      (sample_labels) {.labels = slice_labels, .state_label = state_label}
+    );
+  }
 
   sampling_buffer_free(buffer);
 
   return Qtrue;
 }
 
-void sample_thread(
-  VALUE thread,
-  sampling_buffer* buffer,
-  VALUE recorder_instance,
-  sample_values values,
-  sample_labels labels,
-  sample_type type
-) {
-  // Samples thread into recorder
-  if (type == SAMPLE_REGULAR) {
-    sampling_buffer *record_buffer = buffer;
-    int extra_frames_in_record_buffer = 0;
-    sample_thread_internal(thread, buffer, recorder_instance, values, labels, record_buffer, extra_frames_in_record_buffer);
-    return;
-  }
-
-  // Samples thread into recorder, including as a top frame in the stack a frame named "Garbage Collection"
-  if (type == SAMPLE_IN_GC) {
-    ddog_CharSlice function_name = DDOG_CHARSLICE_C("");
-    ddog_CharSlice function_filename = DDOG_CHARSLICE_C("Garbage Collection");
-    buffer->locations[0] = (ddog_prof_Location) {
-      .function = (ddog_prof_Function) {.name = function_name, .filename = function_filename},
-      .line = 0
-    };
-    // To avoid changing sample_thread_internal, we just prepare a new buffer struct that uses the same underlying storage as the
-    // original buffer, but has capacity one less, so that we can keep the above Garbage Collection frame untouched.
-    sampling_buffer thread_in_gc_buffer = (struct sampling_buffer) {
-      .max_frames = buffer->max_frames - 1,
-      .stack_buffer = buffer->stack_buffer + 1,
-      .lines_buffer = buffer->lines_buffer + 1,
-      .is_ruby_frame = buffer->is_ruby_frame + 1,
-      .locations = buffer->locations + 1,
-    };
-    sampling_buffer *record_buffer = buffer; // We pass in the original buffer as the record_buffer, but not as the regular buffer
-    int extra_frames_in_record_buffer = 1;
-    sample_thread_internal(thread, &thread_in_gc_buffer, recorder_instance, values, labels, record_buffer, extra_frames_in_record_buffer);
-    return;
-  }
-
-  rb_raise(rb_eArgError, "Unexpected value for sample_type: %d", type);
-}
-
 #define CHARSLICE_EQUALS(must_be_a_literal, charslice) (strlen("" must_be_a_literal) == charslice.len && strncmp(must_be_a_literal, charslice.ptr, charslice.len) == 0)
 
 // Idea: Should we release the global vm lock (GVL) after we get the data from `rb_profile_frames`? That way other Ruby threads
@@ -189,24 +148,12 @@ void sample_thread(
 // * Should we move this into a different thread entirely?
 // * If we don't move it into a different thread, does releasing the GVL on a Ruby thread mean that we're introducing
 //   a new thread switch point where there previously was none?
-//
-// ---
-//
-// Why the weird extra record_buffer and extra_frames_in_record_buffer?
-// The answer is: to support both sample_thread() and sample_thread_in_gc().
-//
-// For sample_thread(), buffer == record_buffer and extra_frames_in_record_buffer == 0, so it's a no-op.
-// For sample_thread_in_gc(), the buffer is a special buffer that is the same as the record_buffer, but with every
-// pointer shifted forward extra_frames_in_record_buffer elements, so that the caller can actually inject those extra
-// frames, and this function doesn't have to care about it.
-static void sample_thread_internal(
+void sample_thread(
   VALUE thread,
   sampling_buffer* buffer,
   VALUE recorder_instance,
   sample_values values,
-  sample_labels labels,
-  sampling_buffer *record_buffer,
-  int extra_frames_in_record_buffer
+  sample_labels labels
 ) {
   int captured_frames = ddtrace_rb_profile_frames(
     thread,
@@ -218,14 +165,7 @@ static void sample_thread_internal(
   );
 
   if (captured_frames == PLACEHOLDER_STACK_IN_NATIVE_CODE) {
-    record_placeholder_stack_in_native_code(
-      buffer,
-      recorder_instance,
-      values,
-      labels,
-      record_buffer,
-      extra_frames_in_record_buffer
-    );
+    record_placeholder_stack_in_native_code(buffer, recorder_instance, values, labels);
     return;
   }
 
@@ -332,7 +272,7 @@ static void sample_thread_internal(
 
   record_sample(
     recorder_instance,
-    (ddog_prof_Slice_Location) {.ptr = record_buffer->locations, .len = captured_frames + extra_frames_in_record_buffer},
+    (ddog_prof_Slice_Location) {.ptr = buffer->locations, .len = captured_frames},
     values,
     labels
   );
@@ -379,24 +319,35 @@ static void maybe_add_placeholder_frames_omitted(VALUE thread, sampling_buffer*
 //
 // To give customers visibility into these threads, rather than reporting an empty stack, we replace the empty stack
 // with one containing a placeholder frame, so that these threads are properly represented in the UX.
+
 static void record_placeholder_stack_in_native_code(
+  sampling_buffer* buffer,
+  VALUE recorder_instance,
+  sample_values values,
+  sample_labels labels
+) {
+  record_placeholder_stack(
+    buffer,
+    recorder_instance,
+    values,
+    labels,
+    DDOG_CHARSLICE_C("In native code")
+  );
+}
+
+void record_placeholder_stack(
   sampling_buffer* buffer,
   VALUE recorder_instance,
   sample_values values,
   sample_labels labels,
-  sampling_buffer *record_buffer,
-  int extra_frames_in_record_buffer
+  ddog_CharSlice placeholder_stack
 ) {
-  ddog_CharSlice function_name = DDOG_CHARSLICE_C("");
-  ddog_CharSlice function_filename = DDOG_CHARSLICE_C("In native code");
-  buffer->locations[0] = (ddog_prof_Location) {
-    .function = (ddog_prof_Function) {.name = function_name, .filename = function_filename},
-    .line = 0
-  };
+  ddog_prof_Function placeholder = {.name = DDOG_CHARSLICE_C(""), .filename = placeholder_stack};
+  buffer->locations[0] = (ddog_prof_Location) {.function = placeholder, .line = 0};
 
   record_sample(
     recorder_instance,
-    (ddog_prof_Slice_Location) {.ptr = record_buffer->locations, .len = 1 + extra_frames_in_record_buffer},
+    (ddog_prof_Slice_Location) {.ptr = buffer->locations, .len = 1},
     values,
     labels
   );
diff --git a/ext/ddtrace_profiling_native_extension/collectors_stack.h b/ext/ddtrace_profiling_native_extension/collectors_stack.h
index e4175ae35e8..b026bbbdeb8 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_stack.h
+++ b/ext/ddtrace_profiling_native_extension/collectors_stack.h
@@ -6,15 +6,19 @@
 
 typedef struct sampling_buffer sampling_buffer;
 
-typedef enum { SAMPLE_REGULAR, SAMPLE_IN_GC } sample_type;
-
 void sample_thread(
   VALUE thread,
+  sampling_buffer* buffer,
+  VALUE recorder_instance,
+  sample_values values,
+  sample_labels labels
+);
+void record_placeholder_stack(
   sampling_buffer* buffer,
   VALUE recorder_instance,
   sample_values values,
   sample_labels labels,
-  sample_type type
+  ddog_CharSlice placeholder_stack
 );
 sampling_buffer *sampling_buffer_new(unsigned int max_frames);
 void sampling_buffer_free(sampling_buffer *buffer);
diff --git a/ext/ddtrace_profiling_native_extension/collectors_thread_context.c b/ext/ddtrace_profiling_native_extension/collectors_thread_context.c
index 618cdb23f5d..a86ce1dba9d 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_thread_context.c
+++ b/ext/ddtrace_profiling_native_extension/collectors_thread_context.c
@@ -3,6 +3,7 @@
 #include "collectors_thread_context.h"
 #include "clock_id.h"
 #include "collectors_stack.h"
+#include "collectors_gc_profiling_helper.h"
 #include "helpers.h"
 #include "libdatadog_helpers.h"
 #include "private_vm_api_access.h"
@@ -37,24 +38,29 @@
 // When `thread_context_collector_on_gc_start` gets called, the current cpu and wall-time get recorded to the thread
 // context: `cpu_time_at_gc_start_ns` and `wall_time_at_gc_start_ns`.
 //
-// While these fields are set, regular samples (if any) do not account for any time that passes after these two
-// timestamps.
+// While `cpu_time_at_gc_start_ns` is set, regular samples (if any) do not account for cpu-time any time that passes
+// after this timestamp. The idea is that this cpu-time will be blamed separately on GC, and not on the user thread.
+// Wall-time accounting is not affected by this (e.g. we still record 60 seconds every 60 seconds).
 //
-// (Regular samples can still account for the time between the previous sample and the start of GC.)
+// (Regular samples can still account for the cpu-time between the previous sample and the start of GC.)
 //
-// When `thread_context_collector_on_gc_finish` gets called, the current cpu and wall-time again get recorded to the
-// thread context: `cpu_time_at_gc_finish_ns` and `wall_time_at_gc_finish_ns`.
+// When `thread_context_collector_on_gc_finish` gets called, the cpu-time and wall-time spent during GC gets recorded
+// into the global gc_tracking structure, and further samples are not affected. (The `cpu_time_at_previous_sample_ns`
+// of the thread that did GC also gets adjusted to avoid double-accounting.)
 //
-// Finally, when `thread_context_collector_sample_after_gc` gets called, the following happens:
+// Finally, when `thread_context_collector_sample_after_gc` gets called, a sample gets recorded with a stack having
+// a single placeholder `Garbage Collection` frame. This sample gets
+// assigned the cpu-time and wall-time that was recorded between calls to `on_gc_start` and `on_gc_finish`, as well
+// as metadata for the last GC.
 //
-// 1. A sample gets taken, using the special `SAMPLE_IN_GC` sample type, which produces a stack with a placeholder
-// `Garbage Collection` frame as the latest frame. This sample gets assigned the cpu-time and wall-time period that was
-// recorded between calls to `on_gc_start` and `on_gc_finish`.
-//
-// 2. The thread is no longer marked as being in gc (all gc tracking fields get reset back to `INVALID_TIME`).
-//
-// 3. The `cpu_time_at_previous_sample_ns` and `wall_time_at_previous_sample_ns` get updated with the elapsed time in
-// GC, so that all time is accounted for -- e.g. the next sample will not get "blamed" by time spent in GC.
+// Note that the Ruby GC does not usually do all of the GC work in one go. Instead, it breaks it up into smaller steps
+// so that the application can keep doing user work in between GC steps.
+// The `on_gc_start` / `on_gc_finish` will trigger each time the VM executes these smaller steps, and on a benchmark
+// that executes `Object.new` in a loop, I measured more than 50k of this steps per second (!!).
+// Creating these many events for every GC step is a lot of overhead, so instead `on_gc_finish` coalesces time
+// spent in GC and only flushes it at most every 10 ms/every complete GC collection. This reduces the amount of
+// individual GC events we need to record. We use the latest GC metadata for this event, reflecting the last GC that
+// happened in the coalesced period.
 //
 // In an earlier attempt at implementing this functionality (https://github.com/DataDog/dd-trace-rb/pull/2308), we
 // discovered that we needed to factor the sampling work away from `thread_context_collector_on_gc_finish` and into a
@@ -68,6 +74,7 @@
 #define IS_WALL_TIME true
 #define IS_NOT_WALL_TIME false
 #define MISSING_TRACER_CONTEXT_KEY 0
+#define TIME_BETWEEN_GC_EVENTS_NS MILLIS_AS_NS(10)
 
 static ID at_active_span_id;  // id of :@active_span in Ruby
 static ID at_active_trace_id; // id of :@active_trace in Ruby
@@ -114,6 +121,14 @@ struct thread_context_collector_state {
     // See thread_context_collector_on_gc_start for details
     unsigned int gc_samples_missed_due_to_missing_context;
   } stats;
+
+  struct {
+    unsigned long accumulated_cpu_time_ns;
+    unsigned long accumulated_wall_time_ns;
+
+    long wall_time_at_previous_gc_ns; // Will be INVALID_TIME unless there's accumulated time above
+    long wall_time_at_last_flushed_gc_event_ns; // Starts at 0 and then will always be valid
+  } gc_tracking;
 };
 
 // Tracks per-thread state
@@ -127,15 +142,10 @@ struct per_thread_context {
   long wall_time_at_previous_sample_ns; // Can be INVALID_TIME until initialized
 
   struct {
-    // Both of these fields are set by on_gc_start and kept until sample_after_gc is called.
+    // Both of these fields are set by on_gc_start and kept until on_gc_finish is called.
     // Outside of this window, they will be INVALID_TIME.
     long cpu_time_at_start_ns;
     long wall_time_at_start_ns;
-
-    // Both of these fields are set by on_gc_finish and kept until sample_after_gc is called.
-    // Outside of this window, they will be INVALID_TIME.
-    long cpu_time_at_finish_ns;
-    long wall_time_at_finish_ns;
   } gc_tracking;
 };
 
@@ -180,7 +190,6 @@ static void trigger_sample_for_thread(
   VALUE stack_from_thread,
   struct per_thread_context *thread_context,
   sample_values values,
-  sample_type type,
   long current_monotonic_wall_time_ns,
   ddog_CharSlice *ruby_vm_type,
   ddog_CharSlice *class_name
@@ -193,6 +202,7 @@ static VALUE _native_inspect(VALUE self, VALUE collector_instance);
 static VALUE per_thread_context_st_table_as_ruby_hash(struct thread_context_collector_state *state);
 static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value_context, st_data_t result_hash);
 static VALUE stats_as_ruby_hash(struct thread_context_collector_state *state);
+static VALUE gc_tracking_as_ruby_hash(struct thread_context_collector_state *state);
 static void remove_context_for_dead_threads(struct thread_context_collector_state *state);
 static int remove_if_dead_thread(st_data_t key_thread, st_data_t value_context, st_data_t _argument);
 static VALUE _native_per_thread_context(VALUE self, VALUE collector_instance);
@@ -200,13 +210,14 @@ static long update_time_since_previous_sample(long *time_at_previous_sample_ns,
 static long cpu_time_now_ns(struct per_thread_context *thread_context);
 static long thread_id_for(VALUE thread);
 static VALUE _native_stats(VALUE self, VALUE collector_instance);
+static VALUE _native_gc_tracking(VALUE self, VALUE collector_instance);
 static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result);
 static bool should_collect_resource(VALUE root_span_type);
 static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
 static VALUE thread_list(struct thread_context_collector_state *state);
 static VALUE _native_sample_allocation(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE sample_weight, VALUE new_object);
 static VALUE _native_new_empty_thread(VALUE self);
-ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type);
+static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type);
 
 void collectors_thread_context_init(VALUE profiling_module) {
   VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
@@ -235,6 +246,7 @@ void collectors_thread_context_init(VALUE profiling_module) {
   rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0);
   rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1);
   rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1);
+  rb_define_singleton_method(testing_module, "_native_gc_tracking", _native_gc_tracking, 1);
   rb_define_singleton_method(testing_module, "_native_new_empty_thread", _native_new_empty_thread, 0);
 
   at_active_span_id = rb_intern_const("@active_span");
@@ -243,6 +255,8 @@ void collectors_thread_context_init(VALUE profiling_module) {
   at_resource_id = rb_intern_const("@resource");
   at_root_span_id = rb_intern_const("@root_span");
   at_type_id = rb_intern_const("@type");
+
+  gc_profiling_init();
 }
 
 // This structure is used to define a Ruby object that stores a pointer to a struct thread_context_collector_state
@@ -320,6 +334,8 @@ static VALUE _native_new(VALUE klass) {
   state->allocation_type_enabled = true;
   state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER;
   state->main_thread = rb_thread_main();
+  state->gc_tracking.wall_time_at_previous_gc_ns = INVALID_TIME;
+  state->gc_tracking.wall_time_at_last_flushed_gc_event_ns = 0;
 
   return TypedData_Wrap_Struct(klass, &thread_context_collector_typed_data, state);
 }
@@ -465,7 +481,11 @@ void update_metrics_and_sample(
   long wall_time_elapsed_ns = update_time_since_previous_sample(
     &thread_context->wall_time_at_previous_sample_ns,
     current_monotonic_wall_time_ns,
-    thread_context->gc_tracking.wall_time_at_start_ns,
+    // We explicitly pass in `INVALID_TIME` as an argument for `gc_start_time_ns` here because we don't want wall-time
+    // accounting to change during GC.
+    // E.g. if 60 seconds pass in the real world, 60 seconds of wall-time are recorded, regardless of the thread doing
+    // GC or not.
+    INVALID_TIME,
     IS_WALL_TIME
   );
 
@@ -475,7 +495,6 @@ void update_metrics_and_sample(
     stack_from_thread,
     thread_context,
     (sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = wall_time_elapsed_ns},
-    SAMPLE_REGULAR,
     current_monotonic_wall_time_ns,
     NULL,
     NULL
@@ -484,7 +503,7 @@ void update_metrics_and_sample(
 
 // This function gets called when Ruby is about to start running the Garbage Collector on the current thread.
 // It updates the per_thread_context of the current thread to include the current cpu/wall times, to be used to later
-// create a stack sample that blames the cpu/wall time spent from now until the end of the garbage collector work.
+// create an event including the cpu/wall time spent in garbage collector work.
 //
 // Safety: This function gets called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
 // *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
@@ -509,27 +528,14 @@ void thread_context_collector_on_gc_start(VALUE self_instance) {
     return;
   }
 
-  // If these fields are set, there's an existing GC sample that still needs to be written out by `sample_after_gc`.
-  //
-  // When can this happen? Because we don't have precise control over when `sample_after_gc` gets called (it will be
-  // called sometime after GC finishes), there is no way to guarantee that Ruby will not trigger more than one GC cycle
-  // before we can actually run that method.
-  //
-  // We handle this by collapsing multiple GC cycles into one. That is, if the following happens:
-  // `on_gc_start` (time=0) -> `on_gc_finish` (time=1) -> `on_gc_start` (time=2) -> `on_gc_finish` (time=3) -> `sample_after_gc`
-  // then we just use time=0 from the first on_gc_start and time=3 from the last on_gc_finish, e.g. we behave as if
-  // there was a single, longer GC period.
-  if (thread_context->gc_tracking.cpu_time_at_finish_ns != INVALID_TIME &&
-    thread_context->gc_tracking.wall_time_at_finish_ns != INVALID_TIME) return;
-
-  // Here we record the wall-time first and in on_gc_finish we record it second to avoid having wall-time be slightly < cpu-time
+  // Here we record the wall-time first and in on_gc_finish we record it second to try to avoid having wall-time be slightly < cpu-time
   thread_context->gc_tracking.wall_time_at_start_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
   thread_context->gc_tracking.cpu_time_at_start_ns = cpu_time_now_ns(thread_context);
 }
 
 // This function gets called when Ruby has finished running the Garbage Collector on the current thread.
-// It updates the per_thread_context of the current thread to include the current cpu/wall times, to be used to later
-// create a stack sample that blames the cpu/wall time spent from the start of garbage collector work until now.
+// It records the cpu/wall-time observed during GC, which will be used to later
+// create an event including the cpu/wall time spent from the start of garbage collector work until now.
 //
 // Safety: This function gets called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
 // *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
@@ -537,9 +543,9 @@ void thread_context_collector_on_gc_start(VALUE self_instance) {
 //
 // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
 // Assumption 2: This function is called from the main Ractor (if Ruby has support for Ractors).
-void thread_context_collector_on_gc_finish(VALUE self_instance) {
+bool thread_context_collector_on_gc_finish(VALUE self_instance) {
   struct thread_context_collector_state *state;
-  if (!rb_typeddata_is_kind_of(self_instance, &thread_context_collector_typed_data)) return;
+  if (!rb_typeddata_is_kind_of(self_instance, &thread_context_collector_typed_data)) return false;
   // This should never fail the the above check passes
   TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
 
@@ -547,29 +553,67 @@ void thread_context_collector_on_gc_finish(VALUE self_instance) {
 
   // If there was no previously-existing context for this thread, we won't allocate one (see safety). We keep a metric for
   // how often this happens -- see on_gc_start.
-  if (thread_context == NULL) return;
+  if (thread_context == NULL) return false;
 
-  if (thread_context->gc_tracking.cpu_time_at_start_ns == INVALID_TIME &&
-    thread_context->gc_tracking.wall_time_at_start_ns == INVALID_TIME) {
+  long cpu_time_at_start_ns = thread_context->gc_tracking.cpu_time_at_start_ns;
+  long wall_time_at_start_ns = thread_context->gc_tracking.wall_time_at_start_ns;
+
+  if (cpu_time_at_start_ns == INVALID_TIME && wall_time_at_start_ns == INVALID_TIME) {
     // If this happened, it means that on_gc_start was either never called for the thread OR it was called but no thread
     // context existed at the time. The former can be the result of a bug, but since we can't distinguish them, we just
     // do nothing.
-    return;
+    return false;
+  }
+
+  // Mark thread as no longer in GC
+  thread_context->gc_tracking.cpu_time_at_start_ns = INVALID_TIME;
+  thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME;
+
+  // Here we record the wall-time second and in on_gc_start we record it first to try to avoid having wall-time be slightly < cpu-time
+  long cpu_time_at_finish_ns = cpu_time_now_ns(thread_context);
+  long wall_time_at_finish_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
+
+  // If our end timestamp is not OK, we bail out
+  if (wall_time_at_finish_ns == 0) return false;
+
+  long gc_cpu_time_elapsed_ns = cpu_time_at_finish_ns - cpu_time_at_start_ns;
+  long gc_wall_time_elapsed_ns = wall_time_at_finish_ns - wall_time_at_start_ns;
+
+  // Wall-time can go backwards if the system clock gets changed (and we observed spurious jumps back on macOS as well)
+  // so let's ensure we don't get negative values for time deltas.
+  gc_cpu_time_elapsed_ns = long_max_of(gc_cpu_time_elapsed_ns, 0);
+  gc_wall_time_elapsed_ns = long_max_of(gc_wall_time_elapsed_ns, 0);
+
+  if (state->gc_tracking.wall_time_at_previous_gc_ns == INVALID_TIME) {
+    state->gc_tracking.accumulated_cpu_time_ns = 0;
+    state->gc_tracking.accumulated_wall_time_ns = 0;
   }
 
-  // Here we record the wall-time second and in on_gc_start we record it first to avoid having wall-time be slightly < cpu-time
-  thread_context->gc_tracking.cpu_time_at_finish_ns = cpu_time_now_ns(thread_context);
-  thread_context->gc_tracking.wall_time_at_finish_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
+  state->gc_tracking.accumulated_cpu_time_ns += gc_cpu_time_elapsed_ns;
+  state->gc_tracking.accumulated_wall_time_ns += gc_wall_time_elapsed_ns;
+  state->gc_tracking.wall_time_at_previous_gc_ns = wall_time_at_finish_ns;
+
+  // Update cpu-time accounting so it doesn't include the cpu-time spent in GC during the next sample
+  // We don't update the wall-time because we don't subtract the wall-time spent in GC (see call to
+  // `update_time_since_previous_sample` for wall-time in `update_metrics_and_sample`).
+  if (thread_context->cpu_time_at_previous_sample_ns != INVALID_TIME) {
+    thread_context->cpu_time_at_previous_sample_ns += gc_cpu_time_elapsed_ns;
+  }
+
+  // Let the caller know if it should schedule a flush or not. Returning true every time would cause a lot of overhead
+  // on the application (see GC tracking introduction at the top of the file), so instead we try to accumulate a few
+  // samples first.
+  bool finished_major_gc = gc_profiling_has_major_gc_finished();
+  bool over_flush_time_treshold =
+    (wall_time_at_finish_ns - state->gc_tracking.wall_time_at_last_flushed_gc_event_ns) >= TIME_BETWEEN_GC_EVENTS_NS;
+
+  return finished_major_gc || over_flush_time_treshold;
 }
 
-// This function gets called shortly after Ruby has finished running the Garbage Collector.
+// This function gets called after one or more GC work steps (calls to on_gc_start/on_gc_finish).
 // It creates a new sample including the cpu and wall-time spent by the garbage collector work, and resets any
 // GC-related tracking.
 //
-// Specifically, it will search for thread(s) which have gone through a cycle of on_gc_start/on_gc_finish
-// and thus have cpu_time_at_start_ns, cpu_time_at_finish_ns, wall_time_at_start_ns, wall_time_at_finish_ns
-// set on their context.
-//
 // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
 // Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed.
 // Assumption 3: Unlike `on_gc_start` and `on_gc_finish`, this method is allowed to allocate memory as needed.
@@ -578,70 +622,45 @@ VALUE thread_context_collector_sample_after_gc(VALUE self_instance) {
   struct thread_context_collector_state *state;
   TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
 
-  VALUE threads = thread_list(state);
-  bool sampled_any_thread = false;
-
-  const long thread_count = RARRAY_LEN(threads);
-  for (long i = 0; i < thread_count; i++) {
-    VALUE thread = RARRAY_AREF(threads, i);
-    struct per_thread_context *thread_context = get_or_create_context_for(thread, state);
+  if (state->gc_tracking.wall_time_at_previous_gc_ns == INVALID_TIME) {
+    rb_raise(rb_eRuntimeError, "BUG: Unexpected call to sample_after_gc without valid GC information available");
+  }
 
-    if (
-      thread_context->gc_tracking.cpu_time_at_start_ns == INVALID_TIME ||
-      thread_context->gc_tracking.cpu_time_at_finish_ns == INVALID_TIME ||
-      thread_context->gc_tracking.wall_time_at_start_ns == INVALID_TIME ||
-      thread_context->gc_tracking.wall_time_at_finish_ns == INVALID_TIME
-    ) continue; // Ignore threads with no/incomplete garbage collection data
-
-    sampled_any_thread = true;
-
-    long gc_cpu_time_elapsed_ns =
-      thread_context->gc_tracking.cpu_time_at_finish_ns - thread_context->gc_tracking.cpu_time_at_start_ns;
-    long gc_wall_time_elapsed_ns =
-      thread_context->gc_tracking.wall_time_at_finish_ns - thread_context->gc_tracking.wall_time_at_start_ns;
-
-    // We don't expect non-wall time to go backwards, so let's flag this as a bug
-    if (gc_cpu_time_elapsed_ns < 0) rb_raise(rb_eRuntimeError, "BUG: Unexpected negative gc_cpu_time_elapsed_ns between samples");
-    // Wall-time can actually go backwards (e.g. when the system clock gets set) so we can't assume time going backwards
-    // was a bug.
-    // @ivoanjo: I've also observed time going backwards spuriously on macOS, see discussion on
-    // https://github.com/DataDog/dd-trace-rb/pull/2336.
-    if (gc_wall_time_elapsed_ns < 0) gc_wall_time_elapsed_ns = 0;
-
-    if (thread_context->gc_tracking.wall_time_at_start_ns == 0 && thread_context->gc_tracking.wall_time_at_finish_ns != 0) {
-      // Avoid using wall-clock if we got 0 for a start (meaning there was an error) but not 0 for finish so we don't
-      // come up with a crazy value for the frame
-      rb_raise(rb_eRuntimeError, "BUG: Unexpected zero value for gc_tracking.wall_time_at_start_ns");
-    }
+  int max_labels_needed_for_gc = 7; // Magic number gets validated inside gc_profiling_set_metadata
+  ddog_prof_Label labels[max_labels_needed_for_gc];
+  uint8_t label_pos = gc_profiling_set_metadata(labels, max_labels_needed_for_gc);
 
-    trigger_sample_for_thread(
-      state,
-      /* thread: */  thread,
-      /* stack_from_thread: */ thread,
-      thread_context,
-      (sample_values) {.cpu_time_ns = gc_cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = gc_wall_time_elapsed_ns},
-      SAMPLE_IN_GC,
-      INVALID_TIME, // For now we're not collecting timestamps for these events
-      NULL,
-      NULL
-    );
+  ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = label_pos};
 
-    // Mark thread as no longer in GC
-    thread_context->gc_tracking.cpu_time_at_start_ns = INVALID_TIME;
-    thread_context->gc_tracking.cpu_time_at_finish_ns = INVALID_TIME;
-    thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME;
-    thread_context->gc_tracking.wall_time_at_finish_ns = INVALID_TIME;
+  // The end_timestamp_ns is treated specially by libdatadog and that's why it's not added as a ddog_prof_Label
+  int64_t end_timestamp_ns = 0;
 
-    // Update counters so that they won't include the time in GC during the next sample
-    if (thread_context->cpu_time_at_previous_sample_ns != INVALID_TIME) {
-      thread_context->cpu_time_at_previous_sample_ns += gc_cpu_time_elapsed_ns;
-    }
-    if (thread_context->wall_time_at_previous_sample_ns != INVALID_TIME) {
-      thread_context->wall_time_at_previous_sample_ns += gc_wall_time_elapsed_ns;
-    }
+  if (state->timeline_enabled) {
+    end_timestamp_ns = monotonic_to_system_epoch_ns(&state->time_converter_state, state->gc_tracking.wall_time_at_previous_gc_ns);
   }
 
-  if (sampled_any_thread) state->stats.gc_samples++;
+  record_placeholder_stack(
+    state->sampling_buffer,
+    state->recorder_instance,
+    (sample_values) {
+      // This event gets both a regular cpu/wall-time duration, as a normal cpu/wall-time sample would, as well as a
+      // timeline duration.
+      // This is done to enable two use-cases:
+      // * regular cpu/wall-time makes this event show up as a regular stack in the flamegraph
+      // * the timeline duration is used when the event shows up in the timeline
+      .cpu_time_ns = state->gc_tracking.accumulated_cpu_time_ns,
+      .cpu_or_wall_samples = 1,
+      .wall_time_ns = state->gc_tracking.accumulated_wall_time_ns,
+      .timeline_wall_time_ns = state->gc_tracking.accumulated_wall_time_ns,
+    },
+    (sample_labels) {.labels = slice_labels, .state_label = NULL, .end_timestamp_ns = end_timestamp_ns},
+    DDOG_CHARSLICE_C("Garbage Collection")
+  );
+
+  state->gc_tracking.wall_time_at_last_flushed_gc_event_ns = state->gc_tracking.wall_time_at_previous_gc_ns;
+  state->gc_tracking.wall_time_at_previous_gc_ns = INVALID_TIME;
+
+  state->stats.gc_samples++;
 
   // Return a VALUE to make it easier to call this function from Ruby APIs that expect a return value (such as rb_rescue2)
   return Qnil;
@@ -653,7 +672,6 @@ static void trigger_sample_for_thread(
   VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
   struct per_thread_context *thread_context,
   sample_values values,
-  sample_type type,
   long current_monotonic_wall_time_ns,
   // These two labels are only used for allocation profiling; @ivoanjo: may want to refactor this at some point?
   ddog_CharSlice *ruby_vm_type,
@@ -776,8 +794,7 @@ static void trigger_sample_for_thread(
     state->sampling_buffer,
     state->recorder_instance,
     values,
-    (sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns},
-    type
+    (sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns}
   );
 }
 
@@ -874,9 +891,7 @@ static void initialize_context(VALUE thread, struct per_thread_context *thread_c
 
   // These will only be used during a GC operation
   thread_context->gc_tracking.cpu_time_at_start_ns = INVALID_TIME;
-  thread_context->gc_tracking.cpu_time_at_finish_ns = INVALID_TIME;
   thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME;
-  thread_context->gc_tracking.wall_time_at_finish_ns = INVALID_TIME;
 }
 
 static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) {
@@ -901,6 +916,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc
     state->time_converter_state.delta_to_epoch_ns
   ));
   rb_str_concat(result, rb_sprintf(" main_thread=%"PRIsVALUE, state->main_thread));
+  rb_str_concat(result, rb_sprintf(" gc_tracking=%"PRIsVALUE, gc_tracking_as_ruby_hash(state)));
 
   return result;
 }
@@ -927,9 +943,7 @@ static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value
     ID2SYM(rb_intern("wall_time_at_previous_sample_ns")), /* => */ LONG2NUM(thread_context->wall_time_at_previous_sample_ns),
 
     ID2SYM(rb_intern("gc_tracking.cpu_time_at_start_ns")),   /* => */ LONG2NUM(thread_context->gc_tracking.cpu_time_at_start_ns),
-    ID2SYM(rb_intern("gc_tracking.cpu_time_at_finish_ns")),  /* => */ LONG2NUM(thread_context->gc_tracking.cpu_time_at_finish_ns),
     ID2SYM(rb_intern("gc_tracking.wall_time_at_start_ns")),  /* => */ LONG2NUM(thread_context->gc_tracking.wall_time_at_start_ns),
-    ID2SYM(rb_intern("gc_tracking.wall_time_at_finish_ns")), /* => */ LONG2NUM(thread_context->gc_tracking.wall_time_at_finish_ns)
   };
   for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(context_as_hash, arguments[i], arguments[i+1]);
 
@@ -947,6 +961,19 @@ static VALUE stats_as_ruby_hash(struct thread_context_collector_state *state) {
   return stats_as_hash;
 }
 
+static VALUE gc_tracking_as_ruby_hash(struct thread_context_collector_state *state) {
+  // Update this when modifying state struct (gc_tracking inner struct)
+  VALUE result = rb_hash_new();
+  VALUE arguments[] = {
+    ID2SYM(rb_intern("accumulated_cpu_time_ns")),               /* => */ ULONG2NUM(state->gc_tracking.accumulated_cpu_time_ns),
+    ID2SYM(rb_intern("accumulated_wall_time_ns")),              /* => */ ULONG2NUM(state->gc_tracking.accumulated_wall_time_ns),
+    ID2SYM(rb_intern("wall_time_at_previous_gc_ns")),           /* => */ LONG2NUM(state->gc_tracking.wall_time_at_previous_gc_ns),
+    ID2SYM(rb_intern("wall_time_at_last_flushed_gc_event_ns")), /* => */ LONG2NUM(state->gc_tracking.wall_time_at_last_flushed_gc_event_ns),
+  };
+  for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(result, arguments[i], arguments[i+1]);
+  return result;
+}
+
 static void remove_context_for_dead_threads(struct thread_context_collector_state *state) {
   st_foreach(state->hash_map_per_thread_context, remove_if_dead_thread, 0 /* unused */);
 }
@@ -1049,8 +1076,6 @@ VALUE enforce_thread_context_collector_instance(VALUE object) {
 
 // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
 // It SHOULD NOT be used for other purposes.
-//
-// Returns the whole contents of the per_thread_context structs being tracked.
 static VALUE _native_stats(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) {
   struct thread_context_collector_state *state;
   TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
@@ -1058,6 +1083,15 @@ static VALUE _native_stats(DDTRACE_UNUSED VALUE _self, VALUE collector_instance)
   return stats_as_ruby_hash(state);
 }
 
+// This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
+// It SHOULD NOT be used for other purposes.
+static VALUE _native_gc_tracking(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) {
+  struct thread_context_collector_state *state;
+  TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
+
+  return gc_tracking_as_ruby_hash(state);
+}
+
 // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
 static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result) {
   if (state->tracer_context_key == MISSING_TRACER_CONTEXT_KEY) return;
@@ -1218,7 +1252,6 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
     /* stack_from_thread: */ current_thread,
     get_or_create_context_for(current_thread, state),
     (sample_values) {.alloc_samples = sample_weight},
-    SAMPLE_REGULAR,
     INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions
     &ruby_vm_type,
     optional_class_name
@@ -1241,7 +1274,7 @@ static VALUE _native_new_empty_thread(DDTRACE_UNUSED VALUE self) {
   return rb_thread_create(new_empty_thread_inner, NULL);
 }
 
-ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) {
+static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) {
   switch (type) {
     case(RUBY_T_OBJECT  ): return DDOG_CHARSLICE_C("Object");
     case(RUBY_T_CLASS   ): return DDOG_CHARSLICE_C("Class");
diff --git a/ext/ddtrace_profiling_native_extension/collectors_thread_context.h b/ext/ddtrace_profiling_native_extension/collectors_thread_context.h
index 88dfabe9615..6299d96b43e 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_thread_context.h
+++ b/ext/ddtrace_profiling_native_extension/collectors_thread_context.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ruby.h>
+#include <stdbool.h>
 
 void thread_context_collector_sample(
   VALUE self_instance,
@@ -10,5 +11,5 @@ void thread_context_collector_sample(
 void thread_context_collector_sample_allocation(VALUE self_instance, unsigned int sample_weight, VALUE new_object);
 VALUE thread_context_collector_sample_after_gc(VALUE self_instance);
 void thread_context_collector_on_gc_start(VALUE self_instance);
-void thread_context_collector_on_gc_finish(VALUE self_instance);
+bool thread_context_collector_on_gc_finish(VALUE self_instance);
 VALUE enforce_thread_context_collector_instance(VALUE object);
diff --git a/ext/ddtrace_profiling_native_extension/helpers.h b/ext/ddtrace_profiling_native_extension/helpers.h
index f06c96710ec..d71a513de9f 100644
--- a/ext/ddtrace_profiling_native_extension/helpers.h
+++ b/ext/ddtrace_profiling_native_extension/helpers.h
@@ -15,3 +15,5 @@
 // don't like C and I just implemented this as a function.
 inline static uint64_t uint64_max_of(uint64_t a, uint64_t b) { return a > b ? a : b; }
 inline static uint64_t uint64_min_of(uint64_t a, uint64_t b) { return a > b ? b : a; }
+inline static long long_max_of(long a, long b) { return a > b ? a : b; }
+inline static long long_min_of(long a, long b) { return a > b ? b : a; }
diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.c b/ext/ddtrace_profiling_native_extension/stack_recorder.c
index 4918f8caea8..55ac0b25a1a 100644
--- a/ext/ddtrace_profiling_native_extension/stack_recorder.c
+++ b/ext/ddtrace_profiling_native_extension/stack_recorder.c
@@ -151,15 +151,19 @@ static VALUE error_symbol = Qnil; // :error in Ruby
 #define WALL_TIME_VALUE_ID 2
 #define ALLOC_SAMPLES_VALUE     {.type_ = VALUE_STRING("alloc-samples"),     .unit = VALUE_STRING("count")}
 #define ALLOC_SAMPLES_VALUE_ID 3
-#define HEAP_SAMPLES_VALUE     {.type_ = VALUE_STRING("heap-live-samples"),     .unit = VALUE_STRING("count")}
+#define HEAP_SAMPLES_VALUE      {.type_ = VALUE_STRING("heap-live-samples"), .unit = VALUE_STRING("count")}
 #define HEAP_SAMPLES_VALUE_ID 4
+#define TIMELINE_VALUE          {.type_ = VALUE_STRING("timeline"),          .unit = VALUE_STRING("nanoseconds")}
+#define TIMELINE_VALUE_ID 5
 
-static const ddog_prof_ValueType all_value_types[] = {CPU_TIME_VALUE, CPU_SAMPLES_VALUE, WALL_TIME_VALUE, ALLOC_SAMPLES_VALUE, HEAP_SAMPLES_VALUE};
+static const ddog_prof_ValueType all_value_types[] =
+  {CPU_TIME_VALUE, CPU_SAMPLES_VALUE, WALL_TIME_VALUE, ALLOC_SAMPLES_VALUE, HEAP_SAMPLES_VALUE, TIMELINE_VALUE};
 
 // This array MUST be kept in sync with all_value_types above and is intended to act as a "hashmap" between VALUE_ID and the position it
 // occupies on the all_value_types array.
 // E.g. all_value_types_positions[CPU_TIME_VALUE_ID] => 0, means that CPU_TIME_VALUE was declared at position 0 of all_value_types.
-static const uint8_t all_value_types_positions[] = {CPU_TIME_VALUE_ID, CPU_SAMPLES_VALUE_ID, WALL_TIME_VALUE_ID, ALLOC_SAMPLES_VALUE_ID, HEAP_SAMPLES_VALUE_ID};
+static const uint8_t all_value_types_positions[] =
+  {CPU_TIME_VALUE_ID, CPU_SAMPLES_VALUE_ID, WALL_TIME_VALUE_ID, ALLOC_SAMPLES_VALUE_ID, HEAP_SAMPLES_VALUE_ID, TIMELINE_VALUE_ID};
 
 #define ALL_VALUE_TYPES_COUNT (sizeof(all_value_types) / sizeof(ddog_prof_ValueType))
 
@@ -203,7 +207,14 @@ static VALUE _native_new(VALUE klass);
 static void initialize_slot_concurrency_control(struct stack_recorder_state *state);
 static void initialize_profiles(struct stack_recorder_state *state, ddog_prof_Slice_ValueType sample_types);
 static void stack_recorder_typed_data_free(void *data);
-static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE cpu_time_enabled, VALUE alloc_samples_enabled, VALUE heap_samples_enabled);
+static VALUE _native_initialize(
+  DDTRACE_UNUSED VALUE _self,
+  VALUE recorder_instance,
+  VALUE cpu_time_enabled,
+  VALUE alloc_samples_enabled,
+  VALUE heap_samples_enabled,
+  VALUE timeline_enabled
+);
 static VALUE _native_serialize(VALUE self, VALUE recorder_instance);
 static VALUE ruby_time_from(ddog_Timespec ddprof_time);
 static void *call_serialize_without_gvl(void *call_args);
@@ -237,7 +248,7 @@ void stack_recorder_init(VALUE profiling_module) {
   // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
   rb_define_alloc_func(stack_recorder_class, _native_new);
 
-  rb_define_singleton_method(stack_recorder_class, "_native_initialize", _native_initialize, 4);
+  rb_define_singleton_method(stack_recorder_class, "_native_initialize", _native_initialize, 5);
   rb_define_singleton_method(stack_recorder_class, "_native_serialize",  _native_serialize, 1);
   rb_define_singleton_method(stack_recorder_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
   rb_define_singleton_method(testing_module, "_native_active_slot", _native_active_slot, 1);
@@ -338,28 +349,39 @@ static void stack_recorder_typed_data_free(void *state_ptr) {
   ruby_xfree(state);
 }
 
-static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE cpu_time_enabled, VALUE alloc_samples_enabled, VALUE heap_samples_enabled) {
+static VALUE _native_initialize(
+  DDTRACE_UNUSED VALUE _self,
+  VALUE recorder_instance,
+  VALUE cpu_time_enabled,
+  VALUE alloc_samples_enabled,
+  VALUE heap_samples_enabled,
+  VALUE timeline_enabled
+) {
   ENFORCE_BOOLEAN(cpu_time_enabled);
   ENFORCE_BOOLEAN(alloc_samples_enabled);
   ENFORCE_BOOLEAN(heap_samples_enabled);
+  ENFORCE_BOOLEAN(timeline_enabled);
 
   struct stack_recorder_state *state;
   TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state);
 
-  if (cpu_time_enabled == Qtrue && alloc_samples_enabled == Qtrue && heap_samples_enabled == Qtrue) return Qtrue; // Nothing to do, this is the default
+  uint8_t requested_values_count = ALL_VALUE_TYPES_COUNT -
+    (cpu_time_enabled == Qtrue ? 0 : 1) -
+    (alloc_samples_enabled == Qtrue? 0 : 1) -
+    (heap_samples_enabled == Qtrue ? 0 : 1) -
+    (timeline_enabled == Qtrue ? 0 : 1);
+
+  if (requested_values_count == ALL_VALUE_TYPES_COUNT) return Qtrue; // Nothing to do, this is the default
 
   // When some sample types are disabled, we need to reconfigure libdatadog to record less types,
   // as well as reconfigure the position_for array to push the disabled types to the end so they don't get recorded.
   // See record_sample for details on the use of position_for.
 
-  state->enabled_values_count = ALL_VALUE_TYPES_COUNT -
-    (cpu_time_enabled == Qtrue ? 0 : 1) -
-    (alloc_samples_enabled == Qtrue? 0 : 1) -
-    (heap_samples_enabled == Qtrue ? 0 : 1);
+  state->enabled_values_count = requested_values_count;
 
   ddog_prof_ValueType enabled_value_types[ALL_VALUE_TYPES_COUNT];
   uint8_t next_enabled_pos = 0;
-  uint8_t next_disabled_pos = state->enabled_values_count;
+  uint8_t next_disabled_pos = requested_values_count;
 
   // CPU_SAMPLES_VALUE is always enabled
   enabled_value_types[next_enabled_pos] = (ddog_prof_ValueType) CPU_SAMPLES_VALUE;
@@ -395,6 +417,13 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_insta
     state->heap_recorder = NULL;
   }
 
+  if (timeline_enabled == Qtrue) {
+    enabled_value_types[next_enabled_pos] = (ddog_prof_ValueType) TIMELINE_VALUE;
+    state->position_for[TIMELINE_VALUE_ID] = next_enabled_pos++;
+  } else {
+    state->position_for[TIMELINE_VALUE_ID] = next_disabled_pos++;
+  }
+
   ddog_prof_Profile_drop(&state->slot_one_profile);
   ddog_prof_Profile_drop(&state->slot_two_profile);
 
@@ -476,6 +505,7 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations,
   metric_values[position_for[CPU_SAMPLES_VALUE_ID]]   = values.cpu_or_wall_samples;
   metric_values[position_for[WALL_TIME_VALUE_ID]]     = values.wall_time_ns;
   metric_values[position_for[ALLOC_SAMPLES_VALUE_ID]] = values.alloc_samples;
+  metric_values[position_for[TIMELINE_VALUE_ID]]      = values.timeline_wall_time_ns;
 
   if (values.alloc_samples != 0) {
     // If we got an allocation sample end the heap allocation recording to commit the heap sample.
diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.h b/ext/ddtrace_profiling_native_extension/stack_recorder.h
index 9190cc89a25..4a9f2ead7b2 100644
--- a/ext/ddtrace_profiling_native_extension/stack_recorder.h
+++ b/ext/ddtrace_profiling_native_extension/stack_recorder.h
@@ -8,6 +8,7 @@ typedef struct {
   int64_t wall_time_ns;
   uint32_t cpu_or_wall_samples;
   uint32_t alloc_samples;
+  int64_t timeline_wall_time_ns;
 } sample_values;
 
 typedef struct sample_labels {
diff --git a/lib/datadog/profiling/component.rb b/lib/datadog/profiling/component.rb
index 228fd01f7a9..d0083a583f4 100644
--- a/lib/datadog/profiling/component.rb
+++ b/lib/datadog/profiling/component.rb
@@ -48,7 +48,11 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
         overhead_target_percentage = valid_overhead_target(settings.profiling.advanced.overhead_target_percentage)
         upload_period_seconds = [60, settings.profiling.advanced.upload_period_seconds].max
 
-        recorder = build_recorder(allocation_profiling_enabled, heap_profiling_enabled)
+        recorder = build_recorder(
+          allocation_profiling_enabled: allocation_profiling_enabled,
+          heap_profiling_enabled: heap_profiling_enabled,
+          timeline_enabled: timeline_enabled,
+        )
         thread_context_collector = build_thread_context_collector(settings, recorder, optional_tracer, timeline_enabled)
         worker = Datadog::Profiling::Collectors::CpuAndWallTimeWorker.new(
           gc_profiling_enabled: enable_gc_profiling?(settings),
@@ -72,11 +76,16 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
         Profiling::Profiler.new(worker: worker, scheduler: scheduler)
       end
 
-      private_class_method def self.build_recorder(allocation_profiling_enabled, heap_profiling_enabled)
+      private_class_method def self.build_recorder(
+        allocation_profiling_enabled:,
+        heap_profiling_enabled:,
+        timeline_enabled:
+      )
         Datadog::Profiling::StackRecorder.new(
           cpu_time_enabled: RUBY_PLATFORM.include?('linux'), # Only supported on Linux currently
           alloc_samples_enabled: allocation_profiling_enabled,
           heap_samples_enabled: heap_profiling_enabled,
+          timeline_enabled: timeline_enabled,
         )
       end
 
diff --git a/lib/datadog/profiling/stack_recorder.rb b/lib/datadog/profiling/stack_recorder.rb
index 3153a4982c5..6337d3ec588 100644
--- a/lib/datadog/profiling/stack_recorder.rb
+++ b/lib/datadog/profiling/stack_recorder.rb
@@ -4,7 +4,7 @@ module Profiling
     # Note that `record_sample` is only accessible from native code.
     # Methods prefixed with _native_ are implemented in `stack_recorder.c`
     class StackRecorder
-      def initialize(cpu_time_enabled:, alloc_samples_enabled:, heap_samples_enabled:)
+      def initialize(cpu_time_enabled:, alloc_samples_enabled:, heap_samples_enabled:, timeline_enabled:)
         # This mutex works in addition to the fancy C-level mutexes we have in the native side (see the docs there).
         # It prevents multiple Ruby threads calling serialize at the same time -- something like
         # `10.times { Thread.new { stack_recorder.serialize } }`.
@@ -13,7 +13,13 @@ def initialize(cpu_time_enabled:, alloc_samples_enabled:, heap_samples_enabled:)
         # accidentally happening.
         @no_concurrent_synchronize_mutex = Mutex.new
 
-        self.class._native_initialize(self, cpu_time_enabled, alloc_samples_enabled, heap_samples_enabled)
+        self.class._native_initialize(
+          self,
+          cpu_time_enabled,
+          alloc_samples_enabled,
+          heap_samples_enabled,
+          timeline_enabled,
+        )
       end
 
       def serialize
diff --git a/sig/datadog/profiling/component.rbs b/sig/datadog/profiling/component.rbs
index 2b32dbe88fe..5de8b61eb6c 100644
--- a/sig/datadog/profiling/component.rbs
+++ b/sig/datadog/profiling/component.rbs
@@ -8,8 +8,9 @@ module Datadog
       ) -> Datadog::Profiling::Profiler?
 
       def self.build_recorder: (
-        bool allocation_profiling_enabled,
-        bool heap_profiling_enabled,
+        allocation_profiling_enabled: bool,
+        heap_profiling_enabled: bool,
+        timeline_enabled: bool,
       ) -> Datadog::Profiling::StackRecorder
 
       def self.build_thread_context_collector: (
diff --git a/sig/datadog/profiling/stack_recorder.rbs b/sig/datadog/profiling/stack_recorder.rbs
index 0f61f9c8ab9..f0681354e95 100644
--- a/sig/datadog/profiling/stack_recorder.rbs
+++ b/sig/datadog/profiling/stack_recorder.rbs
@@ -3,13 +3,14 @@ module Datadog
     class StackRecorder
       @no_concurrent_synchronize_mutex: ::Thread::Mutex
 
-      def initialize: (cpu_time_enabled: bool, alloc_samples_enabled: bool, heap_samples_enabled: bool) -> void
+      def initialize: (cpu_time_enabled: bool, alloc_samples_enabled: bool, heap_samples_enabled: bool, timeline_enabled: bool) -> void
 
       def self._native_initialize: (
         Datadog::Profiling::StackRecorder recorder_instance,
         bool cpu_time_enabled,
         bool alloc_samples_enabled,
         bool heap_samples_enabled,
+        bool timeline_enabled,
       ) -> true
 
       def serialize: () -> untyped
diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
index 21888900e7c..11a2bdee3fb 100644
--- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
+++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
@@ -256,28 +256,28 @@
 
       described_class::Testing._native_trigger_sample
 
-      invoke_gc_times = 5
-
-      invoke_gc_times.times do
+      5.times do
         Thread.pass
         GC.start
+        Thread.pass
       end
 
       cpu_and_wall_time_worker.stop
 
       all_samples = samples_from_pprof(recorder.serialize!)
 
-      current_thread_gc_samples =
-        samples_for_thread(all_samples, Thread.current)
-          .select { |it| it.locations.first.path == 'Garbage Collection' }
-
-      # NOTE: In some cases, Ruby may actually call two GC's back-to-back without us having the possibility to take
-      # a sample. I don't expect this to happen for this test (that's what the `Thread.pass` above is trying to avoid)
-      # but if this spec turns out to be flaky, that is probably the issue, and that would mean we'd need to relax the
-      # check.
-      expect(
-        current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.values.fetch(:'cpu-samples') }
-      ).to be >= invoke_gc_times
+      gc_sample = all_samples.find { |sample| sample.labels[:'gc cause'] == 'GC.start()' }
+
+      expect(gc_sample.labels).to match a_hash_including(
+        :state => 'had cpu',
+        :'thread id' => 'GC',
+        :'thread name' => 'Garbage Collection',
+        :event => 'gc',
+        :'gc reason' => an_instance_of(String),
+        :'gc cause' => 'GC.start()',
+        :'gc type' => 'major',
+      )
+      expect(gc_sample.locations.first.path).to eq 'Garbage Collection'
     end
 
     context 'when the background thread dies without cleaning up (after Ruby forks)' do
diff --git a/spec/datadog/profiling/collectors/stack_spec.rb b/spec/datadog/profiling/collectors/stack_spec.rb
index aee41a3f3f5..38c851cbe0d 100644
--- a/spec/datadog/profiling/collectors/stack_spec.rb
+++ b/spec/datadog/profiling/collectors/stack_spec.rb
@@ -60,14 +60,8 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram
     context 'when marking sample as being in garbage collection' do
       let(:in_gc) { true }
 
-      it 'includes a placeholder frame for garbage collection' do
-        expect(stacks.fetch(:gathered)[0]).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0)
-      end
-
-      it 'matches the Ruby backtrace API' do
-        # We skip 4 frames here -- the garbage collection placeholder, as well as the 3 top stacks that differ from the
-        # reference stack (see the `let(:gathered_stack)` above for details)
-        expect(stacks.fetch(:gathered)[4..-1]).to eq reference_stack
+      it 'gathers a one-element stack with a "Garbage Collection" placeholder' do
+        expect(stacks.fetch(:gathered)).to contain_exactly(have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0))
       end
     end
   end
@@ -491,38 +485,6 @@ def call_sleep
         expect(gathered_stack).to eq reference_stack
       end
     end
-
-    context 'when marking sample as being in garbage collection' do
-      let(:in_gc) { true }
-
-      it 'gathers exactly max_frames frames' do
-        expect(gathered_stack.size).to be max_frames
-      end
-
-      it 'matches the Ruby backtrace API, up to max_frames - 2' do
-        garbage_collection = 1
-        expect(gathered_stack[(0 + garbage_collection)...(max_frames - 1)]).to eq reference_stack[0...(max_frames - 1 - garbage_collection)]
-      end
-
-      it 'includes two placeholder frames: one for garbage collection and another for including the number of skipped frames' do
-        garbage_collection = 1
-        placeholder = 1
-        omitted_frames = target_stack_depth - max_frames + placeholder + garbage_collection
-
-        expect(omitted_frames).to be 97
-        expect(gathered_stack.last).to have_attributes(base_label: '', path: '97 frames omitted', lineno: 0)
-        expect(gathered_stack.first).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0)
-      end
-
-      context 'when stack is exactly one item less as deep as the configured max_frames' do
-        let(:target_stack_depth) { 4 }
-
-        it 'includes a placeholder frame for garbage collection and matches the Ruby backtrace API' do
-          garbage_collection = 1
-          expect(gathered_stack[(0 + garbage_collection)..-1]).to eq reference_stack
-        end
-      end
-    end
   end
 
   context 'when sampling a dead thread' do
@@ -600,11 +562,8 @@ def call_sleep
     context 'when marking sample as being in garbage collection' do
       let(:in_gc) { true }
 
-      it 'gathers a two-element stack with a placeholder for "In native code" and another for garbage collection' do
-        expect(gathered_stack).to contain_exactly(
-          have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0),
-          have_attributes(base_label: '', path: 'In native code', lineno: 0),
-        )
+      it 'gathers a one-element stack with a "Garbage Collection" placeholder' do
+        expect(stacks.fetch(:gathered)).to contain_exactly(have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0))
       end
     end
   end
diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb
index 88e32c6a089..6ebfd62cea1 100644
--- a/spec/datadog/profiling/collectors/thread_context_spec.rb
+++ b/spec/datadog/profiling/collectors/thread_context_spec.rb
@@ -9,7 +9,7 @@
     expect(Thread.list).to include(Thread.main, t1, t2, t3)
   end
 
-  let(:recorder) { build_stack_recorder }
+  let(:recorder) { build_stack_recorder(timeline_enabled: timeline_enabled) }
   let(:ready_queue) { Queue.new }
   let(:t1) do
     Thread.new(ready_queue) do |ready_queue|
@@ -91,6 +91,10 @@ def stats
     described_class::Testing._native_stats(cpu_and_wall_time_collector)
   end
 
+  def gc_tracking
+    described_class::Testing._native_gc_tracking(cpu_and_wall_time_collector)
+  end
+
   # This method exists only so we can look for its name in the stack trace in a few tests
   def inside_t1
     yield
@@ -183,50 +187,25 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current
       expect(t1_samples.map(&:values).map { |it| it.fetch(:'cpu-samples') }.reduce(:+)).to eq 5
     end
 
-    [:before, :after].each do |on_gc_finish_order|
-      context "when a thread is marked as being in garbage collection, #{on_gc_finish_order} on_gc_finish" do
-        # Until sample_after_gc gets called, the state left over by both on_gc_start and on_gc_finish "blocks" time
-        # from being assigned to further samples. Note this is expected to be very rare in practice, otherwise we would
-        # probably want to look into skipping these samples entirely.
-        it 'records the wall-time between a previous sample and the start of garbage collection, and no further time' do
-          sample
-          wall_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns)
-
-          on_gc_start
-          on_gc_finish if on_gc_finish_order == :after
-
-          wall_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns')
-
-          # Even though we keep calling sample, the result only includes the time until we called on_gc_start
-          5.times { another_way_of_calling_sample }
-
-          total_wall_for_rspec_thread =
-            samples_for_thread(samples, Thread.current)
-              .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } }
-              .map { |it| it.values.fetch(:'wall-time') }
-              .reduce(:+)
-
-          expect(total_wall_for_rspec_thread).to be(wall_time_at_gc_start - wall_time_at_first_sample)
-        end
+    context 'when a thread is marked as being in garbage collection by on_gc_start' do
+      # @ivoanjo: This spec exists because for cpu-time the behavior is not this one (e.g. we don't keep recording
+      # cpu-time), and I wanted to validate that the different behavior does not get applied to wall-time.
+      it 'keeps recording the wall-time after every sample' do
+        sample
+        wall_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns)
 
-        # The whole point of wall_time_at_previous_sample_ns is to track the past point in time that we use as start of
-        # the time range for a sample.
-        # BUT, we can't let it advance during GC as it should only get accounted for when GC finishes.
-        it 'does not advance wall_time_at_previous_sample_ns for the thread beyond gc_tracking.wall_time_at_start_ns' do
-          sample
+        on_gc_start
 
-          on_gc_start
-          on_gc_finish if on_gc_finish_order == :after
+        5.times { sample }
 
-          wall_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns')
+        time_after = Datadog::Core::Utils::Time.get_time(:nanosecond)
 
-          5.times { sample }
+        sample
 
-          wall_time_at_previous_sample_ns =
-            per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns)
+        wall_time_at_last_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns)
 
-          expect(wall_time_at_previous_sample_ns).to be wall_time_at_gc_start
-        end
+        expect(wall_time_at_last_sample).to be >= wall_time_at_first_sample
+        expect(wall_time_at_last_sample).to be >= time_after
       end
     end
 
@@ -267,47 +246,61 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current
           expect(total_cpu_for_rspec_thread).to be_between(1, rspec_thread_spent_time)
         end
 
-        [:before, :after].each do |on_gc_finish_order|
-          context "when a thread is marked as being in garbage collection, #{on_gc_finish_order} on_gc_finish" do
-            it 'records the cpu-time between a previous sample and the start of garbage collection, and no further time' do
-              sample
-              cpu_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns)
+        context 'when a thread is marked as being in garbage collection by on_gc_start' do
+          it 'records the cpu-time between a previous sample and the start of garbage collection, and no further time' do
+            sample
+            cpu_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns)
 
-              on_gc_start
-              on_gc_finish if on_gc_finish_order == :after
+            on_gc_start
 
-              cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns')
+            cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns')
 
-              # Even though we keep calling sample, the result only includes the time until we called on_gc_start
-              5.times { another_way_of_calling_sample }
+            # Even though we keep calling sample, the result only includes the time until we called on_gc_start
+            5.times { another_way_of_calling_sample }
 
-              total_cpu_for_rspec_thread =
-                samples_for_thread(samples, Thread.current)
-                  .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } }
-                  .map { |it| it.values.fetch(:'cpu-time') }
-                  .reduce(:+)
+            total_cpu_for_rspec_thread =
+              samples_for_thread(samples, Thread.current)
+                .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } }
+                .map { |it| it.values.fetch(:'cpu-time') }
+                .reduce(:+)
 
-              expect(total_cpu_for_rspec_thread).to be(cpu_time_at_gc_start - cpu_time_at_first_sample)
-            end
+            expect(total_cpu_for_rspec_thread).to be(cpu_time_at_gc_start - cpu_time_at_first_sample)
+          end
 
-            # The whole point of cpu_time_at_previous_sample_ns is to track the past point in time that we use as start of
-            # the time range for a sample.
-            # BUT, we can't let it advance during GC as it should only get accounted for when GC finishes.
-            it 'does not advance cpu_time_at_previous_sample_ns for the thread beyond gc_tracking.cpu_time_at_start_ns' do
-              sample
+          # When a thread is marked as being in GC the cpu_time_at_previous_sample_ns is not allowed to advance until
+          # the GC finishes.
+          it 'does not advance cpu_time_at_previous_sample_ns for the thread beyond gc_tracking.cpu_time_at_start_ns' do
+            sample
+
+            on_gc_start
 
-              on_gc_start
-              on_gc_finish if on_gc_finish_order == :after
+            cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns')
 
-              cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns')
+            5.times { sample }
 
-              5.times { sample }
+            cpu_time_at_previous_sample_ns =
+              per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns)
 
-              cpu_time_at_previous_sample_ns =
-                per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns)
+            expect(cpu_time_at_previous_sample_ns).to be cpu_time_at_gc_start
+          end
+        end
 
-              expect(cpu_time_at_previous_sample_ns).to be cpu_time_at_gc_start
-            end
+        context 'when a thread is unmarked as being in garbage collection by on_gc_finish' do
+          it 'lets cpu_time_at_previous_sample_ns advance again' do
+            sample
+
+            on_gc_start
+
+            cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns')
+
+            on_gc_finish
+
+            5.times { sample }
+
+            cpu_time_at_previous_sample_ns =
+              per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns)
+
+            expect(cpu_time_at_previous_sample_ns).to be > cpu_time_at_gc_start
           end
         end
       end
@@ -653,26 +646,11 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current
           it "records the cpu-time when garbage collection started in the caller thread's context" do
             on_gc_start
 
-            expect(per_thread_context.fetch(Thread.current)).to include(:'gc_tracking.cpu_time_at_start_ns' => be > 0)
-          end
-        end
-      end
-
-      context 'when called again after on_gc_finish but before sample_after_gc' do
-        before do
-          on_gc_start
-          on_gc_finish
-        end
+            cpu_time_at_previous_sample_ns = per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns)
 
-        it 'does not change the gc start times' do
-          start_times = proc do
-            cpu_time = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns')
-            wall_time = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns')
-
-            [cpu_time, wall_time]
+            expect(per_thread_context.fetch(Thread.current))
+              .to include(:'gc_tracking.cpu_time_at_start_ns' => (be > cpu_time_at_previous_sample_ns))
           end
-
-          expect { on_gc_start }.to_not change(&start_times)
         end
       end
     end
@@ -693,38 +671,54 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current
       context 'when on_gc_start was not called before' do
         # See comment in the actual implementation on when/why this can happen
 
-        it 'does not change the gc finish times' do
+        it 'does not change the wall_time_at_previous_gc_ns' do
           on_gc_finish
 
-          expect(per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_finish_ns')).to be invalid_time
-          expect(per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_finish_ns')).to be invalid_time
+          expect(gc_tracking.fetch(:wall_time_at_previous_gc_ns)).to be invalid_time
         end
       end
 
       context 'when on_gc_start was previously called' do
         before { on_gc_start }
 
-        it "records the wall-time when garbage collection finished in the caller thread's context" do
+        it 'records the wall-time when garbage collection finished in the gc_tracking' do
           wall_time_before_on_gc_finish_ns = Datadog::Core::Utils::Time.get_time(:nanosecond)
           on_gc_finish
           wall_time_after_on_gc_finish_ns = Datadog::Core::Utils::Time.get_time(:nanosecond)
 
+          expect(gc_tracking.fetch(:wall_time_at_previous_gc_ns))
+            .to be_between(wall_time_before_on_gc_finish_ns, wall_time_after_on_gc_finish_ns)
+        end
+
+        it 'resets the gc tracking fields back to invalid_time' do
+          on_gc_finish
+
           expect(per_thread_context.fetch(Thread.current)).to include(
-            :'gc_tracking.wall_time_at_finish_ns' =>
-              be_between(wall_time_before_on_gc_finish_ns, wall_time_after_on_gc_finish_ns)
+            :'gc_tracking.cpu_time_at_start_ns' => invalid_time,
+            :'gc_tracking.wall_time_at_start_ns' => invalid_time,
           )
         end
 
+        it 'records the wall-time time spent between calls to on_gc_start and on_gc_finish' do
+          wall_time_at_start_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns')
+
+          wall_time_before_on_gc_finish_ns = Datadog::Core::Utils::Time.get_time(:nanosecond)
+          on_gc_finish
+
+          expect(gc_tracking.fetch(:accumulated_wall_time_ns))
+            .to be >= (wall_time_before_on_gc_finish_ns - wall_time_at_start_ns)
+        end
+
         context 'cpu-time behavior' do
           context 'when not on Linux' do
             before do
               skip 'The fallback behavior only applies when not on Linux' if PlatformHelpers.linux?
             end
 
-            it "records the cpu-time when garbage collection finished in the caller thread's context as zero" do
+            it 'records the accumulated_cpu_time_ns as zero' do
               on_gc_finish
 
-              expect(per_thread_context.fetch(Thread.current)).to include(:'gc_tracking.cpu_time_at_finish_ns' => 0)
+              expect(gc_tracking.fetch(:accumulated_cpu_time_ns)).to be 0
             end
           end
 
@@ -733,187 +727,152 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current
               skip 'Test only runs on Linux' unless PlatformHelpers.linux?
             end
 
-            it "records the cpu-time when garbage collection finished in the caller thread's context" do
+            it 'records the cpu-time spent between calls to on_gc_start and on_gc_finish' do
               on_gc_finish
 
-              expect(per_thread_context.fetch(Thread.current)).to include(:'gc_tracking.cpu_time_at_finish_ns' => be > 0)
+              expect(gc_tracking.fetch(:accumulated_cpu_time_ns)).to be > 0
+            end
+
+            it 'advances the cpu_time_at_previous_sample_ns for the sampled thread by the time spent in GC' do
+              cpu_time_at_previous_sample_ns_before =
+                per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns)
+
+              on_gc_finish
+
+              expect(per_thread_context.fetch(Thread.current)).to include(
+                cpu_time_at_previous_sample_ns: be > cpu_time_at_previous_sample_ns_before
+              )
             end
           end
         end
       end
 
       context 'when going through multiple cycles of on_gc_start/on_gc_finish without sample_after_gc getting called' do
-        it 'keeps the cpu-time and wall-time at finish from the LAST on_gc_finish' do
-          context_tracking = []
+        let(:context_tracking) { [] }
 
+        before do
           5.times do
             on_gc_start
             on_gc_finish
 
-            context_tracking << per_thread_context.fetch(Thread.current)
+            context_tracking << gc_tracking
           end
-
-          cpu_time_from_last_on_gc_finish = context_tracking.last.fetch(:'gc_tracking.cpu_time_at_finish_ns')
-          wall_time_from_last_on_gc_finish = context_tracking.last.fetch(:'gc_tracking.wall_time_at_finish_ns')
-
-          expect(context_tracking.first)
-            .to include(:'gc_tracking.wall_time_at_finish_ns' => be < wall_time_from_last_on_gc_finish)
-
-          # This always advances: all_but_last <= the last one
-          # (Needs the <= because unfortunately we may not get enough precision, otherwise it would be <)
-          all_but_last = context_tracking[0..-2]
-          expect(
-            all_but_last.map { |it| it.fetch(:'gc_tracking.cpu_time_at_finish_ns') }
-          ).to all be <= cpu_time_from_last_on_gc_finish
-          expect(
-            all_but_last.map { |it| it.fetch(:'gc_tracking.wall_time_at_finish_ns') }
-          ).to all be <= wall_time_from_last_on_gc_finish
         end
-      end
-    end
-  end
 
-  describe '#sample_after_gc' do
-    let(:gc_samples) { samples.select { |it| it.locations.first.path == 'Garbage Collection' } }
+        it 'accumulates the cpu-time and wall-time from the multiple GCs' do
+          all_accumulated_wall_time = context_tracking.map { |it| it.fetch(:accumulated_wall_time_ns) }
 
-    before { sample }
+          expect(all_accumulated_wall_time).to eq all_accumulated_wall_time.sort
+          expect(all_accumulated_wall_time.first).to be <= all_accumulated_wall_time.last
 
-    context 'when there is no thread with gc time to be sampled' do
-      it 'does not sample any threads' do
-        sample_after_gc
+          all_accumulated_cpu_time = context_tracking.map { |it| it.fetch(:accumulated_cpu_time_ns) }
+          expect(all_accumulated_cpu_time).to eq all_accumulated_cpu_time.sort
 
-        expect(gc_samples).to be_empty
-      end
+          expect(all_accumulated_cpu_time.first).to be < all_accumulated_cpu_time.last if all_accumulated_cpu_time.first > 0
+        end
 
-      it 'does not increment the gc_samples stat' do
-        sample_after_gc
+        it 'updates the wall_time_at_previous_gc_ns with the latest one' do
+          all_wall_time_at_previous_gc_ns = context_tracking.map { |it| it.fetch(:wall_time_at_previous_gc_ns) }
 
-        expect(stats.fetch(:gc_samples)).to be 0
+          expect(all_wall_time_at_previous_gc_ns.last).to be all_wall_time_at_previous_gc_ns.max
+        end
       end
     end
+  end
 
-    context 'when there is a thread with gc start time but no finish time' do
-      before { on_gc_start }
-
-      it 'does not sample any threads' do
-        sample_after_gc
-
-        expect(gc_samples).to be_empty
-      end
-
-      it 'does not increment the gc_samples stat' do
-        sample_after_gc
+  describe '#sample_after_gc' do
+    before { sample }
 
-        expect(stats.fetch(:gc_samples)).to be 0
+    context 'when called before on_gc_start/on_gc_finish' do
+      it do
+        expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/)
       end
     end
 
-    context 'when there is a thread with a recorded gc start and finish time' do
-      let(:gc_sample) do
-        expect(gc_samples.size).to be 1
-        gc_samples.first
-      end
+    context 'when there is gc information to record' do
+      let(:gc_sample) { samples.find { |it| it.labels.fetch(:'thread name') == 'Garbage Collection' } }
 
       before do
         on_gc_start
+        @time_before = Datadog::Core::Utils::Time.as_utc_epoch_ns(Time.now)
         on_gc_finish
+        @time_after = Datadog::Core::Utils::Time.as_utc_epoch_ns(Time.now)
       end
 
-      it 'samples the thread with recorded gc start and finish time, marking it as being in Garbage Collection' do
-        sample_after_gc
+      context 'when called more than once in a row' do
+        it do
+          sample_after_gc
+
+          expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/)
+        end
+      end
 
-        expect(object_id_from(gc_sample.labels.fetch(:'thread id'))).to eq Thread.current.object_id
+      it 'increments the gc_samples counter' do
+        expect { sample_after_gc }.to change { stats.fetch(:gc_samples) }.from(0).to(1)
       end
 
-      it 'samples the thread with recorded gc start and finish time, recording the times between gc start and finish' do
-        cpu_time_at_start_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns')
-        cpu_time_at_finish_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_finish_ns')
-        wall_time_at_start_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns')
-        wall_time_at_finish_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_finish_ns')
+      it 'sets the wall_time_at_last_flushed_gc_event_ns from the wall_time_at_previous_gc_ns' do
+        wall_time_at_previous_gc_ns = gc_tracking.fetch(:wall_time_at_previous_gc_ns)
 
         sample_after_gc
 
-        expect(gc_sample.values).to include(
-          :"cpu-samples" => 1,
-          :'cpu-time' => cpu_time_at_finish_ns - cpu_time_at_start_ns,
-          :"wall-time" => wall_time_at_finish_ns - wall_time_at_start_ns,
-        )
+        expect(gc_tracking.fetch(:wall_time_at_last_flushed_gc_event_ns)).to be wall_time_at_previous_gc_ns
       end
 
-      it 'resets the gc tracking fields back to invalid_time' do
+      it 'resets the wall_time_at_previous_gc_ns to invalid_time' do
         sample_after_gc
 
-        expect(per_thread_context.fetch(Thread.current)).to include(
-          :'gc_tracking.cpu_time_at_start_ns' => invalid_time,
-          :'gc_tracking.cpu_time_at_finish_ns' => invalid_time,
-          :'gc_tracking.wall_time_at_start_ns' => invalid_time,
-          :'gc_tracking.wall_time_at_finish_ns' => invalid_time,
-        )
-      end
-
-      it 'increments the gc_samples stat' do
-        expect { sample_after_gc }.to change { stats.fetch(:gc_samples) }.from(0).to(1)
+        expect(gc_tracking.fetch(:wall_time_at_previous_gc_ns)).to be invalid_time
       end
 
-      it 'does not sample any other threads' do
+      it 'creates a Garbage Collection sample' do
         sample_after_gc
 
-        expect(gc_samples.size).to be 1
+        expect(gc_sample.values.fetch(:'cpu-samples')).to be 1
+        expect(gc_sample.labels).to match a_hash_including(
+          :state => 'had cpu',
+          :'thread id' => 'GC',
+          :'thread name' => 'Garbage Collection',
+          :event => 'gc',
+          :'gc cause' => an_instance_of(String),
+          :'gc type' => an_instance_of(String),
+        )
+        expect(gc_sample.locations.first.path).to eq 'Garbage Collection'
       end
 
-      it 'advances the wall_time_at_previous_sample_ns for the sampled thread by the time spent in garbage collection' do
-        wall_time_at_previous_sample_ns_before =
-          per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns)
+      it 'creates a Garbage Collection sample using the accumulated_cpu_time_ns and accumulated_wall_time_ns' do
+        accumulated_cpu_time_ns = gc_tracking.fetch(:accumulated_cpu_time_ns)
+        accumulated_wall_time_ns = gc_tracking.fetch(:accumulated_wall_time_ns)
 
         sample_after_gc
 
-        wall_time_spent_in_gc = gc_sample.values.fetch(:'wall-time')
-
-        expect(per_thread_context.fetch(Thread.current)).to include(
-          wall_time_at_previous_sample_ns: wall_time_at_previous_sample_ns_before + wall_time_spent_in_gc
+        expect(gc_sample.values).to match a_hash_including(
+          :'cpu-time' => accumulated_cpu_time_ns,
+          :'wall-time' => accumulated_wall_time_ns,
         )
       end
 
-      context 'cpu-time behavior' do
-        context 'when not on Linux' do
-          before do
-            skip 'The fallback behavior only applies when not on Linux' if PlatformHelpers.linux?
-          end
-
-          it 'keeps the cpu_time_at_previous_sample_ns as invalid_time' do
-            sample_after_gc
-
-            expect(per_thread_context.fetch(Thread.current)).to include(cpu_time_at_previous_sample_ns: invalid_time)
-          end
-        end
+      it 'does not include the timeline timestamp' do
+        sample_after_gc
 
-        context 'on Linux' do
-          before do
-            skip 'Test only runs on Linux' unless PlatformHelpers.linux?
-          end
+        expect(gc_sample.labels.keys).to_not include(:end_timestamp_ns)
+      end
 
-          it 'advances the cpu_time_at_previous_sample_ns for the sampled thread by the time spent in garbage collection' do
-            cpu_time_at_previous_sample_ns_before =
-              per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns)
+      context 'when timeline is enabled' do
+        let(:timeline_enabled) { true }
 
-            sample_after_gc
+        it 'creates a Garbage Collection sample using the accumulated_wall_time_ns as the timeline duration' do
+          accumulated_wall_time_ns = gc_tracking.fetch(:accumulated_wall_time_ns)
 
-            cpu_time_spent_in_gc = gc_sample.values.fetch(:'cpu-time')
+          sample_after_gc
 
-            expect(per_thread_context.fetch(Thread.current)).to include(
-              cpu_time_at_previous_sample_ns: cpu_time_at_previous_sample_ns_before + cpu_time_spent_in_gc
-            )
-          end
+          expect(gc_sample.values.fetch(:timeline)).to be accumulated_wall_time_ns
         end
-      end
-
-      context 'when timeline is enabled' do
-        let(:timeline_enabled) { true }
 
-        it 'does not include end_timestamp_ns labels in GC samples' do
+        it 'creates a Garbage Collection sample using the timestamp set by on_gc_finish, converted to epoch ns' do
           sample_after_gc
 
-          expect(gc_samples.first.labels.keys).to_not include(:end_timestamp_ns)
+          expect(gc_sample.labels.fetch(:end_timestamp_ns)).to be_between(@time_before, @time_after)
         end
       end
     end
diff --git a/spec/datadog/profiling/component_spec.rb b/spec/datadog/profiling/component_spec.rb
index 77ac09732b4..efd37d42efd 100644
--- a/spec/datadog/profiling/component_spec.rb
+++ b/spec/datadog/profiling/component_spec.rb
@@ -53,6 +53,8 @@
       context 'when using the new CPU Profiling 2.0 profiler' do
         it 'initializes a ThreadContext collector' do
           allow(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new)
+          dummy_stack_recorder = instance_double(Datadog::Profiling::StackRecorder, 'dummy_stack_recorder')
+          allow(Datadog::Profiling::StackRecorder).to receive(:new).and_return(dummy_stack_recorder)
 
           expect(settings.profiling.advanced).to receive(:max_frames).and_return(:max_frames_config)
           expect(settings.profiling.advanced)
@@ -61,7 +63,7 @@
             .to receive(:enabled).and_return(:endpoint_collection_enabled_config)
 
           expect(Datadog::Profiling::Collectors::ThreadContext).to receive(:new).with(
-            recorder: instance_of(Datadog::Profiling::StackRecorder),
+            recorder: dummy_stack_recorder,
             max_frames: :max_frames_config,
             tracer: tracer,
             endpoint_collection_enabled: :endpoint_collection_enabled_config,
@@ -276,6 +278,28 @@
           end
         end
 
+        context 'when timeline is enabled' do
+          before { settings.profiling.advanced.experimental_timeline_enabled = true }
+
+          it 'sets up the StackRecorder with timeline_enabled: true' do
+            expect(Datadog::Profiling::StackRecorder)
+              .to receive(:new).with(hash_including(timeline_enabled: true)).and_call_original
+
+            build_profiler_component
+          end
+        end
+
+        context 'when timeline is disabled' do
+          before { settings.profiling.advanced.experimental_timeline_enabled = false }
+
+          it 'sets up the StackRecorder with timeline_enabled: false' do
+            expect(Datadog::Profiling::StackRecorder)
+              .to receive(:new).with(hash_including(timeline_enabled: false)).and_call_original
+
+            build_profiler_component
+          end
+        end
+
         it 'sets up the Profiler with the CpuAndWallTimeWorker collector' do
           expect(Datadog::Profiling::Profiler).to receive(:new).with(
             worker: instance_of(Datadog::Profiling::Collectors::CpuAndWallTimeWorker),
@@ -295,11 +319,11 @@
         it 'sets up the Exporter internal_metadata with relevant settings' do
           allow(Datadog::Profiling::Collectors::ThreadContext).to receive(:new)
           allow(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new)
+          allow(Datadog::Profiling::StackRecorder).to receive(:new)
 
           expect(described_class).to receive(:no_signals_workaround_enabled?).and_return(:no_signals_result)
           expect(settings.profiling.advanced).to receive(:experimental_timeline_enabled).and_return(:timeline_result)
-          expect(settings.profiling.advanced).to receive(:experimental_allocation_sample_rate)
-            .and_return(123)
+          expect(settings.profiling.advanced).to receive(:experimental_allocation_sample_rate).and_return(123)
           expect(Datadog::Profiling::Exporter).to receive(:new).with(
             hash_including(
               internal_metadata: {
diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb
index e02c1af213b..fcb084106b9 100644
--- a/spec/datadog/profiling/spec_helper.rb
+++ b/spec/datadog/profiling/spec_helper.rb
@@ -65,7 +65,11 @@ def decode_frame_from_pprof(decoded_profile, location_id)
   end
 
   def object_id_from(thread_id)
-    Integer(thread_id.match(/\d+ \((?<object_id>\d+)\)/)[:object_id])
+    if thread_id != 'GC'
+      Integer(thread_id.match(/\d+ \((?<object_id>\d+)\)/)[:object_id])
+    else
+      -1
+    end
   end
 
   def samples_for_thread(samples, thread)
@@ -74,11 +78,12 @@ def samples_for_thread(samples, thread)
 
   # We disable heap_sample collection by default in tests since it requires some extra mocking/
   # setup for it to properly work.
-  def build_stack_recorder(heap_samples_enabled: false)
+  def build_stack_recorder(heap_samples_enabled: false, timeline_enabled: false)
     Datadog::Profiling::StackRecorder.new(
       cpu_time_enabled: true,
       alloc_samples_enabled: true,
-      heap_samples_enabled: heap_samples_enabled
+      heap_samples_enabled: heap_samples_enabled,
+      timeline_enabled: timeline_enabled,
     )
   end
 end
diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb
index 8652ceb5e05..73f73ae4874 100644
--- a/spec/datadog/profiling/stack_recorder_spec.rb
+++ b/spec/datadog/profiling/stack_recorder_spec.rb
@@ -10,12 +10,14 @@
   # Disabling these by default since they require some extra setup and produce separate samples.
   # Enabling this is tested in a particular context below.
   let(:heap_samples_enabled) { false }
+  let(:timeline_enabled) { true }
 
   subject(:stack_recorder) do
     described_class.new(
       cpu_time_enabled: cpu_time_enabled,
       alloc_samples_enabled: alloc_samples_enabled,
-      heap_samples_enabled: heap_samples_enabled
+      heap_samples_enabled: heap_samples_enabled,
+      timeline_enabled: timeline_enabled,
     )
   end
 
@@ -118,77 +120,76 @@ def slot_two_mutex_locked?
         expect(start).to be <= finish
       end
 
-      context 'when all profile types are enabled' do
+      describe 'profile types configuration' do
         let(:cpu_time_enabled) { true }
         let(:alloc_samples_enabled) { true }
         let(:heap_samples_enabled) { true }
-
-        it 'returns a pprof with the configured sample types' do
-          expect(sample_types_from(decoded_profile)).to eq(
+        let(:timeline_enabled) { true }
+        let(:all_profile_types) do
+          {
             'cpu-time' => 'nanoseconds',
             'cpu-samples' => 'count',
             'wall-time' => 'nanoseconds',
             'alloc-samples' => 'count',
             'heap-live-samples' => 'count',
-          )
+            'timeline' => 'nanoseconds',
+          }
         end
-      end
 
-      context 'when cpu-time is disabled' do
-        let(:cpu_time_enabled) { false }
-        let(:alloc_samples_enabled) { true }
-        let(:heap_samples_enabled) { true }
+        def profile_types_without(type)
+          all_profile_types.dup.tap { |it| it.delete(type) { raise 'Missing key' } }
+        end
 
-        it 'returns a pprof without the cpu-type type' do
-          expect(sample_types_from(decoded_profile)).to eq(
-            'cpu-samples' => 'count',
-            'wall-time' => 'nanoseconds',
-            'alloc-samples' => 'count',
-            'heap-live-samples' => 'count',
-          )
+        context 'when all profile types are enabled' do
+          it 'returns a pprof with the configured sample types' do
+            expect(sample_types_from(decoded_profile)).to eq(all_profile_types)
+          end
         end
-      end
 
-      context 'when alloc-samples is disabled' do
-        let(:cpu_time_enabled) { true }
-        let(:alloc_samples_enabled) { false }
-        let(:heap_samples_enabled) { true }
+        context 'when cpu-time is disabled' do
+          let(:cpu_time_enabled) { false }
 
-        it 'returns a pprof without the alloc-samples type' do
-          expect(sample_types_from(decoded_profile)).to eq(
-            'cpu-time' => 'nanoseconds',
-            'cpu-samples' => 'count',
-            'wall-time' => 'nanoseconds',
-            'heap-live-samples' => 'count',
-          )
+          it 'returns a pprof without the cpu-type type' do
+            expect(sample_types_from(decoded_profile)).to eq(profile_types_without('cpu-time'))
+          end
         end
-      end
 
-      context 'when heap-live-samples is disabled' do
-        let(:cpu_time_enabled) { true }
-        let(:alloc_samples_enabled) { true }
-        let(:heap_samples_enabled) { false }
+        context 'when alloc-samples is disabled' do
+          let(:alloc_samples_enabled) { false }
 
-        it 'returns a pprof without the heap-live-samples type' do
-          expect(sample_types_from(decoded_profile)).to eq(
-            'cpu-time' => 'nanoseconds',
-            'cpu-samples' => 'count',
-            'wall-time' => 'nanoseconds',
-            'alloc-samples' => 'count',
-          )
+          it 'returns a pprof without the alloc-samples type' do
+            expect(sample_types_from(decoded_profile)).to eq(profile_types_without('alloc-samples'))
+          end
         end
-      end
 
-      context 'when all optional types are disabled' do
-        let(:cpu_time_enabled) { false }
-        let(:alloc_samples_enabled) { false }
-        let(:heap_samples_enabled) { false }
+        context 'when heap-live-samples is disabled' do
+          let(:heap_samples_enabled) { false }
 
-        it 'returns a pprof without the optional types' do
-          expect(sample_types_from(decoded_profile)).to eq(
-            'cpu-samples' => 'count',
-            'wall-time' => 'nanoseconds',
-          )
+          it 'returns a pprof without the heap-live-samples type' do
+            expect(sample_types_from(decoded_profile)).to eq(profile_types_without('heap-live-samples'))
+          end
+        end
+
+        context 'when timeline is disabled' do
+          let(:timeline_enabled) { false }
+
+          it 'returns a pprof without the timeline type' do
+            expect(sample_types_from(decoded_profile)).to eq(profile_types_without('timeline'))
+          end
+        end
+
+        context 'when all optional types are disabled' do
+          let(:cpu_time_enabled) { false }
+          let(:alloc_samples_enabled) { false }
+          let(:heap_samples_enabled) { false }
+          let(:timeline_enabled) { false }
+
+          it 'returns a pprof without the optional types' do
+            expect(sample_types_from(decoded_profile)).to eq(
+              'cpu-samples' => 'count',
+              'wall-time' => 'nanoseconds',
+            )
+          end
         end
       end
 
@@ -214,7 +215,9 @@ def sample_types_from(decoded_profile)
     end
 
     context 'when profile has a sample' do
-      let(:metric_values) { { 'cpu-time' => 123, 'cpu-samples' => 456, 'wall-time' => 789, 'alloc-samples' => 4242 } }
+      let(:metric_values) do
+        { 'cpu-time' => 123, 'cpu-samples' => 456, 'wall-time' => 789, 'alloc-samples' => 4242, 'timeline' => 1111 }
+      end
       let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b', 'state' => 'unknown' }.to_a }
 
       let(:samples) { samples_from_pprof(encoded_pprof) }
@@ -232,6 +235,7 @@ def sample_types_from(decoded_profile)
             :'cpu-samples' => 456,
             :'wall-time' => 789,
             :'alloc-samples' => 4242,
+            :timeline => 1111,
           )
       end
 
@@ -240,7 +244,7 @@ def sample_types_from(decoded_profile)
 
         it 'encodes the sample with the metrics provided, ignoring the disabled ones' do
           expect(samples.first.values)
-            .to eq(:'cpu-samples' => 456, :'wall-time' => 789, :'alloc-samples' => 4242)
+            .to eq(:'cpu-samples' => 456, :'wall-time' => 789, :'alloc-samples' => 4242, :timeline => 1111)
         end
       end
 
@@ -338,7 +342,9 @@ def sample_types_from(decoded_profile)
 
     describe 'heap samples' do
       let(:sample_rate) { 50 }
-      let(:metric_values) { { 'cpu-time' => 101, 'cpu-samples' => 1, 'wall-time' => 789, 'alloc-samples' => sample_rate } }
+      let(:metric_values) do
+        { 'cpu-time' => 101, 'cpu-samples' => 1, 'wall-time' => 789, 'alloc-samples' => sample_rate, 'timeline' => 42 }
+      end
       let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b', 'state' => 'unknown' }.to_a }
 
       let(:a_string) { 'a beautiful string' }