diff --git a/benchmarks/profiler_sample_loop_v2.rb b/benchmarks/profiler_sample_loop_v2.rb
index 0ed11b59c68..0292892804a 100644
--- a/benchmarks/profiler_sample_loop_v2.rb
+++ b/benchmarks/profiler_sample_loop_v2.rb
@@ -13,6 +13,10 @@
 # This benchmark measures the performance of the main stack sampling loop of the profiler
 
 class ProfilerSampleLoopBenchmark
+  # This is needed because we're directly invoking the CpuAndWallTime collector through a testing interface; in normal
+  # use a profiler thread is automatically used.
+  PROFILER_OVERHEAD_STACK_THREAD = Thread.new { sleep }
+
   def create_profiler
     @recorder = Datadog::Profiling::StackRecorder.new
     @collector = Datadog::Profiling::Collectors::CpuAndWallTime.new(recorder: @recorder, max_frames: 400, tracer: nil)
@@ -36,7 +40,7 @@ def run_benchmark
       x.config(**benchmark_time, suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_sample_loop_v2'))
 
       x.report("stack collector #{ENV['CONFIG']}") do
-        Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector)
+        Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD)
       end
 
       x.save! 'profiler-sample-loop-v2-results.json' unless VALIDATE_BENCHMARK_MODE
@@ -48,7 +52,7 @@ def run_benchmark
 
   def run_forever
     while true
-      1000.times { Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector) }
+      1000.times { Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) }
       @recorder.serialize
       print '.'
     end
diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c
index 26debd3d2fc..77848ee375a 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c
+++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c
@@ -136,13 +136,22 @@ static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _val
 static int hash_map_per_thread_context_free_values(st_data_t _thread, st_data_t value_per_thread_context, st_data_t _argument);
 static VALUE _native_new(VALUE klass);
 static VALUE _native_initialize(VALUE self, VALUE collector_instance, VALUE recorder_instance, VALUE max_frames, VALUE tracer_context_key);
-static VALUE _native_sample(VALUE self, VALUE collector_instance);
+static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread);
 static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance);
 static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance);
 static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
+void update_metrics_and_sample(
+  struct cpu_and_wall_time_collector_state *state,
+  VALUE thread_being_sampled,
+  VALUE profiler_overhead_stack_thread,
+  struct per_thread_context *thread_context,
+  long current_cpu_time_ns,
+  long current_monotonic_wall_time_ns
+);
 static void trigger_sample_for_thread(
   struct cpu_and_wall_time_collector_state *state,
   VALUE thread,
+  VALUE stack_from_thread,
   struct per_thread_context *thread_context,
   ddog_Slice_I64 metric_values_slice,
   sample_type type
@@ -185,7 +194,7 @@ void collectors_cpu_and_wall_time_init(VALUE profiling_module) {
   rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_initialize", _native_initialize, 4);
   rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_inspect", _native_inspect, 1);
   rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
-  rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 1);
+  rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2);
   rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1);
   rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1);
   rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1);
@@ -294,8 +303,10 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_inst
 
 // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTime behavior using RSpec.
 // It SHOULD NOT be used for other purposes.
-static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) {
-  cpu_and_wall_time_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE));
+static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread) {
+  if (!is_thread_alive(profiler_overhead_stack_thread)) rb_raise(rb_eArgError, "Unexpected: profiler_overhead_stack_thread is not alive");
+
+  cpu_and_wall_time_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE), profiler_overhead_stack_thread);
   return Qtrue;
 }
 
@@ -327,10 +338,17 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_
 // Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe.
 // Assumption 4: This function IS NOT called in a reentrant way.
 // Assumption 5: This function is called from the main Ractor (if Ruby has support for Ractors).
-void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns) {
+//
+// The `profiler_overhead_stack_thread` is used to attribute the profiler overhead to a stack borrowed from a different thread
+// (belonging to ddtrace), so that the overhead is visible in the profile rather than blamed on user code.
+void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns, VALUE profiler_overhead_stack_thread) {
   struct cpu_and_wall_time_collector_state *state;
   TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
 
+  VALUE current_thread = rb_thread_current();
+  struct per_thread_context *current_thread_context = get_or_create_context_for(current_thread, state);
+  long cpu_time_at_sample_start_for_current_thread = cpu_time_now_ns(current_thread_context);
+
   VALUE threads = ddtrace_thread_list();
 
   const long thread_count = RARRAY_LEN(threads);
@@ -338,33 +356,17 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto
     VALUE thread = RARRAY_AREF(threads, i);
     struct per_thread_context *thread_context = get_or_create_context_for(thread, state);
 
-    long current_cpu_time_ns = cpu_time_now_ns(thread_context);
+    // We account for cpu-time for the current thread in a different way -- we use the cpu-time at sampling start, to avoid
+    // blaming the time the profiler took on whatever's running on the thread right now
+    long current_cpu_time_ns = thread != current_thread ? cpu_time_now_ns(thread_context) : cpu_time_at_sample_start_for_current_thread;
 
-    long cpu_time_elapsed_ns = update_time_since_previous_sample(
-      &thread_context->cpu_time_at_previous_sample_ns,
-      current_cpu_time_ns,
-      thread_context->gc_tracking.cpu_time_at_start_ns,
-      IS_NOT_WALL_TIME
-    );
-    long wall_time_elapsed_ns = update_time_since_previous_sample(
-      &thread_context->wall_time_at_previous_sample_ns,
-      current_monotonic_wall_time_ns,
-      thread_context->gc_tracking.wall_time_at_start_ns,
-      IS_WALL_TIME
-    );
-
-    int64_t metric_values[ENABLED_VALUE_TYPES_COUNT] = {0};
-
-    metric_values[CPU_TIME_VALUE_POS] = cpu_time_elapsed_ns;
-    metric_values[CPU_SAMPLES_VALUE_POS] = 1;
-    metric_values[WALL_TIME_VALUE_POS] = wall_time_elapsed_ns;
-
-    trigger_sample_for_thread(
+    update_metrics_and_sample(
       state,
-      thread,
+      /* thread_being_sampled: */ thread,
+      /* stack_from_thread: */ thread,
       thread_context,
-      (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT},
-      SAMPLE_REGULAR
+      current_cpu_time_ns,
+      current_monotonic_wall_time_ns
     );
   }
 
@@ -373,6 +375,52 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto
   // TODO: This seems somewhat overkill and inefficient to do often; right now we just do it every few samples
   // but there's probably a better way to do this if we actually track when threads finish
   if (state->sample_count % 100 == 0) remove_context_for_dead_threads(state);
+
+  update_metrics_and_sample(
+    state,
+    /* thread_being_sampled: */ current_thread,
+    /* stack_from_thread: */ profiler_overhead_stack_thread,
+    current_thread_context,
+    cpu_time_now_ns(current_thread_context),
+    monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
+  );
+}
+
+void update_metrics_and_sample(
+  struct cpu_and_wall_time_collector_state *state,
+  VALUE thread_being_sampled,
+  VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
+  struct per_thread_context *thread_context,
+  long current_cpu_time_ns,
+  long current_monotonic_wall_time_ns
+) {
+  long cpu_time_elapsed_ns = update_time_since_previous_sample(
+    &thread_context->cpu_time_at_previous_sample_ns,
+    current_cpu_time_ns,
+    thread_context->gc_tracking.cpu_time_at_start_ns,
+    IS_NOT_WALL_TIME
+  );
+  long wall_time_elapsed_ns = update_time_since_previous_sample(
+    &thread_context->wall_time_at_previous_sample_ns,
+    current_monotonic_wall_time_ns,
+    thread_context->gc_tracking.wall_time_at_start_ns,
+    IS_WALL_TIME
+  );
+
+  int64_t metric_values[ENABLED_VALUE_TYPES_COUNT] = {0};
+
+  metric_values[CPU_TIME_VALUE_POS] = cpu_time_elapsed_ns;
+  metric_values[CPU_SAMPLES_VALUE_POS] = 1;
+  metric_values[WALL_TIME_VALUE_POS] = wall_time_elapsed_ns;
+
+  trigger_sample_for_thread(
+    state,
+    thread_being_sampled,
+    stack_from_thread,
+    thread_context,
+    (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT},
+    SAMPLE_REGULAR
+  );
 }
 
 // This function gets called when Ruby is about to start running the Garbage Collector on the current thread.
@@ -515,7 +563,8 @@ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) {
 
     trigger_sample_for_thread(
       state,
-      thread,
+      /* thread: */  thread,
+      /* stack_from_thread: */ thread,
       thread_context,
       (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT},
       SAMPLE_IN_GC
@@ -545,6 +594,7 @@ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) {
 static void trigger_sample_for_thread(
   struct cpu_and_wall_time_collector_state *state,
   VALUE thread,
+  VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
   struct per_thread_context *thread_context,
   ddog_Slice_I64 metric_values_slice,
   sample_type type
@@ -552,6 +602,7 @@ static void trigger_sample_for_thread(
   int max_label_count =
     1 + // thread id
     1 + // thread name
+    1 + // profiler overhead
     2;  // local root span id and span id
   ddog_prof_Label labels[max_label_count];
   int label_pos = 0;
@@ -594,6 +645,13 @@ static void trigger_sample_for_thread(
     }
   }
 
+  if (thread != stack_from_thread) {
+    labels[label_pos++] = (ddog_prof_Label) {
+      .key = DDOG_CHARSLICE_C("profiler overhead"),
+      .num = 1
+    };
+  }
+
   // The number of times `label_pos++` shows up in this function needs to match `max_label_count`. To avoid "oops I
   // forgot to update max_label_count" in the future, we've also added this validation.
   // @ivoanjo: I wonder if C compilers are smart enough to statically prove when this check never triggers happens and
@@ -603,7 +661,7 @@ static void trigger_sample_for_thread(
   }
 
   sample_thread(
-    thread,
+    stack_from_thread,
     state->sampling_buffer,
     state->recorder_instance,
     metric_values_slice,
diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h
index 609a3d4757f..688d8389987 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h
+++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h
@@ -2,7 +2,11 @@
 
 #include <ruby.h>
 
-void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns);
+void cpu_and_wall_time_collector_sample(
+  VALUE self_instance,
+  long current_monotonic_wall_time_ns,
+  VALUE profiler_overhead_stack_thread
+);
 VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance);
 void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance);
 void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance);
diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index 422bd376486..a5d56a67e02 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -481,7 +481,8 @@ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) {
 
   state->stats.sampled++;
 
-  cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample);
+  VALUE profiler_overhead_stack_thread = state->owner_thread; // Used to attribute profiler overhead to a different stack
+  cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample, profiler_overhead_stack_thread);
 
   long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
   long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;
diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb
index 2211fb0c59b..d38f361df84 100644
--- a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb
+++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb
@@ -15,8 +15,10 @@
   let(:ready_queue) { Queue.new }
   let(:t1) do
     Thread.new(ready_queue) do |ready_queue|
-      ready_queue << true
-      sleep
+      inside_t1 do
+        ready_queue << true
+        sleep
+      end
     end
   end
   let(:t2) do
@@ -47,8 +49,8 @@
     end
   end
 
-  def sample
-    described_class::Testing._native_sample(cpu_and_wall_time_collector)
+  def sample(profiler_overhead_stack_thread: Thread.current)
+    described_class::Testing._native_sample(cpu_and_wall_time_collector, profiler_overhead_stack_thread)
   end
 
   def on_gc_start
@@ -75,6 +77,15 @@ def stats
     described_class::Testing._native_stats(cpu_and_wall_time_collector)
   end
 
+  def inside_t1
+    yield
+  end
+
+  # This method exists only so we can look for its name in the stack trace in a few tests
+  def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current)
+    sample(profiler_overhead_stack_thread: profiler_overhead_stack_thread)
+  end
+
   describe '#sample' do
     it 'samples all threads' do
       all_threads = Thread.list
@@ -88,7 +99,7 @@ def stats
     it 'tags the samples with the object ids of the Threads they belong to' do
       sample
 
-      expect(samples.map { |it| object_id_from(it.fetch(:labels).fetch(:'thread id')) })
+      expect(samples.map { |it| object_id_from(it.labels.fetch(:'thread id')) })
         .to include(*[Thread.main, t1, t2, t3].map(&:object_id))
     end
 
@@ -103,9 +114,9 @@ def stats
       t2_sample = samples_for_thread(samples, t2).first
       t3_sample = samples_for_thread(samples, t3).first
 
-      expect(t1_sample).to include(labels: include(:'thread name' => 'thread t1'))
-      expect(t2_sample.fetch(:labels).keys).to_not include(:'thread name')
-      expect(t3_sample).to include(labels: include(:'thread name' => 'thread t3'))
+      expect(t1_sample.labels).to include(:'thread name' => 'thread t1')
+      expect(t2_sample.labels.keys).to_not include(:'thread name')
+      expect(t3_sample.labels).to include(:'thread name' => 'thread t3')
     end
 
     it 'includes the wall-time elapsed between samples' do
@@ -118,7 +129,7 @@ def stats
         per_thread_context.fetch(t1).fetch(:wall_time_at_previous_sample_ns)
 
       t1_samples = samples_for_thread(samples, t1)
-      wall_time = t1_samples.first.fetch(:values).fetch(:'wall-time')
+      wall_time = t1_samples.first.values.fetch(:'wall-time')
 
       expect(t1_samples.size)
         .to be(1), "Expected thread t1 to always have same stack trace (because it's sleeping), got #{t1_samples.inspect}"
@@ -131,7 +142,7 @@ def stats
 
       t1_sample = samples_for_thread(samples, t1).first
 
-      expect(t1_sample).to include(values: include(:'cpu-samples' => 5))
+      expect(t1_sample.values).to include(:'cpu-samples' => 5)
     end
 
     [:before, :after].each do |on_gc_finish_order|
@@ -148,11 +159,13 @@ def stats
 
           wall_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns')
 
-          5.times { sample } # Even though we keep sampling, the result only includes the time until we called on_gc_start
+          # Even though we keep calling sample, the result only includes the time until we called on_gc_start
+          5.times { another_way_of_calling_sample }
 
           total_wall_for_rspec_thread =
             samples_for_thread(samples, Thread.current)
-              .map { |it| it.fetch(:values).fetch(:'wall-time') }
+              .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } }
+              .map { |it| it.values.fetch(:'wall-time') }
               .reduce(:+)
 
           expect(total_wall_for_rspec_thread).to be(wall_time_at_gc_start - wall_time_at_first_sample)
@@ -207,7 +220,7 @@ def stats
           # some data for it
           total_cpu_for_rspec_thread =
             samples_for_thread(samples, Thread.current)
-              .map { |it| it.fetch(:values).fetch(:'cpu-time') }
+              .map { |it| it.values.fetch(:'cpu-time') }
               .reduce(:+)
 
           # The **wall-clock time** spent by the rspec thread is going to be an upper bound for the cpu time spent,
@@ -227,12 +240,13 @@ def stats
 
               cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns')
 
-              # Even though we keep sampling, the result only includes the time until we called on_gc_start
-              5.times { sample }
+              # Even though we keep calling sample, the result only includes the time until we called on_gc_start
+              5.times { another_way_of_calling_sample }
 
               total_cpu_for_rspec_thread =
                 samples_for_thread(samples, Thread.current)
-                  .map { |it| it.fetch(:values).fetch(:'cpu-time') }
+                  .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } }
+                  .map { |it| it.values.fetch(:'cpu-time') }
                   .reduce(:+)
 
               expect(total_cpu_for_rspec_thread).to be(cpu_time_at_gc_start - cpu_time_at_first_sample)
@@ -274,7 +288,7 @@ def stats
         it 'does not include "local root span id" nor "span id" labels in the samples' do
           sample
 
-          found_labels = t1_sample.fetch(:labels).keys
+          found_labels = t1_sample.labels.keys
 
           expect(found_labels).to_not include(:'local root span id')
           expect(found_labels).to_not include(:'span id')
@@ -347,7 +361,7 @@ def stats
           it 'includes "local root span id" and "span id" labels in the samples' do
             sample
 
-            expect(t1_sample.fetch(:labels)).to include(
+            expect(t1_sample.labels).to include(
               :'local root span id' => @t1_local_root_span_id.to_i,
               :'span id' => @t1_span_id.to_i,
             )
@@ -356,7 +370,7 @@ def stats
           it 'does not include the "trace endpoint" label' do
             sample
 
-            expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint')
+            expect(t1_sample.labels.keys).to_not include(:'trace endpoint')
           end
 
           context 'when local root span type is web' do
@@ -365,7 +379,7 @@ def stats
             it 'includes the "trace endpoint" label in the samples' do
               sample
 
-              expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'profiler.test')
+              expect(t1_sample.labels).to include(:'trace endpoint' => 'profiler.test')
             end
 
             describe 'trace vs root span resource mutation' do
@@ -392,7 +406,7 @@ def stats
                 it 'includes the "trace endpoint" label in the samples with the root span resource' do
                   sample
 
-                  expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'root_span_resource')
+                  expect(t1_sample.labels).to include(:'trace endpoint' => 'root_span_resource')
                 end
               end
 
@@ -403,7 +417,7 @@ def stats
                 it 'includes the "trace endpoint" label in the samples with the trace resource' do
                   sample
 
-                  expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'trace_resource')
+                  expect(t1_sample.labels).to include(:'trace endpoint' => 'trace_resource')
                 end
               end
 
@@ -414,7 +428,7 @@ def stats
                 it 'does not include the "trace endpoint" label' do
                   sample
 
-                  expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint')
+                  expect(t1_sample.labels.keys).to_not include(:'trace endpoint')
                 end
               end
             end
@@ -431,8 +445,8 @@ def stats
                 t1_samples = samples_for_thread(samples, t1)
 
                 expect(t1_samples).to have(1).item
-                expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_first_sample')
-                expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 2)
+                expect(t1_samples.first.labels).to include(:'trace endpoint' => 'changed_after_first_sample')
+                expect(t1_samples.first.values).to include(:'cpu-samples' => 2)
               end
 
               context 'when the resource is changed multiple times' do
@@ -446,8 +460,8 @@ def stats
                   t1_samples = samples_for_thread(samples, t1)
 
                   expect(t1_samples).to have(1).item
-                  expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_second_sample')
-                  expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 3)
+                  expect(t1_samples.first.labels).to include(:'trace endpoint' => 'changed_after_second_sample')
+                  expect(t1_samples.first.values).to include(:'cpu-samples' => 3)
                 end
               end
             end
@@ -455,6 +469,48 @@ def stats
         end
       end
     end
+
+    # This is a bit weird, but what we're doing here is using the stack from a different thread to represent the
+    # profiler overhead. In practice, the "different thread" will be the Collectors::CpuAndWallTimeWorker thread.
+    #
+    # Thus, what happens is, when we sample _once_, two samples will show up for the thread **that calls sample**:
+    # * The regular stack
+    # * The stack from the other thread
+    #
+    # E.g. if 1s elapsed since the last sample, and sampling takes 500ms:
+    # * The regular stack will have 1s attributed to it
+    # * The stack from the other thread will have 500ms attributed to it.
+    #
+    # This way it's clear what overhead comes from profiling. Without this feature (aka if profiler_overhead_stack_thread
+    # is set to Thread.current), then all 1.5s get attributed to the current stack, and the profiler overhead would be
+    # invisible.
+    it 'attributes the time sampling to the stack of the worker_thread_to_blame' do
+      sample
+      wall_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns)
+
+      another_way_of_calling_sample(profiler_overhead_stack_thread: t1)
+      wall_time_at_second_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns)
+
+      second_sample_stack =
+        samples_for_thread(samples, Thread.current)
+          .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } }
+
+      # The stack from the profiler_overhead_stack_thread (t1) above has showed up attributed to Thread.current, as we
+      # are using it to represent the profiler overhead.
+      profiler_overhead_stack =
+        samples_for_thread(samples, Thread.current)
+          .select { |it| it.locations.find { |frame| frame.base_label == 'inside_t1' } }
+
+      expect(second_sample_stack.size).to be 1
+      expect(profiler_overhead_stack.size).to be 1
+
+      expect(
+        second_sample_stack.first.values.fetch(:'wall-time') + profiler_overhead_stack.first.values.fetch(:'wall-time')
+      ).to be wall_time_at_second_sample - wall_time_at_first_sample
+
+      expect(second_sample_stack.first.labels).to_not include(:'profiler overhead' => anything)
+      expect(profiler_overhead_stack.first.labels).to include(:'profiler overhead' => 1)
+    end
   end
 
   describe '#on_gc_start' do
@@ -626,7 +682,7 @@ def stats
   end
 
   describe '#sample_after_gc' do
-    let(:gc_samples) { samples.select { |it| it.fetch(:locations).first.fetch(:path) == 'Garbage Collection' } }
+    let(:gc_samples) { samples.select { |it| it.locations.first.path == 'Garbage Collection' } }
 
     before { sample }
 
@@ -674,7 +730,7 @@ def stats
       it 'samples the thread with recorded gc start and finish time, marking it as being in Garbage Collection' do
         sample_after_gc
 
-        expect(object_id_from(gc_sample.fetch(:labels).fetch(:'thread id'))).to eq Thread.current.object_id
+        expect(object_id_from(gc_sample.labels.fetch(:'thread id'))).to eq Thread.current.object_id
       end
 
       it 'samples the thread with recorded gc start and finish time, recording the times between gc start and finish' do
@@ -685,7 +741,7 @@ def stats
 
         sample_after_gc
 
-        expect(gc_sample.fetch(:values)).to include(
+        expect(gc_sample.values).to include(
           :"cpu-samples" => 1,
           :'cpu-time' => cpu_time_at_finish_ns - cpu_time_at_start_ns,
           :"wall-time" => wall_time_at_finish_ns - wall_time_at_start_ns,
@@ -719,7 +775,7 @@ def stats
 
         sample_after_gc
 
-        wall_time_spent_in_gc = gc_sample.fetch(:values).fetch(:'wall-time')
+        wall_time_spent_in_gc = gc_sample.values.fetch(:'wall-time')
 
         expect(per_thread_context.fetch(Thread.current)).to include(
           wall_time_at_previous_sample_ns: wall_time_at_previous_sample_ns_before + wall_time_spent_in_gc
@@ -750,7 +806,7 @@ def stats
 
             sample_after_gc
 
-            cpu_time_spent_in_gc = gc_sample.fetch(:values).fetch(:'cpu-time')
+            cpu_time_spent_in_gc = gc_sample.values.fetch(:'cpu-time')
 
             expect(per_thread_context.fetch(Thread.current)).to include(
               cpu_time_at_previous_sample_ns: cpu_time_at_previous_sample_ns_before + cpu_time_spent_in_gc
diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
index 1b85376412f..f4763a41f4a 100644
--- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
+++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
@@ -129,7 +129,7 @@
       start
 
       all_samples = try_wait_until do
-        samples = samples_from_pprof_without_gc(recorder.serialize!)
+        samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!)
         samples if samples.any?
       end
 
@@ -143,14 +143,16 @@
       start
 
       all_samples = try_wait_until do
-        samples = samples_from_pprof_without_gc(recorder.serialize!)
+        samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!)
         samples if samples.any?
       end
 
       cpu_and_wall_time_worker.stop
 
       sample_count =
-        samples_for_thread(all_samples, Thread.current).map { |it| it.fetch(:values).fetch(:'cpu-samples') }.reduce(:+)
+        samples_for_thread(all_samples, Thread.current)
+          .map { |it| it.values.fetch(:'cpu-samples') }
+          .reduce(:+)
 
       stats = cpu_and_wall_time_worker.stats
 
@@ -163,7 +165,7 @@
       start
 
       try_wait_until do
-        samples = samples_from_pprof_without_gc(recorder.serialize!)
+        samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!)
         samples if samples.any?
       end
 
@@ -210,14 +212,14 @@
 
       current_thread_gc_samples =
         samples_for_thread(all_samples, Thread.current)
-          .select { |it| it.fetch(:locations).first.fetch(:path) == 'Garbage Collection' }
+          .select { |it| it.locations.first.path == 'Garbage Collection' }
 
       # NOTE: In some cases, Ruby may actually call two GC's back-to-back without us having the possibility to take
       # a sample. I don't expect this to happen for this test (that's what the `Thread.pass` above is trying to avoid)
       # but if this spec turns out to be flaky, that is probably the issue, and that would mean we'd need to relax the
       # check.
       expect(
-        current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.fetch(:values).fetch(:'cpu-samples') }
+        current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.values.fetch(:'cpu-samples') }
       ).to be >= invoke_gc_times
     end
 
@@ -294,8 +296,8 @@
         cpu_and_wall_time_worker.stop
         background_thread.kill
 
-        result = samples_for_thread(samples_from_pprof_without_gc(recorder.serialize!), Thread.current)
-        sample_count = result.map { |it| it.fetch(:values).fetch(:'cpu-samples') }.reduce(:+)
+        result = samples_for_thread(samples_from_pprof_without_gc_and_overhead(recorder.serialize!), Thread.current)
+        sample_count = result.map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+)
 
         stats = cpu_and_wall_time_worker.stats
 
@@ -323,8 +325,8 @@
 
         cpu_and_wall_time_worker.stop
 
-        result = samples_for_thread(samples_from_pprof_without_gc(recorder.serialize!), Thread.current)
-        sample_count = result.map { |it| it.fetch(:values).fetch(:'cpu-samples') }.reduce(:+)
+        result = samples_for_thread(samples_from_pprof_without_gc_and_overhead(recorder.serialize!), Thread.current)
+        sample_count = result.map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+)
 
         stats = cpu_and_wall_time_worker.stats
         debug_failures = { thread_list: Thread.list, result: result }
@@ -364,7 +366,7 @@
 
         samples_from_ractor =
           samples_from_pprof(recorder.serialize!)
-            .select { |it| it.fetch(:labels)[:'thread name'] == 'background ractor' }
+            .select { |it| it.labels[:'thread name'] == 'background ractor' }
 
         expect(samples_from_ractor).to be_empty
       end
@@ -537,11 +539,13 @@ def wait_until_running
     try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(cpu_and_wall_time_worker) }
   end
 
-  # This is useful because in a bunch of tests above we want to assert on properties of the period sampling, and having
-  # a random GC in the middle of the spec contribute a sample can throw off the expected values and counts.
+  # This is useful because in a bunch of tests above we want to assert on properties of the samples, and having GC
+  # and profiler overhead samples is a source of randomness which causes flakiness in the assertions.
   #
-  # We have separate specs that assert on the GC behaviors.
-  def samples_from_pprof_without_gc(pprof_data)
-    samples_from_pprof(pprof_data).reject { |it| it.fetch(:locations).first.fetch(:path) == 'Garbage Collection' }
+  # We have separate specs that assert on these behaviors.
+  def samples_from_pprof_without_gc_and_overhead(pprof_data)
+    samples_from_pprof(pprof_data)
+      .reject { |it| it.locations.first.path == 'Garbage Collection' }
+      .reject { |it| it.labels.include?(:'profiler overhead') }
   end
 end
diff --git a/spec/datadog/profiling/collectors/stack_spec.rb b/spec/datadog/profiling/collectors/stack_spec.rb
index 024ae7128ab..ecd5b6389d4 100644
--- a/spec/datadog/profiling/collectors/stack_spec.rb
+++ b/spec/datadog/profiling/collectors/stack_spec.rb
@@ -34,7 +34,7 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram
     let(:reference_stack) do
       # To make the stacks comparable we slice off the actual Ruby `Thread#backtrace_locations` frame since that part
       # will necessarily be different
-      expect(super().first).to match(hash_including(base_label: 'backtrace_locations'))
+      expect(super().first.base_label).to eq 'backtrace_locations'
       super()[1..-1]
     end
 
@@ -43,9 +43,9 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram
       # also necessarily be different
       expect(super()[0..2]).to match(
         [
-          hash_including(base_label: '_native_sample'),
-          hash_including(base_label: 'sample'),
-          hash_including(base_label: 'sample_and_decode'),
+          have_attributes(base_label: '_native_sample'),
+          have_attributes(base_label: 'sample'),
+          have_attributes(base_label: 'sample_and_decode'),
         ]
       )
       super()[3..-1]
@@ -63,7 +63,7 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram
       let(:in_gc) { true }
 
       it 'includes a placeholder frame for garbage collection' do
-        expect(stacks.fetch(:gathered)[0]).to eq({ base_label: '', path: 'Garbage Collection', lineno: 0 })
+        expect(stacks.fetch(:gathered)[0]).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0)
       end
 
       it 'matches the Ruby backtrace API' do
@@ -105,7 +105,7 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram
       end
 
       it 'has a sleeping frame at the top of the stack' do
-        expect(reference_stack.first).to match(hash_including(base_label: 'sleep'))
+        expect(reference_stack.first.base_label).to eq 'sleep'
       end
     end
 
@@ -128,9 +128,9 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram
 
       it 'has eval frames on the stack' do
         expect(reference_stack[0..2]).to contain_exactly(
-          hash_including(base_label: 'sleep', path: '(eval)'),
-          hash_including(base_label: '<top (required)>', path: '(eval)'),
-          hash_including(base_label: 'eval', path: end_with('stack_spec.rb')),
+          have_attributes(base_label: 'sleep', path: '(eval)'),
+          have_attributes(base_label: '<top (required)>', path: '(eval)'),
+          have_attributes(base_label: 'eval', path: end_with('stack_spec.rb')),
         )
       end
     end
@@ -172,8 +172,8 @@ def call_sleep
           # These two frames are the frames that get created with the evaluation of the string, e.g. if instead of
           # `eval("foo")` we did `eval { foo }` then it is the block containing foo; eval with a string works similarly,
           # although you don't see a block there.
-          hash_including(base_label: 'call_eval', path: '(eval)', lineno: 1),
-          hash_including(base_label: 'call_instance_eval', path: '(eval)', lineno: 1),
+          have_attributes(base_label: 'call_eval', path: '(eval)', lineno: 1),
+          have_attributes(base_label: 'call_instance_eval', path: '(eval)', lineno: 1),
         )
       end
     end
@@ -191,7 +191,7 @@ def call_sleep
 
       it 'has a frame with the custom file and line provided on the stack' do
         expect(reference_stack).to include(
-          hash_including(path: '/this/is/a/fake_file_.rb', lineno: -123456789),
+          have_attributes(path: '/this/is/a/fake_file_.rb', lineno: -123456789),
         )
       end
     end
@@ -214,7 +214,7 @@ def call_sleep
       # I opted to join these two expects to avoid running the `load` above more than once
       it 'matches the Ruby backtrace API AND has a sleeping frame at the top of the stack' do
         expect(gathered_stack).to eq reference_stack
-        expect(reference_stack.first).to match(hash_including(base_label: 'sleep'))
+        expect(reference_stack.first.base_label).to eq 'sleep'
       end
     end
   end
@@ -245,8 +245,7 @@ def call_sleep
       omitted_frames = target_stack_depth - max_frames + placeholder
 
       expect(omitted_frames).to be 96
-      expect(gathered_stack.last)
-        .to match(hash_including({ base_label: '', path: '96 frames omitted', lineno: 0 }))
+      expect(gathered_stack.last).to have_attributes(base_label: '', path: '96 frames omitted', lineno: 0)
     end
 
     context 'when stack is exactly 1 item deeper than the configured max_frames' do
@@ -255,8 +254,7 @@ def call_sleep
       it 'includes a placeholder frame stating that 2 frames were omitted' do
         # Why 2 frames omitted and not 1? That's because the placeholder takes over 1 space in the buffer, so
         # if there were 6 frames on the stack and the limit is 5, then 4 of those frames will be present in the output
-        expect(gathered_stack.last)
-          .to match(hash_including({ base_label: '', path: '2 frames omitted', lineno: 0 }))
+        expect(gathered_stack.last).to have_attributes(base_label: '', path: '2 frames omitted', lineno: 0)
       end
     end
 
@@ -286,9 +284,8 @@ def call_sleep
         omitted_frames = target_stack_depth - max_frames + placeholder + garbage_collection
 
         expect(omitted_frames).to be 97
-        expect(gathered_stack.last)
-          .to match(hash_including({ base_label: '', path: '97 frames omitted', lineno: 0 }))
-        expect(gathered_stack.first).to match(hash_including(base_label: '', path: 'Garbage Collection', lineno: 0))
+        expect(gathered_stack.last).to have_attributes(base_label: '', path: '97 frames omitted', lineno: 0)
+        expect(gathered_stack.first).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0)
       end
 
       context 'when stack is exactly one item less as deep as the configured max_frames' do
@@ -319,7 +316,7 @@ def call_sleep
         # @ivoanjo: I... don't think this can happen in practice. It's debatable if we should still have the placeholder
         # frame or not, but for ease of implementation I chose this path, and I added this spec just to get coverage on
         # this corner case.
-        expect(gathered_stack).to contain_exactly({ base_label: '', path: 'Garbage Collection', lineno: 0 })
+        expect(gathered_stack).to contain_exactly(have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0))
       end
     end
   end
@@ -371,7 +368,7 @@ def call_sleep
     end
 
     it 'gathers a one-element stack with a "In native code" placeholder' do
-      expect(gathered_stack).to contain_exactly({ base_label: '', path: 'In native code', lineno: 0 })
+      expect(gathered_stack).to contain_exactly(have_attributes(base_label: '', path: 'In native code', lineno: 0))
     end
 
     context 'when marking sample as being in garbage collection' do
@@ -379,8 +376,8 @@ def call_sleep
 
       it 'gathers a two-element stack with a placeholder for "In native code" and another for garbage collection' do
         expect(gathered_stack).to contain_exactly(
-          { base_label: '', path: 'Garbage Collection', lineno: 0 },
-          { base_label: '', path: 'In native code', lineno: 0 }
+          have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0),
+          have_attributes(base_label: '', path: 'In native code', lineno: 0),
         )
       end
     end
@@ -412,7 +409,7 @@ def call_sleep
 
   def convert_reference_stack(raw_reference_stack)
     raw_reference_stack.map do |location|
-      { base_label: location.base_label, path: location.path, lineno: location.lineno }
+      ProfileHelpers::Frame.new(location.base_label, location.path, location.lineno).freeze
     end
   end
 
@@ -422,7 +419,7 @@ def sample_and_decode(thread, max_frames: 400, recorder: Datadog::Profiling::Sta
     samples = samples_from_pprof(recorder.serialize!)
 
     expect(samples.size).to be 1
-    samples.first.fetch(:locations)
+    samples.first.locations
   end
 end
 
diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb
index d2bfea17863..2bae601930a 100644
--- a/spec/datadog/profiling/spec_helper.rb
+++ b/spec/datadog/profiling/spec_helper.rb
@@ -5,6 +5,9 @@
 module ProfileHelpers
   include Kernel
 
+  Sample = Struct.new(:locations, :values, :labels) # rubocop:disable Lint/StructNewOverride
+  Frame = Struct.new(:base_label, :path, :lineno)
+
   def build_stack_sample(
     locations: nil,
     thread_id: nil,
@@ -59,16 +62,16 @@ def samples_from_pprof(pprof_data)
     pretty_sample_types = decoded_profile.sample_type.map { |it| string_table[it.type].to_sym }
 
     decoded_profile.sample.map do |sample|
-      {
-        locations: sample.location_id.map { |location_id| decode_frame_from_pprof(decoded_profile, location_id) },
-        values: pretty_sample_types.zip(sample.value).to_h,
-        labels: sample.label.map do |it|
+      Sample.new(
+        sample.location_id.map { |location_id| decode_frame_from_pprof(decoded_profile, location_id) },
+        pretty_sample_types.zip(sample.value).to_h,
+        sample.label.map do |it|
           [
             string_table[it.key].to_sym,
             it.num == 0 ? string_table[it.str] : it.num,
           ]
         end.to_h,
-      }
+      ).freeze
     end
   end
 
@@ -80,7 +83,7 @@ def decode_frame_from_pprof(decoded_profile, location_id)
     line_entry = location.line.first
     function = decoded_profile.function.find { |func| func.id == line_entry.function_id }
 
-    { base_label: strings[function.name], path: strings[function.filename], lineno: line_entry.line }
+    Frame.new(strings[function.name], strings[function.filename], line_entry.line).freeze
   end
 
   def object_id_from(thread_id)
@@ -88,7 +91,7 @@ def object_id_from(thread_id)
   end
 
   def samples_for_thread(samples, thread)
-    samples.select { |sample| object_id_from(sample.fetch(:labels).fetch(:'thread id')) == thread.object_id }
+    samples.select { |sample| object_id_from(sample.labels.fetch(:'thread id')) == thread.object_id }
   end
 end
 
diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb
index a7db92da3ba..9c5e650292a 100644
--- a/spec/datadog/profiling/stack_recorder_spec.rb
+++ b/spec/datadog/profiling/stack_recorder_spec.rb
@@ -151,11 +151,11 @@ def sample_types_from(decoded_profile)
       end
 
       it 'encodes the sample with the metrics provided' do
-        expect(samples.first).to include(values: { :'cpu-time' => 123, :'cpu-samples' => 456, :'wall-time' => 789 })
+        expect(samples.first.values).to eq(:'cpu-time' => 123, :'cpu-samples' => 456, :'wall-time' => 789)
       end
 
       it 'encodes the sample with the labels provided' do
-        expect(samples.first).to include(labels: { label_a: 'value_a', label_b: 'value_b' })
+        expect(samples.first.labels).to eq(label_a: 'value_a', label_b: 'value_b')
       end
 
       it 'encodes a single empty mapping' do