diff --git a/benchmarks/profiler_sample_loop_v2.rb b/benchmarks/profiler_sample_loop_v2.rb index 0ed11b59c68..0292892804a 100644 --- a/benchmarks/profiler_sample_loop_v2.rb +++ b/benchmarks/profiler_sample_loop_v2.rb @@ -13,6 +13,10 @@ # This benchmark measures the performance of the main stack sampling loop of the profiler class ProfilerSampleLoopBenchmark + # This is needed because we're directly invoking the CpuAndWallTime collector through a testing interface; in normal + # use a profiler thread is automatically used. + PROFILER_OVERHEAD_STACK_THREAD = Thread.new { sleep } + def create_profiler @recorder = Datadog::Profiling::StackRecorder.new @collector = Datadog::Profiling::Collectors::CpuAndWallTime.new(recorder: @recorder, max_frames: 400, tracer: nil) @@ -36,7 +40,7 @@ def run_benchmark x.config(**benchmark_time, suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_sample_loop_v2')) x.report("stack collector #{ENV['CONFIG']}") do - Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector) + Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) end x.save! 'profiler-sample-loop-v2-results.json' unless VALIDATE_BENCHMARK_MODE @@ -48,7 +52,7 @@ def run_benchmark def run_forever while true - 1000.times { Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector) } + 1000.times { Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) } @recorder.serialize print '.' end diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c index 26debd3d2fc..77848ee375a 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c @@ -136,13 +136,22 @@ static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _val static int hash_map_per_thread_context_free_values(st_data_t _thread, st_data_t value_per_thread_context, st_data_t _argument); static VALUE _native_new(VALUE klass); static VALUE _native_initialize(VALUE self, VALUE collector_instance, VALUE recorder_instance, VALUE max_frames, VALUE tracer_context_key); -static VALUE _native_sample(VALUE self, VALUE collector_instance); +static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread); static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance); static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance); static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance); +void update_metrics_and_sample( + struct cpu_and_wall_time_collector_state *state, + VALUE thread_being_sampled, + VALUE profiler_overhead_stack_thread, + struct per_thread_context *thread_context, + long current_cpu_time_ns, + long current_monotonic_wall_time_ns +); static void trigger_sample_for_thread( struct cpu_and_wall_time_collector_state *state, VALUE thread, + VALUE stack_from_thread, struct per_thread_context *thread_context, ddog_Slice_I64 metric_values_slice, sample_type type @@ -185,7 +194,7 @@ void collectors_cpu_and_wall_time_init(VALUE profiling_module) { rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_initialize", _native_initialize, 4); rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_inspect", _native_inspect, 1); rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_reset_after_fork", _native_reset_after_fork, 1); - rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 1); + rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2); rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1); rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1); rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1); @@ -294,8 +303,10 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_inst // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTime behavior using RSpec. // It SHOULD NOT be used for other purposes. -static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) { - cpu_and_wall_time_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE)); +static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread) { + if (!is_thread_alive(profiler_overhead_stack_thread)) rb_raise(rb_eArgError, "Unexpected: profiler_overhead_stack_thread is not alive"); + + cpu_and_wall_time_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE), profiler_overhead_stack_thread); return Qtrue; } @@ -327,10 +338,17 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_ // Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe. // Assumption 4: This function IS NOT called in a reentrant way. // Assumption 5: This function is called from the main Ractor (if Ruby has support for Ractors). -void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns) { +// +// The `profiler_overhead_stack_thread` is used to attribute the profiler overhead to a stack borrowed from a different thread +// (belonging to ddtrace), so that the overhead is visible in the profile rather than blamed on user code. +void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns, VALUE profiler_overhead_stack_thread) { struct cpu_and_wall_time_collector_state *state; TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state); + VALUE current_thread = rb_thread_current(); + struct per_thread_context *current_thread_context = get_or_create_context_for(current_thread, state); + long cpu_time_at_sample_start_for_current_thread = cpu_time_now_ns(current_thread_context); + VALUE threads = ddtrace_thread_list(); const long thread_count = RARRAY_LEN(threads); @@ -338,33 +356,17 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto VALUE thread = RARRAY_AREF(threads, i); struct per_thread_context *thread_context = get_or_create_context_for(thread, state); - long current_cpu_time_ns = cpu_time_now_ns(thread_context); + // We account for cpu-time for the current thread in a different way -- we use the cpu-time at sampling start, to avoid + // blaming the time the profiler took on whatever's running on the thread right now + long current_cpu_time_ns = thread != current_thread ? cpu_time_now_ns(thread_context) : cpu_time_at_sample_start_for_current_thread; - long cpu_time_elapsed_ns = update_time_since_previous_sample( - &thread_context->cpu_time_at_previous_sample_ns, - current_cpu_time_ns, - thread_context->gc_tracking.cpu_time_at_start_ns, - IS_NOT_WALL_TIME - ); - long wall_time_elapsed_ns = update_time_since_previous_sample( - &thread_context->wall_time_at_previous_sample_ns, - current_monotonic_wall_time_ns, - thread_context->gc_tracking.wall_time_at_start_ns, - IS_WALL_TIME - ); - - int64_t metric_values[ENABLED_VALUE_TYPES_COUNT] = {0}; - - metric_values[CPU_TIME_VALUE_POS] = cpu_time_elapsed_ns; - metric_values[CPU_SAMPLES_VALUE_POS] = 1; - metric_values[WALL_TIME_VALUE_POS] = wall_time_elapsed_ns; - - trigger_sample_for_thread( + update_metrics_and_sample( state, - thread, + /* thread_being_sampled: */ thread, + /* stack_from_thread: */ thread, thread_context, - (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT}, - SAMPLE_REGULAR + current_cpu_time_ns, + current_monotonic_wall_time_ns ); } @@ -373,6 +375,52 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto // TODO: This seems somewhat overkill and inefficient to do often; right now we just do it every few samples // but there's probably a better way to do this if we actually track when threads finish if (state->sample_count % 100 == 0) remove_context_for_dead_threads(state); + + update_metrics_and_sample( + state, + /* thread_being_sampled: */ current_thread, + /* stack_from_thread: */ profiler_overhead_stack_thread, + current_thread_context, + cpu_time_now_ns(current_thread_context), + monotonic_wall_time_now_ns(RAISE_ON_FAILURE) + ); +} + +void update_metrics_and_sample( + struct cpu_and_wall_time_collector_state *state, + VALUE thread_being_sampled, + VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack + struct per_thread_context *thread_context, + long current_cpu_time_ns, + long current_monotonic_wall_time_ns +) { + long cpu_time_elapsed_ns = update_time_since_previous_sample( + &thread_context->cpu_time_at_previous_sample_ns, + current_cpu_time_ns, + thread_context->gc_tracking.cpu_time_at_start_ns, + IS_NOT_WALL_TIME + ); + long wall_time_elapsed_ns = update_time_since_previous_sample( + &thread_context->wall_time_at_previous_sample_ns, + current_monotonic_wall_time_ns, + thread_context->gc_tracking.wall_time_at_start_ns, + IS_WALL_TIME + ); + + int64_t metric_values[ENABLED_VALUE_TYPES_COUNT] = {0}; + + metric_values[CPU_TIME_VALUE_POS] = cpu_time_elapsed_ns; + metric_values[CPU_SAMPLES_VALUE_POS] = 1; + metric_values[WALL_TIME_VALUE_POS] = wall_time_elapsed_ns; + + trigger_sample_for_thread( + state, + thread_being_sampled, + stack_from_thread, + thread_context, + (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT}, + SAMPLE_REGULAR + ); } // This function gets called when Ruby is about to start running the Garbage Collector on the current thread. @@ -515,7 +563,8 @@ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) { trigger_sample_for_thread( state, - thread, + /* thread: */ thread, + /* stack_from_thread: */ thread, thread_context, (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT}, SAMPLE_IN_GC @@ -545,6 +594,7 @@ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) { static void trigger_sample_for_thread( struct cpu_and_wall_time_collector_state *state, VALUE thread, + VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack struct per_thread_context *thread_context, ddog_Slice_I64 metric_values_slice, sample_type type @@ -552,6 +602,7 @@ static void trigger_sample_for_thread( int max_label_count = 1 + // thread id 1 + // thread name + 1 + // profiler overhead 2; // local root span id and span id ddog_prof_Label labels[max_label_count]; int label_pos = 0; @@ -594,6 +645,13 @@ static void trigger_sample_for_thread( } } + if (thread != stack_from_thread) { + labels[label_pos++] = (ddog_prof_Label) { + .key = DDOG_CHARSLICE_C("profiler overhead"), + .num = 1 + }; + } + // The number of times `label_pos++` shows up in this function needs to match `max_label_count`. To avoid "oops I // forgot to update max_label_count" in the future, we've also added this validation. // @ivoanjo: I wonder if C compilers are smart enough to statically prove when this check never triggers happens and @@ -603,7 +661,7 @@ static void trigger_sample_for_thread( } sample_thread( - thread, + stack_from_thread, state->sampling_buffer, state->recorder_instance, metric_values_slice, diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h index 609a3d4757f..688d8389987 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h @@ -2,7 +2,11 @@ #include -void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns); +void cpu_and_wall_time_collector_sample( + VALUE self_instance, + long current_monotonic_wall_time_ns, + VALUE profiler_overhead_stack_thread +); VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance); void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance); void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance); diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index 422bd376486..a5d56a67e02 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -481,7 +481,8 @@ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) { state->stats.sampled++; - cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample); + VALUE profiler_overhead_stack_thread = state->owner_thread; // Used to attribute profiler overhead to a different stack + cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample, profiler_overhead_stack_thread); long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE); long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample; diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb index 2211fb0c59b..d38f361df84 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb @@ -15,8 +15,10 @@ let(:ready_queue) { Queue.new } let(:t1) do Thread.new(ready_queue) do |ready_queue| - ready_queue << true - sleep + inside_t1 do + ready_queue << true + sleep + end end end let(:t2) do @@ -47,8 +49,8 @@ end end - def sample - described_class::Testing._native_sample(cpu_and_wall_time_collector) + def sample(profiler_overhead_stack_thread: Thread.current) + described_class::Testing._native_sample(cpu_and_wall_time_collector, profiler_overhead_stack_thread) end def on_gc_start @@ -75,6 +77,15 @@ def stats described_class::Testing._native_stats(cpu_and_wall_time_collector) end + def inside_t1 + yield + end + + # This method exists only so we can look for its name in the stack trace in a few tests + def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current) + sample(profiler_overhead_stack_thread: profiler_overhead_stack_thread) + end + describe '#sample' do it 'samples all threads' do all_threads = Thread.list @@ -88,7 +99,7 @@ def stats it 'tags the samples with the object ids of the Threads they belong to' do sample - expect(samples.map { |it| object_id_from(it.fetch(:labels).fetch(:'thread id')) }) + expect(samples.map { |it| object_id_from(it.labels.fetch(:'thread id')) }) .to include(*[Thread.main, t1, t2, t3].map(&:object_id)) end @@ -103,9 +114,9 @@ def stats t2_sample = samples_for_thread(samples, t2).first t3_sample = samples_for_thread(samples, t3).first - expect(t1_sample).to include(labels: include(:'thread name' => 'thread t1')) - expect(t2_sample.fetch(:labels).keys).to_not include(:'thread name') - expect(t3_sample).to include(labels: include(:'thread name' => 'thread t3')) + expect(t1_sample.labels).to include(:'thread name' => 'thread t1') + expect(t2_sample.labels.keys).to_not include(:'thread name') + expect(t3_sample.labels).to include(:'thread name' => 'thread t3') end it 'includes the wall-time elapsed between samples' do @@ -118,7 +129,7 @@ def stats per_thread_context.fetch(t1).fetch(:wall_time_at_previous_sample_ns) t1_samples = samples_for_thread(samples, t1) - wall_time = t1_samples.first.fetch(:values).fetch(:'wall-time') + wall_time = t1_samples.first.values.fetch(:'wall-time') expect(t1_samples.size) .to be(1), "Expected thread t1 to always have same stack trace (because it's sleeping), got #{t1_samples.inspect}" @@ -131,7 +142,7 @@ def stats t1_sample = samples_for_thread(samples, t1).first - expect(t1_sample).to include(values: include(:'cpu-samples' => 5)) + expect(t1_sample.values).to include(:'cpu-samples' => 5) end [:before, :after].each do |on_gc_finish_order| @@ -148,11 +159,13 @@ def stats wall_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns') - 5.times { sample } # Even though we keep sampling, the result only includes the time until we called on_gc_start + # Even though we keep calling sample, the result only includes the time until we called on_gc_start + 5.times { another_way_of_calling_sample } total_wall_for_rspec_thread = samples_for_thread(samples, Thread.current) - .map { |it| it.fetch(:values).fetch(:'wall-time') } + .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } } + .map { |it| it.values.fetch(:'wall-time') } .reduce(:+) expect(total_wall_for_rspec_thread).to be(wall_time_at_gc_start - wall_time_at_first_sample) @@ -207,7 +220,7 @@ def stats # some data for it total_cpu_for_rspec_thread = samples_for_thread(samples, Thread.current) - .map { |it| it.fetch(:values).fetch(:'cpu-time') } + .map { |it| it.values.fetch(:'cpu-time') } .reduce(:+) # The **wall-clock time** spent by the rspec thread is going to be an upper bound for the cpu time spent, @@ -227,12 +240,13 @@ def stats cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns') - # Even though we keep sampling, the result only includes the time until we called on_gc_start - 5.times { sample } + # Even though we keep calling sample, the result only includes the time until we called on_gc_start + 5.times { another_way_of_calling_sample } total_cpu_for_rspec_thread = samples_for_thread(samples, Thread.current) - .map { |it| it.fetch(:values).fetch(:'cpu-time') } + .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } } + .map { |it| it.values.fetch(:'cpu-time') } .reduce(:+) expect(total_cpu_for_rspec_thread).to be(cpu_time_at_gc_start - cpu_time_at_first_sample) @@ -274,7 +288,7 @@ def stats it 'does not include "local root span id" nor "span id" labels in the samples' do sample - found_labels = t1_sample.fetch(:labels).keys + found_labels = t1_sample.labels.keys expect(found_labels).to_not include(:'local root span id') expect(found_labels).to_not include(:'span id') @@ -347,7 +361,7 @@ def stats it 'includes "local root span id" and "span id" labels in the samples' do sample - expect(t1_sample.fetch(:labels)).to include( + expect(t1_sample.labels).to include( :'local root span id' => @t1_local_root_span_id.to_i, :'span id' => @t1_span_id.to_i, ) @@ -356,7 +370,7 @@ def stats it 'does not include the "trace endpoint" label' do sample - expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint') + expect(t1_sample.labels.keys).to_not include(:'trace endpoint') end context 'when local root span type is web' do @@ -365,7 +379,7 @@ def stats it 'includes the "trace endpoint" label in the samples' do sample - expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'profiler.test') + expect(t1_sample.labels).to include(:'trace endpoint' => 'profiler.test') end describe 'trace vs root span resource mutation' do @@ -392,7 +406,7 @@ def stats it 'includes the "trace endpoint" label in the samples with the root span resource' do sample - expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'root_span_resource') + expect(t1_sample.labels).to include(:'trace endpoint' => 'root_span_resource') end end @@ -403,7 +417,7 @@ def stats it 'includes the "trace endpoint" label in the samples with the trace resource' do sample - expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'trace_resource') + expect(t1_sample.labels).to include(:'trace endpoint' => 'trace_resource') end end @@ -414,7 +428,7 @@ def stats it 'does not include the "trace endpoint" label' do sample - expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint') + expect(t1_sample.labels.keys).to_not include(:'trace endpoint') end end end @@ -431,8 +445,8 @@ def stats t1_samples = samples_for_thread(samples, t1) expect(t1_samples).to have(1).item - expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_first_sample') - expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 2) + expect(t1_samples.first.labels).to include(:'trace endpoint' => 'changed_after_first_sample') + expect(t1_samples.first.values).to include(:'cpu-samples' => 2) end context 'when the resource is changed multiple times' do @@ -446,8 +460,8 @@ def stats t1_samples = samples_for_thread(samples, t1) expect(t1_samples).to have(1).item - expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_second_sample') - expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 3) + expect(t1_samples.first.labels).to include(:'trace endpoint' => 'changed_after_second_sample') + expect(t1_samples.first.values).to include(:'cpu-samples' => 3) end end end @@ -455,6 +469,48 @@ def stats end end end + + # This is a bit weird, but what we're doing here is using the stack from a different thread to represent the + # profiler overhead. In practice, the "different thread" will be the Collectors::CpuAndWallTimeWorker thread. + # + # Thus, what happens is, when we sample _once_, two samples will show up for the thread **that calls sample**: + # * The regular stack + # * The stack from the other thread + # + # E.g. if 1s elapsed since the last sample, and sampling takes 500ms: + # * The regular stack will have 1s attributed to it + # * The stack from the other thread will have 500ms attributed to it. + # + # This way it's clear what overhead comes from profiling. Without this feature (aka if profiler_overhead_stack_thread + # is set to Thread.current), then all 1.5s get attributed to the current stack, and the profiler overhead would be + # invisible. + it 'attributes the time sampling to the stack of the worker_thread_to_blame' do + sample + wall_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns) + + another_way_of_calling_sample(profiler_overhead_stack_thread: t1) + wall_time_at_second_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns) + + second_sample_stack = + samples_for_thread(samples, Thread.current) + .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } } + + # The stack from the profiler_overhead_stack_thread (t1) above has showed up attributed to Thread.current, as we + # are using it to represent the profiler overhead. + profiler_overhead_stack = + samples_for_thread(samples, Thread.current) + .select { |it| it.locations.find { |frame| frame.base_label == 'inside_t1' } } + + expect(second_sample_stack.size).to be 1 + expect(profiler_overhead_stack.size).to be 1 + + expect( + second_sample_stack.first.values.fetch(:'wall-time') + profiler_overhead_stack.first.values.fetch(:'wall-time') + ).to be wall_time_at_second_sample - wall_time_at_first_sample + + expect(second_sample_stack.first.labels).to_not include(:'profiler overhead' => anything) + expect(profiler_overhead_stack.first.labels).to include(:'profiler overhead' => 1) + end end describe '#on_gc_start' do @@ -626,7 +682,7 @@ def stats end describe '#sample_after_gc' do - let(:gc_samples) { samples.select { |it| it.fetch(:locations).first.fetch(:path) == 'Garbage Collection' } } + let(:gc_samples) { samples.select { |it| it.locations.first.path == 'Garbage Collection' } } before { sample } @@ -674,7 +730,7 @@ def stats it 'samples the thread with recorded gc start and finish time, marking it as being in Garbage Collection' do sample_after_gc - expect(object_id_from(gc_sample.fetch(:labels).fetch(:'thread id'))).to eq Thread.current.object_id + expect(object_id_from(gc_sample.labels.fetch(:'thread id'))).to eq Thread.current.object_id end it 'samples the thread with recorded gc start and finish time, recording the times between gc start and finish' do @@ -685,7 +741,7 @@ def stats sample_after_gc - expect(gc_sample.fetch(:values)).to include( + expect(gc_sample.values).to include( :"cpu-samples" => 1, :'cpu-time' => cpu_time_at_finish_ns - cpu_time_at_start_ns, :"wall-time" => wall_time_at_finish_ns - wall_time_at_start_ns, @@ -719,7 +775,7 @@ def stats sample_after_gc - wall_time_spent_in_gc = gc_sample.fetch(:values).fetch(:'wall-time') + wall_time_spent_in_gc = gc_sample.values.fetch(:'wall-time') expect(per_thread_context.fetch(Thread.current)).to include( wall_time_at_previous_sample_ns: wall_time_at_previous_sample_ns_before + wall_time_spent_in_gc @@ -750,7 +806,7 @@ def stats sample_after_gc - cpu_time_spent_in_gc = gc_sample.fetch(:values).fetch(:'cpu-time') + cpu_time_spent_in_gc = gc_sample.values.fetch(:'cpu-time') expect(per_thread_context.fetch(Thread.current)).to include( cpu_time_at_previous_sample_ns: cpu_time_at_previous_sample_ns_before + cpu_time_spent_in_gc diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 1b85376412f..f4763a41f4a 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -129,7 +129,7 @@ start all_samples = try_wait_until do - samples = samples_from_pprof_without_gc(recorder.serialize!) + samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!) samples if samples.any? end @@ -143,14 +143,16 @@ start all_samples = try_wait_until do - samples = samples_from_pprof_without_gc(recorder.serialize!) + samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!) samples if samples.any? end cpu_and_wall_time_worker.stop sample_count = - samples_for_thread(all_samples, Thread.current).map { |it| it.fetch(:values).fetch(:'cpu-samples') }.reduce(:+) + samples_for_thread(all_samples, Thread.current) + .map { |it| it.values.fetch(:'cpu-samples') } + .reduce(:+) stats = cpu_and_wall_time_worker.stats @@ -163,7 +165,7 @@ start try_wait_until do - samples = samples_from_pprof_without_gc(recorder.serialize!) + samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!) samples if samples.any? end @@ -210,14 +212,14 @@ current_thread_gc_samples = samples_for_thread(all_samples, Thread.current) - .select { |it| it.fetch(:locations).first.fetch(:path) == 'Garbage Collection' } + .select { |it| it.locations.first.path == 'Garbage Collection' } # NOTE: In some cases, Ruby may actually call two GC's back-to-back without us having the possibility to take # a sample. I don't expect this to happen for this test (that's what the `Thread.pass` above is trying to avoid) # but if this spec turns out to be flaky, that is probably the issue, and that would mean we'd need to relax the # check. expect( - current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.fetch(:values).fetch(:'cpu-samples') } + current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.values.fetch(:'cpu-samples') } ).to be >= invoke_gc_times end @@ -294,8 +296,8 @@ cpu_and_wall_time_worker.stop background_thread.kill - result = samples_for_thread(samples_from_pprof_without_gc(recorder.serialize!), Thread.current) - sample_count = result.map { |it| it.fetch(:values).fetch(:'cpu-samples') }.reduce(:+) + result = samples_for_thread(samples_from_pprof_without_gc_and_overhead(recorder.serialize!), Thread.current) + sample_count = result.map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+) stats = cpu_and_wall_time_worker.stats @@ -323,8 +325,8 @@ cpu_and_wall_time_worker.stop - result = samples_for_thread(samples_from_pprof_without_gc(recorder.serialize!), Thread.current) - sample_count = result.map { |it| it.fetch(:values).fetch(:'cpu-samples') }.reduce(:+) + result = samples_for_thread(samples_from_pprof_without_gc_and_overhead(recorder.serialize!), Thread.current) + sample_count = result.map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+) stats = cpu_and_wall_time_worker.stats debug_failures = { thread_list: Thread.list, result: result } @@ -364,7 +366,7 @@ samples_from_ractor = samples_from_pprof(recorder.serialize!) - .select { |it| it.fetch(:labels)[:'thread name'] == 'background ractor' } + .select { |it| it.labels[:'thread name'] == 'background ractor' } expect(samples_from_ractor).to be_empty end @@ -537,11 +539,13 @@ def wait_until_running try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(cpu_and_wall_time_worker) } end - # This is useful because in a bunch of tests above we want to assert on properties of the period sampling, and having - # a random GC in the middle of the spec contribute a sample can throw off the expected values and counts. + # This is useful because in a bunch of tests above we want to assert on properties of the samples, and having GC + # and profiler overhead samples is a source of randomness which causes flakiness in the assertions. # - # We have separate specs that assert on the GC behaviors. - def samples_from_pprof_without_gc(pprof_data) - samples_from_pprof(pprof_data).reject { |it| it.fetch(:locations).first.fetch(:path) == 'Garbage Collection' } + # We have separate specs that assert on these behaviors. + def samples_from_pprof_without_gc_and_overhead(pprof_data) + samples_from_pprof(pprof_data) + .reject { |it| it.locations.first.path == 'Garbage Collection' } + .reject { |it| it.labels.include?(:'profiler overhead') } end end diff --git a/spec/datadog/profiling/collectors/stack_spec.rb b/spec/datadog/profiling/collectors/stack_spec.rb index 024ae7128ab..ecd5b6389d4 100644 --- a/spec/datadog/profiling/collectors/stack_spec.rb +++ b/spec/datadog/profiling/collectors/stack_spec.rb @@ -34,7 +34,7 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram let(:reference_stack) do # To make the stacks comparable we slice off the actual Ruby `Thread#backtrace_locations` frame since that part # will necessarily be different - expect(super().first).to match(hash_including(base_label: 'backtrace_locations')) + expect(super().first.base_label).to eq 'backtrace_locations' super()[1..-1] end @@ -43,9 +43,9 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram # also necessarily be different expect(super()[0..2]).to match( [ - hash_including(base_label: '_native_sample'), - hash_including(base_label: 'sample'), - hash_including(base_label: 'sample_and_decode'), + have_attributes(base_label: '_native_sample'), + have_attributes(base_label: 'sample'), + have_attributes(base_label: 'sample_and_decode'), ] ) super()[3..-1] @@ -63,7 +63,7 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram let(:in_gc) { true } it 'includes a placeholder frame for garbage collection' do - expect(stacks.fetch(:gathered)[0]).to eq({ base_label: '', path: 'Garbage Collection', lineno: 0 }) + expect(stacks.fetch(:gathered)[0]).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0) end it 'matches the Ruby backtrace API' do @@ -105,7 +105,7 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram end it 'has a sleeping frame at the top of the stack' do - expect(reference_stack.first).to match(hash_including(base_label: 'sleep')) + expect(reference_stack.first.base_label).to eq 'sleep' end end @@ -128,9 +128,9 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram it 'has eval frames on the stack' do expect(reference_stack[0..2]).to contain_exactly( - hash_including(base_label: 'sleep', path: '(eval)'), - hash_including(base_label: '', path: '(eval)'), - hash_including(base_label: 'eval', path: end_with('stack_spec.rb')), + have_attributes(base_label: 'sleep', path: '(eval)'), + have_attributes(base_label: '', path: '(eval)'), + have_attributes(base_label: 'eval', path: end_with('stack_spec.rb')), ) end end @@ -172,8 +172,8 @@ def call_sleep # These two frames are the frames that get created with the evaluation of the string, e.g. if instead of # `eval("foo")` we did `eval { foo }` then it is the block containing foo; eval with a string works similarly, # although you don't see a block there. - hash_including(base_label: 'call_eval', path: '(eval)', lineno: 1), - hash_including(base_label: 'call_instance_eval', path: '(eval)', lineno: 1), + have_attributes(base_label: 'call_eval', path: '(eval)', lineno: 1), + have_attributes(base_label: 'call_instance_eval', path: '(eval)', lineno: 1), ) end end @@ -191,7 +191,7 @@ def call_sleep it 'has a frame with the custom file and line provided on the stack' do expect(reference_stack).to include( - hash_including(path: '/this/is/a/fake_file_.rb', lineno: -123456789), + have_attributes(path: '/this/is/a/fake_file_.rb', lineno: -123456789), ) end end @@ -214,7 +214,7 @@ def call_sleep # I opted to join these two expects to avoid running the `load` above more than once it 'matches the Ruby backtrace API AND has a sleeping frame at the top of the stack' do expect(gathered_stack).to eq reference_stack - expect(reference_stack.first).to match(hash_including(base_label: 'sleep')) + expect(reference_stack.first.base_label).to eq 'sleep' end end end @@ -245,8 +245,7 @@ def call_sleep omitted_frames = target_stack_depth - max_frames + placeholder expect(omitted_frames).to be 96 - expect(gathered_stack.last) - .to match(hash_including({ base_label: '', path: '96 frames omitted', lineno: 0 })) + expect(gathered_stack.last).to have_attributes(base_label: '', path: '96 frames omitted', lineno: 0) end context 'when stack is exactly 1 item deeper than the configured max_frames' do @@ -255,8 +254,7 @@ def call_sleep it 'includes a placeholder frame stating that 2 frames were omitted' do # Why 2 frames omitted and not 1? That's because the placeholder takes over 1 space in the buffer, so # if there were 6 frames on the stack and the limit is 5, then 4 of those frames will be present in the output - expect(gathered_stack.last) - .to match(hash_including({ base_label: '', path: '2 frames omitted', lineno: 0 })) + expect(gathered_stack.last).to have_attributes(base_label: '', path: '2 frames omitted', lineno: 0) end end @@ -286,9 +284,8 @@ def call_sleep omitted_frames = target_stack_depth - max_frames + placeholder + garbage_collection expect(omitted_frames).to be 97 - expect(gathered_stack.last) - .to match(hash_including({ base_label: '', path: '97 frames omitted', lineno: 0 })) - expect(gathered_stack.first).to match(hash_including(base_label: '', path: 'Garbage Collection', lineno: 0)) + expect(gathered_stack.last).to have_attributes(base_label: '', path: '97 frames omitted', lineno: 0) + expect(gathered_stack.first).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0) end context 'when stack is exactly one item less as deep as the configured max_frames' do @@ -319,7 +316,7 @@ def call_sleep # @ivoanjo: I... don't think this can happen in practice. It's debatable if we should still have the placeholder # frame or not, but for ease of implementation I chose this path, and I added this spec just to get coverage on # this corner case. - expect(gathered_stack).to contain_exactly({ base_label: '', path: 'Garbage Collection', lineno: 0 }) + expect(gathered_stack).to contain_exactly(have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0)) end end end @@ -371,7 +368,7 @@ def call_sleep end it 'gathers a one-element stack with a "In native code" placeholder' do - expect(gathered_stack).to contain_exactly({ base_label: '', path: 'In native code', lineno: 0 }) + expect(gathered_stack).to contain_exactly(have_attributes(base_label: '', path: 'In native code', lineno: 0)) end context 'when marking sample as being in garbage collection' do @@ -379,8 +376,8 @@ def call_sleep it 'gathers a two-element stack with a placeholder for "In native code" and another for garbage collection' do expect(gathered_stack).to contain_exactly( - { base_label: '', path: 'Garbage Collection', lineno: 0 }, - { base_label: '', path: 'In native code', lineno: 0 } + have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0), + have_attributes(base_label: '', path: 'In native code', lineno: 0), ) end end @@ -412,7 +409,7 @@ def call_sleep def convert_reference_stack(raw_reference_stack) raw_reference_stack.map do |location| - { base_label: location.base_label, path: location.path, lineno: location.lineno } + ProfileHelpers::Frame.new(location.base_label, location.path, location.lineno).freeze end end @@ -422,7 +419,7 @@ def sample_and_decode(thread, max_frames: 400, recorder: Datadog::Profiling::Sta samples = samples_from_pprof(recorder.serialize!) expect(samples.size).to be 1 - samples.first.fetch(:locations) + samples.first.locations end end diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb index d2bfea17863..2bae601930a 100644 --- a/spec/datadog/profiling/spec_helper.rb +++ b/spec/datadog/profiling/spec_helper.rb @@ -5,6 +5,9 @@ module ProfileHelpers include Kernel + Sample = Struct.new(:locations, :values, :labels) # rubocop:disable Lint/StructNewOverride + Frame = Struct.new(:base_label, :path, :lineno) + def build_stack_sample( locations: nil, thread_id: nil, @@ -59,16 +62,16 @@ def samples_from_pprof(pprof_data) pretty_sample_types = decoded_profile.sample_type.map { |it| string_table[it.type].to_sym } decoded_profile.sample.map do |sample| - { - locations: sample.location_id.map { |location_id| decode_frame_from_pprof(decoded_profile, location_id) }, - values: pretty_sample_types.zip(sample.value).to_h, - labels: sample.label.map do |it| + Sample.new( + sample.location_id.map { |location_id| decode_frame_from_pprof(decoded_profile, location_id) }, + pretty_sample_types.zip(sample.value).to_h, + sample.label.map do |it| [ string_table[it.key].to_sym, it.num == 0 ? string_table[it.str] : it.num, ] end.to_h, - } + ).freeze end end @@ -80,7 +83,7 @@ def decode_frame_from_pprof(decoded_profile, location_id) line_entry = location.line.first function = decoded_profile.function.find { |func| func.id == line_entry.function_id } - { base_label: strings[function.name], path: strings[function.filename], lineno: line_entry.line } + Frame.new(strings[function.name], strings[function.filename], line_entry.line).freeze end def object_id_from(thread_id) @@ -88,7 +91,7 @@ def object_id_from(thread_id) end def samples_for_thread(samples, thread) - samples.select { |sample| object_id_from(sample.fetch(:labels).fetch(:'thread id')) == thread.object_id } + samples.select { |sample| object_id_from(sample.labels.fetch(:'thread id')) == thread.object_id } end end diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index a7db92da3ba..9c5e650292a 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -151,11 +151,11 @@ def sample_types_from(decoded_profile) end it 'encodes the sample with the metrics provided' do - expect(samples.first).to include(values: { :'cpu-time' => 123, :'cpu-samples' => 456, :'wall-time' => 789 }) + expect(samples.first.values).to eq(:'cpu-time' => 123, :'cpu-samples' => 456, :'wall-time' => 789) end it 'encodes the sample with the labels provided' do - expect(samples.first).to include(labels: { label_a: 'value_a', label_b: 'value_b' }) + expect(samples.first.labels).to eq(label_a: 'value_a', label_b: 'value_b') end it 'encodes a single empty mapping' do