From 45b0da4285f13b47f48b518ce8c9b6e7ea2e479e Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 8 Feb 2023 12:26:25 +0000 Subject: [PATCH 1/8] Introduce `Sample` and `Frame` structs in `ProfileHelpers` I got tired of the extra boilerplate from using hashes and introduced two small structs to contain the parsed profile data. --- .../collectors/cpu_and_wall_time_spec.rb | 50 +++++++++---------- .../cpu_and_wall_time_worker_spec.rb | 12 ++--- .../profiling/collectors/stack_spec.rb | 49 +++++++++--------- spec/datadog/profiling/spec_helper.rb | 17 ++++--- spec/datadog/profiling/stack_recorder_spec.rb | 4 +- 5 files changed, 66 insertions(+), 66 deletions(-) diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb index 4c489647dab..0d7bc081818 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb @@ -88,7 +88,7 @@ def stats it 'tags the samples with the object ids of the Threads they belong to' do sample - expect(samples.map { |it| object_id_from(it.fetch(:labels).fetch(:'thread id')) }) + expect(samples.map { |it| object_id_from(it.labels.fetch(:'thread id')) }) .to include(*[Thread.main, t1, t2, t3].map(&:object_id)) end @@ -103,9 +103,9 @@ def stats t2_sample = samples_for_thread(samples, t2).first t3_sample = samples_for_thread(samples, t3).first - expect(t1_sample).to include(labels: include(:'thread name' => 'thread t1')) - expect(t2_sample.fetch(:labels).keys).to_not include(:'thread name') - expect(t3_sample).to include(labels: include(:'thread name' => 'thread t3')) + expect(t1_sample.labels).to include(:'thread name' => 'thread t1') + expect(t2_sample.labels.keys).to_not include(:'thread name') + expect(t3_sample.labels).to include(:'thread name' => 'thread t3') end it 'includes the wall-time elapsed between samples' do @@ -118,7 +118,7 @@ def stats per_thread_context.fetch(t1).fetch(:wall_time_at_previous_sample_ns) t1_samples = samples_for_thread(samples, t1) - wall_time = t1_samples.first.fetch(:values).fetch(:'wall-time') + wall_time = t1_samples.first.values.fetch(:'wall-time') expect(t1_samples.size) .to be(1), "Expected thread t1 to always have same stack trace (because it's sleeping), got #{t1_samples.inspect}" @@ -131,7 +131,7 @@ def stats t1_sample = samples_for_thread(samples, t1).first - expect(t1_sample).to include(values: include(:'cpu-samples' => 5)) + expect(t1_sample.values).to include(:'cpu-samples' => 5) end [:before, :after].each do |on_gc_finish_order| @@ -152,7 +152,7 @@ def stats total_wall_for_rspec_thread = samples_for_thread(samples, Thread.current) - .map { |it| it.fetch(:values).fetch(:'wall-time') } + .map { |it| it.values.fetch(:'wall-time') } .reduce(:+) expect(total_wall_for_rspec_thread).to be(wall_time_at_gc_start - wall_time_at_first_sample) @@ -207,7 +207,7 @@ def stats # some data for it total_cpu_for_rspec_thread = samples_for_thread(samples, Thread.current) - .map { |it| it.fetch(:values).fetch(:'cpu-time') } + .map { |it| it.values.fetch(:'cpu-time') } .reduce(:+) # The **wall-clock time** spent by the rspec thread is going to be an upper bound for the cpu time spent, @@ -232,7 +232,7 @@ def stats total_cpu_for_rspec_thread = samples_for_thread(samples, Thread.current) - .map { |it| it.fetch(:values).fetch(:'cpu-time') } + .map { |it| it.values.fetch(:'cpu-time') } .reduce(:+) expect(total_cpu_for_rspec_thread).to be(cpu_time_at_gc_start - cpu_time_at_first_sample) @@ -274,7 +274,7 @@ def stats it 'does not include "local root span id" nor "span id" labels in the samples' do sample - found_labels = t1_sample.fetch(:labels).keys + found_labels = t1_sample.labels.keys expect(found_labels).to_not include(:'local root span id') expect(found_labels).to_not include(:'span id') @@ -347,7 +347,7 @@ def stats it 'includes "local root span id" and "span id" labels in the samples' do sample - expect(t1_sample.fetch(:labels)).to include( + expect(t1_sample.labels).to include( :'local root span id' => @t1_local_root_span_id.to_s, :'span id' => @t1_span_id.to_i, ) @@ -356,7 +356,7 @@ def stats it 'does not include the "trace endpoint" label' do sample - expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint') + expect(t1_sample.labels.keys).to_not include(:'trace endpoint') end context 'when local root span type is web' do @@ -365,7 +365,7 @@ def stats it 'includes the "trace endpoint" label in the samples' do sample - expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'profiler.test') + expect(t1_sample.labels).to include(:'trace endpoint' => 'profiler.test') end describe 'trace vs root span resource mutation' do @@ -392,7 +392,7 @@ def stats it 'includes the "trace endpoint" label in the samples with the root span resource' do sample - expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'root_span_resource') + expect(t1_sample.labels).to include(:'trace endpoint' => 'root_span_resource') end end @@ -403,7 +403,7 @@ def stats it 'includes the "trace endpoint" label in the samples with the trace resource' do sample - expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'trace_resource') + expect(t1_sample.labels).to include(:'trace endpoint' => 'trace_resource') end end @@ -414,7 +414,7 @@ def stats it 'does not include the "trace endpoint" label' do sample - expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint') + expect(t1_sample.labels.keys).to_not include(:'trace endpoint') end end end @@ -431,8 +431,8 @@ def stats t1_samples = samples_for_thread(samples, t1) expect(t1_samples).to have(1).item - expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_first_sample') - expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 2) + expect(t1_samples.first.labels).to include(:'trace endpoint' => 'changed_after_first_sample') + expect(t1_samples.first.values).to include(:'cpu-samples' => 2) end context 'when the resource is changed multiple times' do @@ -446,8 +446,8 @@ def stats t1_samples = samples_for_thread(samples, t1) expect(t1_samples).to have(1).item - expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_second_sample') - expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 3) + expect(t1_samples.first.labels).to include(:'trace endpoint' => 'changed_after_second_sample') + expect(t1_samples.first.values).to include(:'cpu-samples' => 3) end end end @@ -626,7 +626,7 @@ def stats end describe '#sample_after_gc' do - let(:gc_samples) { samples.select { |it| it.fetch(:locations).first.fetch(:path) == 'Garbage Collection' } } + let(:gc_samples) { samples.select { |it| it.locations.first.path == 'Garbage Collection' } } before { sample } @@ -674,7 +674,7 @@ def stats it 'samples the thread with recorded gc start and finish time, marking it as being in Garbage Collection' do sample_after_gc - expect(object_id_from(gc_sample.fetch(:labels).fetch(:'thread id'))).to eq Thread.current.object_id + expect(object_id_from(gc_sample.labels.fetch(:'thread id'))).to eq Thread.current.object_id end it 'samples the thread with recorded gc start and finish time, recording the times between gc start and finish' do @@ -685,7 +685,7 @@ def stats sample_after_gc - expect(gc_sample.fetch(:values)).to include( + expect(gc_sample.values).to include( :"cpu-samples" => 1, :'cpu-time' => cpu_time_at_finish_ns - cpu_time_at_start_ns, :"wall-time" => wall_time_at_finish_ns - wall_time_at_start_ns, @@ -719,7 +719,7 @@ def stats sample_after_gc - wall_time_spent_in_gc = gc_sample.fetch(:values).fetch(:'wall-time') + wall_time_spent_in_gc = gc_sample.values.fetch(:'wall-time') expect(per_thread_context.fetch(Thread.current)).to include( wall_time_at_previous_sample_ns: wall_time_at_previous_sample_ns_before + wall_time_spent_in_gc @@ -750,7 +750,7 @@ def stats sample_after_gc - cpu_time_spent_in_gc = gc_sample.fetch(:values).fetch(:'cpu-time') + cpu_time_spent_in_gc = gc_sample.values.fetch(:'cpu-time') expect(per_thread_context.fetch(Thread.current)).to include( cpu_time_at_previous_sample_ns: cpu_time_at_previous_sample_ns_before + cpu_time_spent_in_gc diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 1b85376412f..dd2961eed3f 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -150,7 +150,7 @@ cpu_and_wall_time_worker.stop sample_count = - samples_for_thread(all_samples, Thread.current).map { |it| it.fetch(:values).fetch(:'cpu-samples') }.reduce(:+) + samples_for_thread(all_samples, Thread.current).map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+) stats = cpu_and_wall_time_worker.stats @@ -210,14 +210,14 @@ current_thread_gc_samples = samples_for_thread(all_samples, Thread.current) - .select { |it| it.fetch(:locations).first.fetch(:path) == 'Garbage Collection' } + .select { |it| it.locations.first.path == 'Garbage Collection' } # NOTE: In some cases, Ruby may actually call two GC's back-to-back without us having the possibility to take # a sample. I don't expect this to happen for this test (that's what the `Thread.pass` above is trying to avoid) # but if this spec turns out to be flaky, that is probably the issue, and that would mean we'd need to relax the # check. expect( - current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.fetch(:values).fetch(:'cpu-samples') } + current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.values.fetch(:'cpu-samples') } ).to be >= invoke_gc_times end @@ -295,7 +295,7 @@ background_thread.kill result = samples_for_thread(samples_from_pprof_without_gc(recorder.serialize!), Thread.current) - sample_count = result.map { |it| it.fetch(:values).fetch(:'cpu-samples') }.reduce(:+) + sample_count = result.map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+) stats = cpu_and_wall_time_worker.stats @@ -324,7 +324,7 @@ cpu_and_wall_time_worker.stop result = samples_for_thread(samples_from_pprof_without_gc(recorder.serialize!), Thread.current) - sample_count = result.map { |it| it.fetch(:values).fetch(:'cpu-samples') }.reduce(:+) + sample_count = result.map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+) stats = cpu_and_wall_time_worker.stats debug_failures = { thread_list: Thread.list, result: result } @@ -542,6 +542,6 @@ def wait_until_running # # We have separate specs that assert on the GC behaviors. def samples_from_pprof_without_gc(pprof_data) - samples_from_pprof(pprof_data).reject { |it| it.fetch(:locations).first.fetch(:path) == 'Garbage Collection' } + samples_from_pprof(pprof_data).reject { |it| it.locations.first.path == 'Garbage Collection' } end end diff --git a/spec/datadog/profiling/collectors/stack_spec.rb b/spec/datadog/profiling/collectors/stack_spec.rb index aed64a1b0cc..65b0aa0cab5 100644 --- a/spec/datadog/profiling/collectors/stack_spec.rb +++ b/spec/datadog/profiling/collectors/stack_spec.rb @@ -33,7 +33,7 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram let(:reference_stack) do # To make the stacks comparable we slice off the actual Ruby `Thread#backtrace_locations` frame since that part # will necessarily be different - expect(super().first).to match(hash_including(base_label: 'backtrace_locations')) + expect(super().first.base_label).to eq 'backtrace_locations' super()[1..-1] end @@ -42,9 +42,9 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram # also necessarily be different expect(super()[0..2]).to match( [ - hash_including(base_label: '_native_sample'), - hash_including(base_label: 'sample'), - hash_including(base_label: 'sample_and_decode'), + have_attributes(base_label: '_native_sample'), + have_attributes(base_label: 'sample'), + have_attributes(base_label: 'sample_and_decode'), ] ) super()[3..-1] @@ -62,7 +62,7 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram let(:in_gc) { true } it 'includes a placeholder frame for garbage collection' do - expect(stacks.fetch(:gathered)[0]).to eq({ base_label: '', path: 'Garbage Collection', lineno: 0 }) + expect(stacks.fetch(:gathered)[0]).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0) end it 'matches the Ruby backtrace API' do @@ -104,7 +104,7 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram end it 'has a sleeping frame at the top of the stack' do - expect(reference_stack.first).to match(hash_including(base_label: 'sleep')) + expect(reference_stack.first.base_label).to eq 'sleep' end end @@ -127,9 +127,9 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram it 'has eval frames on the stack' do expect(reference_stack[0..2]).to contain_exactly( - hash_including(base_label: 'sleep', path: '(eval)'), - hash_including(base_label: '', path: '(eval)'), - hash_including(base_label: 'eval', path: end_with('stack_spec.rb')), + have_attributes(base_label: 'sleep', path: '(eval)'), + have_attributes(base_label: '', path: '(eval)'), + have_attributes(base_label: 'eval', path: end_with('stack_spec.rb')), ) end end @@ -171,8 +171,8 @@ def call_sleep # These two frames are the frames that get created with the evaluation of the string, e.g. if instead of # `eval("foo")` we did `eval { foo }` then it is the block containing foo; eval with a string works similarly, # although you don't see a block there. - hash_including(base_label: 'call_eval', path: '(eval)', lineno: 1), - hash_including(base_label: 'call_instance_eval', path: '(eval)', lineno: 1), + have_attributes(base_label: 'call_eval', path: '(eval)', lineno: 1), + have_attributes(base_label: 'call_instance_eval', path: '(eval)', lineno: 1), ) end end @@ -190,7 +190,7 @@ def call_sleep it 'has a frame with the custom file and line provided on the stack' do expect(reference_stack).to include( - hash_including(path: '/this/is/a/fake_file_.rb', lineno: -123456789), + have_attributes(path: '/this/is/a/fake_file_.rb', lineno: -123456789), ) end end @@ -213,7 +213,7 @@ def call_sleep # I opted to join these two expects to avoid running the `load` above more than once it 'matches the Ruby backtrace API AND has a sleeping frame at the top of the stack' do expect(gathered_stack).to eq reference_stack - expect(reference_stack.first).to match(hash_including(base_label: 'sleep')) + expect(reference_stack.first.base_label).to eq 'sleep' end end end @@ -244,8 +244,7 @@ def call_sleep omitted_frames = target_stack_depth - max_frames + placeholder expect(omitted_frames).to be 96 - expect(gathered_stack.last) - .to match(hash_including({ base_label: '', path: '96 frames omitted', lineno: 0 })) + expect(gathered_stack.last).to have_attributes(base_label: '', path: '96 frames omitted', lineno: 0) end context 'when stack is exactly 1 item deeper than the configured max_frames' do @@ -254,8 +253,7 @@ def call_sleep it 'includes a placeholder frame stating that 2 frames were omitted' do # Why 2 frames omitted and not 1? That's because the placeholder takes over 1 space in the buffer, so # if there were 6 frames on the stack and the limit is 5, then 4 of those frames will be present in the output - expect(gathered_stack.last) - .to match(hash_including({ base_label: '', path: '2 frames omitted', lineno: 0 })) + expect(gathered_stack.last).to have_attributes(base_label: '', path: '2 frames omitted', lineno: 0) end end @@ -285,9 +283,8 @@ def call_sleep omitted_frames = target_stack_depth - max_frames + placeholder + garbage_collection expect(omitted_frames).to be 97 - expect(gathered_stack.last) - .to match(hash_including({ base_label: '', path: '97 frames omitted', lineno: 0 })) - expect(gathered_stack.first).to match(hash_including(base_label: '', path: 'Garbage Collection', lineno: 0)) + expect(gathered_stack.last).to have_attributes(base_label: '', path: '97 frames omitted', lineno: 0) + expect(gathered_stack.first).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0) end context 'when stack is exactly one item less as deep as the configured max_frames' do @@ -318,7 +315,7 @@ def call_sleep # @ivoanjo: I... don't think this can happen in practice. It's debatable if we should still have the placeholder # frame or not, but for ease of implementation I chose this path, and I added this spec just to get coverage on # this corner case. - expect(gathered_stack).to contain_exactly({ base_label: '', path: 'Garbage Collection', lineno: 0 }) + expect(gathered_stack).to contain_exactly(have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0)) end end end @@ -370,7 +367,7 @@ def call_sleep end it 'gathers a one-element stack with a "In native code" placeholder' do - expect(gathered_stack).to contain_exactly({ base_label: '', path: 'In native code', lineno: 0 }) + expect(gathered_stack).to contain_exactly(have_attributes(base_label: '', path: 'In native code', lineno: 0)) end context 'when marking sample as being in garbage collection' do @@ -378,8 +375,8 @@ def call_sleep it 'gathers a two-element stack with a placeholder for "In native code" and another for garbage collection' do expect(gathered_stack).to contain_exactly( - { base_label: '', path: 'Garbage Collection', lineno: 0 }, - { base_label: '', path: 'In native code', lineno: 0 } + have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0), + have_attributes(base_label: '', path: 'In native code', lineno: 0), ) end end @@ -411,7 +408,7 @@ def call_sleep def convert_reference_stack(raw_reference_stack) raw_reference_stack.map do |location| - { base_label: location.base_label, path: location.path, lineno: location.lineno } + ProfileHelpers::Frame.new(location.base_label, location.path, location.lineno).freeze end end @@ -421,7 +418,7 @@ def sample_and_decode(thread, max_frames: 400, recorder: Datadog::Profiling::Sta samples = samples_from_pprof(recorder.serialize!) expect(samples.size).to be 1 - samples.first.fetch(:locations) + samples.first.locations end end diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb index d2bfea17863..faf561260e4 100644 --- a/spec/datadog/profiling/spec_helper.rb +++ b/spec/datadog/profiling/spec_helper.rb @@ -5,6 +5,9 @@ module ProfileHelpers include Kernel + Sample = Struct.new(:locations, :values, :labels) + Frame = Struct.new(:base_label, :path, :lineno) + def build_stack_sample( locations: nil, thread_id: nil, @@ -59,16 +62,16 @@ def samples_from_pprof(pprof_data) pretty_sample_types = decoded_profile.sample_type.map { |it| string_table[it.type].to_sym } decoded_profile.sample.map do |sample| - { - locations: sample.location_id.map { |location_id| decode_frame_from_pprof(decoded_profile, location_id) }, - values: pretty_sample_types.zip(sample.value).to_h, - labels: sample.label.map do |it| + Sample.new( + sample.location_id.map { |location_id| decode_frame_from_pprof(decoded_profile, location_id) }, + pretty_sample_types.zip(sample.value).to_h, + sample.label.map do |it| [ string_table[it.key].to_sym, it.num == 0 ? string_table[it.str] : it.num, ] end.to_h, - } + ).freeze end end @@ -80,7 +83,7 @@ def decode_frame_from_pprof(decoded_profile, location_id) line_entry = location.line.first function = decoded_profile.function.find { |func| func.id == line_entry.function_id } - { base_label: strings[function.name], path: strings[function.filename], lineno: line_entry.line } + Frame.new(strings[function.name], strings[function.filename], line_entry.line).freeze end def object_id_from(thread_id) @@ -88,7 +91,7 @@ def object_id_from(thread_id) end def samples_for_thread(samples, thread) - samples.select { |sample| object_id_from(sample.fetch(:labels).fetch(:'thread id')) == thread.object_id } + samples.select { |sample| object_id_from(sample.labels.fetch(:'thread id')) == thread.object_id } end end diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 131840f5c4d..053729649d9 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -149,11 +149,11 @@ def sample_types_from(decoded_profile) end it 'encodes the sample with the metrics provided' do - expect(samples.first).to include(values: { :'cpu-time' => 123, :'cpu-samples' => 456, :'wall-time' => 789 }) + expect(samples.first.values).to eq(:'cpu-time' => 123, :'cpu-samples' => 456, :'wall-time' => 789) end it 'encodes the sample with the labels provided' do - expect(samples.first).to include(labels: { label_a: 'value_a', label_b: 'value_b' }) + expect(samples.first.labels).to eq(label_a: 'value_a', label_b: 'value_b') end it 'encodes a single empty mapping' do From 9f76677be7064a442f53673bffb25275fe6895f3 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 8 Feb 2023 13:43:51 +0000 Subject: [PATCH 2/8] Extract metrics update during sample into a separate function This will enable it to be reused when we want to add the profiler overhead as a stack. --- .../collectors_cpu_and_wall_time.c | 69 ++++++++++++------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c index 2f95f1fecf6..0e89f29f180 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c @@ -143,6 +143,13 @@ static VALUE _native_sample(VALUE self, VALUE collector_instance); static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance); static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance); static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance); +void update_metrics_and_sample( + struct cpu_and_wall_time_collector_state *state, + VALUE thread_being_sampled, + struct per_thread_context *thread_context, + long current_cpu_time_ns, + long current_monotonic_wall_time_ns +); static void trigger_sample_for_thread( struct cpu_and_wall_time_collector_state *state, VALUE thread, @@ -343,32 +350,7 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto long current_cpu_time_ns = cpu_time_now_ns(thread_context); - long cpu_time_elapsed_ns = update_time_since_previous_sample( - &thread_context->cpu_time_at_previous_sample_ns, - current_cpu_time_ns, - thread_context->gc_tracking.cpu_time_at_start_ns, - IS_NOT_WALL_TIME - ); - long wall_time_elapsed_ns = update_time_since_previous_sample( - &thread_context->wall_time_at_previous_sample_ns, - current_monotonic_wall_time_ns, - thread_context->gc_tracking.wall_time_at_start_ns, - IS_WALL_TIME - ); - - int64_t metric_values[ENABLED_VALUE_TYPES_COUNT] = {0}; - - metric_values[CPU_TIME_VALUE_POS] = cpu_time_elapsed_ns; - metric_values[CPU_SAMPLES_VALUE_POS] = 1; - metric_values[WALL_TIME_VALUE_POS] = wall_time_elapsed_ns; - - trigger_sample_for_thread( - state, - thread, - thread_context, - (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT}, - SAMPLE_REGULAR - ); + update_metrics_and_sample(state, thread, thread_context, current_cpu_time_ns, current_monotonic_wall_time_ns); } state->sample_count++; @@ -378,6 +360,41 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto if (state->sample_count % 100 == 0) remove_context_for_dead_threads(state); } +void update_metrics_and_sample( + struct cpu_and_wall_time_collector_state *state, + VALUE thread_being_sampled, + struct per_thread_context *thread_context, + long current_cpu_time_ns, + long current_monotonic_wall_time_ns +) { + long cpu_time_elapsed_ns = update_time_since_previous_sample( + &thread_context->cpu_time_at_previous_sample_ns, + current_cpu_time_ns, + thread_context->gc_tracking.cpu_time_at_start_ns, + IS_NOT_WALL_TIME + ); + long wall_time_elapsed_ns = update_time_since_previous_sample( + &thread_context->wall_time_at_previous_sample_ns, + current_monotonic_wall_time_ns, + thread_context->gc_tracking.wall_time_at_start_ns, + IS_WALL_TIME + ); + + int64_t metric_values[ENABLED_VALUE_TYPES_COUNT] = {0}; + + metric_values[CPU_TIME_VALUE_POS] = cpu_time_elapsed_ns; + metric_values[CPU_SAMPLES_VALUE_POS] = 1; + metric_values[WALL_TIME_VALUE_POS] = wall_time_elapsed_ns; + + trigger_sample_for_thread( + state, + thread_being_sampled, + thread_context, + (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT}, + SAMPLE_REGULAR + ); +} + // This function gets called when Ruby is about to start running the Garbage Collector on the current thread. // It updates the per_thread_context of the current thread to include the current cpu/wall times, to be used to later // create a stack sample that blames the cpu/wall time spent from now until the end of the garbage collector work. From 416592ba1ab9786c9379e803b811992c5f05a4e1 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 8 Feb 2023 15:02:25 +0000 Subject: [PATCH 3/8] [PROF-6558] Show profiler overhead in flamegraph for CPU Profiling 2.0 **What does this PR do?**: This PR changes the cpu-time/wall-time sampling in the CPU Profiling 2.0 profiler to expose the profiler overhead as part of the captured data. Specifically, during sampling, the profiler now measures how much cpu-time/wall-time is spent by the profiler itself, and adds that to the profiler as a separate stack (that shows up as belonging to dd-trace-rb). **Motivation**: In the old Ruby profiler, the profiler always worked from a background thread. Because we also sampled that background thread, the flamegraph included the overhead of the profiler -- it was the overhead of the background thread. With CPU Profiling 2.0, sampling is done in whatever thread is holding the Global VM Lock, which means that this part of the profiler overhead was not being exposed. With this change, this impact on code is now accounted for. **Additional Notes**: I was torn between a few options, and decided to use the stack of the `CpuAndWallTimeWorker` thread to represent the overhead of the profiler. Thus, when this stack shows up in another thread, it represents the overhead of the profiler on that thread. I could have gone with a placeholder (e.g. just a frame that says "Profiler Sampling" or something similar), but such placeholders break the "Only My Code" feature. Using a stack from a different thread than the one we're supposedly "sampling" adds a bit of complexity; but we can always undo this and go with the placeholder in this becomes problematic. **How to test the change?**: Change includes code coverage. --- benchmarks/profiler_sample_loop_v2.rb | 8 ++- .../collectors_cpu_and_wall_time.c | 51 +++++++++++--- .../collectors_cpu_and_wall_time.h | 6 +- .../collectors_cpu_and_wall_time_worker.c | 3 +- .../collectors/cpu_and_wall_time_spec.rb | 67 +++++++++++++++++-- 5 files changed, 115 insertions(+), 20 deletions(-) diff --git a/benchmarks/profiler_sample_loop_v2.rb b/benchmarks/profiler_sample_loop_v2.rb index 0ed11b59c68..0292892804a 100644 --- a/benchmarks/profiler_sample_loop_v2.rb +++ b/benchmarks/profiler_sample_loop_v2.rb @@ -13,6 +13,10 @@ # This benchmark measures the performance of the main stack sampling loop of the profiler class ProfilerSampleLoopBenchmark + # This is needed because we're directly invoking the CpuAndWallTime collector through a testing interface; in normal + # use a profiler thread is automatically used. + PROFILER_OVERHEAD_STACK_THREAD = Thread.new { sleep } + def create_profiler @recorder = Datadog::Profiling::StackRecorder.new @collector = Datadog::Profiling::Collectors::CpuAndWallTime.new(recorder: @recorder, max_frames: 400, tracer: nil) @@ -36,7 +40,7 @@ def run_benchmark x.config(**benchmark_time, suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_sample_loop_v2')) x.report("stack collector #{ENV['CONFIG']}") do - Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector) + Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) end x.save! 'profiler-sample-loop-v2-results.json' unless VALIDATE_BENCHMARK_MODE @@ -48,7 +52,7 @@ def run_benchmark def run_forever while true - 1000.times { Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector) } + 1000.times { Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) } @recorder.serialize print '.' end diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c index 0e89f29f180..f427291cfe6 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c @@ -139,13 +139,14 @@ static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _val static int hash_map_per_thread_context_free_values(st_data_t _thread, st_data_t value_per_thread_context, st_data_t _argument); static VALUE _native_new(VALUE klass); static VALUE _native_initialize(VALUE self, VALUE collector_instance, VALUE recorder_instance, VALUE max_frames, VALUE tracer_context_key); -static VALUE _native_sample(VALUE self, VALUE collector_instance); +static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread); static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance); static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance); static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance); void update_metrics_and_sample( struct cpu_and_wall_time_collector_state *state, VALUE thread_being_sampled, + VALUE profiler_overhead_stack_thread, struct per_thread_context *thread_context, long current_cpu_time_ns, long current_monotonic_wall_time_ns @@ -153,6 +154,7 @@ void update_metrics_and_sample( static void trigger_sample_for_thread( struct cpu_and_wall_time_collector_state *state, VALUE thread, + VALUE stack_from_thread, struct per_thread_context *thread_context, ddog_Slice_I64 metric_values_slice, sample_type type @@ -195,7 +197,7 @@ void collectors_cpu_and_wall_time_init(VALUE profiling_module) { rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_initialize", _native_initialize, 4); rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_inspect", _native_inspect, 1); rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_reset_after_fork", _native_reset_after_fork, 1); - rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 1); + rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2); rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1); rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1); rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1); @@ -304,8 +306,10 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_inst // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTime behavior using RSpec. // It SHOULD NOT be used for other purposes. -static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) { - cpu_and_wall_time_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE)); +static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread) { + if (!is_thread_alive(profiler_overhead_stack_thread)) rb_raise(rb_eArgError, "Unexpected: profiler_overhead_stack_thread is not alive"); + + cpu_and_wall_time_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE), profiler_overhead_stack_thread); return Qtrue; } @@ -337,10 +341,17 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_ // Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe. // Assumption 4: This function IS NOT called in a reentrant way. // Assumption 5: This function is called from the main Ractor (if Ruby has support for Ractors). -void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns) { +// +// The `profiler_overhead_stack_thread` is used to attribute the profiler overhead to a stack borrowed from a different thread +// (belonging to ddtrace), so that the overhead is visible in the profile rather than blamed on user code. +void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns, VALUE profiler_overhead_stack_thread) { struct cpu_and_wall_time_collector_state *state; TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state); + VALUE current_thread = rb_thread_current(); + struct per_thread_context *current_thread_context = get_or_create_context_for(current_thread, state); + long cpu_time_at_sample_start_for_current_thread = cpu_time_now_ns(current_thread_context); + VALUE threads = ddtrace_thread_list(); const long thread_count = RARRAY_LEN(threads); @@ -348,9 +359,18 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto VALUE thread = RARRAY_AREF(threads, i); struct per_thread_context *thread_context = get_or_create_context_for(thread, state); - long current_cpu_time_ns = cpu_time_now_ns(thread_context); + // We account for cpu-time for the current thread in a different way -- we use the cpu-time at sampling start, to avoid + // blaming the time the profiler took on whatever's running on the thread right now + long current_cpu_time_ns = thread != current_thread ? cpu_time_now_ns(thread_context) : cpu_time_at_sample_start_for_current_thread; - update_metrics_and_sample(state, thread, thread_context, current_cpu_time_ns, current_monotonic_wall_time_ns); + update_metrics_and_sample( + state, + /* thread_being_sampled: */ thread, + /* stack_from_thread: */ thread, + thread_context, + current_cpu_time_ns, + current_monotonic_wall_time_ns + ); } state->sample_count++; @@ -358,11 +378,21 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto // TODO: This seems somewhat overkill and inefficient to do often; right now we just do it every few samples // but there's probably a better way to do this if we actually track when threads finish if (state->sample_count % 100 == 0) remove_context_for_dead_threads(state); + + update_metrics_and_sample( + state, + /* thread_being_sampled: */ current_thread, + /* stack_from_thread: */ profiler_overhead_stack_thread, + current_thread_context, + cpu_time_now_ns(current_thread_context), + monotonic_wall_time_now_ns(RAISE_ON_FAILURE) + ); } void update_metrics_and_sample( struct cpu_and_wall_time_collector_state *state, VALUE thread_being_sampled, + VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack struct per_thread_context *thread_context, long current_cpu_time_ns, long current_monotonic_wall_time_ns @@ -389,6 +419,7 @@ void update_metrics_and_sample( trigger_sample_for_thread( state, thread_being_sampled, + stack_from_thread, thread_context, (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT}, SAMPLE_REGULAR @@ -535,7 +566,8 @@ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) { trigger_sample_for_thread( state, - thread, + /* thread: */ thread, + /* stack_from_thread: */ thread, thread_context, (ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT}, SAMPLE_IN_GC @@ -565,6 +597,7 @@ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) { static void trigger_sample_for_thread( struct cpu_and_wall_time_collector_state *state, VALUE thread, + VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack struct per_thread_context *thread_context, ddog_Slice_I64 metric_values_slice, sample_type type @@ -623,7 +656,7 @@ static void trigger_sample_for_thread( } sample_thread( - thread, + stack_from_thread, state->sampling_buffer, state->recorder_instance, metric_values_slice, diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h index 609a3d4757f..688d8389987 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h @@ -2,7 +2,11 @@ #include -void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns); +void cpu_and_wall_time_collector_sample( + VALUE self_instance, + long current_monotonic_wall_time_ns, + VALUE profiler_overhead_stack_thread +); VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance); void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance); void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance); diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index 422bd376486..a5d56a67e02 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -481,7 +481,8 @@ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) { state->stats.sampled++; - cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample); + VALUE profiler_overhead_stack_thread = state->owner_thread; // Used to attribute profiler overhead to a different stack + cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample, profiler_overhead_stack_thread); long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE); long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample; diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb index 0d7bc081818..14c5f5859c9 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb @@ -15,8 +15,10 @@ let(:ready_queue) { Queue.new } let(:t1) do Thread.new(ready_queue) do |ready_queue| - ready_queue << true - sleep + inside_t1 do + ready_queue << true + sleep + end end end let(:t2) do @@ -47,8 +49,8 @@ end end - def sample - described_class::Testing._native_sample(cpu_and_wall_time_collector) + def sample(profiler_overhead_stack_thread: Thread.current) + described_class::Testing._native_sample(cpu_and_wall_time_collector, profiler_overhead_stack_thread) end def on_gc_start @@ -75,6 +77,15 @@ def stats described_class::Testing._native_stats(cpu_and_wall_time_collector) end + def inside_t1 + yield + end + + # This method exists only so we can look for its name in the stack trace in a few tests + def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current) + sample(profiler_overhead_stack_thread: profiler_overhead_stack_thread) + end + describe '#sample' do it 'samples all threads' do all_threads = Thread.list @@ -148,10 +159,12 @@ def stats wall_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns') - 5.times { sample } # Even though we keep sampling, the result only includes the time until we called on_gc_start + # Even though we keep calling sample, the result only includes the time until we called on_gc_start + 5.times { another_way_of_calling_sample } total_wall_for_rspec_thread = samples_for_thread(samples, Thread.current) + .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } } .map { |it| it.values.fetch(:'wall-time') } .reduce(:+) @@ -227,11 +240,12 @@ def stats cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns') - # Even though we keep sampling, the result only includes the time until we called on_gc_start - 5.times { sample } + # Even though we keep calling sample, the result only includes the time until we called on_gc_start + 5.times { another_way_of_calling_sample } total_cpu_for_rspec_thread = samples_for_thread(samples, Thread.current) + .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } } .map { |it| it.values.fetch(:'cpu-time') } .reduce(:+) @@ -455,6 +469,45 @@ def stats end end end + + # This is a bit weird, but what we're doing here is using the stack from a different thread to represent the + # profiler overhead. In practice, the "different thread" will be the Collectors::CpuAndWallTimeWorker thread. + # + # Thus, what happens is, when we sample _once_, two samples will show up for the thread **that calls sample**: + # * The regular stack + # * The stack from the other thread + # + # E.g. if 1s elapsed since the last sample, and sampling takes 500ms: + # * The regular stack will have 1s attributed to it + # * The stack from the other thread will have 500ms attributed to it. + # + # This way it's clear what overhead comes from profiling. Without this feature (aka if profiler_overhead_stack_thread + # is set to Thread.current), then all 1.5s get attributed to the current stack, and the profiler overhead would be + # invisible. + it 'attributes the time sampling to the stack of the worker_thread_to_blame' do + sample + wall_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns) + + another_way_of_calling_sample(profiler_overhead_stack_thread: t1) + wall_time_at_second_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns) + + second_sample_stack = + samples_for_thread(samples, Thread.current) + .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } } + + # The stack from the profiler_overhead_stack_thread (t1) above has showed up attributed to Thread.current, as we + # are using it to represent the profiler overhead. + profiler_overhead_stack = + samples_for_thread(samples, Thread.current) + .select { |it| it.locations.find { |frame| frame.base_label == 'inside_t1' } } + + expect(second_sample_stack.size).to be 1 + expect(profiler_overhead_stack.size).to be 1 + + expect( + second_sample_stack.first.values.fetch(:'wall-time') + profiler_overhead_stack.first.values.fetch(:'wall-time') + ).to be wall_time_at_second_sample - wall_time_at_first_sample + end end describe '#on_gc_start' do From 2880f59e6d3fde9a58501f8cac92e6f275e41680 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 8 Feb 2023 15:14:19 +0000 Subject: [PATCH 4/8] Silence Rubocop warning --- spec/datadog/profiling/spec_helper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb index faf561260e4..2bae601930a 100644 --- a/spec/datadog/profiling/spec_helper.rb +++ b/spec/datadog/profiling/spec_helper.rb @@ -5,7 +5,7 @@ module ProfileHelpers include Kernel - Sample = Struct.new(:locations, :values, :labels) + Sample = Struct.new(:locations, :values, :labels) # rubocop:disable Lint/StructNewOverride Frame = Struct.new(:base_label, :path, :lineno) def build_stack_sample( From 0816723c6f60ccc0255282ecbb7ec157fb8c3df6 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 8 Feb 2023 15:45:55 +0000 Subject: [PATCH 5/8] Fix broken spec after previous refactoring Ooops! I missed updating this spec since it doesn't run on all Rubies. --- .../profiling/collectors/cpu_and_wall_time_worker_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index dd2961eed3f..45fcb2c135c 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -364,7 +364,7 @@ samples_from_ractor = samples_from_pprof(recorder.serialize!) - .select { |it| it.fetch(:labels)[:'thread name'] == 'background ractor' } + .select { |it| it.labels[:'thread name'] == 'background ractor' } expect(samples_from_ractor).to be_empty end From 89e72e15dbef28eb5256060a02e112c5ce790a9d Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 8 Feb 2023 15:57:37 +0000 Subject: [PATCH 6/8] Fix flaky spec caused by the new "show profiler overhead" feature Because the new feature relies on adding extra samples to the flamegraph, the ```ruby expect(stats.fetch(:signal_handler_enqueued_sample)).to be >= sample_count ``` assertion was failing. --- .../profiling/collectors/cpu_and_wall_time_worker_spec.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 45fcb2c135c..ddbdb2a8fc4 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -150,7 +150,10 @@ cpu_and_wall_time_worker.stop sample_count = - samples_for_thread(all_samples, Thread.current).map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+) + samples_for_thread(all_samples, Thread.current) + .reject { |it| it.locations.find { |frame| frame.base_label == '_native_sampling_loop' } } # Filter out profiler + .map { |it| it.values.fetch(:'cpu-samples') } + .reduce(:+) stats = cpu_and_wall_time_worker.stats From 061304b21577ababbc9512c597ad8e2a6c4d028d Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 8 Feb 2023 16:37:38 +0000 Subject: [PATCH 7/8] Add label to samples representing the profiler overhead This will be useful when we want to process the profile and either focus on or disregard these samples. --- .../collectors_cpu_and_wall_time.c | 8 ++++++++ .../profiling/collectors/cpu_and_wall_time_spec.rb | 3 +++ .../profiling/collectors/cpu_and_wall_time_worker_spec.rb | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c index f427291cfe6..33112e1dd39 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c @@ -605,6 +605,7 @@ static void trigger_sample_for_thread( int max_label_count = 1 + // thread id 1 + // thread name + 1 + // profiler overhead 2; // local root span id and span id ddog_prof_Label labels[max_label_count]; int label_pos = 0; @@ -647,6 +648,13 @@ static void trigger_sample_for_thread( } } + if (thread != stack_from_thread) { + labels[label_pos++] = (ddog_prof_Label) { + .key = DDOG_CHARSLICE_C("profiler overhead"), + .num = 1 + }; + } + // The number of times `label_pos++` shows up in this function needs to match `max_label_count`. To avoid "oops I // forgot to update max_label_count" in the future, we've also added this validation. // @ivoanjo: I wonder if C compilers are smart enough to statically prove when this check never triggers happens and diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb index 14c5f5859c9..03862792d9f 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb @@ -507,6 +507,9 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current expect( second_sample_stack.first.values.fetch(:'wall-time') + profiler_overhead_stack.first.values.fetch(:'wall-time') ).to be wall_time_at_second_sample - wall_time_at_first_sample + + expect(second_sample_stack.first.labels).to_not include(:'profiler overhead' => anything) + expect(profiler_overhead_stack.first.labels).to include(:'profiler overhead' => 1) end end diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index ddbdb2a8fc4..a2ea7f700c5 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -540,7 +540,7 @@ def wait_until_running try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(cpu_and_wall_time_worker) } end - # This is useful because in a bunch of tests above we want to assert on properties of the period sampling, and having + # This is useful because in a bunch of tests above we want to assert on properties of the samples, and having # a random GC in the middle of the spec contribute a sample can throw off the expected values and counts. # # We have separate specs that assert on the GC behaviors. From 59115a6ea4de00166d521f66f0046274ee06becd Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 8 Feb 2023 16:48:19 +0000 Subject: [PATCH 8/8] Filter out profiler overhead frames from CpuAndWallTimeWorker assertions The profiler overhead frames show up non-deterministically (because they show up in whatever thread has the Global VM Lock), so they can cause a lot of the specs to become flaky. Excluding them should be enough to avoid the issue. --- .../cpu_and_wall_time_worker_spec.rb | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index a2ea7f700c5..f4763a41f4a 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -129,7 +129,7 @@ start all_samples = try_wait_until do - samples = samples_from_pprof_without_gc(recorder.serialize!) + samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!) samples if samples.any? end @@ -143,7 +143,7 @@ start all_samples = try_wait_until do - samples = samples_from_pprof_without_gc(recorder.serialize!) + samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!) samples if samples.any? end @@ -151,7 +151,6 @@ sample_count = samples_for_thread(all_samples, Thread.current) - .reject { |it| it.locations.find { |frame| frame.base_label == '_native_sampling_loop' } } # Filter out profiler .map { |it| it.values.fetch(:'cpu-samples') } .reduce(:+) @@ -166,7 +165,7 @@ start try_wait_until do - samples = samples_from_pprof_without_gc(recorder.serialize!) + samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!) samples if samples.any? end @@ -297,7 +296,7 @@ cpu_and_wall_time_worker.stop background_thread.kill - result = samples_for_thread(samples_from_pprof_without_gc(recorder.serialize!), Thread.current) + result = samples_for_thread(samples_from_pprof_without_gc_and_overhead(recorder.serialize!), Thread.current) sample_count = result.map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+) stats = cpu_and_wall_time_worker.stats @@ -326,7 +325,7 @@ cpu_and_wall_time_worker.stop - result = samples_for_thread(samples_from_pprof_without_gc(recorder.serialize!), Thread.current) + result = samples_for_thread(samples_from_pprof_without_gc_and_overhead(recorder.serialize!), Thread.current) sample_count = result.map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+) stats = cpu_and_wall_time_worker.stats @@ -540,11 +539,13 @@ def wait_until_running try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(cpu_and_wall_time_worker) } end - # This is useful because in a bunch of tests above we want to assert on properties of the samples, and having - # a random GC in the middle of the spec contribute a sample can throw off the expected values and counts. + # This is useful because in a bunch of tests above we want to assert on properties of the samples, and having GC + # and profiler overhead samples is a source of randomness which causes flakiness in the assertions. # - # We have separate specs that assert on the GC behaviors. - def samples_from_pprof_without_gc(pprof_data) - samples_from_pprof(pprof_data).reject { |it| it.locations.first.path == 'Garbage Collection' } + # We have separate specs that assert on these behaviors. + def samples_from_pprof_without_gc_and_overhead(pprof_data) + samples_from_pprof(pprof_data) + .reject { |it| it.locations.first.path == 'Garbage Collection' } + .reject { |it| it.labels.include?(:'profiler overhead') } end end