Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-6558] Show profiler overhead in flamegraph for CPU Profiling 2.0 #2607

Merged
merged 9 commits into from
Feb 14, 2023
8 changes: 6 additions & 2 deletions benchmarks/profiler_sample_loop_v2.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
# This benchmark measures the performance of the main stack sampling loop of the profiler

class ProfilerSampleLoopBenchmark
# This is needed because we're directly invoking the CpuAndWallTime collector through a testing interface; in normal
# use a profiler thread is automatically used.
PROFILER_OVERHEAD_STACK_THREAD = Thread.new { sleep }

def create_profiler
@recorder = Datadog::Profiling::StackRecorder.new
@collector = Datadog::Profiling::Collectors::CpuAndWallTime.new(recorder: @recorder, max_frames: 400, tracer: nil)
Expand All @@ -36,7 +40,7 @@ def run_benchmark
x.config(**benchmark_time, suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_sample_loop_v2'))

x.report("stack collector #{ENV['CONFIG']}") do
Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector)
Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD)
end

x.save! 'profiler-sample-loop-v2-results.json' unless VALIDATE_BENCHMARK_MODE
Expand All @@ -48,7 +52,7 @@ def run_benchmark

def run_forever
while true
1000.times { Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector) }
1000.times { Datadog::Profiling::Collectors::CpuAndWallTime::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) }
@recorder.serialize
print '.'
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,22 @@ static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _val
static int hash_map_per_thread_context_free_values(st_data_t _thread, st_data_t value_per_thread_context, st_data_t _argument);
static VALUE _native_new(VALUE klass);
static VALUE _native_initialize(VALUE self, VALUE collector_instance, VALUE recorder_instance, VALUE max_frames, VALUE tracer_context_key);
static VALUE _native_sample(VALUE self, VALUE collector_instance);
static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread);
static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance);
static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance);
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
void update_metrics_and_sample(
struct cpu_and_wall_time_collector_state *state,
VALUE thread_being_sampled,
VALUE profiler_overhead_stack_thread,
struct per_thread_context *thread_context,
long current_cpu_time_ns,
long current_monotonic_wall_time_ns
);
static void trigger_sample_for_thread(
struct cpu_and_wall_time_collector_state *state,
VALUE thread,
VALUE stack_from_thread,
struct per_thread_context *thread_context,
ddog_Slice_I64 metric_values_slice,
sample_type type
Expand Down Expand Up @@ -188,7 +197,7 @@ void collectors_cpu_and_wall_time_init(VALUE profiling_module) {
rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_initialize", _native_initialize, 4);
rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_inspect", _native_inspect, 1);
rb_define_singleton_method(collectors_cpu_and_wall_time_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 1);
rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2);
rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1);
rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1);
rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1);
Expand Down Expand Up @@ -297,8 +306,10 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_inst

// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTime behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) {
cpu_and_wall_time_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE));
static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread) {
if (!is_thread_alive(profiler_overhead_stack_thread)) rb_raise(rb_eArgError, "Unexpected: profiler_overhead_stack_thread is not alive");

cpu_and_wall_time_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE), profiler_overhead_stack_thread);
return Qtrue;
}

Expand Down Expand Up @@ -330,44 +341,35 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_
// Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe.
// Assumption 4: This function IS NOT called in a reentrant way.
// Assumption 5: This function is called from the main Ractor (if Ruby has support for Ractors).
void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns) {
//
// The `profiler_overhead_stack_thread` is used to attribute the profiler overhead to a stack borrowed from a different thread
// (belonging to ddtrace), so that the overhead is visible in the profile rather than blamed on user code.
void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns, VALUE profiler_overhead_stack_thread) {
struct cpu_and_wall_time_collector_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);

VALUE current_thread = rb_thread_current();
struct per_thread_context *current_thread_context = get_or_create_context_for(current_thread, state);
long cpu_time_at_sample_start_for_current_thread = cpu_time_now_ns(current_thread_context);

VALUE threads = ddtrace_thread_list();

const long thread_count = RARRAY_LEN(threads);
for (long i = 0; i < thread_count; i++) {
VALUE thread = RARRAY_AREF(threads, i);
struct per_thread_context *thread_context = get_or_create_context_for(thread, state);

long current_cpu_time_ns = cpu_time_now_ns(thread_context);
// We account for cpu-time for the current thread in a different way -- we use the cpu-time at sampling start, to avoid
// blaming the time the profiler took on whatever's running on the thread right now
long current_cpu_time_ns = thread != current_thread ? cpu_time_now_ns(thread_context) : cpu_time_at_sample_start_for_current_thread;

long cpu_time_elapsed_ns = update_time_since_previous_sample(
&thread_context->cpu_time_at_previous_sample_ns,
current_cpu_time_ns,
thread_context->gc_tracking.cpu_time_at_start_ns,
IS_NOT_WALL_TIME
);
long wall_time_elapsed_ns = update_time_since_previous_sample(
&thread_context->wall_time_at_previous_sample_ns,
current_monotonic_wall_time_ns,
thread_context->gc_tracking.wall_time_at_start_ns,
IS_WALL_TIME
);

int64_t metric_values[ENABLED_VALUE_TYPES_COUNT] = {0};

metric_values[CPU_TIME_VALUE_POS] = cpu_time_elapsed_ns;
metric_values[CPU_SAMPLES_VALUE_POS] = 1;
metric_values[WALL_TIME_VALUE_POS] = wall_time_elapsed_ns;

trigger_sample_for_thread(
update_metrics_and_sample(
state,
thread,
/* thread_being_sampled: */ thread,
/* stack_from_thread: */ thread,
thread_context,
(ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT},
SAMPLE_REGULAR
current_cpu_time_ns,
current_monotonic_wall_time_ns
);
}

Expand All @@ -376,6 +378,52 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto
// TODO: This seems somewhat overkill and inefficient to do often; right now we just do it every few samples
// but there's probably a better way to do this if we actually track when threads finish
if (state->sample_count % 100 == 0) remove_context_for_dead_threads(state);

update_metrics_and_sample(
state,
/* thread_being_sampled: */ current_thread,
/* stack_from_thread: */ profiler_overhead_stack_thread,
current_thread_context,
cpu_time_now_ns(current_thread_context),
monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
);
}

void update_metrics_and_sample(
struct cpu_and_wall_time_collector_state *state,
VALUE thread_being_sampled,
VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
struct per_thread_context *thread_context,
long current_cpu_time_ns,
long current_monotonic_wall_time_ns
) {
long cpu_time_elapsed_ns = update_time_since_previous_sample(
&thread_context->cpu_time_at_previous_sample_ns,
current_cpu_time_ns,
thread_context->gc_tracking.cpu_time_at_start_ns,
IS_NOT_WALL_TIME
);
long wall_time_elapsed_ns = update_time_since_previous_sample(
&thread_context->wall_time_at_previous_sample_ns,
current_monotonic_wall_time_ns,
thread_context->gc_tracking.wall_time_at_start_ns,
IS_WALL_TIME
);

int64_t metric_values[ENABLED_VALUE_TYPES_COUNT] = {0};

metric_values[CPU_TIME_VALUE_POS] = cpu_time_elapsed_ns;
metric_values[CPU_SAMPLES_VALUE_POS] = 1;
metric_values[WALL_TIME_VALUE_POS] = wall_time_elapsed_ns;

trigger_sample_for_thread(
state,
thread_being_sampled,
stack_from_thread,
thread_context,
(ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT},
SAMPLE_REGULAR
);
}

// This function gets called when Ruby is about to start running the Garbage Collector on the current thread.
Expand Down Expand Up @@ -518,7 +566,8 @@ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) {

trigger_sample_for_thread(
state,
thread,
/* thread: */ thread,
/* stack_from_thread: */ thread,
thread_context,
(ddog_Slice_I64) {.ptr = metric_values, .len = ENABLED_VALUE_TYPES_COUNT},
SAMPLE_IN_GC
Expand Down Expand Up @@ -548,13 +597,15 @@ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) {
static void trigger_sample_for_thread(
struct cpu_and_wall_time_collector_state *state,
VALUE thread,
VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
struct per_thread_context *thread_context,
ddog_Slice_I64 metric_values_slice,
sample_type type
) {
int max_label_count =
1 + // thread id
1 + // thread name
1 + // profiler overhead
2; // local root span id and span id
ddog_prof_Label labels[max_label_count];
int label_pos = 0;
Expand Down Expand Up @@ -597,6 +648,13 @@ static void trigger_sample_for_thread(
}
}

if (thread != stack_from_thread) {
labels[label_pos++] = (ddog_prof_Label) {
.key = DDOG_CHARSLICE_C("profiler overhead"),
.num = 1
};
}

// The number of times `label_pos++` shows up in this function needs to match `max_label_count`. To avoid "oops I
// forgot to update max_label_count" in the future, we've also added this validation.
// @ivoanjo: I wonder if C compilers are smart enough to statically prove when this check never triggers happens and
Expand All @@ -606,7 +664,7 @@ static void trigger_sample_for_thread(
}

sample_thread(
thread,
stack_from_thread,
state->sampling_buffer,
state->recorder_instance,
metric_values_slice,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

#include <ruby.h>

void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monotonic_wall_time_ns);
void cpu_and_wall_time_collector_sample(
VALUE self_instance,
long current_monotonic_wall_time_ns,
VALUE profiler_overhead_stack_thread
);
VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance);
void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance);
void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,8 @@ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) {

state->stats.sampled++;

cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample);
VALUE profiler_overhead_stack_thread = state->owner_thread; // Used to attribute profiler overhead to a different stack
cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample, profiler_overhead_stack_thread);

long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;
Expand Down
Loading