diff --git a/benchmarks/profiler_sample_loop_v2.rb b/benchmarks/profiler_sample_loop_v2.rb index 4722a6c050f..43a9e85e148 100644 --- a/benchmarks/profiler_sample_loop_v2.rb +++ b/benchmarks/profiler_sample_loop_v2.rb @@ -19,7 +19,8 @@ def create_profiler @recorder = Datadog::Profiling::StackRecorder.new( cpu_time_enabled: true, alloc_samples_enabled: false, - heap_samples_enabled: false + heap_samples_enabled: false, + timeline_enabled: false, ) @collector = Datadog::Profiling::Collectors::ThreadContext.new( recorder: @recorder, max_frames: 400, tracer: nil, endpoint_collection_enabled: false, timeline_enabled: false diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index af9b0127e18..876dbd4afee 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -718,28 +718,11 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) { if (event == RUBY_INTERNAL_EVENT_GC_ENTER) { thread_context_collector_on_gc_start(state->thread_context_collector_instance); } else if (event == RUBY_INTERNAL_EVENT_GC_EXIT) { - // Design: In an earlier iteration of this feature (see https://github.com/DataDog/dd-trace-rb/pull/2308) we - // actually had a single method to implement the behavior of both thread_context_collector_on_gc_finish - // and thread_context_collector_sample_after_gc (the latter is called via after_gc_from_postponed_job). - // - // Unfortunately, then we discovered the safety issue around no allocations, and thus decided to separate them -- so that - // the sampling could run outside the tight safety constraints of the garbage collection process. - // - // There is a downside: The sample is now taken very very shortly afterwards the GC finishes, and not immediately - // as the GC finishes, which means the stack captured may by affected by "skid", e.g. point slightly after where - // it should be pointing at. - // Alternatives to solve this would be to capture no stack for garbage collection (as we do for Java and .net); - // making the sampling process allocation-safe (very hard); or separate stack sampling from sample recording, - // e.g. enabling us to capture the stack in thread_context_collector_on_gc_finish and do the rest later - // (medium hard). - - thread_context_collector_on_gc_finish(state->thread_context_collector_instance); - // We use rb_postponed_job_register_one to ask Ruby to run thread_context_collector_sample_after_gc after if - // fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual. - // - // Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as - // this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3. - rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL); + bool should_flush = thread_context_collector_on_gc_finish(state->thread_context_collector_instance); + + // We use rb_postponed_job_register_one to ask Ruby to run thread_context_collector_sample_after_gc when the + // thread collector flags it's time to flush. + if (should_flush) rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL); } } diff --git a/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.c b/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.c new file mode 100644 index 00000000000..e9dd6c727d9 --- /dev/null +++ b/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.c @@ -0,0 +1,156 @@ +#include +#include + +#include "collectors_gc_profiling_helper.h" + +// This helper is used by the Datadog::Profiling::Collectors::ThreadContext to profile garbage collection. +// It's tested through that class' interfaces. +// --- + +// Used when retrieving GC information from the VM. +// All these are symbols, but we don't need to mark them since we ask for them to be interned (and thus live forever) +static VALUE state_sym; +static VALUE marking_sym; +static VALUE sweeping_sym; +static VALUE none_sym; +static VALUE gc_by_sym; +static VALUE newobj_sym; +static VALUE malloc_sym; +static VALUE method_sym; +static VALUE capi_sym; +static VALUE stress_sym; +static VALUE major_by_sym; +static VALUE nofree_sym; +static VALUE oldgen_sym; +static VALUE shady_sym; +static VALUE force_sym; +static VALUE oldmalloc_sym; + +static ddog_CharSlice major_gc_reason_pretty(VALUE major_gc_reason); +static ddog_CharSlice gc_cause_pretty(VALUE gc_cause); +static ddog_CharSlice gc_type_pretty(VALUE major_gc_reason, VALUE gc_state); + +void gc_profiling_init(void) { + // This function lazy-interns a few constants, which may trigger allocations. Since we want to call it during GC as + // well, when allocations are not allowed, we call it once here so that the constants get defined ahead of time. + rb_gc_latest_gc_info(rb_hash_new()); + + // Used to query and look up the results of GC information + state_sym = ID2SYM(rb_intern_const("state")); + marking_sym = ID2SYM(rb_intern_const("marking")); + sweeping_sym = ID2SYM(rb_intern_const("sweeping")); + none_sym = ID2SYM(rb_intern_const("none")); + gc_by_sym = ID2SYM(rb_intern_const("gc_by")); + newobj_sym = ID2SYM(rb_intern_const("newobj")); + malloc_sym = ID2SYM(rb_intern_const("malloc")); + method_sym = ID2SYM(rb_intern_const("method")); + capi_sym = ID2SYM(rb_intern_const("capi")); + stress_sym = ID2SYM(rb_intern_const("stress")); + major_by_sym = ID2SYM(rb_intern_const("major_by")); + nofree_sym = ID2SYM(rb_intern_const("nofree")); + oldgen_sym = ID2SYM(rb_intern_const("oldgen")); + shady_sym = ID2SYM(rb_intern_const("shady")); + force_sym = ID2SYM(rb_intern_const("force")); + oldmalloc_sym = ID2SYM(rb_intern_const("oldmalloc")); + state_sym = ID2SYM(rb_intern_const("state")); + none_sym = ID2SYM(rb_intern_const("none")); +} + +bool gc_profiling_has_major_gc_finished(void) { + return rb_gc_latest_gc_info(state_sym) == none_sym && rb_gc_latest_gc_info(major_by_sym) != Qnil; +} + +uint8_t gc_profiling_set_metadata(ddog_prof_Label *labels, int labels_length) { + uint8_t max_label_count = + 1 + // thread id + 1 + // thread name + 1 + // state + 1 + // event + 1 + // gc reason + 1 + // gc cause + 1; // gc type + + if (max_label_count > labels_length) { + rb_raise(rb_eArgError, "BUG: gc_profiling_set_metadata invalid labels_length (%d) < max_label_count (%d)", labels_length, max_label_count); + } + + uint8_t label_pos = 0; + + labels[label_pos++] = (ddog_prof_Label) { + .key = DDOG_CHARSLICE_C("thread id"), + .str = DDOG_CHARSLICE_C("GC"), + .num = 0, // This shouldn't be needed but the tracer-2.7 docker image ships a buggy gcc that complains about this + }; + + labels[label_pos++] = (ddog_prof_Label) { + .key = DDOG_CHARSLICE_C("thread name"), + .str = DDOG_CHARSLICE_C("Garbage Collection"), + .num = 0, // Workaround, same as above + }; + + labels[label_pos++] = (ddog_prof_Label) { + .key = DDOG_CHARSLICE_C("state"), + .str = DDOG_CHARSLICE_C("had cpu"), + .num = 0, // Workaround, same as above + }; + + labels[label_pos++] = (ddog_prof_Label) { + .key = DDOG_CHARSLICE_C("event"), + .str = DDOG_CHARSLICE_C("gc"), + .num = 0, // Workaround, same as above + }; + + VALUE major_by = rb_gc_latest_gc_info(major_by_sym); + if (major_by != Qnil) { + labels[label_pos++] = (ddog_prof_Label) { + .key = DDOG_CHARSLICE_C("gc reason"), + .str = major_gc_reason_pretty(major_by), + }; + } + + labels[label_pos++] = (ddog_prof_Label) { + .key = DDOG_CHARSLICE_C("gc cause"), + .str = gc_cause_pretty(rb_gc_latest_gc_info(gc_by_sym)), + }; + + labels[label_pos++] = (ddog_prof_Label) { + .key = DDOG_CHARSLICE_C("gc type"), + .str = gc_type_pretty(major_by, rb_gc_latest_gc_info(state_sym)), + }; + + if (label_pos > max_label_count) { + rb_raise(rb_eRuntimeError, "BUG: gc_profiling_set_metadata unexpected label_pos (%d) > max_label_count (%d)", label_pos, max_label_count); + } + + return label_pos; +} + +static ddog_CharSlice major_gc_reason_pretty(VALUE major_gc_reason) { + if (major_gc_reason == nofree_sym ) return DDOG_CHARSLICE_C("not enough free slots (NOFREE)"); + if (major_gc_reason == oldgen_sym ) return DDOG_CHARSLICE_C("old generation full (OLDGEN)"); + if (major_gc_reason == shady_sym ) return DDOG_CHARSLICE_C("too many objects without write barriers (SHADY)"); + if (major_gc_reason == force_sym ) return DDOG_CHARSLICE_C("requested (FORCE)"); + if (major_gc_reason == oldmalloc_sym) return DDOG_CHARSLICE_C("heap bytes allocated threshold (OLDMALLOC)"); + return DDOG_CHARSLICE_C("unknown"); +} + +static ddog_CharSlice gc_cause_pretty(VALUE gc_cause) { + if (gc_cause == newobj_sym) return DDOG_CHARSLICE_C("object allocation"); + if (gc_cause == malloc_sym) return DDOG_CHARSLICE_C("malloc()"); + if (gc_cause == method_sym) return DDOG_CHARSLICE_C("GC.start()"); + if (gc_cause == capi_sym ) return DDOG_CHARSLICE_C("rb_gc()"); + if (gc_cause == stress_sym) return DDOG_CHARSLICE_C("stress"); + return DDOG_CHARSLICE_C("unknown"); +} + +static ddog_CharSlice gc_type_pretty(VALUE major_gc_reason, VALUE gc_state) { + if (major_gc_reason != Qnil) { + if (gc_state == marking_sym ) return DDOG_CHARSLICE_C("major (ongoing, marking)"); + if (gc_state == sweeping_sym) return DDOG_CHARSLICE_C("major (ongoing, sweeping)"); + return DDOG_CHARSLICE_C("major"); + } else { + // As we delay flushing events when a minor GC finishes, it's not relevant to include the observed state of the + // minor GC, as we often won't record a marking -> sweeping -> done cycle, as it happens too quickly. + return DDOG_CHARSLICE_C("minor"); + } +} diff --git a/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.h b/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.h new file mode 100644 index 00000000000..7a5096624c6 --- /dev/null +++ b/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.h @@ -0,0 +1,5 @@ +#pragma once + +void gc_profiling_init(void); +bool gc_profiling_has_major_gc_finished(void); +uint8_t gc_profiling_set_metadata(ddog_prof_Label *labels, int labels_length); diff --git a/ext/ddtrace_profiling_native_extension/collectors_stack.c b/ext/ddtrace_profiling_native_extension/collectors_stack.c index 7611f13e41e..9bc0d11e0cb 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_stack.c +++ b/ext/ddtrace_profiling_native_extension/collectors_stack.c @@ -36,22 +36,13 @@ static VALUE _native_sample( VALUE in_gc ); static void maybe_add_placeholder_frames_omitted(VALUE thread, sampling_buffer* buffer, char *frames_omitted_message, int frames_omitted_message_size); -static void record_placeholder_stack_in_native_code( - sampling_buffer* buffer, - VALUE recorder_instance, - sample_values values, - sample_labels labels, - sampling_buffer *record_buffer, - int extra_frames_in_record_buffer -); +static void record_placeholder_stack_in_native_code(sampling_buffer* buffer, VALUE recorder_instance, sample_values values, sample_labels labels); static void sample_thread_internal( VALUE thread, sampling_buffer* buffer, VALUE recorder_instance, sample_values values, - sample_labels labels, - sampling_buffer *record_buffer, - int extra_frames_in_record_buffer + sample_labels labels ); void collectors_stack_init(VALUE profiling_module) { @@ -88,6 +79,7 @@ static VALUE _native_sample( .cpu_or_wall_samples = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("cpu-samples"), zero)), .wall_time_ns = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("wall-time"), zero)), .alloc_samples = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("alloc-samples"), zero)), + .timeline_wall_time_ns = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("timeline"), zero)), }; long labels_count = RARRAY_LEN(labels_array) + RARRAY_LEN(numeric_labels_array); @@ -122,62 +114,29 @@ static VALUE _native_sample( ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = labels_count}; - sample_thread( - thread, - buffer, - recorder_instance, - values, - (sample_labels) {.labels = slice_labels, .state_label = state_label}, - RTEST(in_gc) ? SAMPLE_IN_GC : SAMPLE_REGULAR - ); + if (in_gc == Qtrue) { + record_placeholder_stack( + buffer, + recorder_instance, + values, + (sample_labels) {.labels = slice_labels, .state_label = state_label}, + DDOG_CHARSLICE_C("Garbage Collection") + ); + } else { + sample_thread( + thread, + buffer, + recorder_instance, + values, + (sample_labels) {.labels = slice_labels, .state_label = state_label} + ); + } sampling_buffer_free(buffer); return Qtrue; } -void sample_thread( - VALUE thread, - sampling_buffer* buffer, - VALUE recorder_instance, - sample_values values, - sample_labels labels, - sample_type type -) { - // Samples thread into recorder - if (type == SAMPLE_REGULAR) { - sampling_buffer *record_buffer = buffer; - int extra_frames_in_record_buffer = 0; - sample_thread_internal(thread, buffer, recorder_instance, values, labels, record_buffer, extra_frames_in_record_buffer); - return; - } - - // Samples thread into recorder, including as a top frame in the stack a frame named "Garbage Collection" - if (type == SAMPLE_IN_GC) { - ddog_CharSlice function_name = DDOG_CHARSLICE_C(""); - ddog_CharSlice function_filename = DDOG_CHARSLICE_C("Garbage Collection"); - buffer->locations[0] = (ddog_prof_Location) { - .function = (ddog_prof_Function) {.name = function_name, .filename = function_filename}, - .line = 0 - }; - // To avoid changing sample_thread_internal, we just prepare a new buffer struct that uses the same underlying storage as the - // original buffer, but has capacity one less, so that we can keep the above Garbage Collection frame untouched. - sampling_buffer thread_in_gc_buffer = (struct sampling_buffer) { - .max_frames = buffer->max_frames - 1, - .stack_buffer = buffer->stack_buffer + 1, - .lines_buffer = buffer->lines_buffer + 1, - .is_ruby_frame = buffer->is_ruby_frame + 1, - .locations = buffer->locations + 1, - }; - sampling_buffer *record_buffer = buffer; // We pass in the original buffer as the record_buffer, but not as the regular buffer - int extra_frames_in_record_buffer = 1; - sample_thread_internal(thread, &thread_in_gc_buffer, recorder_instance, values, labels, record_buffer, extra_frames_in_record_buffer); - return; - } - - rb_raise(rb_eArgError, "Unexpected value for sample_type: %d", type); -} - #define CHARSLICE_EQUALS(must_be_a_literal, charslice) (strlen("" must_be_a_literal) == charslice.len && strncmp(must_be_a_literal, charslice.ptr, charslice.len) == 0) // Idea: Should we release the global vm lock (GVL) after we get the data from `rb_profile_frames`? That way other Ruby threads @@ -189,24 +148,12 @@ void sample_thread( // * Should we move this into a different thread entirely? // * If we don't move it into a different thread, does releasing the GVL on a Ruby thread mean that we're introducing // a new thread switch point where there previously was none? -// -// --- -// -// Why the weird extra record_buffer and extra_frames_in_record_buffer? -// The answer is: to support both sample_thread() and sample_thread_in_gc(). -// -// For sample_thread(), buffer == record_buffer and extra_frames_in_record_buffer == 0, so it's a no-op. -// For sample_thread_in_gc(), the buffer is a special buffer that is the same as the record_buffer, but with every -// pointer shifted forward extra_frames_in_record_buffer elements, so that the caller can actually inject those extra -// frames, and this function doesn't have to care about it. -static void sample_thread_internal( +void sample_thread( VALUE thread, sampling_buffer* buffer, VALUE recorder_instance, sample_values values, - sample_labels labels, - sampling_buffer *record_buffer, - int extra_frames_in_record_buffer + sample_labels labels ) { int captured_frames = ddtrace_rb_profile_frames( thread, @@ -218,14 +165,7 @@ static void sample_thread_internal( ); if (captured_frames == PLACEHOLDER_STACK_IN_NATIVE_CODE) { - record_placeholder_stack_in_native_code( - buffer, - recorder_instance, - values, - labels, - record_buffer, - extra_frames_in_record_buffer - ); + record_placeholder_stack_in_native_code(buffer, recorder_instance, values, labels); return; } @@ -332,7 +272,7 @@ static void sample_thread_internal( record_sample( recorder_instance, - (ddog_prof_Slice_Location) {.ptr = record_buffer->locations, .len = captured_frames + extra_frames_in_record_buffer}, + (ddog_prof_Slice_Location) {.ptr = buffer->locations, .len = captured_frames}, values, labels ); @@ -379,24 +319,35 @@ static void maybe_add_placeholder_frames_omitted(VALUE thread, sampling_buffer* // // To give customers visibility into these threads, rather than reporting an empty stack, we replace the empty stack // with one containing a placeholder frame, so that these threads are properly represented in the UX. + static void record_placeholder_stack_in_native_code( + sampling_buffer* buffer, + VALUE recorder_instance, + sample_values values, + sample_labels labels +) { + record_placeholder_stack( + buffer, + recorder_instance, + values, + labels, + DDOG_CHARSLICE_C("In native code") + ); +} + +void record_placeholder_stack( sampling_buffer* buffer, VALUE recorder_instance, sample_values values, sample_labels labels, - sampling_buffer *record_buffer, - int extra_frames_in_record_buffer + ddog_CharSlice placeholder_stack ) { - ddog_CharSlice function_name = DDOG_CHARSLICE_C(""); - ddog_CharSlice function_filename = DDOG_CHARSLICE_C("In native code"); - buffer->locations[0] = (ddog_prof_Location) { - .function = (ddog_prof_Function) {.name = function_name, .filename = function_filename}, - .line = 0 - }; + ddog_prof_Function placeholder = {.name = DDOG_CHARSLICE_C(""), .filename = placeholder_stack}; + buffer->locations[0] = (ddog_prof_Location) {.function = placeholder, .line = 0}; record_sample( recorder_instance, - (ddog_prof_Slice_Location) {.ptr = record_buffer->locations, .len = 1 + extra_frames_in_record_buffer}, + (ddog_prof_Slice_Location) {.ptr = buffer->locations, .len = 1}, values, labels ); diff --git a/ext/ddtrace_profiling_native_extension/collectors_stack.h b/ext/ddtrace_profiling_native_extension/collectors_stack.h index e4175ae35e8..b026bbbdeb8 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_stack.h +++ b/ext/ddtrace_profiling_native_extension/collectors_stack.h @@ -6,15 +6,19 @@ typedef struct sampling_buffer sampling_buffer; -typedef enum { SAMPLE_REGULAR, SAMPLE_IN_GC } sample_type; - void sample_thread( VALUE thread, + sampling_buffer* buffer, + VALUE recorder_instance, + sample_values values, + sample_labels labels +); +void record_placeholder_stack( sampling_buffer* buffer, VALUE recorder_instance, sample_values values, sample_labels labels, - sample_type type + ddog_CharSlice placeholder_stack ); sampling_buffer *sampling_buffer_new(unsigned int max_frames); void sampling_buffer_free(sampling_buffer *buffer); diff --git a/ext/ddtrace_profiling_native_extension/collectors_thread_context.c b/ext/ddtrace_profiling_native_extension/collectors_thread_context.c index 618cdb23f5d..a86ce1dba9d 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_thread_context.c +++ b/ext/ddtrace_profiling_native_extension/collectors_thread_context.c @@ -3,6 +3,7 @@ #include "collectors_thread_context.h" #include "clock_id.h" #include "collectors_stack.h" +#include "collectors_gc_profiling_helper.h" #include "helpers.h" #include "libdatadog_helpers.h" #include "private_vm_api_access.h" @@ -37,24 +38,29 @@ // When `thread_context_collector_on_gc_start` gets called, the current cpu and wall-time get recorded to the thread // context: `cpu_time_at_gc_start_ns` and `wall_time_at_gc_start_ns`. // -// While these fields are set, regular samples (if any) do not account for any time that passes after these two -// timestamps. +// While `cpu_time_at_gc_start_ns` is set, regular samples (if any) do not account for cpu-time any time that passes +// after this timestamp. The idea is that this cpu-time will be blamed separately on GC, and not on the user thread. +// Wall-time accounting is not affected by this (e.g. we still record 60 seconds every 60 seconds). // -// (Regular samples can still account for the time between the previous sample and the start of GC.) +// (Regular samples can still account for the cpu-time between the previous sample and the start of GC.) // -// When `thread_context_collector_on_gc_finish` gets called, the current cpu and wall-time again get recorded to the -// thread context: `cpu_time_at_gc_finish_ns` and `wall_time_at_gc_finish_ns`. +// When `thread_context_collector_on_gc_finish` gets called, the cpu-time and wall-time spent during GC gets recorded +// into the global gc_tracking structure, and further samples are not affected. (The `cpu_time_at_previous_sample_ns` +// of the thread that did GC also gets adjusted to avoid double-accounting.) // -// Finally, when `thread_context_collector_sample_after_gc` gets called, the following happens: +// Finally, when `thread_context_collector_sample_after_gc` gets called, a sample gets recorded with a stack having +// a single placeholder `Garbage Collection` frame. This sample gets +// assigned the cpu-time and wall-time that was recorded between calls to `on_gc_start` and `on_gc_finish`, as well +// as metadata for the last GC. // -// 1. A sample gets taken, using the special `SAMPLE_IN_GC` sample type, which produces a stack with a placeholder -// `Garbage Collection` frame as the latest frame. This sample gets assigned the cpu-time and wall-time period that was -// recorded between calls to `on_gc_start` and `on_gc_finish`. -// -// 2. The thread is no longer marked as being in gc (all gc tracking fields get reset back to `INVALID_TIME`). -// -// 3. The `cpu_time_at_previous_sample_ns` and `wall_time_at_previous_sample_ns` get updated with the elapsed time in -// GC, so that all time is accounted for -- e.g. the next sample will not get "blamed" by time spent in GC. +// Note that the Ruby GC does not usually do all of the GC work in one go. Instead, it breaks it up into smaller steps +// so that the application can keep doing user work in between GC steps. +// The `on_gc_start` / `on_gc_finish` will trigger each time the VM executes these smaller steps, and on a benchmark +// that executes `Object.new` in a loop, I measured more than 50k of this steps per second (!!). +// Creating these many events for every GC step is a lot of overhead, so instead `on_gc_finish` coalesces time +// spent in GC and only flushes it at most every 10 ms/every complete GC collection. This reduces the amount of +// individual GC events we need to record. We use the latest GC metadata for this event, reflecting the last GC that +// happened in the coalesced period. // // In an earlier attempt at implementing this functionality (https://github.com/DataDog/dd-trace-rb/pull/2308), we // discovered that we needed to factor the sampling work away from `thread_context_collector_on_gc_finish` and into a @@ -68,6 +74,7 @@ #define IS_WALL_TIME true #define IS_NOT_WALL_TIME false #define MISSING_TRACER_CONTEXT_KEY 0 +#define TIME_BETWEEN_GC_EVENTS_NS MILLIS_AS_NS(10) static ID at_active_span_id; // id of :@active_span in Ruby static ID at_active_trace_id; // id of :@active_trace in Ruby @@ -114,6 +121,14 @@ struct thread_context_collector_state { // See thread_context_collector_on_gc_start for details unsigned int gc_samples_missed_due_to_missing_context; } stats; + + struct { + unsigned long accumulated_cpu_time_ns; + unsigned long accumulated_wall_time_ns; + + long wall_time_at_previous_gc_ns; // Will be INVALID_TIME unless there's accumulated time above + long wall_time_at_last_flushed_gc_event_ns; // Starts at 0 and then will always be valid + } gc_tracking; }; // Tracks per-thread state @@ -127,15 +142,10 @@ struct per_thread_context { long wall_time_at_previous_sample_ns; // Can be INVALID_TIME until initialized struct { - // Both of these fields are set by on_gc_start and kept until sample_after_gc is called. + // Both of these fields are set by on_gc_start and kept until on_gc_finish is called. // Outside of this window, they will be INVALID_TIME. long cpu_time_at_start_ns; long wall_time_at_start_ns; - - // Both of these fields are set by on_gc_finish and kept until sample_after_gc is called. - // Outside of this window, they will be INVALID_TIME. - long cpu_time_at_finish_ns; - long wall_time_at_finish_ns; } gc_tracking; }; @@ -180,7 +190,6 @@ static void trigger_sample_for_thread( VALUE stack_from_thread, struct per_thread_context *thread_context, sample_values values, - sample_type type, long current_monotonic_wall_time_ns, ddog_CharSlice *ruby_vm_type, ddog_CharSlice *class_name @@ -193,6 +202,7 @@ static VALUE _native_inspect(VALUE self, VALUE collector_instance); static VALUE per_thread_context_st_table_as_ruby_hash(struct thread_context_collector_state *state); static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value_context, st_data_t result_hash); static VALUE stats_as_ruby_hash(struct thread_context_collector_state *state); +static VALUE gc_tracking_as_ruby_hash(struct thread_context_collector_state *state); static void remove_context_for_dead_threads(struct thread_context_collector_state *state); static int remove_if_dead_thread(st_data_t key_thread, st_data_t value_context, st_data_t _argument); static VALUE _native_per_thread_context(VALUE self, VALUE collector_instance); @@ -200,13 +210,14 @@ static long update_time_since_previous_sample(long *time_at_previous_sample_ns, static long cpu_time_now_ns(struct per_thread_context *thread_context); static long thread_id_for(VALUE thread); static VALUE _native_stats(VALUE self, VALUE collector_instance); +static VALUE _native_gc_tracking(VALUE self, VALUE collector_instance); static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result); static bool should_collect_resource(VALUE root_span_type); static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector_instance); static VALUE thread_list(struct thread_context_collector_state *state); static VALUE _native_sample_allocation(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE sample_weight, VALUE new_object); static VALUE _native_new_empty_thread(VALUE self); -ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type); +static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type); void collectors_thread_context_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); @@ -235,6 +246,7 @@ void collectors_thread_context_init(VALUE profiling_module) { rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0); rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1); rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1); + rb_define_singleton_method(testing_module, "_native_gc_tracking", _native_gc_tracking, 1); rb_define_singleton_method(testing_module, "_native_new_empty_thread", _native_new_empty_thread, 0); at_active_span_id = rb_intern_const("@active_span"); @@ -243,6 +255,8 @@ void collectors_thread_context_init(VALUE profiling_module) { at_resource_id = rb_intern_const("@resource"); at_root_span_id = rb_intern_const("@root_span"); at_type_id = rb_intern_const("@type"); + + gc_profiling_init(); } // This structure is used to define a Ruby object that stores a pointer to a struct thread_context_collector_state @@ -320,6 +334,8 @@ static VALUE _native_new(VALUE klass) { state->allocation_type_enabled = true; state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER; state->main_thread = rb_thread_main(); + state->gc_tracking.wall_time_at_previous_gc_ns = INVALID_TIME; + state->gc_tracking.wall_time_at_last_flushed_gc_event_ns = 0; return TypedData_Wrap_Struct(klass, &thread_context_collector_typed_data, state); } @@ -465,7 +481,11 @@ void update_metrics_and_sample( long wall_time_elapsed_ns = update_time_since_previous_sample( &thread_context->wall_time_at_previous_sample_ns, current_monotonic_wall_time_ns, - thread_context->gc_tracking.wall_time_at_start_ns, + // We explicitly pass in `INVALID_TIME` as an argument for `gc_start_time_ns` here because we don't want wall-time + // accounting to change during GC. + // E.g. if 60 seconds pass in the real world, 60 seconds of wall-time are recorded, regardless of the thread doing + // GC or not. + INVALID_TIME, IS_WALL_TIME ); @@ -475,7 +495,6 @@ void update_metrics_and_sample( stack_from_thread, thread_context, (sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = wall_time_elapsed_ns}, - SAMPLE_REGULAR, current_monotonic_wall_time_ns, NULL, NULL @@ -484,7 +503,7 @@ void update_metrics_and_sample( // This function gets called when Ruby is about to start running the Garbage Collector on the current thread. // It updates the per_thread_context of the current thread to include the current cpu/wall times, to be used to later -// create a stack sample that blames the cpu/wall time spent from now until the end of the garbage collector work. +// create an event including the cpu/wall time spent in garbage collector work. // // Safety: This function gets called while Ruby is doing garbage collection. While Ruby is doing garbage collection, // *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation. @@ -509,27 +528,14 @@ void thread_context_collector_on_gc_start(VALUE self_instance) { return; } - // If these fields are set, there's an existing GC sample that still needs to be written out by `sample_after_gc`. - // - // When can this happen? Because we don't have precise control over when `sample_after_gc` gets called (it will be - // called sometime after GC finishes), there is no way to guarantee that Ruby will not trigger more than one GC cycle - // before we can actually run that method. - // - // We handle this by collapsing multiple GC cycles into one. That is, if the following happens: - // `on_gc_start` (time=0) -> `on_gc_finish` (time=1) -> `on_gc_start` (time=2) -> `on_gc_finish` (time=3) -> `sample_after_gc` - // then we just use time=0 from the first on_gc_start and time=3 from the last on_gc_finish, e.g. we behave as if - // there was a single, longer GC period. - if (thread_context->gc_tracking.cpu_time_at_finish_ns != INVALID_TIME && - thread_context->gc_tracking.wall_time_at_finish_ns != INVALID_TIME) return; - - // Here we record the wall-time first and in on_gc_finish we record it second to avoid having wall-time be slightly < cpu-time + // Here we record the wall-time first and in on_gc_finish we record it second to try to avoid having wall-time be slightly < cpu-time thread_context->gc_tracking.wall_time_at_start_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE); thread_context->gc_tracking.cpu_time_at_start_ns = cpu_time_now_ns(thread_context); } // This function gets called when Ruby has finished running the Garbage Collector on the current thread. -// It updates the per_thread_context of the current thread to include the current cpu/wall times, to be used to later -// create a stack sample that blames the cpu/wall time spent from the start of garbage collector work until now. +// It records the cpu/wall-time observed during GC, which will be used to later +// create an event including the cpu/wall time spent from the start of garbage collector work until now. // // Safety: This function gets called while Ruby is doing garbage collection. While Ruby is doing garbage collection, // *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation. @@ -537,9 +543,9 @@ void thread_context_collector_on_gc_start(VALUE self_instance) { // // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this. // Assumption 2: This function is called from the main Ractor (if Ruby has support for Ractors). -void thread_context_collector_on_gc_finish(VALUE self_instance) { +bool thread_context_collector_on_gc_finish(VALUE self_instance) { struct thread_context_collector_state *state; - if (!rb_typeddata_is_kind_of(self_instance, &thread_context_collector_typed_data)) return; + if (!rb_typeddata_is_kind_of(self_instance, &thread_context_collector_typed_data)) return false; // This should never fail the the above check passes TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); @@ -547,29 +553,67 @@ void thread_context_collector_on_gc_finish(VALUE self_instance) { // If there was no previously-existing context for this thread, we won't allocate one (see safety). We keep a metric for // how often this happens -- see on_gc_start. - if (thread_context == NULL) return; + if (thread_context == NULL) return false; - if (thread_context->gc_tracking.cpu_time_at_start_ns == INVALID_TIME && - thread_context->gc_tracking.wall_time_at_start_ns == INVALID_TIME) { + long cpu_time_at_start_ns = thread_context->gc_tracking.cpu_time_at_start_ns; + long wall_time_at_start_ns = thread_context->gc_tracking.wall_time_at_start_ns; + + if (cpu_time_at_start_ns == INVALID_TIME && wall_time_at_start_ns == INVALID_TIME) { // If this happened, it means that on_gc_start was either never called for the thread OR it was called but no thread // context existed at the time. The former can be the result of a bug, but since we can't distinguish them, we just // do nothing. - return; + return false; + } + + // Mark thread as no longer in GC + thread_context->gc_tracking.cpu_time_at_start_ns = INVALID_TIME; + thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME; + + // Here we record the wall-time second and in on_gc_start we record it first to try to avoid having wall-time be slightly < cpu-time + long cpu_time_at_finish_ns = cpu_time_now_ns(thread_context); + long wall_time_at_finish_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE); + + // If our end timestamp is not OK, we bail out + if (wall_time_at_finish_ns == 0) return false; + + long gc_cpu_time_elapsed_ns = cpu_time_at_finish_ns - cpu_time_at_start_ns; + long gc_wall_time_elapsed_ns = wall_time_at_finish_ns - wall_time_at_start_ns; + + // Wall-time can go backwards if the system clock gets changed (and we observed spurious jumps back on macOS as well) + // so let's ensure we don't get negative values for time deltas. + gc_cpu_time_elapsed_ns = long_max_of(gc_cpu_time_elapsed_ns, 0); + gc_wall_time_elapsed_ns = long_max_of(gc_wall_time_elapsed_ns, 0); + + if (state->gc_tracking.wall_time_at_previous_gc_ns == INVALID_TIME) { + state->gc_tracking.accumulated_cpu_time_ns = 0; + state->gc_tracking.accumulated_wall_time_ns = 0; } - // Here we record the wall-time second and in on_gc_start we record it first to avoid having wall-time be slightly < cpu-time - thread_context->gc_tracking.cpu_time_at_finish_ns = cpu_time_now_ns(thread_context); - thread_context->gc_tracking.wall_time_at_finish_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE); + state->gc_tracking.accumulated_cpu_time_ns += gc_cpu_time_elapsed_ns; + state->gc_tracking.accumulated_wall_time_ns += gc_wall_time_elapsed_ns; + state->gc_tracking.wall_time_at_previous_gc_ns = wall_time_at_finish_ns; + + // Update cpu-time accounting so it doesn't include the cpu-time spent in GC during the next sample + // We don't update the wall-time because we don't subtract the wall-time spent in GC (see call to + // `update_time_since_previous_sample` for wall-time in `update_metrics_and_sample`). + if (thread_context->cpu_time_at_previous_sample_ns != INVALID_TIME) { + thread_context->cpu_time_at_previous_sample_ns += gc_cpu_time_elapsed_ns; + } + + // Let the caller know if it should schedule a flush or not. Returning true every time would cause a lot of overhead + // on the application (see GC tracking introduction at the top of the file), so instead we try to accumulate a few + // samples first. + bool finished_major_gc = gc_profiling_has_major_gc_finished(); + bool over_flush_time_treshold = + (wall_time_at_finish_ns - state->gc_tracking.wall_time_at_last_flushed_gc_event_ns) >= TIME_BETWEEN_GC_EVENTS_NS; + + return finished_major_gc || over_flush_time_treshold; } -// This function gets called shortly after Ruby has finished running the Garbage Collector. +// This function gets called after one or more GC work steps (calls to on_gc_start/on_gc_finish). // It creates a new sample including the cpu and wall-time spent by the garbage collector work, and resets any // GC-related tracking. // -// Specifically, it will search for thread(s) which have gone through a cycle of on_gc_start/on_gc_finish -// and thus have cpu_time_at_start_ns, cpu_time_at_finish_ns, wall_time_at_start_ns, wall_time_at_finish_ns -// set on their context. -// // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this. // Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed. // Assumption 3: Unlike `on_gc_start` and `on_gc_finish`, this method is allowed to allocate memory as needed. @@ -578,70 +622,45 @@ VALUE thread_context_collector_sample_after_gc(VALUE self_instance) { struct thread_context_collector_state *state; TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); - VALUE threads = thread_list(state); - bool sampled_any_thread = false; - - const long thread_count = RARRAY_LEN(threads); - for (long i = 0; i < thread_count; i++) { - VALUE thread = RARRAY_AREF(threads, i); - struct per_thread_context *thread_context = get_or_create_context_for(thread, state); + if (state->gc_tracking.wall_time_at_previous_gc_ns == INVALID_TIME) { + rb_raise(rb_eRuntimeError, "BUG: Unexpected call to sample_after_gc without valid GC information available"); + } - if ( - thread_context->gc_tracking.cpu_time_at_start_ns == INVALID_TIME || - thread_context->gc_tracking.cpu_time_at_finish_ns == INVALID_TIME || - thread_context->gc_tracking.wall_time_at_start_ns == INVALID_TIME || - thread_context->gc_tracking.wall_time_at_finish_ns == INVALID_TIME - ) continue; // Ignore threads with no/incomplete garbage collection data - - sampled_any_thread = true; - - long gc_cpu_time_elapsed_ns = - thread_context->gc_tracking.cpu_time_at_finish_ns - thread_context->gc_tracking.cpu_time_at_start_ns; - long gc_wall_time_elapsed_ns = - thread_context->gc_tracking.wall_time_at_finish_ns - thread_context->gc_tracking.wall_time_at_start_ns; - - // We don't expect non-wall time to go backwards, so let's flag this as a bug - if (gc_cpu_time_elapsed_ns < 0) rb_raise(rb_eRuntimeError, "BUG: Unexpected negative gc_cpu_time_elapsed_ns between samples"); - // Wall-time can actually go backwards (e.g. when the system clock gets set) so we can't assume time going backwards - // was a bug. - // @ivoanjo: I've also observed time going backwards spuriously on macOS, see discussion on - // https://github.com/DataDog/dd-trace-rb/pull/2336. - if (gc_wall_time_elapsed_ns < 0) gc_wall_time_elapsed_ns = 0; - - if (thread_context->gc_tracking.wall_time_at_start_ns == 0 && thread_context->gc_tracking.wall_time_at_finish_ns != 0) { - // Avoid using wall-clock if we got 0 for a start (meaning there was an error) but not 0 for finish so we don't - // come up with a crazy value for the frame - rb_raise(rb_eRuntimeError, "BUG: Unexpected zero value for gc_tracking.wall_time_at_start_ns"); - } + int max_labels_needed_for_gc = 7; // Magic number gets validated inside gc_profiling_set_metadata + ddog_prof_Label labels[max_labels_needed_for_gc]; + uint8_t label_pos = gc_profiling_set_metadata(labels, max_labels_needed_for_gc); - trigger_sample_for_thread( - state, - /* thread: */ thread, - /* stack_from_thread: */ thread, - thread_context, - (sample_values) {.cpu_time_ns = gc_cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = gc_wall_time_elapsed_ns}, - SAMPLE_IN_GC, - INVALID_TIME, // For now we're not collecting timestamps for these events - NULL, - NULL - ); + ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = label_pos}; - // Mark thread as no longer in GC - thread_context->gc_tracking.cpu_time_at_start_ns = INVALID_TIME; - thread_context->gc_tracking.cpu_time_at_finish_ns = INVALID_TIME; - thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME; - thread_context->gc_tracking.wall_time_at_finish_ns = INVALID_TIME; + // The end_timestamp_ns is treated specially by libdatadog and that's why it's not added as a ddog_prof_Label + int64_t end_timestamp_ns = 0; - // Update counters so that they won't include the time in GC during the next sample - if (thread_context->cpu_time_at_previous_sample_ns != INVALID_TIME) { - thread_context->cpu_time_at_previous_sample_ns += gc_cpu_time_elapsed_ns; - } - if (thread_context->wall_time_at_previous_sample_ns != INVALID_TIME) { - thread_context->wall_time_at_previous_sample_ns += gc_wall_time_elapsed_ns; - } + if (state->timeline_enabled) { + end_timestamp_ns = monotonic_to_system_epoch_ns(&state->time_converter_state, state->gc_tracking.wall_time_at_previous_gc_ns); } - if (sampled_any_thread) state->stats.gc_samples++; + record_placeholder_stack( + state->sampling_buffer, + state->recorder_instance, + (sample_values) { + // This event gets both a regular cpu/wall-time duration, as a normal cpu/wall-time sample would, as well as a + // timeline duration. + // This is done to enable two use-cases: + // * regular cpu/wall-time makes this event show up as a regular stack in the flamegraph + // * the timeline duration is used when the event shows up in the timeline + .cpu_time_ns = state->gc_tracking.accumulated_cpu_time_ns, + .cpu_or_wall_samples = 1, + .wall_time_ns = state->gc_tracking.accumulated_wall_time_ns, + .timeline_wall_time_ns = state->gc_tracking.accumulated_wall_time_ns, + }, + (sample_labels) {.labels = slice_labels, .state_label = NULL, .end_timestamp_ns = end_timestamp_ns}, + DDOG_CHARSLICE_C("Garbage Collection") + ); + + state->gc_tracking.wall_time_at_last_flushed_gc_event_ns = state->gc_tracking.wall_time_at_previous_gc_ns; + state->gc_tracking.wall_time_at_previous_gc_ns = INVALID_TIME; + + state->stats.gc_samples++; // Return a VALUE to make it easier to call this function from Ruby APIs that expect a return value (such as rb_rescue2) return Qnil; @@ -653,7 +672,6 @@ static void trigger_sample_for_thread( VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack struct per_thread_context *thread_context, sample_values values, - sample_type type, long current_monotonic_wall_time_ns, // These two labels are only used for allocation profiling; @ivoanjo: may want to refactor this at some point? ddog_CharSlice *ruby_vm_type, @@ -776,8 +794,7 @@ static void trigger_sample_for_thread( state->sampling_buffer, state->recorder_instance, values, - (sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns}, - type + (sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns} ); } @@ -874,9 +891,7 @@ static void initialize_context(VALUE thread, struct per_thread_context *thread_c // These will only be used during a GC operation thread_context->gc_tracking.cpu_time_at_start_ns = INVALID_TIME; - thread_context->gc_tracking.cpu_time_at_finish_ns = INVALID_TIME; thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME; - thread_context->gc_tracking.wall_time_at_finish_ns = INVALID_TIME; } static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) { @@ -901,6 +916,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc state->time_converter_state.delta_to_epoch_ns )); rb_str_concat(result, rb_sprintf(" main_thread=%"PRIsVALUE, state->main_thread)); + rb_str_concat(result, rb_sprintf(" gc_tracking=%"PRIsVALUE, gc_tracking_as_ruby_hash(state))); return result; } @@ -927,9 +943,7 @@ static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value ID2SYM(rb_intern("wall_time_at_previous_sample_ns")), /* => */ LONG2NUM(thread_context->wall_time_at_previous_sample_ns), ID2SYM(rb_intern("gc_tracking.cpu_time_at_start_ns")), /* => */ LONG2NUM(thread_context->gc_tracking.cpu_time_at_start_ns), - ID2SYM(rb_intern("gc_tracking.cpu_time_at_finish_ns")), /* => */ LONG2NUM(thread_context->gc_tracking.cpu_time_at_finish_ns), ID2SYM(rb_intern("gc_tracking.wall_time_at_start_ns")), /* => */ LONG2NUM(thread_context->gc_tracking.wall_time_at_start_ns), - ID2SYM(rb_intern("gc_tracking.wall_time_at_finish_ns")), /* => */ LONG2NUM(thread_context->gc_tracking.wall_time_at_finish_ns) }; for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(context_as_hash, arguments[i], arguments[i+1]); @@ -947,6 +961,19 @@ static VALUE stats_as_ruby_hash(struct thread_context_collector_state *state) { return stats_as_hash; } +static VALUE gc_tracking_as_ruby_hash(struct thread_context_collector_state *state) { + // Update this when modifying state struct (gc_tracking inner struct) + VALUE result = rb_hash_new(); + VALUE arguments[] = { + ID2SYM(rb_intern("accumulated_cpu_time_ns")), /* => */ ULONG2NUM(state->gc_tracking.accumulated_cpu_time_ns), + ID2SYM(rb_intern("accumulated_wall_time_ns")), /* => */ ULONG2NUM(state->gc_tracking.accumulated_wall_time_ns), + ID2SYM(rb_intern("wall_time_at_previous_gc_ns")), /* => */ LONG2NUM(state->gc_tracking.wall_time_at_previous_gc_ns), + ID2SYM(rb_intern("wall_time_at_last_flushed_gc_event_ns")), /* => */ LONG2NUM(state->gc_tracking.wall_time_at_last_flushed_gc_event_ns), + }; + for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(result, arguments[i], arguments[i+1]); + return result; +} + static void remove_context_for_dead_threads(struct thread_context_collector_state *state) { st_foreach(state->hash_map_per_thread_context, remove_if_dead_thread, 0 /* unused */); } @@ -1049,8 +1076,6 @@ VALUE enforce_thread_context_collector_instance(VALUE object) { // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. -// -// Returns the whole contents of the per_thread_context structs being tracked. static VALUE _native_stats(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) { struct thread_context_collector_state *state; TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); @@ -1058,6 +1083,15 @@ static VALUE _native_stats(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) return stats_as_ruby_hash(state); } +// This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. +// It SHOULD NOT be used for other purposes. +static VALUE _native_gc_tracking(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) { + struct thread_context_collector_state *state; + TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); + + return gc_tracking_as_ruby_hash(state); +} + // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this. static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result) { if (state->tracer_context_key == MISSING_TRACER_CONTEXT_KEY) return; @@ -1218,7 +1252,6 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in /* stack_from_thread: */ current_thread, get_or_create_context_for(current_thread, state), (sample_values) {.alloc_samples = sample_weight}, - SAMPLE_REGULAR, INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions &ruby_vm_type, optional_class_name @@ -1241,7 +1274,7 @@ static VALUE _native_new_empty_thread(DDTRACE_UNUSED VALUE self) { return rb_thread_create(new_empty_thread_inner, NULL); } -ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) { +static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) { switch (type) { case(RUBY_T_OBJECT ): return DDOG_CHARSLICE_C("Object"); case(RUBY_T_CLASS ): return DDOG_CHARSLICE_C("Class"); diff --git a/ext/ddtrace_profiling_native_extension/collectors_thread_context.h b/ext/ddtrace_profiling_native_extension/collectors_thread_context.h index 88dfabe9615..6299d96b43e 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_thread_context.h +++ b/ext/ddtrace_profiling_native_extension/collectors_thread_context.h @@ -1,6 +1,7 @@ #pragma once #include +#include void thread_context_collector_sample( VALUE self_instance, @@ -10,5 +11,5 @@ void thread_context_collector_sample( void thread_context_collector_sample_allocation(VALUE self_instance, unsigned int sample_weight, VALUE new_object); VALUE thread_context_collector_sample_after_gc(VALUE self_instance); void thread_context_collector_on_gc_start(VALUE self_instance); -void thread_context_collector_on_gc_finish(VALUE self_instance); +bool thread_context_collector_on_gc_finish(VALUE self_instance); VALUE enforce_thread_context_collector_instance(VALUE object); diff --git a/ext/ddtrace_profiling_native_extension/helpers.h b/ext/ddtrace_profiling_native_extension/helpers.h index f06c96710ec..d71a513de9f 100644 --- a/ext/ddtrace_profiling_native_extension/helpers.h +++ b/ext/ddtrace_profiling_native_extension/helpers.h @@ -15,3 +15,5 @@ // don't like C and I just implemented this as a function. inline static uint64_t uint64_max_of(uint64_t a, uint64_t b) { return a > b ? a : b; } inline static uint64_t uint64_min_of(uint64_t a, uint64_t b) { return a > b ? b : a; } +inline static long long_max_of(long a, long b) { return a > b ? a : b; } +inline static long long_min_of(long a, long b) { return a > b ? b : a; } diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.c b/ext/ddtrace_profiling_native_extension/stack_recorder.c index 4918f8caea8..55ac0b25a1a 100644 --- a/ext/ddtrace_profiling_native_extension/stack_recorder.c +++ b/ext/ddtrace_profiling_native_extension/stack_recorder.c @@ -151,15 +151,19 @@ static VALUE error_symbol = Qnil; // :error in Ruby #define WALL_TIME_VALUE_ID 2 #define ALLOC_SAMPLES_VALUE {.type_ = VALUE_STRING("alloc-samples"), .unit = VALUE_STRING("count")} #define ALLOC_SAMPLES_VALUE_ID 3 -#define HEAP_SAMPLES_VALUE {.type_ = VALUE_STRING("heap-live-samples"), .unit = VALUE_STRING("count")} +#define HEAP_SAMPLES_VALUE {.type_ = VALUE_STRING("heap-live-samples"), .unit = VALUE_STRING("count")} #define HEAP_SAMPLES_VALUE_ID 4 +#define TIMELINE_VALUE {.type_ = VALUE_STRING("timeline"), .unit = VALUE_STRING("nanoseconds")} +#define TIMELINE_VALUE_ID 5 -static const ddog_prof_ValueType all_value_types[] = {CPU_TIME_VALUE, CPU_SAMPLES_VALUE, WALL_TIME_VALUE, ALLOC_SAMPLES_VALUE, HEAP_SAMPLES_VALUE}; +static const ddog_prof_ValueType all_value_types[] = + {CPU_TIME_VALUE, CPU_SAMPLES_VALUE, WALL_TIME_VALUE, ALLOC_SAMPLES_VALUE, HEAP_SAMPLES_VALUE, TIMELINE_VALUE}; // This array MUST be kept in sync with all_value_types above and is intended to act as a "hashmap" between VALUE_ID and the position it // occupies on the all_value_types array. // E.g. all_value_types_positions[CPU_TIME_VALUE_ID] => 0, means that CPU_TIME_VALUE was declared at position 0 of all_value_types. -static const uint8_t all_value_types_positions[] = {CPU_TIME_VALUE_ID, CPU_SAMPLES_VALUE_ID, WALL_TIME_VALUE_ID, ALLOC_SAMPLES_VALUE_ID, HEAP_SAMPLES_VALUE_ID}; +static const uint8_t all_value_types_positions[] = + {CPU_TIME_VALUE_ID, CPU_SAMPLES_VALUE_ID, WALL_TIME_VALUE_ID, ALLOC_SAMPLES_VALUE_ID, HEAP_SAMPLES_VALUE_ID, TIMELINE_VALUE_ID}; #define ALL_VALUE_TYPES_COUNT (sizeof(all_value_types) / sizeof(ddog_prof_ValueType)) @@ -203,7 +207,14 @@ static VALUE _native_new(VALUE klass); static void initialize_slot_concurrency_control(struct stack_recorder_state *state); static void initialize_profiles(struct stack_recorder_state *state, ddog_prof_Slice_ValueType sample_types); static void stack_recorder_typed_data_free(void *data); -static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE cpu_time_enabled, VALUE alloc_samples_enabled, VALUE heap_samples_enabled); +static VALUE _native_initialize( + DDTRACE_UNUSED VALUE _self, + VALUE recorder_instance, + VALUE cpu_time_enabled, + VALUE alloc_samples_enabled, + VALUE heap_samples_enabled, + VALUE timeline_enabled +); static VALUE _native_serialize(VALUE self, VALUE recorder_instance); static VALUE ruby_time_from(ddog_Timespec ddprof_time); static void *call_serialize_without_gvl(void *call_args); @@ -237,7 +248,7 @@ void stack_recorder_init(VALUE profiling_module) { // https://bugs.ruby-lang.org/issues/18007 for a discussion around this. rb_define_alloc_func(stack_recorder_class, _native_new); - rb_define_singleton_method(stack_recorder_class, "_native_initialize", _native_initialize, 4); + rb_define_singleton_method(stack_recorder_class, "_native_initialize", _native_initialize, 5); rb_define_singleton_method(stack_recorder_class, "_native_serialize", _native_serialize, 1); rb_define_singleton_method(stack_recorder_class, "_native_reset_after_fork", _native_reset_after_fork, 1); rb_define_singleton_method(testing_module, "_native_active_slot", _native_active_slot, 1); @@ -338,28 +349,39 @@ static void stack_recorder_typed_data_free(void *state_ptr) { ruby_xfree(state); } -static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE cpu_time_enabled, VALUE alloc_samples_enabled, VALUE heap_samples_enabled) { +static VALUE _native_initialize( + DDTRACE_UNUSED VALUE _self, + VALUE recorder_instance, + VALUE cpu_time_enabled, + VALUE alloc_samples_enabled, + VALUE heap_samples_enabled, + VALUE timeline_enabled +) { ENFORCE_BOOLEAN(cpu_time_enabled); ENFORCE_BOOLEAN(alloc_samples_enabled); ENFORCE_BOOLEAN(heap_samples_enabled); + ENFORCE_BOOLEAN(timeline_enabled); struct stack_recorder_state *state; TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state); - if (cpu_time_enabled == Qtrue && alloc_samples_enabled == Qtrue && heap_samples_enabled == Qtrue) return Qtrue; // Nothing to do, this is the default + uint8_t requested_values_count = ALL_VALUE_TYPES_COUNT - + (cpu_time_enabled == Qtrue ? 0 : 1) - + (alloc_samples_enabled == Qtrue? 0 : 1) - + (heap_samples_enabled == Qtrue ? 0 : 1) - + (timeline_enabled == Qtrue ? 0 : 1); + + if (requested_values_count == ALL_VALUE_TYPES_COUNT) return Qtrue; // Nothing to do, this is the default // When some sample types are disabled, we need to reconfigure libdatadog to record less types, // as well as reconfigure the position_for array to push the disabled types to the end so they don't get recorded. // See record_sample for details on the use of position_for. - state->enabled_values_count = ALL_VALUE_TYPES_COUNT - - (cpu_time_enabled == Qtrue ? 0 : 1) - - (alloc_samples_enabled == Qtrue? 0 : 1) - - (heap_samples_enabled == Qtrue ? 0 : 1); + state->enabled_values_count = requested_values_count; ddog_prof_ValueType enabled_value_types[ALL_VALUE_TYPES_COUNT]; uint8_t next_enabled_pos = 0; - uint8_t next_disabled_pos = state->enabled_values_count; + uint8_t next_disabled_pos = requested_values_count; // CPU_SAMPLES_VALUE is always enabled enabled_value_types[next_enabled_pos] = (ddog_prof_ValueType) CPU_SAMPLES_VALUE; @@ -395,6 +417,13 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_insta state->heap_recorder = NULL; } + if (timeline_enabled == Qtrue) { + enabled_value_types[next_enabled_pos] = (ddog_prof_ValueType) TIMELINE_VALUE; + state->position_for[TIMELINE_VALUE_ID] = next_enabled_pos++; + } else { + state->position_for[TIMELINE_VALUE_ID] = next_disabled_pos++; + } + ddog_prof_Profile_drop(&state->slot_one_profile); ddog_prof_Profile_drop(&state->slot_two_profile); @@ -476,6 +505,7 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, metric_values[position_for[CPU_SAMPLES_VALUE_ID]] = values.cpu_or_wall_samples; metric_values[position_for[WALL_TIME_VALUE_ID]] = values.wall_time_ns; metric_values[position_for[ALLOC_SAMPLES_VALUE_ID]] = values.alloc_samples; + metric_values[position_for[TIMELINE_VALUE_ID]] = values.timeline_wall_time_ns; if (values.alloc_samples != 0) { // If we got an allocation sample end the heap allocation recording to commit the heap sample. diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.h b/ext/ddtrace_profiling_native_extension/stack_recorder.h index 9190cc89a25..4a9f2ead7b2 100644 --- a/ext/ddtrace_profiling_native_extension/stack_recorder.h +++ b/ext/ddtrace_profiling_native_extension/stack_recorder.h @@ -8,6 +8,7 @@ typedef struct { int64_t wall_time_ns; uint32_t cpu_or_wall_samples; uint32_t alloc_samples; + int64_t timeline_wall_time_ns; } sample_values; typedef struct sample_labels { diff --git a/lib/datadog/profiling/component.rb b/lib/datadog/profiling/component.rb index 228fd01f7a9..d0083a583f4 100644 --- a/lib/datadog/profiling/component.rb +++ b/lib/datadog/profiling/component.rb @@ -48,7 +48,11 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) overhead_target_percentage = valid_overhead_target(settings.profiling.advanced.overhead_target_percentage) upload_period_seconds = [60, settings.profiling.advanced.upload_period_seconds].max - recorder = build_recorder(allocation_profiling_enabled, heap_profiling_enabled) + recorder = build_recorder( + allocation_profiling_enabled: allocation_profiling_enabled, + heap_profiling_enabled: heap_profiling_enabled, + timeline_enabled: timeline_enabled, + ) thread_context_collector = build_thread_context_collector(settings, recorder, optional_tracer, timeline_enabled) worker = Datadog::Profiling::Collectors::CpuAndWallTimeWorker.new( gc_profiling_enabled: enable_gc_profiling?(settings), @@ -72,11 +76,16 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) Profiling::Profiler.new(worker: worker, scheduler: scheduler) end - private_class_method def self.build_recorder(allocation_profiling_enabled, heap_profiling_enabled) + private_class_method def self.build_recorder( + allocation_profiling_enabled:, + heap_profiling_enabled:, + timeline_enabled: + ) Datadog::Profiling::StackRecorder.new( cpu_time_enabled: RUBY_PLATFORM.include?('linux'), # Only supported on Linux currently alloc_samples_enabled: allocation_profiling_enabled, heap_samples_enabled: heap_profiling_enabled, + timeline_enabled: timeline_enabled, ) end diff --git a/lib/datadog/profiling/stack_recorder.rb b/lib/datadog/profiling/stack_recorder.rb index 3153a4982c5..6337d3ec588 100644 --- a/lib/datadog/profiling/stack_recorder.rb +++ b/lib/datadog/profiling/stack_recorder.rb @@ -4,7 +4,7 @@ module Profiling # Note that `record_sample` is only accessible from native code. # Methods prefixed with _native_ are implemented in `stack_recorder.c` class StackRecorder - def initialize(cpu_time_enabled:, alloc_samples_enabled:, heap_samples_enabled:) + def initialize(cpu_time_enabled:, alloc_samples_enabled:, heap_samples_enabled:, timeline_enabled:) # This mutex works in addition to the fancy C-level mutexes we have in the native side (see the docs there). # It prevents multiple Ruby threads calling serialize at the same time -- something like # `10.times { Thread.new { stack_recorder.serialize } }`. @@ -13,7 +13,13 @@ def initialize(cpu_time_enabled:, alloc_samples_enabled:, heap_samples_enabled:) # accidentally happening. @no_concurrent_synchronize_mutex = Mutex.new - self.class._native_initialize(self, cpu_time_enabled, alloc_samples_enabled, heap_samples_enabled) + self.class._native_initialize( + self, + cpu_time_enabled, + alloc_samples_enabled, + heap_samples_enabled, + timeline_enabled, + ) end def serialize diff --git a/sig/datadog/profiling/component.rbs b/sig/datadog/profiling/component.rbs index 2b32dbe88fe..5de8b61eb6c 100644 --- a/sig/datadog/profiling/component.rbs +++ b/sig/datadog/profiling/component.rbs @@ -8,8 +8,9 @@ module Datadog ) -> Datadog::Profiling::Profiler? def self.build_recorder: ( - bool allocation_profiling_enabled, - bool heap_profiling_enabled, + allocation_profiling_enabled: bool, + heap_profiling_enabled: bool, + timeline_enabled: bool, ) -> Datadog::Profiling::StackRecorder def self.build_thread_context_collector: ( diff --git a/sig/datadog/profiling/stack_recorder.rbs b/sig/datadog/profiling/stack_recorder.rbs index 0f61f9c8ab9..f0681354e95 100644 --- a/sig/datadog/profiling/stack_recorder.rbs +++ b/sig/datadog/profiling/stack_recorder.rbs @@ -3,13 +3,14 @@ module Datadog class StackRecorder @no_concurrent_synchronize_mutex: ::Thread::Mutex - def initialize: (cpu_time_enabled: bool, alloc_samples_enabled: bool, heap_samples_enabled: bool) -> void + def initialize: (cpu_time_enabled: bool, alloc_samples_enabled: bool, heap_samples_enabled: bool, timeline_enabled: bool) -> void def self._native_initialize: ( Datadog::Profiling::StackRecorder recorder_instance, bool cpu_time_enabled, bool alloc_samples_enabled, bool heap_samples_enabled, + bool timeline_enabled, ) -> true def serialize: () -> untyped diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 21888900e7c..11a2bdee3fb 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -256,28 +256,28 @@ described_class::Testing._native_trigger_sample - invoke_gc_times = 5 - - invoke_gc_times.times do + 5.times do Thread.pass GC.start + Thread.pass end cpu_and_wall_time_worker.stop all_samples = samples_from_pprof(recorder.serialize!) - current_thread_gc_samples = - samples_for_thread(all_samples, Thread.current) - .select { |it| it.locations.first.path == 'Garbage Collection' } - - # NOTE: In some cases, Ruby may actually call two GC's back-to-back without us having the possibility to take - # a sample. I don't expect this to happen for this test (that's what the `Thread.pass` above is trying to avoid) - # but if this spec turns out to be flaky, that is probably the issue, and that would mean we'd need to relax the - # check. - expect( - current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.values.fetch(:'cpu-samples') } - ).to be >= invoke_gc_times + gc_sample = all_samples.find { |sample| sample.labels[:'gc cause'] == 'GC.start()' } + + expect(gc_sample.labels).to match a_hash_including( + :state => 'had cpu', + :'thread id' => 'GC', + :'thread name' => 'Garbage Collection', + :event => 'gc', + :'gc reason' => an_instance_of(String), + :'gc cause' => 'GC.start()', + :'gc type' => 'major', + ) + expect(gc_sample.locations.first.path).to eq 'Garbage Collection' end context 'when the background thread dies without cleaning up (after Ruby forks)' do diff --git a/spec/datadog/profiling/collectors/stack_spec.rb b/spec/datadog/profiling/collectors/stack_spec.rb index aee41a3f3f5..38c851cbe0d 100644 --- a/spec/datadog/profiling/collectors/stack_spec.rb +++ b/spec/datadog/profiling/collectors/stack_spec.rb @@ -60,14 +60,8 @@ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_fram context 'when marking sample as being in garbage collection' do let(:in_gc) { true } - it 'includes a placeholder frame for garbage collection' do - expect(stacks.fetch(:gathered)[0]).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0) - end - - it 'matches the Ruby backtrace API' do - # We skip 4 frames here -- the garbage collection placeholder, as well as the 3 top stacks that differ from the - # reference stack (see the `let(:gathered_stack)` above for details) - expect(stacks.fetch(:gathered)[4..-1]).to eq reference_stack + it 'gathers a one-element stack with a "Garbage Collection" placeholder' do + expect(stacks.fetch(:gathered)).to contain_exactly(have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0)) end end end @@ -491,38 +485,6 @@ def call_sleep expect(gathered_stack).to eq reference_stack end end - - context 'when marking sample as being in garbage collection' do - let(:in_gc) { true } - - it 'gathers exactly max_frames frames' do - expect(gathered_stack.size).to be max_frames - end - - it 'matches the Ruby backtrace API, up to max_frames - 2' do - garbage_collection = 1 - expect(gathered_stack[(0 + garbage_collection)...(max_frames - 1)]).to eq reference_stack[0...(max_frames - 1 - garbage_collection)] - end - - it 'includes two placeholder frames: one for garbage collection and another for including the number of skipped frames' do - garbage_collection = 1 - placeholder = 1 - omitted_frames = target_stack_depth - max_frames + placeholder + garbage_collection - - expect(omitted_frames).to be 97 - expect(gathered_stack.last).to have_attributes(base_label: '', path: '97 frames omitted', lineno: 0) - expect(gathered_stack.first).to have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0) - end - - context 'when stack is exactly one item less as deep as the configured max_frames' do - let(:target_stack_depth) { 4 } - - it 'includes a placeholder frame for garbage collection and matches the Ruby backtrace API' do - garbage_collection = 1 - expect(gathered_stack[(0 + garbage_collection)..-1]).to eq reference_stack - end - end - end end context 'when sampling a dead thread' do @@ -600,11 +562,8 @@ def call_sleep context 'when marking sample as being in garbage collection' do let(:in_gc) { true } - it 'gathers a two-element stack with a placeholder for "In native code" and another for garbage collection' do - expect(gathered_stack).to contain_exactly( - have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0), - have_attributes(base_label: '', path: 'In native code', lineno: 0), - ) + it 'gathers a one-element stack with a "Garbage Collection" placeholder' do + expect(stacks.fetch(:gathered)).to contain_exactly(have_attributes(base_label: '', path: 'Garbage Collection', lineno: 0)) end end end diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index 88e32c6a089..6ebfd62cea1 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -9,7 +9,7 @@ expect(Thread.list).to include(Thread.main, t1, t2, t3) end - let(:recorder) { build_stack_recorder } + let(:recorder) { build_stack_recorder(timeline_enabled: timeline_enabled) } let(:ready_queue) { Queue.new } let(:t1) do Thread.new(ready_queue) do |ready_queue| @@ -91,6 +91,10 @@ def stats described_class::Testing._native_stats(cpu_and_wall_time_collector) end + def gc_tracking + described_class::Testing._native_gc_tracking(cpu_and_wall_time_collector) + end + # This method exists only so we can look for its name in the stack trace in a few tests def inside_t1 yield @@ -183,50 +187,25 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current expect(t1_samples.map(&:values).map { |it| it.fetch(:'cpu-samples') }.reduce(:+)).to eq 5 end - [:before, :after].each do |on_gc_finish_order| - context "when a thread is marked as being in garbage collection, #{on_gc_finish_order} on_gc_finish" do - # Until sample_after_gc gets called, the state left over by both on_gc_start and on_gc_finish "blocks" time - # from being assigned to further samples. Note this is expected to be very rare in practice, otherwise we would - # probably want to look into skipping these samples entirely. - it 'records the wall-time between a previous sample and the start of garbage collection, and no further time' do - sample - wall_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns) - - on_gc_start - on_gc_finish if on_gc_finish_order == :after - - wall_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns') - - # Even though we keep calling sample, the result only includes the time until we called on_gc_start - 5.times { another_way_of_calling_sample } - - total_wall_for_rspec_thread = - samples_for_thread(samples, Thread.current) - .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } } - .map { |it| it.values.fetch(:'wall-time') } - .reduce(:+) - - expect(total_wall_for_rspec_thread).to be(wall_time_at_gc_start - wall_time_at_first_sample) - end + context 'when a thread is marked as being in garbage collection by on_gc_start' do + # @ivoanjo: This spec exists because for cpu-time the behavior is not this one (e.g. we don't keep recording + # cpu-time), and I wanted to validate that the different behavior does not get applied to wall-time. + it 'keeps recording the wall-time after every sample' do + sample + wall_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns) - # The whole point of wall_time_at_previous_sample_ns is to track the past point in time that we use as start of - # the time range for a sample. - # BUT, we can't let it advance during GC as it should only get accounted for when GC finishes. - it 'does not advance wall_time_at_previous_sample_ns for the thread beyond gc_tracking.wall_time_at_start_ns' do - sample + on_gc_start - on_gc_start - on_gc_finish if on_gc_finish_order == :after + 5.times { sample } - wall_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns') + time_after = Datadog::Core::Utils::Time.get_time(:nanosecond) - 5.times { sample } + sample - wall_time_at_previous_sample_ns = - per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns) + wall_time_at_last_sample = per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns) - expect(wall_time_at_previous_sample_ns).to be wall_time_at_gc_start - end + expect(wall_time_at_last_sample).to be >= wall_time_at_first_sample + expect(wall_time_at_last_sample).to be >= time_after end end @@ -267,47 +246,61 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current expect(total_cpu_for_rspec_thread).to be_between(1, rspec_thread_spent_time) end - [:before, :after].each do |on_gc_finish_order| - context "when a thread is marked as being in garbage collection, #{on_gc_finish_order} on_gc_finish" do - it 'records the cpu-time between a previous sample and the start of garbage collection, and no further time' do - sample - cpu_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns) + context 'when a thread is marked as being in garbage collection by on_gc_start' do + it 'records the cpu-time between a previous sample and the start of garbage collection, and no further time' do + sample + cpu_time_at_first_sample = per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns) - on_gc_start - on_gc_finish if on_gc_finish_order == :after + on_gc_start - cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns') + cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns') - # Even though we keep calling sample, the result only includes the time until we called on_gc_start - 5.times { another_way_of_calling_sample } + # Even though we keep calling sample, the result only includes the time until we called on_gc_start + 5.times { another_way_of_calling_sample } - total_cpu_for_rspec_thread = - samples_for_thread(samples, Thread.current) - .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } } - .map { |it| it.values.fetch(:'cpu-time') } - .reduce(:+) + total_cpu_for_rspec_thread = + samples_for_thread(samples, Thread.current) + .select { |it| it.locations.find { |frame| frame.base_label == 'another_way_of_calling_sample' } } + .map { |it| it.values.fetch(:'cpu-time') } + .reduce(:+) - expect(total_cpu_for_rspec_thread).to be(cpu_time_at_gc_start - cpu_time_at_first_sample) - end + expect(total_cpu_for_rspec_thread).to be(cpu_time_at_gc_start - cpu_time_at_first_sample) + end - # The whole point of cpu_time_at_previous_sample_ns is to track the past point in time that we use as start of - # the time range for a sample. - # BUT, we can't let it advance during GC as it should only get accounted for when GC finishes. - it 'does not advance cpu_time_at_previous_sample_ns for the thread beyond gc_tracking.cpu_time_at_start_ns' do - sample + # When a thread is marked as being in GC the cpu_time_at_previous_sample_ns is not allowed to advance until + # the GC finishes. + it 'does not advance cpu_time_at_previous_sample_ns for the thread beyond gc_tracking.cpu_time_at_start_ns' do + sample + + on_gc_start - on_gc_start - on_gc_finish if on_gc_finish_order == :after + cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns') - cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns') + 5.times { sample } - 5.times { sample } + cpu_time_at_previous_sample_ns = + per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns) - cpu_time_at_previous_sample_ns = - per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns) + expect(cpu_time_at_previous_sample_ns).to be cpu_time_at_gc_start + end + end - expect(cpu_time_at_previous_sample_ns).to be cpu_time_at_gc_start - end + context 'when a thread is unmarked as being in garbage collection by on_gc_finish' do + it 'lets cpu_time_at_previous_sample_ns advance again' do + sample + + on_gc_start + + cpu_time_at_gc_start = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns') + + on_gc_finish + + 5.times { sample } + + cpu_time_at_previous_sample_ns = + per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns) + + expect(cpu_time_at_previous_sample_ns).to be > cpu_time_at_gc_start end end end @@ -653,26 +646,11 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current it "records the cpu-time when garbage collection started in the caller thread's context" do on_gc_start - expect(per_thread_context.fetch(Thread.current)).to include(:'gc_tracking.cpu_time_at_start_ns' => be > 0) - end - end - end - - context 'when called again after on_gc_finish but before sample_after_gc' do - before do - on_gc_start - on_gc_finish - end + cpu_time_at_previous_sample_ns = per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns) - it 'does not change the gc start times' do - start_times = proc do - cpu_time = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns') - wall_time = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns') - - [cpu_time, wall_time] + expect(per_thread_context.fetch(Thread.current)) + .to include(:'gc_tracking.cpu_time_at_start_ns' => (be > cpu_time_at_previous_sample_ns)) end - - expect { on_gc_start }.to_not change(&start_times) end end end @@ -693,38 +671,54 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current context 'when on_gc_start was not called before' do # See comment in the actual implementation on when/why this can happen - it 'does not change the gc finish times' do + it 'does not change the wall_time_at_previous_gc_ns' do on_gc_finish - expect(per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_finish_ns')).to be invalid_time - expect(per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_finish_ns')).to be invalid_time + expect(gc_tracking.fetch(:wall_time_at_previous_gc_ns)).to be invalid_time end end context 'when on_gc_start was previously called' do before { on_gc_start } - it "records the wall-time when garbage collection finished in the caller thread's context" do + it 'records the wall-time when garbage collection finished in the gc_tracking' do wall_time_before_on_gc_finish_ns = Datadog::Core::Utils::Time.get_time(:nanosecond) on_gc_finish wall_time_after_on_gc_finish_ns = Datadog::Core::Utils::Time.get_time(:nanosecond) + expect(gc_tracking.fetch(:wall_time_at_previous_gc_ns)) + .to be_between(wall_time_before_on_gc_finish_ns, wall_time_after_on_gc_finish_ns) + end + + it 'resets the gc tracking fields back to invalid_time' do + on_gc_finish + expect(per_thread_context.fetch(Thread.current)).to include( - :'gc_tracking.wall_time_at_finish_ns' => - be_between(wall_time_before_on_gc_finish_ns, wall_time_after_on_gc_finish_ns) + :'gc_tracking.cpu_time_at_start_ns' => invalid_time, + :'gc_tracking.wall_time_at_start_ns' => invalid_time, ) end + it 'records the wall-time time spent between calls to on_gc_start and on_gc_finish' do + wall_time_at_start_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns') + + wall_time_before_on_gc_finish_ns = Datadog::Core::Utils::Time.get_time(:nanosecond) + on_gc_finish + + expect(gc_tracking.fetch(:accumulated_wall_time_ns)) + .to be >= (wall_time_before_on_gc_finish_ns - wall_time_at_start_ns) + end + context 'cpu-time behavior' do context 'when not on Linux' do before do skip 'The fallback behavior only applies when not on Linux' if PlatformHelpers.linux? end - it "records the cpu-time when garbage collection finished in the caller thread's context as zero" do + it 'records the accumulated_cpu_time_ns as zero' do on_gc_finish - expect(per_thread_context.fetch(Thread.current)).to include(:'gc_tracking.cpu_time_at_finish_ns' => 0) + expect(gc_tracking.fetch(:accumulated_cpu_time_ns)).to be 0 end end @@ -733,187 +727,152 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current skip 'Test only runs on Linux' unless PlatformHelpers.linux? end - it "records the cpu-time when garbage collection finished in the caller thread's context" do + it 'records the cpu-time spent between calls to on_gc_start and on_gc_finish' do on_gc_finish - expect(per_thread_context.fetch(Thread.current)).to include(:'gc_tracking.cpu_time_at_finish_ns' => be > 0) + expect(gc_tracking.fetch(:accumulated_cpu_time_ns)).to be > 0 + end + + it 'advances the cpu_time_at_previous_sample_ns for the sampled thread by the time spent in GC' do + cpu_time_at_previous_sample_ns_before = + per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns) + + on_gc_finish + + expect(per_thread_context.fetch(Thread.current)).to include( + cpu_time_at_previous_sample_ns: be > cpu_time_at_previous_sample_ns_before + ) end end end end context 'when going through multiple cycles of on_gc_start/on_gc_finish without sample_after_gc getting called' do - it 'keeps the cpu-time and wall-time at finish from the LAST on_gc_finish' do - context_tracking = [] + let(:context_tracking) { [] } + before do 5.times do on_gc_start on_gc_finish - context_tracking << per_thread_context.fetch(Thread.current) + context_tracking << gc_tracking end - - cpu_time_from_last_on_gc_finish = context_tracking.last.fetch(:'gc_tracking.cpu_time_at_finish_ns') - wall_time_from_last_on_gc_finish = context_tracking.last.fetch(:'gc_tracking.wall_time_at_finish_ns') - - expect(context_tracking.first) - .to include(:'gc_tracking.wall_time_at_finish_ns' => be < wall_time_from_last_on_gc_finish) - - # This always advances: all_but_last <= the last one - # (Needs the <= because unfortunately we may not get enough precision, otherwise it would be <) - all_but_last = context_tracking[0..-2] - expect( - all_but_last.map { |it| it.fetch(:'gc_tracking.cpu_time_at_finish_ns') } - ).to all be <= cpu_time_from_last_on_gc_finish - expect( - all_but_last.map { |it| it.fetch(:'gc_tracking.wall_time_at_finish_ns') } - ).to all be <= wall_time_from_last_on_gc_finish end - end - end - end - describe '#sample_after_gc' do - let(:gc_samples) { samples.select { |it| it.locations.first.path == 'Garbage Collection' } } + it 'accumulates the cpu-time and wall-time from the multiple GCs' do + all_accumulated_wall_time = context_tracking.map { |it| it.fetch(:accumulated_wall_time_ns) } - before { sample } + expect(all_accumulated_wall_time).to eq all_accumulated_wall_time.sort + expect(all_accumulated_wall_time.first).to be <= all_accumulated_wall_time.last - context 'when there is no thread with gc time to be sampled' do - it 'does not sample any threads' do - sample_after_gc + all_accumulated_cpu_time = context_tracking.map { |it| it.fetch(:accumulated_cpu_time_ns) } + expect(all_accumulated_cpu_time).to eq all_accumulated_cpu_time.sort - expect(gc_samples).to be_empty - end + expect(all_accumulated_cpu_time.first).to be < all_accumulated_cpu_time.last if all_accumulated_cpu_time.first > 0 + end - it 'does not increment the gc_samples stat' do - sample_after_gc + it 'updates the wall_time_at_previous_gc_ns with the latest one' do + all_wall_time_at_previous_gc_ns = context_tracking.map { |it| it.fetch(:wall_time_at_previous_gc_ns) } - expect(stats.fetch(:gc_samples)).to be 0 + expect(all_wall_time_at_previous_gc_ns.last).to be all_wall_time_at_previous_gc_ns.max + end end end + end - context 'when there is a thread with gc start time but no finish time' do - before { on_gc_start } - - it 'does not sample any threads' do - sample_after_gc - - expect(gc_samples).to be_empty - end - - it 'does not increment the gc_samples stat' do - sample_after_gc + describe '#sample_after_gc' do + before { sample } - expect(stats.fetch(:gc_samples)).to be 0 + context 'when called before on_gc_start/on_gc_finish' do + it do + expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) end end - context 'when there is a thread with a recorded gc start and finish time' do - let(:gc_sample) do - expect(gc_samples.size).to be 1 - gc_samples.first - end + context 'when there is gc information to record' do + let(:gc_sample) { samples.find { |it| it.labels.fetch(:'thread name') == 'Garbage Collection' } } before do on_gc_start + @time_before = Datadog::Core::Utils::Time.as_utc_epoch_ns(Time.now) on_gc_finish + @time_after = Datadog::Core::Utils::Time.as_utc_epoch_ns(Time.now) end - it 'samples the thread with recorded gc start and finish time, marking it as being in Garbage Collection' do - sample_after_gc + context 'when called more than once in a row' do + it do + sample_after_gc + + expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) + end + end - expect(object_id_from(gc_sample.labels.fetch(:'thread id'))).to eq Thread.current.object_id + it 'increments the gc_samples counter' do + expect { sample_after_gc }.to change { stats.fetch(:gc_samples) }.from(0).to(1) end - it 'samples the thread with recorded gc start and finish time, recording the times between gc start and finish' do - cpu_time_at_start_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_start_ns') - cpu_time_at_finish_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.cpu_time_at_finish_ns') - wall_time_at_start_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_start_ns') - wall_time_at_finish_ns = per_thread_context.fetch(Thread.current).fetch(:'gc_tracking.wall_time_at_finish_ns') + it 'sets the wall_time_at_last_flushed_gc_event_ns from the wall_time_at_previous_gc_ns' do + wall_time_at_previous_gc_ns = gc_tracking.fetch(:wall_time_at_previous_gc_ns) sample_after_gc - expect(gc_sample.values).to include( - :"cpu-samples" => 1, - :'cpu-time' => cpu_time_at_finish_ns - cpu_time_at_start_ns, - :"wall-time" => wall_time_at_finish_ns - wall_time_at_start_ns, - ) + expect(gc_tracking.fetch(:wall_time_at_last_flushed_gc_event_ns)).to be wall_time_at_previous_gc_ns end - it 'resets the gc tracking fields back to invalid_time' do + it 'resets the wall_time_at_previous_gc_ns to invalid_time' do sample_after_gc - expect(per_thread_context.fetch(Thread.current)).to include( - :'gc_tracking.cpu_time_at_start_ns' => invalid_time, - :'gc_tracking.cpu_time_at_finish_ns' => invalid_time, - :'gc_tracking.wall_time_at_start_ns' => invalid_time, - :'gc_tracking.wall_time_at_finish_ns' => invalid_time, - ) - end - - it 'increments the gc_samples stat' do - expect { sample_after_gc }.to change { stats.fetch(:gc_samples) }.from(0).to(1) + expect(gc_tracking.fetch(:wall_time_at_previous_gc_ns)).to be invalid_time end - it 'does not sample any other threads' do + it 'creates a Garbage Collection sample' do sample_after_gc - expect(gc_samples.size).to be 1 + expect(gc_sample.values.fetch(:'cpu-samples')).to be 1 + expect(gc_sample.labels).to match a_hash_including( + :state => 'had cpu', + :'thread id' => 'GC', + :'thread name' => 'Garbage Collection', + :event => 'gc', + :'gc cause' => an_instance_of(String), + :'gc type' => an_instance_of(String), + ) + expect(gc_sample.locations.first.path).to eq 'Garbage Collection' end - it 'advances the wall_time_at_previous_sample_ns for the sampled thread by the time spent in garbage collection' do - wall_time_at_previous_sample_ns_before = - per_thread_context.fetch(Thread.current).fetch(:wall_time_at_previous_sample_ns) + it 'creates a Garbage Collection sample using the accumulated_cpu_time_ns and accumulated_wall_time_ns' do + accumulated_cpu_time_ns = gc_tracking.fetch(:accumulated_cpu_time_ns) + accumulated_wall_time_ns = gc_tracking.fetch(:accumulated_wall_time_ns) sample_after_gc - wall_time_spent_in_gc = gc_sample.values.fetch(:'wall-time') - - expect(per_thread_context.fetch(Thread.current)).to include( - wall_time_at_previous_sample_ns: wall_time_at_previous_sample_ns_before + wall_time_spent_in_gc + expect(gc_sample.values).to match a_hash_including( + :'cpu-time' => accumulated_cpu_time_ns, + :'wall-time' => accumulated_wall_time_ns, ) end - context 'cpu-time behavior' do - context 'when not on Linux' do - before do - skip 'The fallback behavior only applies when not on Linux' if PlatformHelpers.linux? - end - - it 'keeps the cpu_time_at_previous_sample_ns as invalid_time' do - sample_after_gc - - expect(per_thread_context.fetch(Thread.current)).to include(cpu_time_at_previous_sample_ns: invalid_time) - end - end + it 'does not include the timeline timestamp' do + sample_after_gc - context 'on Linux' do - before do - skip 'Test only runs on Linux' unless PlatformHelpers.linux? - end + expect(gc_sample.labels.keys).to_not include(:end_timestamp_ns) + end - it 'advances the cpu_time_at_previous_sample_ns for the sampled thread by the time spent in garbage collection' do - cpu_time_at_previous_sample_ns_before = - per_thread_context.fetch(Thread.current).fetch(:cpu_time_at_previous_sample_ns) + context 'when timeline is enabled' do + let(:timeline_enabled) { true } - sample_after_gc + it 'creates a Garbage Collection sample using the accumulated_wall_time_ns as the timeline duration' do + accumulated_wall_time_ns = gc_tracking.fetch(:accumulated_wall_time_ns) - cpu_time_spent_in_gc = gc_sample.values.fetch(:'cpu-time') + sample_after_gc - expect(per_thread_context.fetch(Thread.current)).to include( - cpu_time_at_previous_sample_ns: cpu_time_at_previous_sample_ns_before + cpu_time_spent_in_gc - ) - end + expect(gc_sample.values.fetch(:timeline)).to be accumulated_wall_time_ns end - end - - context 'when timeline is enabled' do - let(:timeline_enabled) { true } - it 'does not include end_timestamp_ns labels in GC samples' do + it 'creates a Garbage Collection sample using the timestamp set by on_gc_finish, converted to epoch ns' do sample_after_gc - expect(gc_samples.first.labels.keys).to_not include(:end_timestamp_ns) + expect(gc_sample.labels.fetch(:end_timestamp_ns)).to be_between(@time_before, @time_after) end end end diff --git a/spec/datadog/profiling/component_spec.rb b/spec/datadog/profiling/component_spec.rb index 77ac09732b4..efd37d42efd 100644 --- a/spec/datadog/profiling/component_spec.rb +++ b/spec/datadog/profiling/component_spec.rb @@ -53,6 +53,8 @@ context 'when using the new CPU Profiling 2.0 profiler' do it 'initializes a ThreadContext collector' do allow(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new) + dummy_stack_recorder = instance_double(Datadog::Profiling::StackRecorder, 'dummy_stack_recorder') + allow(Datadog::Profiling::StackRecorder).to receive(:new).and_return(dummy_stack_recorder) expect(settings.profiling.advanced).to receive(:max_frames).and_return(:max_frames_config) expect(settings.profiling.advanced) @@ -61,7 +63,7 @@ .to receive(:enabled).and_return(:endpoint_collection_enabled_config) expect(Datadog::Profiling::Collectors::ThreadContext).to receive(:new).with( - recorder: instance_of(Datadog::Profiling::StackRecorder), + recorder: dummy_stack_recorder, max_frames: :max_frames_config, tracer: tracer, endpoint_collection_enabled: :endpoint_collection_enabled_config, @@ -276,6 +278,28 @@ end end + context 'when timeline is enabled' do + before { settings.profiling.advanced.experimental_timeline_enabled = true } + + it 'sets up the StackRecorder with timeline_enabled: true' do + expect(Datadog::Profiling::StackRecorder) + .to receive(:new).with(hash_including(timeline_enabled: true)).and_call_original + + build_profiler_component + end + end + + context 'when timeline is disabled' do + before { settings.profiling.advanced.experimental_timeline_enabled = false } + + it 'sets up the StackRecorder with timeline_enabled: false' do + expect(Datadog::Profiling::StackRecorder) + .to receive(:new).with(hash_including(timeline_enabled: false)).and_call_original + + build_profiler_component + end + end + it 'sets up the Profiler with the CpuAndWallTimeWorker collector' do expect(Datadog::Profiling::Profiler).to receive(:new).with( worker: instance_of(Datadog::Profiling::Collectors::CpuAndWallTimeWorker), @@ -295,11 +319,11 @@ it 'sets up the Exporter internal_metadata with relevant settings' do allow(Datadog::Profiling::Collectors::ThreadContext).to receive(:new) allow(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new) + allow(Datadog::Profiling::StackRecorder).to receive(:new) expect(described_class).to receive(:no_signals_workaround_enabled?).and_return(:no_signals_result) expect(settings.profiling.advanced).to receive(:experimental_timeline_enabled).and_return(:timeline_result) - expect(settings.profiling.advanced).to receive(:experimental_allocation_sample_rate) - .and_return(123) + expect(settings.profiling.advanced).to receive(:experimental_allocation_sample_rate).and_return(123) expect(Datadog::Profiling::Exporter).to receive(:new).with( hash_including( internal_metadata: { diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb index e02c1af213b..fcb084106b9 100644 --- a/spec/datadog/profiling/spec_helper.rb +++ b/spec/datadog/profiling/spec_helper.rb @@ -65,7 +65,11 @@ def decode_frame_from_pprof(decoded_profile, location_id) end def object_id_from(thread_id) - Integer(thread_id.match(/\d+ \((?\d+)\)/)[:object_id]) + if thread_id != 'GC' + Integer(thread_id.match(/\d+ \((?\d+)\)/)[:object_id]) + else + -1 + end end def samples_for_thread(samples, thread) @@ -74,11 +78,12 @@ def samples_for_thread(samples, thread) # We disable heap_sample collection by default in tests since it requires some extra mocking/ # setup for it to properly work. - def build_stack_recorder(heap_samples_enabled: false) + def build_stack_recorder(heap_samples_enabled: false, timeline_enabled: false) Datadog::Profiling::StackRecorder.new( cpu_time_enabled: true, alloc_samples_enabled: true, - heap_samples_enabled: heap_samples_enabled + heap_samples_enabled: heap_samples_enabled, + timeline_enabled: timeline_enabled, ) end end diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 8652ceb5e05..73f73ae4874 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -10,12 +10,14 @@ # Disabling these by default since they require some extra setup and produce separate samples. # Enabling this is tested in a particular context below. let(:heap_samples_enabled) { false } + let(:timeline_enabled) { true } subject(:stack_recorder) do described_class.new( cpu_time_enabled: cpu_time_enabled, alloc_samples_enabled: alloc_samples_enabled, - heap_samples_enabled: heap_samples_enabled + heap_samples_enabled: heap_samples_enabled, + timeline_enabled: timeline_enabled, ) end @@ -118,77 +120,76 @@ def slot_two_mutex_locked? expect(start).to be <= finish end - context 'when all profile types are enabled' do + describe 'profile types configuration' do let(:cpu_time_enabled) { true } let(:alloc_samples_enabled) { true } let(:heap_samples_enabled) { true } - - it 'returns a pprof with the configured sample types' do - expect(sample_types_from(decoded_profile)).to eq( + let(:timeline_enabled) { true } + let(:all_profile_types) do + { 'cpu-time' => 'nanoseconds', 'cpu-samples' => 'count', 'wall-time' => 'nanoseconds', 'alloc-samples' => 'count', 'heap-live-samples' => 'count', - ) + 'timeline' => 'nanoseconds', + } end - end - context 'when cpu-time is disabled' do - let(:cpu_time_enabled) { false } - let(:alloc_samples_enabled) { true } - let(:heap_samples_enabled) { true } + def profile_types_without(type) + all_profile_types.dup.tap { |it| it.delete(type) { raise 'Missing key' } } + end - it 'returns a pprof without the cpu-type type' do - expect(sample_types_from(decoded_profile)).to eq( - 'cpu-samples' => 'count', - 'wall-time' => 'nanoseconds', - 'alloc-samples' => 'count', - 'heap-live-samples' => 'count', - ) + context 'when all profile types are enabled' do + it 'returns a pprof with the configured sample types' do + expect(sample_types_from(decoded_profile)).to eq(all_profile_types) + end end - end - context 'when alloc-samples is disabled' do - let(:cpu_time_enabled) { true } - let(:alloc_samples_enabled) { false } - let(:heap_samples_enabled) { true } + context 'when cpu-time is disabled' do + let(:cpu_time_enabled) { false } - it 'returns a pprof without the alloc-samples type' do - expect(sample_types_from(decoded_profile)).to eq( - 'cpu-time' => 'nanoseconds', - 'cpu-samples' => 'count', - 'wall-time' => 'nanoseconds', - 'heap-live-samples' => 'count', - ) + it 'returns a pprof without the cpu-type type' do + expect(sample_types_from(decoded_profile)).to eq(profile_types_without('cpu-time')) + end end - end - context 'when heap-live-samples is disabled' do - let(:cpu_time_enabled) { true } - let(:alloc_samples_enabled) { true } - let(:heap_samples_enabled) { false } + context 'when alloc-samples is disabled' do + let(:alloc_samples_enabled) { false } - it 'returns a pprof without the heap-live-samples type' do - expect(sample_types_from(decoded_profile)).to eq( - 'cpu-time' => 'nanoseconds', - 'cpu-samples' => 'count', - 'wall-time' => 'nanoseconds', - 'alloc-samples' => 'count', - ) + it 'returns a pprof without the alloc-samples type' do + expect(sample_types_from(decoded_profile)).to eq(profile_types_without('alloc-samples')) + end end - end - context 'when all optional types are disabled' do - let(:cpu_time_enabled) { false } - let(:alloc_samples_enabled) { false } - let(:heap_samples_enabled) { false } + context 'when heap-live-samples is disabled' do + let(:heap_samples_enabled) { false } - it 'returns a pprof without the optional types' do - expect(sample_types_from(decoded_profile)).to eq( - 'cpu-samples' => 'count', - 'wall-time' => 'nanoseconds', - ) + it 'returns a pprof without the heap-live-samples type' do + expect(sample_types_from(decoded_profile)).to eq(profile_types_without('heap-live-samples')) + end + end + + context 'when timeline is disabled' do + let(:timeline_enabled) { false } + + it 'returns a pprof without the timeline type' do + expect(sample_types_from(decoded_profile)).to eq(profile_types_without('timeline')) + end + end + + context 'when all optional types are disabled' do + let(:cpu_time_enabled) { false } + let(:alloc_samples_enabled) { false } + let(:heap_samples_enabled) { false } + let(:timeline_enabled) { false } + + it 'returns a pprof without the optional types' do + expect(sample_types_from(decoded_profile)).to eq( + 'cpu-samples' => 'count', + 'wall-time' => 'nanoseconds', + ) + end end end @@ -214,7 +215,9 @@ def sample_types_from(decoded_profile) end context 'when profile has a sample' do - let(:metric_values) { { 'cpu-time' => 123, 'cpu-samples' => 456, 'wall-time' => 789, 'alloc-samples' => 4242 } } + let(:metric_values) do + { 'cpu-time' => 123, 'cpu-samples' => 456, 'wall-time' => 789, 'alloc-samples' => 4242, 'timeline' => 1111 } + end let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b', 'state' => 'unknown' }.to_a } let(:samples) { samples_from_pprof(encoded_pprof) } @@ -232,6 +235,7 @@ def sample_types_from(decoded_profile) :'cpu-samples' => 456, :'wall-time' => 789, :'alloc-samples' => 4242, + :timeline => 1111, ) end @@ -240,7 +244,7 @@ def sample_types_from(decoded_profile) it 'encodes the sample with the metrics provided, ignoring the disabled ones' do expect(samples.first.values) - .to eq(:'cpu-samples' => 456, :'wall-time' => 789, :'alloc-samples' => 4242) + .to eq(:'cpu-samples' => 456, :'wall-time' => 789, :'alloc-samples' => 4242, :timeline => 1111) end end @@ -338,7 +342,9 @@ def sample_types_from(decoded_profile) describe 'heap samples' do let(:sample_rate) { 50 } - let(:metric_values) { { 'cpu-time' => 101, 'cpu-samples' => 1, 'wall-time' => 789, 'alloc-samples' => sample_rate } } + let(:metric_values) do + { 'cpu-time' => 101, 'cpu-samples' => 1, 'wall-time' => 789, 'alloc-samples' => sample_rate, 'timeline' => 42 } + end let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b', 'state' => 'unknown' }.to_a } let(:a_string) { 'a beautiful string' }