Skip to content

Commit

Permalink
Merge pull request #2384 from DataDog/ivoanjo/prof-6463-endpoint-prof…
Browse files Browse the repository at this point in the history
…iling

[PROF-6463] Implement support for endpoint profiling in CpuAndWallTime collector
  • Loading branch information
ivoanjo authored Nov 22, 2022
2 parents a1d9d96 + e2a7934 commit 3eb734e
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,12 @@
#define IS_NOT_WALL_TIME false
#define MISSING_TRACER_CONTEXT_KEY 0

static ID at_active_trace_id; // id of :@active_trace in Ruby
static ID at_root_span_id; // id of :@root_span in Ruby
static ID at_active_span_id; // id of :@active_span in Ruby
static ID at_active_trace_id; // id of :@active_trace in Ruby
static ID at_id_id; // id of :@id in Ruby
static ID at_resource_id; // id of :@resource in Ruby
static ID at_root_span_id; // id of :@root_span in Ruby
static ID at_type_id; // id of :@type in Ruby

// Contains state for a single CpuAndWallTime instance
struct cpu_and_wall_time_collector_state {
Expand Down Expand Up @@ -129,6 +131,7 @@ struct trace_identifiers {
ddog_CharSlice span_id;
char local_root_span_id_buffer[MAXIMUM_LENGTH_64_BIT_IDENTIFIER];
char span_id_buffer[MAXIMUM_LENGTH_64_BIT_IDENTIFIER];
VALUE trace_endpoint;
};

static void cpu_and_wall_time_collector_typed_data_mark(void *state_ptr);
Expand Down Expand Up @@ -165,6 +168,7 @@ static long wall_time_now_ns(bool raise_on_failure);
static long thread_id_for(VALUE thread);
static VALUE _native_stats(VALUE self, VALUE collector_instance);
static void trace_identifiers_for(struct cpu_and_wall_time_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result);
static bool is_type_web(VALUE root_span_type);
static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector_instance);

void collectors_cpu_and_wall_time_init(VALUE profiling_module) {
Expand Down Expand Up @@ -194,10 +198,12 @@ void collectors_cpu_and_wall_time_init(VALUE profiling_module) {
rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1);
rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1);

at_active_trace_id = rb_intern_const("@active_trace");
at_root_span_id = rb_intern_const("@root_span");
at_active_span_id = rb_intern_const("@active_span");
at_active_trace_id = rb_intern_const("@active_trace");
at_id_id = rb_intern_const("@id");
at_resource_id = rb_intern_const("@resource");
at_root_span_id = rb_intern_const("@root_span");
at_type_id = rb_intern_const("@type");
}

// This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_collector_state
Expand Down Expand Up @@ -572,12 +578,29 @@ static void trigger_sample_for_thread(
};
}

struct trace_identifiers trace_identifiers_result = {.valid = false};
struct trace_identifiers trace_identifiers_result = {.valid = false, .trace_endpoint = Qnil};
trace_identifiers_for(state, thread, &trace_identifiers_result);

if (trace_identifiers_result.valid) {
labels[label_pos++] = (ddog_Label) {.key = DDOG_CHARSLICE_C("local root span id"), .str = trace_identifiers_result.local_root_span_id};
labels[label_pos++] = (ddog_Label) {.key = DDOG_CHARSLICE_C("span id"), .str = trace_identifiers_result.span_id};

if (trace_identifiers_result.trace_endpoint != Qnil) {
// The endpoint gets recorded in a different way because it is mutable in the tracer and can change during a
// trace.
//
// Instead of each sample for the same local_root_span_id getting a potentially-different endpoint,
// `record_endpoint` (via libdatadog) keeps a list of local_root_span_id values and their most-recently-seen
// endpoint values, and at serialization time the most-recently-seen endpoint is applied to all relevant samples.
//
// This is why the endpoint is not directly added in this function to the labels array, although it will later
// show up in the array in the output pprof.
record_endpoint(
state->recorder_instance,
trace_identifiers_result.local_root_span_id,
char_slice_from_ruby_string(trace_identifiers_result.trace_endpoint)
);
}
}

// The number of times `label_pos++` shows up in this function needs to match `max_label_count`. To avoid "oops I
Expand Down Expand Up @@ -864,6 +887,24 @@ static void trace_identifiers_for(struct cpu_and_wall_time_collector_state *stat
};

trace_identifiers_result->valid = true;

VALUE root_span_type = rb_ivar_get(root_span, at_type_id /* @type */);
if (root_span_type == Qnil || !is_type_web(root_span_type)) return;

VALUE trace_resource = rb_ivar_get(active_trace, at_resource_id /* @resource */);
if (RB_TYPE_P(trace_resource, T_STRING)) {
trace_identifiers_result->trace_endpoint = trace_resource;
} else if (trace_resource == Qnil) {
// Fall back to resource from span, if any
trace_identifiers_result->trace_endpoint = rb_ivar_get(root_span, at_resource_id /* @resource */);
}
}

static bool is_type_web(VALUE root_span_type) {
ENFORCE_TYPE(root_span_type, T_STRING);

return RSTRING_LEN(root_span_type) == strlen("web") &&
(memcmp("web", StringValuePtr(root_span_type), strlen("web")) == 0);
}

// After the Ruby VM forks, this method gets called in the child process to clean up any leftover state from the parent.
Expand Down
10 changes: 6 additions & 4 deletions lib/datadog/profiling/collectors/cpu_and_wall_time.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ def reset_after_fork
private

def safely_extract_context_key_from(tracer)
tracer &&
tracer.respond_to?(:provider) &&
# NOTE: instance_variable_get always works, even on nil -- it just returns nil if the variable doesn't exist
tracer.provider.instance_variable_get(:@context).instance_variable_get(:@key)
provider = tracer && tracer.respond_to?(:provider) && tracer.provider

return unless provider

context = provider.instance_variable_get(:@context)
context && context.instance_variable_get(:@key)
end
end
end
Expand Down
110 changes: 107 additions & 3 deletions spec/datadog/profiling/collectors/cpu_and_wall_time_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,15 @@ def stats
end

context 'when thread has a tracer context, and a trace is in progress' do
let(:root_span_type) { 'not-web' }

let(:t1) do
Thread.new(ready_queue) do |ready_queue|
Datadog::Tracing.trace('profiler.test') do
Datadog::Tracing.trace('profiler.test.inner') do |span, trace|
@t1_span_id = span.id
Datadog::Tracing.trace('profiler.test', type: root_span_type) do |_span, trace|
@t1_trace = trace

Datadog::Tracing.trace('profiler.test.inner') do |inner_span|
@t1_span_id = inner_span.id
@t1_local_root_span_id = trace.send(:root_span).id
ready_queue << true
sleep
Expand Down Expand Up @@ -361,6 +365,106 @@ def stats
:'span id' => @t1_span_id.to_s,
)
end

it 'does not include the "trace endpoint" label' do
sample

expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint')
end

context 'when local root span type is web' do
let(:root_span_type) { 'web' }

it 'includes the "trace endpoint" label in the samples' do
sample

expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'profiler.test')
end

describe 'trace vs root span resource mutation' do
let(:t1) do
Thread.new(ready_queue) do |ready_queue|
Datadog::Tracing.trace('profiler.test', type: root_span_type) do |span, trace|
trace.resource = trace_resource
span.resource = root_span_resource

Datadog::Tracing.trace('profiler.test.inner') do |inner_span|
@t1_span_id = inner_span.id
@t1_local_root_span_id = trace.send(:root_span).id
ready_queue << true
sleep
end
end
end
end

context 'when the trace resource is nil but the root span resource is not nil' do
let(:trace_resource) { nil }
let(:root_span_resource) { 'root_span_resource' }

it 'includes the "trace endpoint" label in the samples with the root span resource' do
sample

expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'root_span_resource')
end
end

context 'when both the trace resource and the root span resource are specified' do
let(:trace_resource) { 'trace_resource' }
let(:root_span_resource) { 'root_span_resource' }

it 'includes the "trace endpoint" label in the samples with the trace resource' do
sample

expect(t1_sample.fetch(:labels)).to include(:'trace endpoint' => 'trace_resource')
end
end

context 'when both the trace resource and the root span resource are nil' do
let(:trace_resource) { nil }
let(:root_span_resource) { nil }

it 'does not include the "trace endpoint" label' do
sample

expect(t1_sample.fetch(:labels).keys).to_not include(:'trace endpoint')
end
end
end

context 'when resource is changed after a sample was taken' do
before do
sample
@t1_trace.resource = 'changed_after_first_sample'
end

it 'changes the "trace endpoint" label in all samples' do
sample

t1_samples = samples_for_thread(samples, t1)

expect(t1_samples).to have(1).item
expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_first_sample')
expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 2)
end

context 'when the resource is changed multiple times' do
it 'changes the "trace endpoint" label in all samples' do
sample

@t1_trace.resource = 'changed_after_second_sample'

sample

t1_samples = samples_for_thread(samples, t1)

expect(t1_samples).to have(1).item
expect(t1_samples.first.fetch(:labels)).to include(:'trace endpoint' => 'changed_after_second_sample')
expect(t1_samples.first.fetch(:values)).to include(:'cpu-samples' => 3)
end
end
end
end
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion spec/datadog/profiling/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def samples_from_pprof(pprof_data)
labels: sample.label.map do |it|
[
string_table[it.key].to_sym,
it.str != 0 ? string_table[it.str] : raise('Unexpected: label encoded as number instead of string'),
it.num == 0 ? string_table[it.str] : raise('Unexpected: label encoded as number instead of string'),
]
end.to_h,
}
Expand Down

0 comments on commit 3eb734e

Please sign in to comment.