Skip to content

Commit

Permalink
Merge pull request #2657 from DataDog/ivoanjo/prof-7307-allocation-sa…
Browse files Browse the repository at this point in the history
…mples-support

[PROF-7307] Add support for allocation samples to `ThreadContext`
  • Loading branch information
ivoanjo authored Mar 3, 2023
2 parents 94721a7 + 7d22e54 commit a644ca3
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 5 deletions.
25 changes: 25 additions & 0 deletions ext/ddtrace_profiling_native_extension/collectors_thread_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ static void trace_identifiers_for(struct thread_context_collector_state *state,
static bool is_type_web(VALUE root_span_type);
static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
static VALUE thread_list(struct thread_context_collector_state *state);
static VALUE _native_sample_allocation(VALUE self, VALUE collector_instance, VALUE sample_weight);

void collectors_thread_context_init(VALUE profiling_module) {
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
Expand All @@ -201,6 +202,7 @@ void collectors_thread_context_init(VALUE profiling_module) {
rb_define_singleton_method(collectors_thread_context_class, "_native_inspect", _native_inspect, 1);
rb_define_singleton_method(collectors_thread_context_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2);
rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 2);
rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1);
rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1);
rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1);
Expand Down Expand Up @@ -951,3 +953,26 @@ static VALUE thread_list(struct thread_context_collector_state *state) {
ddtrace_thread_list(result);
return result;
}

void thread_context_collector_sample_allocation(VALUE self_instance, unsigned int sample_weight) {
struct thread_context_collector_state *state;
TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);

VALUE current_thread = rb_thread_current();

trigger_sample_for_thread(
state,
/* thread: */ current_thread,
/* stack_from_thread: */ current_thread,
get_or_create_context_for(current_thread, state),
(sample_values) {.alloc_samples = sample_weight},
SAMPLE_REGULAR
);
}

// This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_sample_allocation(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE sample_weight) {
thread_context_collector_sample_allocation(collector_instance, NUM2UINT(sample_weight));
return Qtrue;
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ void thread_context_collector_sample(
long current_monotonic_wall_time_ns,
VALUE profiler_overhead_stack_thread
);
void thread_context_collector_sample_allocation(VALUE self_instance, unsigned int sample_weight);
VALUE thread_context_collector_sample_after_gc(VALUE self_instance);
void thread_context_collector_on_gc_start(VALUE self_instance);
void thread_context_collector_on_gc_finish(VALUE self_instance);
Expand Down
18 changes: 13 additions & 5 deletions spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -310,11 +310,12 @@

cpu_and_wall_time_worker.stop

result = samples_for_thread(samples_from_pprof_without_gc_and_overhead(recorder.serialize!), Thread.current)
all_samples = samples_from_pprof_without_gc_and_overhead(recorder.serialize!)
result = samples_for_thread(all_samples, Thread.current)
sample_count = result.map { |it| it.values.fetch(:'cpu-samples') }.reduce(:+)

stats = cpu_and_wall_time_worker.stats
debug_failures = { thread_list: Thread.list, result: result }
debug_failures = { thread_list: Thread.list, all_samples: all_samples }

trigger_sample_attempts = stats.fetch(:trigger_sample_attempts)
simulated_signal_delivery = stats.fetch(:simulated_signal_delivery)
Expand All @@ -324,9 +325,16 @@

# Sanity checking

# We're currently targeting 100 samples per second, so this is a conservative approximation that hopefully
# will not cause flakiness
expect(sample_count).to be >= 8, "sample_count: #{sample_count}, stats: #{stats}, debug_failures: #{debug_failures}"
# We're currently targeting 100 samples per second (aka 20 in the 0.2 period above), so expecting 5 samples is a
# conservative choice that hopefully will not cause flakiness.
#
# @ivoanjo: One source of flakiness here in the past has been the dynamic sampling rate component -- if for some
# reason the profiler takes longer to sample (maybe the CI machine is busy today...), it will by design slow down
# and take less samples. For now, I tried to tune down this value, and am hesitating on adding a way to disable
# the dynamic sampling rate while we're running this test (because this is something we'd never do in production).
# But if spec proves to be flaky again, then I think we'll need to go with that solution (as I think this test
# is still quite valuable).
expect(sample_count).to be >= 5, "sample_count: #{sample_count}, stats: #{stats}, debug_failures: #{debug_failures}"
expect(trigger_sample_attempts).to be >= sample_count
end
end
Expand Down
74 changes: 74 additions & 0 deletions spec/datadog/profiling/collectors/thread_context_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ def sample_after_gc
described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector)
end

def sample_allocation(weight:)
described_class::Testing._native_sample_allocation(cpu_and_wall_time_collector, weight)
end

def thread_list
described_class::Testing._native_thread_list
end
Expand All @@ -75,6 +79,7 @@ def stats
described_class::Testing._native_stats(cpu_and_wall_time_collector)
end

# This method exists only so we can look for its name in the stack trace in a few tests
def inside_t1
yield
end
Expand Down Expand Up @@ -815,6 +820,75 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current
end
end

describe '#sample_allocation' do
let(:single_sample) do
expect(samples.size).to be 1
samples.first
end

it 'samples the caller thread' do
sample_allocation(weight: 123)

expect(object_id_from(single_sample.labels.fetch(:'thread id'))).to be Thread.current.object_id
end

it 'tags the sample with the provided weight' do
sample_allocation(weight: 123)

expect(single_sample.values).to include(:'alloc-samples' => 123)
end

it 'includes the thread names, if available' do
thread_with_name = Thread.new do
Thread.current.name = 'thread_with_name'
sample_allocation(weight: 123)
end.join
thread_without_name = Thread.new { sample_allocation(weight: 123) }.join

sample_with_name = samples_for_thread(samples, thread_with_name).first
sample_without_name = samples_for_thread(samples, thread_without_name).first

expect(sample_with_name.labels).to include(:'thread name' => 'thread_with_name')
expect(sample_without_name.labels).to_not include(:'thread name')
end

describe 'code hotspots' do
# NOTE: To avoid duplicating all of the similar-but-slightly different tests from `#sample` (due to how
# `#sample` includes every thread, but `#sample_allocation` includes only the caller thread), here is a simpler
# test to make sure this works in the common case
context 'when there is an active trace on the sampled thread' do
let(:tracer) { Datadog::Tracing.send(:tracer) }
let(:t1) do
Thread.new(ready_queue) do |ready_queue|
inside_t1 do
Datadog::Tracing.trace('profiler.test', type: 'web') do |_span, trace|
trace.resource = 'trace_resource'

Datadog::Tracing.trace('profiler.test.inner') do |inner_span|
@t1_span_id = inner_span.id
@t1_local_root_span_id = trace.send(:root_span).id
sample_allocation(weight: 456)
ready_queue << true
sleep
end
end
end
end
end

after { Datadog::Tracing.shutdown! }

it 'gathers the "local root span id", "span id" and "trace endpoint"' do
expect(single_sample.labels).to include(
:'local root span id' => @t1_local_root_span_id.to_i,
:'span id' => @t1_span_id.to_i,
:'trace endpoint' => 'trace_resource',
)
end
end
end
end

describe '#thread_list' do
it "returns the same as Ruby's Thread.list" do
expect(thread_list).to eq Thread.list
Expand Down
3 changes: 3 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
$LOAD_PATH.unshift File.expand_path('..', __dir__)
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)

Thread.main.name = 'Thread.main' unless Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.3')

require 'pry'
require 'rspec/collection_matchers'
require 'rspec/wait'
Expand Down

0 comments on commit a644ca3

Please sign in to comment.