Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-10201] Reduce allocation profiling overhead by using coarse timestamps #3797

Merged
merged 5 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions benchmarks/profiler_allocation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Used to quickly run benchmark under RSpec as part of the usual test suite, to validate it didn't bitrot
VALIDATE_BENCHMARK_MODE = ENV['VALIDATE_BENCHMARK'] == 'true'

return unless __FILE__ == $PROGRAM_NAME || VALIDATE_BENCHMARK_MODE

require 'benchmark/ips'
require 'datadog'
require 'pry'
require_relative 'dogstatsd_reporter'

# This benchmark measures the performance of allocation profiling

class ExportToFile
PPROF_PREFIX = ENV.fetch('DD_PROFILING_PPROF_PREFIX', 'profiler-allocation')

def export(flush)
File.write("#{PPROF_PREFIX}#{flush.start.strftime('%Y%m%dT%H%M%SZ')}.pprof", flush.pprof_data)
true
end
end

class ProfilerAllocationBenchmark
def run_benchmark
Benchmark.ips do |x|
benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 }
x.config(
**benchmark_time,
suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_allocation')
)

x.report('Allocations (baseline)', 'BasicObject.new')

x.save! 'profiler-allocation-results.json' unless VALIDATE_BENCHMARK_MODE
x.compare!
end

Datadog.configure do |c|
c.profiling.enabled = true
c.profiling.allocation_enabled = true
c.profiling.advanced.gc_enabled = false
c.profiling.exporter.transport = ExportToFile.new unless VALIDATE_BENCHMARK_MODE
end
Datadog::Profiling.wait_until_running

3.times { GC.start }

Benchmark.ips do |x|
benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 }
x.config(
**benchmark_time,
suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_allocation')
)

x.report("Allocations (#{ENV['CONFIG']})", 'BasicObject.new')

x.save! 'profiler-allocation-results.json' unless VALIDATE_BENCHMARK_MODE
x.compare!
end
end
end

puts "Current pid is #{Process.pid}"

ProfilerAllocationBenchmark.new.instance_exec do
run_benchmark
end
Original file line number Diff line number Diff line change
Expand Up @@ -1045,8 +1045,27 @@ static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self) {
return are_allocations_being_tracked ? ULL2NUM(allocation_count) : Qnil;
}

#define HANDLE_CLOCK_FAILURE(call) ({ \
long _result = (call); \
if (_result == 0) { \
delayed_error(state, ERR_CLOCK_FAIL); \
return; \
} \
_result; \
})

// Implements memory-related profiling events. This function is called by Ruby via the `object_allocation_tracepoint`
// when the RUBY_INTERNAL_EVENT_NEWOBJ event is triggered.
//
// When allocation sampling is enabled, this function gets called for almost all* objects allocated by the Ruby VM.
// (*In some weird cases the VM may skip this tracepoint.)
//
// At a high level, there's two paths through this function:
// 1. should_sample == false -> return
// 2. should_sample == true -> sample
//
// On big applications, path 1. is the hottest, since we don't sample every option. So it's quite important for it to
// be as fast as possible.
static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
// Update thread-local allocation count
if (RB_UNLIKELY(allocation_count == UINT64_MAX)) {
Expand All @@ -1061,25 +1080,39 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
// and disabled before it is cleared, but just in case...
if (state == NULL) return;

// In a few cases, we may actually be allocating an object as part of profiler sampling. We don't want to recursively
// In rare cases, we may actually be allocating an object as part of profiler sampling. We don't want to recursively
// sample, so we just return early
if (state->during_sample) {
state->stats.allocations_during_sample++;
return;
}

if (state->dynamic_sampling_rate_enabled) {
long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
if (now == 0) {
delayed_error(state, ERR_CLOCK_FAIL);
return;
}
if (!discrete_dynamic_sampler_should_sample(&state->allocation_sampler, now)) {
state->stats.allocation_skipped++;
return;
// Hot path: Dynamic sampling rate is usually enabled and the sampling decision is usually false
if (RB_LIKELY(state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler))) {
state->stats.allocation_skipped++;

coarse_instant now = monotonic_coarse_wall_time_now_ns();
HANDLE_CLOCK_FAILURE(now.timestamp_ns);

bool needs_readjust = discrete_dynamic_sampler_skipped_sample(&state->allocation_sampler, now);
if (RB_UNLIKELY(needs_readjust)) {
// We rarely readjust, so this is a cold path
// Also, while above we used the cheaper monotonic_coarse, for this call we want the regular monotonic call,
// which is why we end up getting time "again".
discrete_dynamic_sampler_readjust(
&state->allocation_sampler, HANDLE_CLOCK_FAILURE(monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE))
);
}

return;
}

// From here on, we've decided to go ahead with the sample, which is way less common than skipping it

discrete_dynamic_sampler_before_sample(
&state->allocation_sampler, HANDLE_CLOCK_FAILURE(monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE))
);

// @ivoanjo: Strictly speaking, this is not needed because Ruby should not call the same tracepoint while a previous
// invocation is still pending, (e.g. it wouldn't call `on_newobj_event` while it's already running), but I decided
// to keep this here for consistency -- every call to the thread context (other than the special gc calls which are
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@

#define EMA_SMOOTHING_FACTOR 0.6

static void maybe_readjust(discrete_dynamic_sampler *sampler, long now_ns);
inline bool should_readjust(discrete_dynamic_sampler *sampler, coarse_instant now);

void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name, long now_ns) {
sampler->debug_name = debug_name;
discrete_dynamic_sampler_set_overhead_target_percentage(sampler, BASE_OVERHEAD_PCT, now_ns);
Expand Down Expand Up @@ -54,27 +57,25 @@ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sa
return discrete_dynamic_sampler_reset(sampler, now_ns);
}

static void maybe_readjust(discrete_dynamic_sampler *sampler, long now);

bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler, long now_ns) {
// NOTE: See header for an explanation of when this should get used
__attribute__((warn_unused_result))
bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler) {
// For efficiency reasons we don't do true random sampling but rather systematic
// sampling following a sample interval/skip. This can be biased and hide patterns
// but the dynamic interval and rather indeterministic pattern of allocations in
// but the dynamic interval and rather nondeterministic pattern of allocations in
// most real applications should help reduce the bias impact.
sampler->events_since_last_sample++;
sampler->events_since_last_readjustment++;
bool should_sample = sampler->sampling_interval > 0 && sampler->events_since_last_sample >= sampler->sampling_interval;

if (should_sample) {
sampler->sample_start_time_ns = now_ns;
} else {
// check if we should readjust our sampler after this event, even if we didn't sample it
maybe_readjust(sampler, now_ns);
}
return sampler->sampling_interval > 0 && sampler->events_since_last_sample >= sampler->sampling_interval;
}

return should_sample;
// NOTE: See header for an explanation of when this should get used
void discrete_dynamic_sampler_before_sample(discrete_dynamic_sampler *sampler, long now_ns) {
sampler->sample_start_time_ns = now_ns;
}

// NOTE: See header for an explanation of when this should get used
long discrete_dynamic_sampler_after_sample(discrete_dynamic_sampler *sampler, long now_ns) {
long last_sampling_time_ns = sampler->sample_start_time_ns == 0 ? 0 : long_max_of(0, now_ns - sampler->sample_start_time_ns);
sampler->samples_since_last_readjustment++;
Expand All @@ -95,6 +96,11 @@ size_t discrete_dynamic_sampler_events_since_last_sample(discrete_dynamic_sample
return sampler->events_since_last_sample;
}

// NOTE: See header for an explanation of when this should get used
bool discrete_dynamic_sampler_skipped_sample(discrete_dynamic_sampler *sampler, coarse_instant now) {
return should_readjust(sampler, now);
}

static double ewma_adj_window(double latest_value, double avg, long current_window_time_ns, bool is_first) {
if (is_first) {
return latest_value;
Expand All @@ -109,19 +115,27 @@ static double ewma_adj_window(double latest_value, double avg, long current_wind
return (1-alpha) * avg + alpha * latest_value;
}

static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
long this_window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now - sampler->last_readjust_time_ns;
static void maybe_readjust(discrete_dynamic_sampler *sampler, long now_ns) {
if (should_readjust(sampler, to_coarse_instant(now_ns))) discrete_dynamic_sampler_readjust(sampler, now_ns);
}

inline bool should_readjust(discrete_dynamic_sampler *sampler, coarse_instant now) {
long this_window_time_ns =
sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now.timestamp_ns - sampler->last_readjust_time_ns;

bool should_readjust_based_on_time = this_window_time_ns >= ADJUSTMENT_WINDOW_NS;
bool should_readjust_based_on_samples = sampler->samples_since_last_readjustment >= ADJUSTMENT_WINDOW_SAMPLES;

if (!should_readjust_based_on_time && !should_readjust_based_on_samples) {
// not enough time or samples have passed to perform a readjustment
return;
}
return should_readjust_based_on_time || should_readjust_based_on_samples;
}

if (this_window_time_ns == 0) {
// should not be possible given previous condition but lets protect against div by 0 below.
// NOTE: This method ASSUMES should_readjust == true
// NOTE: See header for an explanation of when this should get used
void discrete_dynamic_sampler_readjust(discrete_dynamic_sampler *sampler, long now_ns) {
long this_window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now_ns - sampler->last_readjust_time_ns;

if (this_window_time_ns <= 0) {
// should not be possible given should_readjust but lets protect against div by 0 below.
return;
}

Expand All @@ -143,7 +157,7 @@ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
// Lets update our average sampling time per event
long avg_sampling_time_in_window_ns = sampler->samples_since_last_readjustment == 0 ? 0 : sampler->sampling_time_since_last_readjustment_ns / sampler->samples_since_last_readjustment;
if (avg_sampling_time_in_window_ns > sampler->max_sampling_time_ns) {
// If the average sampling time in the previous window was deemed unnacceptable, clamp it to the
// If the average sampling time in the previous window was deemed unacceptable, clamp it to the
// maximum acceptable value and register this operation in our counter.
// NOTE: This is important so that events like suspensions or system overloads do not lead us to
// learn arbitrarily big sampling times which may then result in us not sampling anything
Expand Down Expand Up @@ -286,7 +300,7 @@ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
sampler->events_since_last_readjustment = 0;
sampler->samples_since_last_readjustment = 0;
sampler->sampling_time_since_last_readjustment_ns = 0;
sampler->last_readjust_time_ns = now;
sampler->last_readjust_time_ns = now_ns;
sampler->has_completed_full_adjustment_window = true;
}

Expand Down Expand Up @@ -406,7 +420,14 @@ VALUE _native_should_sample(VALUE self, VALUE now_ns) {
sampler_state *state;
TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);

return discrete_dynamic_sampler_should_sample(&state->sampler, NUM2LONG(now_ns)) ? Qtrue : Qfalse;
if (discrete_dynamic_sampler_should_sample(&state->sampler)) {
discrete_dynamic_sampler_before_sample(&state->sampler, NUM2LONG(now_ns));
return Qtrue;
} else {
bool needs_readjust = discrete_dynamic_sampler_skipped_sample(&state->sampler, to_coarse_instant(NUM2LONG(now_ns)));
if (needs_readjust) discrete_dynamic_sampler_readjust(&state->sampler, NUM2LONG(now_ns));
return Qfalse;
}
}

VALUE _native_after_sample(VALUE self, VALUE now_ns) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include <ruby.h>

#include "time_helpers.h"

// A sampler that will sample discrete events based on the overhead of their
// sampling.
//
Expand Down Expand Up @@ -62,7 +64,6 @@ typedef struct discrete_dynamic_sampler {
unsigned long sampling_time_clamps;
} discrete_dynamic_sampler;


// Init a new sampler with sane defaults.
void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name, long now_ns);

Expand All @@ -80,9 +81,38 @@ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sa
// @return True if the event associated with this decision should be sampled, false
// otherwise.
//
// NOTE: If true is returned we implicitly assume the start of a sampling operation
// and it is expected that a follow-up after_sample call is issued.
bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler, long now_ns);
// IMPORTANT: A call to this method MUST be followed by a call to either
// `discrete_dynamic_sampler_before_sample` if return is `true` or
// `discrete_dynamic_sampler_skipped_sample` if return is `false`.
//
// In the past, this method took care of before_sample/skipped_sample/readjust as well.
// We've had to split it up because we wanted to both use coarse time and regular monotonic time depending on the
// situation, but also wanted time to come as an argument from the outside.
//
// TL;DR here's how they should be used as Ruby code:
// if discrete_dynamic_sampler_should_sample
// discrete_dynamic_sampler_before_sample(monotonic_wall_time_now_ns())
// else
// needs_readjust = discrete_dynamic_sampler_skipped_sample(monotonic_coarse_wall_time_now_ns())
// discrete_dynamic_sampler_readjust(monotonic_wall_time_now_ns()) if needs_readjust
// end
__attribute__((warn_unused_result))
bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler);

// Signal the start of a sampling operation.
// MUST be called after `discrete_dynamic_sampler_should_sample` returns `true`.
void discrete_dynamic_sampler_before_sample(discrete_dynamic_sampler *sampler, long now_ns);

// Signals that sampling did not happen
// MUST be called after `discrete_dynamic_sampler_skipped_sample` returns `false`.
//
// @return True if the sampler needs to be readjusted.
//
// IMPORTANT: A call to this method MUST be followed by a call to `discrete_dynamic_sampler_readjust` if return is `true`.
__attribute__((warn_unused_result))
bool discrete_dynamic_sampler_skipped_sample(discrete_dynamic_sampler *sampler, coarse_instant now);

void discrete_dynamic_sampler_readjust(discrete_dynamic_sampler *sampler, long now_ns);

// Signal the end of a sampling operation.
//
Expand Down
3 changes: 3 additions & 0 deletions ext/datadog_profiling_native_extension/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ def add_compiler_flag(flag)
# but it's slower to build
# so instead we just assume that we have the function we need on Linux, and nowhere else
$defs << '-DHAVE_PTHREAD_GETCPUCLOCKID'

# Not available on macOS
$defs << '-DHAVE_CLOCK_MONOTONIC_COARSE'
end

have_func 'malloc_stats'
Expand Down
15 changes: 0 additions & 15 deletions ext/datadog_profiling_native_extension/time_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,6 @@
#include "ruby_helpers.h"
#include "time_helpers.h"

// Safety: This function is assumed never to raise exceptions by callers when raise_on_failure == false
long retrieve_clock_as_ns(clockid_t clock_id, bool raise_on_failure) {
struct timespec clock_value;

if (clock_gettime(clock_id, &clock_value) != 0) {
if (raise_on_failure) ENFORCE_SUCCESS_GVL(errno);
return 0;
}

return clock_value.tv_nsec + SECONDS_AS_NS(clock_value.tv_sec);
}

long monotonic_wall_time_now_ns(bool raise_on_failure) { return retrieve_clock_as_ns(CLOCK_MONOTONIC, raise_on_failure); }
long system_epoch_time_now_ns(bool raise_on_failure) { return retrieve_clock_as_ns(CLOCK_REALTIME, raise_on_failure); }

// Design: The monotonic_to_system_epoch_state struct is kept somewhere by the caller, and MUST be initialized to
// MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER.
//
Expand Down
Loading
Loading