Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-8667] Heap Profiling - Part 5 - Size #3333

Merged
merged 5 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/profiler_sample_loop_v2.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def create_profiler
cpu_time_enabled: true,
alloc_samples_enabled: false,
heap_samples_enabled: false,
heap_size_enabled: false,
timeline_enabled: false,
)
@collector = Datadog::Profiling::Collectors::ThreadContext.new(
Expand Down
39 changes: 30 additions & 9 deletions ext/ddtrace_profiling_native_extension/heap_recorder.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ static heap_record* get_or_create_heap_record(heap_recorder*, ddog_prof_Slice_Lo
static void cleanup_heap_record_if_unused(heap_recorder*, heap_record*);
static int st_heap_record_entry_free(st_data_t, st_data_t, st_data_t);
static int st_object_record_entry_free(st_data_t, st_data_t, st_data_t);
static int st_object_record_entry_free_if_invalid(st_data_t, st_data_t, st_data_t);
static int st_object_record_update(st_data_t, st_data_t, st_data_t);
static int st_object_records_iterate(st_data_t, st_data_t, st_data_t);
static int st_object_records_debug(st_data_t key, st_data_t value, st_data_t extra);
static int update_object_record_entry(st_data_t*, st_data_t*, st_data_t, int);
Expand Down Expand Up @@ -250,7 +250,15 @@ void end_heap_allocation_recording(struct heap_recorder *heap_recorder, ddog_pro
commit_allocation(heap_recorder, heap_record, partial_object_record);
}

void heap_recorder_prepare_iteration(heap_recorder *heap_recorder) {
typedef struct {
// A reference to the heap recorder so we can access extra stuff to cleanup unused records.
heap_recorder *heap_recorder;

// Whether we should update object sizes as part of the update iteration or not.
bool update_sizes;
} prepare_iteration_context;

void heap_recorder_prepare_iteration(heap_recorder *heap_recorder, bool update_sizes) {
if (heap_recorder == NULL) {
return;
}
Expand All @@ -260,7 +268,11 @@ void heap_recorder_prepare_iteration(heap_recorder *heap_recorder) {
rb_raise(rb_eRuntimeError, "New heap recorder iteration prepared without the previous one having been finished.");
}

st_foreach(heap_recorder->object_records, st_object_record_entry_free_if_invalid, (st_data_t) heap_recorder);
prepare_iteration_context context = (prepare_iteration_context) {
.heap_recorder = heap_recorder,
.update_sizes = update_sizes,
};
st_foreach(heap_recorder->object_records, st_object_record_update, (st_data_t) &context);

heap_recorder->object_records_snapshot = st_copy(heap_recorder->object_records);
if (heap_recorder->object_records_snapshot == NULL) {
Expand Down Expand Up @@ -361,25 +373,34 @@ static int st_object_record_entry_free(DDTRACE_UNUSED st_data_t key, st_data_t v
return ST_DELETE;
}

static int st_object_record_entry_free_if_invalid(st_data_t key, st_data_t value, st_data_t extra_arg) {
static int st_object_record_update(st_data_t key, st_data_t value, st_data_t extra_arg) {
long obj_id = (long) key;
object_record *record = (object_record*) value;
heap_recorder *recorder = (heap_recorder*) extra_arg;
prepare_iteration_context *context = (prepare_iteration_context*) extra_arg;

if (!ruby_ref_from_id(LONG2NUM(obj_id), NULL)) {
// Id no longer associated with a valid ref. Need to clean things up!
VALUE ref;

if (!ruby_ref_from_id(LONG2NUM(obj_id), &ref)) {
// Id no longer associated with a valid ref. Need to delete this object record!

// Starting with the associated heap record. There will now be one less tracked object pointing to it
heap_record *heap_record = record->heap_record;
heap_record->num_tracked_objects--;

// One less object using this heap record, it may have become unused...
cleanup_heap_record_if_unused(recorder, heap_record);
cleanup_heap_record_if_unused(context->heap_recorder, heap_record);

object_record_free(record);
return ST_DELETE;
}

// If we got this far, entry is still valid

if (context->update_sizes) {
// if we were asked to update sizes, do so...
record->object_data.size = ruby_obj_memsize_of(ref);
}
AlexJF marked this conversation as resolved.
Show resolved Hide resolved

return ST_CONTINUE;
}

Expand Down Expand Up @@ -418,7 +439,7 @@ static int st_object_records_debug(DDTRACE_UNUSED st_data_t key, st_data_t value
object_record *record = (object_record*) value;

heap_frame top_frame = record->heap_record->stack->frames[0];
rb_str_catf(debug_str, "obj_id=%ld weight=%d location=%s:%d alloc_gen=%zu ", record->obj_id, record->object_data.weight, top_frame.filename, (int) top_frame.line, record->object_data.alloc_gen);
rb_str_catf(debug_str, "obj_id=%ld weight=%d size=%zu location=%s:%d alloc_gen=%zu ", record->obj_id, record->object_data.weight, record->object_data.size, top_frame.filename, (int) top_frame.line, record->object_data.alloc_gen);

const char *class = record->object_data.class;
if (class != NULL) {
Expand Down
9 changes: 8 additions & 1 deletion ext/ddtrace_profiling_native_extension/heap_recorder.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ typedef struct live_object_data {
// could be seen as being representative of 50 objects.
unsigned int weight;

// Size of this object on last flush/update.
size_t size;

// The class of the object that we're tracking.
// NOTE: This is optional and will be set to NULL if not set.
char* class;
Expand Down Expand Up @@ -79,8 +82,12 @@ void end_heap_allocation_recording(heap_recorder *heap_recorder, ddog_prof_Slice
// Update the heap recorder to reflect the latest state of the VM and prepare internal structures
// for efficient iteration.
//
// @param update_sizes
// True if we should re-calculate live object sizes ahead of the next iteration. If false,
// the previous sizes will be used (or 0 if we never updated them before).
//
// WARN: This must be called strictly before iteration. Failing to do so will result in exceptions.
void heap_recorder_prepare_iteration(heap_recorder *heap_recorder);
void heap_recorder_prepare_iteration(heap_recorder *heap_recorder, bool update_sizes);

// Optimize the heap recorder by cleaning up any data that might have been prepared specifically
// for the purpose of iterating over the heap recorder data.
Expand Down
36 changes: 36 additions & 0 deletions ext/ddtrace_profiling_native_extension/ruby_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,39 @@ bool ruby_ref_from_id(VALUE obj_id, VALUE *value) {

return true;
}

// Not part of public headers but is externed from Ruby
size_t rb_obj_memsize_of(VALUE obj);

// Wrapper around rb_obj_memsize_of to avoid hitting crashing paths.
size_t ruby_obj_memsize_of(VALUE obj) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be fair, the crashing path is only when an object is invalid -- all of the Ruby types are accounted for in rb_obj_memsize_of(...) so I'm not sure it's worth having this wrapper around it -- rb_bug(...) is a pretty big thing that the VM only uses when it truly believes there's something wrong at the VM/native level.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's an extra "hidden" rb_bug call for the T_NODE case which was the one that worried me a bit more. The fact that they use rb_bug in that place may be a hint that it should be impossible for us to accidentally track one. But I'm also not 100% sure if that's the case or if they just assumed whoever called obj_memsize_of would be doing that check first.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right -- I had missed that one. I don't know enough about RNode; intuitively if they're calling rb_bug it would be weird if we ever got handed one of those objects but I definitely see your concern.

I guess it may worth leaving a bit more context as a comment on why we're doing this, but I'm convinced :)

switch (rb_type(obj)) {
case T_OBJECT:
case T_MODULE:
case T_CLASS:
case T_ICLASS:
case T_STRING:
case T_ARRAY:
case T_HASH:
case T_REGEXP:
case T_DATA:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I soft-wonder if we'll run into trouble with T_DATA (or data/typeddata) objects, since the size for these is up to native extension authors. Since these methods don't get used very often, they may be buggy or inefficient. I guess... let's maybe keep an eye out on these ones?

case T_MATCH:
case T_FILE:
case T_RATIONAL:
case T_COMPLEX:
case T_IMEMO:
case T_FLOAT:
case T_SYMBOL:
case T_BIGNUM:
// case T_NODE: -> Throws exception in rb_obj_memsize_of
AlexJF marked this conversation as resolved.
Show resolved Hide resolved
case T_STRUCT:
case T_ZOMBIE:
#ifndef NO_T_MOVED
case T_MOVED:
#endif
return rb_obj_memsize_of(obj);
default:
// Unsupported, return 0 instead of erroring like rb_obj_memsize_of likes doing
return 0;
}
}
5 changes: 5 additions & 0 deletions ext/ddtrace_profiling_native_extension/ruby_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,8 @@ char* ruby_strndup(const char *str, size_t size);
// writes the ref to the value pointer parameter if !NULL. False if id doesn't
// reference a valid object (in which case value is not changed).
bool ruby_ref_from_id(size_t id, VALUE *value);

// Native wrapper to get the approximate/estimated current size of the passed
// object.
size_t ruby_obj_memsize_of(VALUE obj);

32 changes: 26 additions & 6 deletions ext/ddtrace_profiling_native_extension/stack_recorder.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,17 +153,19 @@ static VALUE error_symbol = Qnil; // :error in Ruby
#define ALLOC_SAMPLES_VALUE_ID 3
#define HEAP_SAMPLES_VALUE {.type_ = VALUE_STRING("heap-live-samples"), .unit = VALUE_STRING("count")}
#define HEAP_SAMPLES_VALUE_ID 4
#define HEAP_SIZE_VALUE {.type_ = VALUE_STRING("heap-live-size"), .unit = VALUE_STRING("bytes")}
#define HEAP_SIZE_VALUE_ID 5
#define TIMELINE_VALUE {.type_ = VALUE_STRING("timeline"), .unit = VALUE_STRING("nanoseconds")}
#define TIMELINE_VALUE_ID 5
#define TIMELINE_VALUE_ID 6

static const ddog_prof_ValueType all_value_types[] =
{CPU_TIME_VALUE, CPU_SAMPLES_VALUE, WALL_TIME_VALUE, ALLOC_SAMPLES_VALUE, HEAP_SAMPLES_VALUE, TIMELINE_VALUE};
{CPU_TIME_VALUE, CPU_SAMPLES_VALUE, WALL_TIME_VALUE, ALLOC_SAMPLES_VALUE, HEAP_SAMPLES_VALUE, HEAP_SIZE_VALUE, TIMELINE_VALUE};

// This array MUST be kept in sync with all_value_types above and is intended to act as a "hashmap" between VALUE_ID and the position it
// occupies on the all_value_types array.
// E.g. all_value_types_positions[CPU_TIME_VALUE_ID] => 0, means that CPU_TIME_VALUE was declared at position 0 of all_value_types.
static const uint8_t all_value_types_positions[] =
{CPU_TIME_VALUE_ID, CPU_SAMPLES_VALUE_ID, WALL_TIME_VALUE_ID, ALLOC_SAMPLES_VALUE_ID, HEAP_SAMPLES_VALUE_ID, TIMELINE_VALUE_ID};
{CPU_TIME_VALUE_ID, CPU_SAMPLES_VALUE_ID, WALL_TIME_VALUE_ID, ALLOC_SAMPLES_VALUE_ID, HEAP_SAMPLES_VALUE_ID, HEAP_SIZE_VALUE_ID, TIMELINE_VALUE_ID};

#define ALL_VALUE_TYPES_COUNT (sizeof(all_value_types) / sizeof(ddog_prof_ValueType))

Expand All @@ -182,6 +184,7 @@ struct stack_recorder_state {

uint8_t position_for[ALL_VALUE_TYPES_COUNT];
uint8_t enabled_values_count;
bool heap_size_enabled;
};

// Used to return a pair of values from sampler_lock_active_profile()
Expand Down Expand Up @@ -214,6 +217,7 @@ static VALUE _native_initialize(
VALUE cpu_time_enabled,
VALUE alloc_samples_enabled,
VALUE heap_samples_enabled,
VALUE heap_sizes_enabled,
AlexJF marked this conversation as resolved.
Show resolved Hide resolved
VALUE timeline_enabled
);
static VALUE _native_serialize(VALUE self, VALUE recorder_instance);
Expand Down Expand Up @@ -253,7 +257,7 @@ void stack_recorder_init(VALUE profiling_module) {
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
rb_define_alloc_func(stack_recorder_class, _native_new);

rb_define_singleton_method(stack_recorder_class, "_native_initialize", _native_initialize, 5);
rb_define_singleton_method(stack_recorder_class, "_native_initialize", _native_initialize, 6);
rb_define_singleton_method(stack_recorder_class, "_native_serialize", _native_serialize, 1);
rb_define_singleton_method(stack_recorder_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
rb_define_singleton_method(testing_module, "_native_active_slot", _native_active_slot, 1);
Expand Down Expand Up @@ -307,6 +311,7 @@ static VALUE _native_new(VALUE klass) {
// heap samples, we will free and reset heap_recorder to NULL, effectively disabling all behaviour specific
// to heap profiling (all calls to heap_recorder_* with a NULL heap recorder are noops).
state->heap_recorder = heap_recorder_new();
state->heap_size_enabled = true;

// Note: Don't raise exceptions after this point, since it'll lead to libdatadog memory leaking!

Expand Down Expand Up @@ -367,11 +372,13 @@ static VALUE _native_initialize(
VALUE cpu_time_enabled,
VALUE alloc_samples_enabled,
VALUE heap_samples_enabled,
VALUE heap_size_enabled,
VALUE timeline_enabled
) {
ENFORCE_BOOLEAN(cpu_time_enabled);
ENFORCE_BOOLEAN(alloc_samples_enabled);
ENFORCE_BOOLEAN(heap_samples_enabled);
ENFORCE_BOOLEAN(heap_size_enabled);
ENFORCE_BOOLEAN(timeline_enabled);

struct stack_recorder_state *state;
Expand All @@ -381,6 +388,7 @@ static VALUE _native_initialize(
(cpu_time_enabled == Qtrue ? 0 : 1) -
(alloc_samples_enabled == Qtrue? 0 : 1) -
(heap_samples_enabled == Qtrue ? 0 : 1) -
(heap_size_enabled == Qtrue ? 0 : 1) -
(timeline_enabled == Qtrue ? 0 : 1);

if (requested_values_count == ALL_VALUE_TYPES_COUNT) return Qtrue; // Nothing to do, this is the default
Expand Down Expand Up @@ -422,7 +430,18 @@ static VALUE _native_initialize(
state->position_for[HEAP_SAMPLES_VALUE_ID] = next_enabled_pos++;
} else {
state->position_for[HEAP_SAMPLES_VALUE_ID] = next_disabled_pos++;
}

if (heap_size_enabled == Qtrue) {
enabled_value_types[next_enabled_pos] = (ddog_prof_ValueType) HEAP_SIZE_VALUE;
state->position_for[HEAP_SIZE_VALUE_ID] = next_enabled_pos++;
state->heap_size_enabled = true;
AlexJF marked this conversation as resolved.
Show resolved Hide resolved
} else {
state->position_for[HEAP_SIZE_VALUE_ID] = next_disabled_pos++;
state->heap_size_enabled = false;
}

if (heap_samples_enabled == Qfalse && heap_size_enabled == Qfalse) {
// Turns out heap sampling is disabled but we initialized everything in _native_new
// assuming all samples were enabled. We need to deinitialize the heap recorder.
heap_recorder_free(state->heap_recorder);
Expand Down Expand Up @@ -455,7 +474,7 @@ static VALUE _native_serialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_instan

// Prepare the iteration on heap recorder we'll be doing outside the GVL. The preparation needs to
// happen while holding on to the GVL.
heap_recorder_prepare_iteration(state->heap_recorder);
heap_recorder_prepare_iteration(state->heap_recorder, state->heap_size_enabled);

// We'll release the Global VM Lock while we're calling serialize, so that the Ruby VM can continue to work while this
// is pending
Expand Down Expand Up @@ -599,6 +618,7 @@ static bool add_heap_sample_to_active_profile_without_gvl(heap_recorder_iteratio
uint8_t *position_for = context->state->position_for;

metric_values[position_for[HEAP_SAMPLES_VALUE_ID]] = object_data->weight;
metric_values[position_for[HEAP_SIZE_VALUE_ID]] = object_data->size * object_data->weight;

ddog_prof_Label labels[2];
size_t label_offset = 0;
Expand Down Expand Up @@ -862,7 +882,7 @@ static VALUE _native_start_fake_slow_heap_serialization(DDTRACE_UNUSED VALUE _se
struct stack_recorder_state *state;
TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state);

heap_recorder_prepare_iteration(state->heap_recorder);
heap_recorder_prepare_iteration(state->heap_recorder, false);

return Qnil;
}
Expand Down
1 change: 1 addition & 0 deletions lib/datadog/profiling/component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
cpu_time_enabled: RUBY_PLATFORM.include?('linux'), # Only supported on Linux currently
alloc_samples_enabled: allocation_profiling_enabled,
heap_samples_enabled: heap_profiling_enabled,
heap_size_enabled: heap_profiling_enabled,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to our future selves, a separate config for this gets introduced in #3360 .

timeline_enabled: timeline_enabled,
)
end
Expand Down
6 changes: 5 additions & 1 deletion lib/datadog/profiling/stack_recorder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ module Profiling
# Note that `record_sample` is only accessible from native code.
# Methods prefixed with _native_ are implemented in `stack_recorder.c`
class StackRecorder
def initialize(cpu_time_enabled:, alloc_samples_enabled:, heap_samples_enabled:, timeline_enabled:)
def initialize(
cpu_time_enabled:, alloc_samples_enabled:, heap_samples_enabled:, heap_size_enabled:,
timeline_enabled:
)
# This mutex works in addition to the fancy C-level mutexes we have in the native side (see the docs there).
# It prevents multiple Ruby threads calling serialize at the same time -- something like
# `10.times { Thread.new { stack_recorder.serialize } }`.
Expand All @@ -18,6 +21,7 @@ def initialize(cpu_time_enabled:, alloc_samples_enabled:, heap_samples_enabled:,
cpu_time_enabled,
alloc_samples_enabled,
heap_samples_enabled,
heap_size_enabled,
timeline_enabled,
)
end
Expand Down
9 changes: 8 additions & 1 deletion sig/datadog/profiling/stack_recorder.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,20 @@ module Datadog
class StackRecorder
@no_concurrent_synchronize_mutex: ::Thread::Mutex

def initialize: (cpu_time_enabled: bool, alloc_samples_enabled: bool, heap_samples_enabled: bool, timeline_enabled: bool) -> void
def initialize: (
cpu_time_enabled: bool,
alloc_samples_enabled: bool,
heap_samples_enabled: bool,
heap_size_enabled: bool,
timeline_enabled: bool,
) -> void

def self._native_initialize: (
Datadog::Profiling::StackRecorder recorder_instance,
bool cpu_time_enabled,
bool alloc_samples_enabled,
bool heap_samples_enabled,
bool heap_size_enabled,
bool timeline_enabled,
) -> true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
let(:allocation_sample_every) { 50 }
let(:allocation_profiling_enabled) { false }
let(:heap_profiling_enabled) { false }
let(:recorder) { build_stack_recorder(heap_samples_enabled: heap_profiling_enabled) }
let(:recorder) do
build_stack_recorder(heap_samples_enabled: heap_profiling_enabled, heap_size_enabled: heap_profiling_enabled)
end
let(:no_signals_workaround_enabled) { false }
let(:timeline_enabled) { false }
let(:options) { {} }
Expand Down Expand Up @@ -594,6 +596,8 @@
.find(&test_struct_heap_sample)

expect(relevant_sample.values[:'heap-live-samples']).to eq test_num_allocated_object
# 40 is the size of a basic object and we have test_num_allocated_object of them
expect(relevant_sample.values[:'heap-live-size']).to eq test_num_allocated_object * 40
end
end

Expand Down
3 changes: 2 additions & 1 deletion spec/datadog/profiling/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,12 @@ def samples_for_thread(samples, thread)

# We disable heap_sample collection by default in tests since it requires some extra mocking/
# setup for it to properly work.
def build_stack_recorder(heap_samples_enabled: false, timeline_enabled: false)
def build_stack_recorder(heap_samples_enabled: false, heap_size_enabled: false, timeline_enabled: false)
Datadog::Profiling::StackRecorder.new(
cpu_time_enabled: true,
alloc_samples_enabled: true,
heap_samples_enabled: heap_samples_enabled,
heap_size_enabled: heap_size_enabled,
timeline_enabled: timeline_enabled,
)
end
Expand Down
Loading
Loading