From 6acbe0497a4ca37894447bc3970aee9b3433c005 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Thu, 23 Feb 2023 09:45:42 +0000
Subject: [PATCH 1/2] Reuse array for listing threads instead of creating one
 every time

**What does this PR do?**:

This PR changes the `ddtrace_thread_list` method to, instead of
creating a new Ruby array to contain its result, receive a
pre-existing array for it to use.

Then, it changes the `CpuAndWallTime` collector to always reuse the
same array for this list.

**Motivation**:

The thread list array was the one single Ruby object allocation that
the profiler always did on every sample. Now that we're exposing
object counts and experimenting with allocation profiling, this
regular allocation was really annoying to deal with when looking
at data and writing tests.

By reusing the same object, this problem goes away, and even some
specs can be simplified.

**Additional Notes**:

N/A

**How to test the change?**:

This change is covered by the existing unit tests.
---
 .../collectors_cpu_and_wall_time.c            | 20 ++++++++++++++++---
 .../private_vm_api_access.c                   |  7 ++-----
 .../private_vm_api_access.h                   |  2 +-
 .../cpu_and_wall_time_worker_spec.rb          | 10 ++++------
 4 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c
index 600d0c1a0f1..c30c0c5964d 100644
--- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c
+++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c
@@ -92,6 +92,8 @@ struct cpu_and_wall_time_collector_state {
   // Currently **outside** of stats struct because we also use it to decide when to clean the contexts, and thus this
   // is not (just) a stat.
   unsigned int sample_count;
+  // Reusable array to get list of threads
+  VALUE thread_list_buffer;
 
   struct stats {
     // Track how many garbage collection samples we've taken.
@@ -174,6 +176,7 @@ static VALUE _native_stats(VALUE self, VALUE collector_instance);
 static void trace_identifiers_for(struct cpu_and_wall_time_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result);
 static bool is_type_web(VALUE root_span_type);
 static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
+static VALUE thread_list(struct cpu_and_wall_time_collector_state *state);
 
 void collectors_cpu_and_wall_time_init(VALUE profiling_module) {
   VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
@@ -231,6 +234,7 @@ static void cpu_and_wall_time_collector_typed_data_mark(void *state_ptr) {
   // Update this when modifying state struct
   rb_gc_mark(state->recorder_instance);
   st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_mark, 0 /* unused */);
+  rb_gc_mark(state->thread_list_buffer);
 }
 
 static void cpu_and_wall_time_collector_typed_data_free(void *state_ptr) {
@@ -274,6 +278,7 @@ static VALUE _native_new(VALUE klass) {
     st_init_numtable();
   state->recorder_instance = Qnil;
   state->tracer_context_key = MISSING_TRACER_CONTEXT_KEY;
+  state->thread_list_buffer = rb_ary_new();
 
   return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_collector_typed_data, state);
 }
@@ -349,7 +354,7 @@ void cpu_and_wall_time_collector_sample(VALUE self_instance, long current_monoto
   struct per_thread_context *current_thread_context = get_or_create_context_for(current_thread, state);
   long cpu_time_at_sample_start_for_current_thread = cpu_time_now_ns(current_thread_context);
 
-  VALUE threads = ddtrace_thread_list();
+  VALUE threads = thread_list(state);
 
   const long thread_count = RARRAY_LEN(threads);
   for (long i = 0; i < thread_count; i++) {
@@ -513,7 +518,7 @@ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) {
   struct cpu_and_wall_time_collector_state *state;
   TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
 
-  VALUE threads = ddtrace_thread_list();
+  VALUE threads = thread_list(state);
   bool sampled_any_thread = false;
 
   const long thread_count = RARRAY_LEN(threads);
@@ -661,7 +666,9 @@ static void trigger_sample_for_thread(
 // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTime behavior using RSpec.
 // It SHOULD NOT be used for other purposes.
 static VALUE _native_thread_list(DDTRACE_UNUSED VALUE _self) {
-  return ddtrace_thread_list();
+  VALUE result = rb_ary_new();
+  ddtrace_thread_list(result);
+  return result;
 }
 
 static struct per_thread_context *get_or_create_context_for(VALUE thread, struct cpu_and_wall_time_collector_state *state) {
@@ -934,3 +941,10 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector
 
   return Qtrue;
 }
+
+static VALUE thread_list(struct cpu_and_wall_time_collector_state *state) {
+  VALUE result = state->thread_list_buffer;
+  rb_ary_clear(result);
+  ddtrace_thread_list(result);
+  return result;
+}
diff --git a/ext/ddtrace_profiling_native_extension/private_vm_api_access.c b/ext/ddtrace_profiling_native_extension/private_vm_api_access.c
index 9e54d67e40f..238b69efcb7 100644
--- a/ext/ddtrace_profiling_native_extension/private_vm_api_access.c
+++ b/ext/ddtrace_profiling_native_extension/private_vm_api_access.c
@@ -204,8 +204,7 @@ ptrdiff_t stack_depth_for(VALUE thread) {
 #endif
 
 // Tries to match rb_thread_list() but that method isn't accessible to extensions
-VALUE ddtrace_thread_list(void) {
-  VALUE result = rb_ary_new();
+void ddtrace_thread_list(VALUE result_array) {
   rb_thread_t *thread = NULL;
 
   // Ruby 3 Safety: Our implementation is inspired by `rb_ractor_thread_list` BUT that method wraps the operations below
@@ -234,13 +233,11 @@ VALUE ddtrace_thread_list(void) {
         case THREAD_RUNNABLE:
         case THREAD_STOPPED:
         case THREAD_STOPPED_FOREVER:
-          rb_ary_push(result, thread->self);
+          rb_ary_push(result_array, thread->self);
         default:
           break;
       }
     }
-
-  return result;
 }
 
 bool is_thread_alive(VALUE thread) {
diff --git a/ext/ddtrace_profiling_native_extension/private_vm_api_access.h b/ext/ddtrace_profiling_native_extension/private_vm_api_access.h
index f7985795d12..62f39c52f30 100644
--- a/ext/ddtrace_profiling_native_extension/private_vm_api_access.h
+++ b/ext/ddtrace_profiling_native_extension/private_vm_api_access.h
@@ -23,7 +23,7 @@ bool is_current_thread_holding_the_gvl(void);
 current_gvl_owner gvl_owner(void);
 uint64_t native_thread_id_for(VALUE thread);
 ptrdiff_t stack_depth_for(VALUE thread);
-VALUE ddtrace_thread_list(void);
+void ddtrace_thread_list(VALUE result_array);
 bool is_thread_alive(VALUE thread);
 VALUE thread_name_for(VALUE thread);
 
diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
index 5c52aaed0b6..f2453fee53e 100644
--- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
+++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
@@ -546,9 +546,7 @@
         100.times { Object.new }
         after_allocations = described_class._native_allocation_count
 
-        # The profiler can (currently) cause extra allocations, which is why this is not exactly 100
-        expect(after_allocations - before_allocations).to be >= 100
-        expect(after_allocations - before_allocations).to be < 110
+        expect(after_allocations - before_allocations).to be 100
       end
 
       it 'returns different numbers of allocations for different threads' do
@@ -573,10 +571,10 @@
 
         background_t1.join
 
-        # This test checks that even though we observed >= 100 allocations in a background thread t1, the counters for
-        # the current thread were not affected by this change (even though they may be slightly affected by the profiler)
+        # This test checks that even though we observed 100 allocations in a background thread t1, the counters for
+        # the current thread were not affected by this change
 
-        expect(after_t1 - before_t1).to be >= 100
+        expect(after_t1 - before_t1).to be 100
         expect(after_allocations - before_allocations).to be < 10
       end
     end

From 31f644d80c9a41140dabb49147f352cd08d9aa98 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Thu, 23 Feb 2023 10:16:14 +0000
Subject: [PATCH 2/2] Tweak spec to make sure there are no unexpected memory
 allocations

When I tweaked this test to tighten the number of expected allocations
I was testing on Ruby 2.7.

It turns out that on Ruby 3 we would still observe one or two extra
allocations (Ruby warming up some caches). I now tweaked the test to
pre-create all needed things, so that we can assert on the exact
number of expected allocations on Ruby 3 as well.
---
 .../collectors/cpu_and_wall_time_worker_spec.rb    | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
index f2453fee53e..d0205c67b48 100644
--- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
+++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
@@ -542,14 +542,24 @@
       end
 
       it 'returns the number of allocations between two calls of the method' do
+        # To get the exact expected number of allocations, we run this once before so that Ruby can create and cache all
+        # it needs to
+        new_object = proc { Object.new }
+        1.times(&new_object)
+
         before_allocations = described_class._native_allocation_count
-        100.times { Object.new }
+        100.times(&new_object)
         after_allocations = described_class._native_allocation_count
 
         expect(after_allocations - before_allocations).to be 100
       end
 
       it 'returns different numbers of allocations for different threads' do
+        # To get the exact expected number of allocations, we run this once before so that Ruby can create and cache all
+        # it needs to
+        new_object = proc { Object.new }
+        1.times(&new_object)
+
         t1_can_run = Queue.new
         t1_has_run = Queue.new
         before_t1 = nil
@@ -559,7 +569,7 @@
           before_t1 = described_class._native_allocation_count
           t1_can_run.pop
 
-          100.times { Object.new }
+          100.times(&new_object)
           after_t1 = described_class._native_allocation_count
           t1_has_run << true
         end