Merge pull request #3216 from DataDog/ivoanjo/backport-improve-profil…

…er-error-handling Backport #3197 to 1.x-stable: Shut down profiler if any components failed
DataDog · Oct 27, 2023 · f897407 · f897407
2 parents 98f7382 + 8065726
commit f897407
Show file tree

Hide file tree

Showing 9 changed files with 196 additions and 27 deletions.
diff --git a/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb b/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb
@@ -53,7 +53,7 @@ def initialize(
           @idle_sampling_helper = idle_sampling_helper
         end
 
-        def start
+        def start(on_failure_proc: nil)
           @start_stop_mutex.synchronize do
             return if @worker_thread && @worker_thread.alive?
 
@@ -74,6 +74,7 @@ def start
                   'CpuAndWallTimeWorker thread error. ' \
                   "Cause: #{e.class.name} #{e.message} Location: #{Array(e.backtrace).first}"
                 )
+                on_failure_proc&.call
               end
             end
             @worker_thread.name = self.class.name # Repeated from above to make sure thread gets named asap

diff --git a/lib/datadog/profiling/profiler.rb b/lib/datadog/profiling/profiler.rb
@@ -21,18 +21,42 @@ def start
           scheduler.reset_after_fork
         end
 
-        worker.start
-        scheduler.start
+        worker.start(on_failure_proc: proc { component_failed(:worker) })
+        scheduler.start(on_failure_proc: proc { component_failed(:scheduler) })
       end
 
       def shutdown!
         Datadog.logger.debug('Shutting down profiler')
 
+        stop_worker
+        stop_scheduler
+      end
+
+      private
+
+      def stop_worker
         worker.stop
+      end
 
+      def stop_scheduler
         scheduler.enabled = false
         scheduler.stop(true)
       end
+
+      def component_failed(failed_component)
+        Datadog.logger.warn(
+          "Detected issue with profiler (#{failed_component} component), stopping profiling. " \
+          'See previous log messages for details.'
+        )
+
+        if failed_component == :worker
+          stop_scheduler
+        elsif failed_component == :scheduler
+          stop_worker
+        else
+          raise ArgumentError, "Unexpected failed_component: #{failed_component.inspect}"
+        end
+      end
     end
   end
 end
diff --git a/lib/datadog/profiling/scheduler.rb b/lib/datadog/profiling/scheduler.rb
@@ -45,20 +45,27 @@ def initialize(
         self.enabled = enabled
       end
 
-      def start
-        perform
+      def start(on_failure_proc: nil)
+        perform(on_failure_proc)
       end
 
-      def perform
-        # A profiling flush may be called while the VM is shutting down, to report the last profile. When we do so,
-        # we impose a strict timeout. This means this last profile may or may not be sent, depending on if the flush can
-        # successfully finish in the strict timeout.
-        # This can be somewhat confusing (why did it not get reported?), so let's at least log what happened.
-        interrupted = true
-
+      def perform(on_failure_proc)
         begin
+          # A profiling flush may be called while the VM is shutting down, to report the last profile. When we do so,
+          # we impose a strict timeout. This means this last profile may or may not be sent, depending on if the flush can
+          # successfully finish in the strict timeout.
+          # This can be somewhat confusing (why did it not get reported?), so let's at least log what happened.
+          interrupted = true
+
           flush_and_wait
           interrupted = false
+        rescue Exception => e # rubocop:disable Lint/RescueException
+          Datadog.logger.warn(
+            'Profiling::Scheduler thread error. ' \
+            "Cause: #{e.class.name} #{e.message} Location: #{Array(e.backtrace).first}"
+          )
+          on_failure_proc&.call
+          raise
         ensure
           Datadog.logger.debug('#flush was interrupted or failed before it could complete') if interrupted
         end

diff --git a/sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs b/sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs
@@ -28,7 +28,7 @@ module Datadog
           ::Integer allocation_sample_every,
         ) -> true
 
-        def start: () -> bool?
+        def start: (?on_failure_proc: ::Proc?) -> bool?
 
         def stop: () -> void
         def self._native_stop: (CpuAndWallTimeWorker self_instance, ::Thread worker_thread) -> true

diff --git a/sig/datadog/profiling/profiler.rbs b/sig/datadog/profiling/profiler.rbs
@@ -18,6 +18,12 @@ module Datadog
       def start: () -> void
 
       def shutdown!: () -> void
+
+      private
+
+      def stop_worker: () -> void
+      def stop_scheduler: () -> void
+      def component_failed: (:worker | :scheduler failed_component) -> void
     end
   end
 end
diff --git a/sig/datadog/profiling/scheduler.rbs b/sig/datadog/profiling/scheduler.rbs
@@ -11,7 +11,7 @@ module Datadog
         ?enabled: bool,
       ) -> void
 
-      def start: () -> void
+      def start: (?on_failure_proc: ::Proc?) -> void
 
       def reset_after_fork: () -> void
     end

diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
@@ -1,9 +1,12 @@
 require 'datadog/profiling/spec_helper'
-require 'datadog/profiling/collectors/cpu_and_wall_time_worker'
 
-RSpec.describe Datadog::Profiling::Collectors::CpuAndWallTimeWorker do
+RSpec.describe 'Datadog::Profiling::Collectors::CpuAndWallTimeWorker' do
   before { skip_if_profiling_not_supported(self) }
 
+  # This is needed because this class uses syntax that doesn't work on Ruby 2.1/2.2; we can undo this once dd-trace-rb
+  # drops support for these Rubies globally
+  let(:described_class) { Datadog::Profiling::Collectors::CpuAndWallTimeWorker }
+
   let(:recorder) { build_stack_recorder }
   let(:endpoint_collection_enabled) { true }
   let(:gc_profiling_enabled) { true }
@@ -526,6 +529,19 @@
         process_waiter_thread.join
       end
     end
+
+    context 'when the _native_sampling_loop terminates with an exception' do
+      it 'calls the on_failure_proc' do
+        expect(described_class).to receive(:_native_sampling_loop).and_raise(StandardError.new('Simulated error'))
+        expect(Datadog.logger).to receive(:warn)
+
+        proc_called = Queue.new
+
+        cpu_and_wall_time_worker.start(on_failure_proc: proc { proc_called << true })
+
+        proc_called.pop
+      end
+    end
   end
 
   describe 'Ractor safety' do

diff --git a/spec/datadog/profiling/profiler_spec.rb b/spec/datadog/profiling/profiler_spec.rb
@@ -14,9 +14,9 @@
   describe '#start' do
     subject(:start) { profiler.start }
 
-    it 'signals collectors and scheduler to start' do
-      expect(worker).to receive(:start)
-      expect(scheduler).to receive(:start)
+    it 'signals the worker and scheduler to start' do
+      expect(worker).to receive(:start).with(on_failure_proc: an_instance_of(Proc))
+      expect(scheduler).to receive(:start).with(on_failure_proc: an_instance_of(Proc))
 
       start
     end
@@ -52,4 +52,69 @@
       shutdown!
     end
   end
+
+  describe 'Component failure handling' do
+    let(:worker) { instance_double(Datadog::Profiling::Collectors::CpuAndWallTimeWorker, start: nil) }
+    let(:scheduler) { instance_double(Datadog::Profiling::Scheduler, start: nil) }
+
+    before { allow(Datadog.logger).to receive(:warn) }
+
+    context 'when the worker failed' do
+      let(:worker_on_failure) do
+        on_failure = nil
+        expect(worker).to receive(:start) { |on_failure_proc:| on_failure = on_failure_proc }
+
+        profiler.start
+
+        on_failure.call
+      end
+
+      it 'logs the issue' do
+        allow(scheduler).to receive(:enabled=)
+        allow(scheduler).to receive(:stop)
+
+        expect(Datadog.logger).to receive(:warn).with(/worker component/)
+
+        worker_on_failure
+      end
+
+      it 'stops the scheduler' do
+        expect(scheduler).to receive(:enabled=).with(false)
+        expect(scheduler).to receive(:stop).with(true)
+
+        worker_on_failure
+      end
+    end
+
+    context 'when the scheduler failed' do
+      let(:scheduler_on_failure) do
+        on_failure = nil
+        expect(scheduler).to receive(:start) { |on_failure_proc:| on_failure = on_failure_proc }
+
+        profiler.start
+
+        on_failure.call
+      end
+
+      it 'logs the issue' do
+        allow(worker).to receive(:stop)
+
+        expect(Datadog.logger).to receive(:warn).with(/scheduler component/)
+
+        scheduler_on_failure
+      end
+
+      it 'stops the worker' do
+        expect(worker).to receive(:stop)
+
+        scheduler_on_failure
+      end
+    end
+
+    context 'when unknown component failed' do
+      it 'raises an ArgumentError' do
+        expect { profiler.send(:component_failed, 'test') }.to raise_error(ArgumentError, /failed_component: "test"/)
+      end
+    end
+  end
 end
diff --git a/spec/datadog/profiling/scheduler_spec.rb b/spec/datadog/profiling/scheduler_spec.rb
@@ -1,16 +1,17 @@
-require 'spec_helper'
+require 'datadog/profiling/spec_helper'
 
-require 'datadog/profiling/http_transport'
-require 'datadog/profiling/exporter'
-require 'datadog/profiling/scheduler'
-
-RSpec.describe Datadog::Profiling::Scheduler do
-  subject(:scheduler) { described_class.new(exporter: exporter, transport: transport, **options) }
+RSpec.describe 'Datadog::Profiling::Scheduler' do
+  before { skip_if_profiling_not_supported(self) }
 
+  # This is needed because this class uses syntax that doesn't work on Ruby 2.1/2.2; we can undo this once dd-trace-rb
+  # drops support for these Rubies globally
+  let(:described_class) { Datadog::Profiling::Scheduler }
   let(:exporter) { instance_double(Datadog::Profiling::Exporter) }
   let(:transport) { instance_double(Datadog::Profiling::HttpTransport) }
   let(:options) { {} }
 
+  subject(:scheduler) { described_class.new(exporter: exporter, transport: transport, **options) }
+
   describe '.new' do
     describe 'default settings' do
       it do
@@ -81,6 +82,55 @@
           result: nil
         )
       end
+
+      context 'when perform fails' do
+        before { Thread.report_on_exception = false if Thread.respond_to?(:report_on_exception=) }
+        after  { Thread.report_on_exception = true  if Thread.respond_to?(:report_on_exception=) }
+
+        it 'calls the on_failure_proc and logs the error' do
+          expect(scheduler).to receive(:flush_and_wait).and_raise(StandardError.new('Simulated error'))
+
+          # This is a bit ugly, but we want the logic in the background thread to be called immediately, and by
+          # default we don't do that
+          expect(scheduler).to receive(:loop_wait_before_first_iteration?).and_return(false)
+          expect(scheduler).to receive(:work_pending?).and_return(true)
+
+          allow(Datadog.logger).to receive(:debug)
+
+          expect(Datadog.logger).to receive(:warn).with(/Profiling::Scheduler thread error/)
+
+          proc_called = Queue.new
+
+          scheduler.start(on_failure_proc: proc { proc_called << true })
+
+          proc_called.pop
+        end
+      end
+
+      context 'when perform is interrupted' do
+        it 'logs the interruption' do
+          inside_flush = Queue.new
+
+          # This is a bit ugly, but we want the logic in the background thread to be called immediately, and by
+          # default we don't do that
+          expect(scheduler).to receive(:loop_wait_before_first_iteration?).and_return(false)
+          expect(scheduler).to receive(:work_pending?).and_return(true)
+
+          allow(Datadog.logger).to receive(:debug)
+          expect(Datadog.logger).to receive(:debug).with(/#flush was interrupted or failed/)
+
+          expect(scheduler).to receive(:flush_and_wait) do
+            inside_flush << true
+            sleep
+          end
+
+          scheduler.start
+          inside_flush.pop
+
+          scheduler.stop(true, 0)
+          scheduler.join
+        end
+      end
     end
   end
 
@@ -108,7 +158,7 @@
       let(:options) { { **super(), interval: 0.01 } }
 
       # Assert that the interval isn't set below the min interval
-      it "floors the wait interval to #{described_class.const_get(:MINIMUM_INTERVAL_SECONDS)}" do
+      it 'floors the wait interval to MINIMUM_INTERVAL_SECONDS' do
         expect(scheduler).to receive(:loop_wait_time=)
           .with(described_class.const_get(:MINIMUM_INTERVAL_SECONDS))