Skip to content

Commit

Permalink
Merge pull request #3216 from DataDog/ivoanjo/backport-improve-profil…
Browse files Browse the repository at this point in the history
…er-error-handling

Backport #3197 to 1.x-stable: Shut down profiler if any components failed
  • Loading branch information
ivoanjo authored Oct 27, 2023
2 parents 98f7382 + 8065726 commit f897407
Show file tree
Hide file tree
Showing 9 changed files with 196 additions and 27 deletions.
3 changes: 2 additions & 1 deletion lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def initialize(
@idle_sampling_helper = idle_sampling_helper
end

def start
def start(on_failure_proc: nil)
@start_stop_mutex.synchronize do
return if @worker_thread && @worker_thread.alive?

Expand All @@ -74,6 +74,7 @@ def start
'CpuAndWallTimeWorker thread error. ' \
"Cause: #{e.class.name} #{e.message} Location: #{Array(e.backtrace).first}"
)
on_failure_proc&.call
end
end
@worker_thread.name = self.class.name # Repeated from above to make sure thread gets named asap
Expand Down
28 changes: 26 additions & 2 deletions lib/datadog/profiling/profiler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,42 @@ def start
scheduler.reset_after_fork
end

worker.start
scheduler.start
worker.start(on_failure_proc: proc { component_failed(:worker) })
scheduler.start(on_failure_proc: proc { component_failed(:scheduler) })
end

def shutdown!
Datadog.logger.debug('Shutting down profiler')

stop_worker
stop_scheduler
end

private

def stop_worker
worker.stop
end

def stop_scheduler
scheduler.enabled = false
scheduler.stop(true)
end

def component_failed(failed_component)
Datadog.logger.warn(
"Detected issue with profiler (#{failed_component} component), stopping profiling. " \
'See previous log messages for details.'
)

if failed_component == :worker
stop_scheduler
elsif failed_component == :scheduler
stop_worker
else
raise ArgumentError, "Unexpected failed_component: #{failed_component.inspect}"
end
end
end
end
end
25 changes: 16 additions & 9 deletions lib/datadog/profiling/scheduler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,27 @@ def initialize(
self.enabled = enabled
end

def start
perform
def start(on_failure_proc: nil)
perform(on_failure_proc)
end

def perform
# A profiling flush may be called while the VM is shutting down, to report the last profile. When we do so,
# we impose a strict timeout. This means this last profile may or may not be sent, depending on if the flush can
# successfully finish in the strict timeout.
# This can be somewhat confusing (why did it not get reported?), so let's at least log what happened.
interrupted = true

def perform(on_failure_proc)
begin
# A profiling flush may be called while the VM is shutting down, to report the last profile. When we do so,
# we impose a strict timeout. This means this last profile may or may not be sent, depending on if the flush can
# successfully finish in the strict timeout.
# This can be somewhat confusing (why did it not get reported?), so let's at least log what happened.
interrupted = true

flush_and_wait
interrupted = false
rescue Exception => e # rubocop:disable Lint/RescueException
Datadog.logger.warn(
'Profiling::Scheduler thread error. ' \
"Cause: #{e.class.name} #{e.message} Location: #{Array(e.backtrace).first}"
)
on_failure_proc&.call
raise
ensure
Datadog.logger.debug('#flush was interrupted or failed before it could complete') if interrupted
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ module Datadog
::Integer allocation_sample_every,
) -> true

def start: () -> bool?
def start: (?on_failure_proc: ::Proc?) -> bool?

def stop: () -> void
def self._native_stop: (CpuAndWallTimeWorker self_instance, ::Thread worker_thread) -> true
Expand Down
6 changes: 6 additions & 0 deletions sig/datadog/profiling/profiler.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ module Datadog
def start: () -> void

def shutdown!: () -> void

private

def stop_worker: () -> void
def stop_scheduler: () -> void
def component_failed: (:worker | :scheduler failed_component) -> void
end
end
end
2 changes: 1 addition & 1 deletion sig/datadog/profiling/scheduler.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ module Datadog
?enabled: bool,
) -> void

def start: () -> void
def start: (?on_failure_proc: ::Proc?) -> void

def reset_after_fork: () -> void
end
Expand Down
20 changes: 18 additions & 2 deletions spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
require 'datadog/profiling/spec_helper'
require 'datadog/profiling/collectors/cpu_and_wall_time_worker'

RSpec.describe Datadog::Profiling::Collectors::CpuAndWallTimeWorker do
RSpec.describe 'Datadog::Profiling::Collectors::CpuAndWallTimeWorker' do
before { skip_if_profiling_not_supported(self) }

# This is needed because this class uses syntax that doesn't work on Ruby 2.1/2.2; we can undo this once dd-trace-rb
# drops support for these Rubies globally
let(:described_class) { Datadog::Profiling::Collectors::CpuAndWallTimeWorker }

let(:recorder) { build_stack_recorder }
let(:endpoint_collection_enabled) { true }
let(:gc_profiling_enabled) { true }
Expand Down Expand Up @@ -526,6 +529,19 @@
process_waiter_thread.join
end
end

context 'when the _native_sampling_loop terminates with an exception' do
it 'calls the on_failure_proc' do
expect(described_class).to receive(:_native_sampling_loop).and_raise(StandardError.new('Simulated error'))
expect(Datadog.logger).to receive(:warn)

proc_called = Queue.new

cpu_and_wall_time_worker.start(on_failure_proc: proc { proc_called << true })

proc_called.pop
end
end
end

describe 'Ractor safety' do
Expand Down
71 changes: 68 additions & 3 deletions spec/datadog/profiling/profiler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
describe '#start' do
subject(:start) { profiler.start }

it 'signals collectors and scheduler to start' do
expect(worker).to receive(:start)
expect(scheduler).to receive(:start)
it 'signals the worker and scheduler to start' do
expect(worker).to receive(:start).with(on_failure_proc: an_instance_of(Proc))
expect(scheduler).to receive(:start).with(on_failure_proc: an_instance_of(Proc))

start
end
Expand Down Expand Up @@ -52,4 +52,69 @@
shutdown!
end
end

describe 'Component failure handling' do
let(:worker) { instance_double(Datadog::Profiling::Collectors::CpuAndWallTimeWorker, start: nil) }
let(:scheduler) { instance_double(Datadog::Profiling::Scheduler, start: nil) }

before { allow(Datadog.logger).to receive(:warn) }

context 'when the worker failed' do
let(:worker_on_failure) do
on_failure = nil
expect(worker).to receive(:start) { |on_failure_proc:| on_failure = on_failure_proc }

profiler.start

on_failure.call
end

it 'logs the issue' do
allow(scheduler).to receive(:enabled=)
allow(scheduler).to receive(:stop)

expect(Datadog.logger).to receive(:warn).with(/worker component/)

worker_on_failure
end

it 'stops the scheduler' do
expect(scheduler).to receive(:enabled=).with(false)
expect(scheduler).to receive(:stop).with(true)

worker_on_failure
end
end

context 'when the scheduler failed' do
let(:scheduler_on_failure) do
on_failure = nil
expect(scheduler).to receive(:start) { |on_failure_proc:| on_failure = on_failure_proc }

profiler.start

on_failure.call
end

it 'logs the issue' do
allow(worker).to receive(:stop)

expect(Datadog.logger).to receive(:warn).with(/scheduler component/)

scheduler_on_failure
end

it 'stops the worker' do
expect(worker).to receive(:stop)

scheduler_on_failure
end
end

context 'when unknown component failed' do
it 'raises an ArgumentError' do
expect { profiler.send(:component_failed, 'test') }.to raise_error(ArgumentError, /failed_component: "test"/)
end
end
end
end
66 changes: 58 additions & 8 deletions spec/datadog/profiling/scheduler_spec.rb
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
require 'spec_helper'
require 'datadog/profiling/spec_helper'

require 'datadog/profiling/http_transport'
require 'datadog/profiling/exporter'
require 'datadog/profiling/scheduler'

RSpec.describe Datadog::Profiling::Scheduler do
subject(:scheduler) { described_class.new(exporter: exporter, transport: transport, **options) }
RSpec.describe 'Datadog::Profiling::Scheduler' do
before { skip_if_profiling_not_supported(self) }

# This is needed because this class uses syntax that doesn't work on Ruby 2.1/2.2; we can undo this once dd-trace-rb
# drops support for these Rubies globally
let(:described_class) { Datadog::Profiling::Scheduler }
let(:exporter) { instance_double(Datadog::Profiling::Exporter) }
let(:transport) { instance_double(Datadog::Profiling::HttpTransport) }
let(:options) { {} }

subject(:scheduler) { described_class.new(exporter: exporter, transport: transport, **options) }

describe '.new' do
describe 'default settings' do
it do
Expand Down Expand Up @@ -81,6 +82,55 @@
result: nil
)
end

context 'when perform fails' do
before { Thread.report_on_exception = false if Thread.respond_to?(:report_on_exception=) }
after { Thread.report_on_exception = true if Thread.respond_to?(:report_on_exception=) }

it 'calls the on_failure_proc and logs the error' do
expect(scheduler).to receive(:flush_and_wait).and_raise(StandardError.new('Simulated error'))

# This is a bit ugly, but we want the logic in the background thread to be called immediately, and by
# default we don't do that
expect(scheduler).to receive(:loop_wait_before_first_iteration?).and_return(false)
expect(scheduler).to receive(:work_pending?).and_return(true)

allow(Datadog.logger).to receive(:debug)

expect(Datadog.logger).to receive(:warn).with(/Profiling::Scheduler thread error/)

proc_called = Queue.new

scheduler.start(on_failure_proc: proc { proc_called << true })

proc_called.pop
end
end

context 'when perform is interrupted' do
it 'logs the interruption' do
inside_flush = Queue.new

# This is a bit ugly, but we want the logic in the background thread to be called immediately, and by
# default we don't do that
expect(scheduler).to receive(:loop_wait_before_first_iteration?).and_return(false)
expect(scheduler).to receive(:work_pending?).and_return(true)

allow(Datadog.logger).to receive(:debug)
expect(Datadog.logger).to receive(:debug).with(/#flush was interrupted or failed/)

expect(scheduler).to receive(:flush_and_wait) do
inside_flush << true
sleep
end

scheduler.start
inside_flush.pop

scheduler.stop(true, 0)
scheduler.join
end
end
end
end

Expand Down Expand Up @@ -108,7 +158,7 @@
let(:options) { { **super(), interval: 0.01 } }

# Assert that the interval isn't set below the min interval
it "floors the wait interval to #{described_class.const_get(:MINIMUM_INTERVAL_SECONDS)}" do
it 'floors the wait interval to MINIMUM_INTERVAL_SECONDS' do
expect(scheduler).to receive(:loop_wait_time=)
.with(described_class.const_get(:MINIMUM_INTERVAL_SECONDS))

Expand Down

0 comments on commit f897407

Please sign in to comment.