Skip to content

Commit

Permalink
Refactor timeout handling
Browse files Browse the repository at this point in the history
Now all timeout actions use the `timeout_signal` callback
which allows to customize the signal sent on timeout.

e.g. if you wish to get debug info you may send SIGBUS to get
crash reports etc.
  • Loading branch information
byroot committed Dec 12, 2023
1 parent 53a6c45 commit 9ca1541
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 74 deletions.
53 changes: 0 additions & 53 deletions lib/pitchfork.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ module Pitchfork

# :stopdoc:

FORK_TIMEOUT = 5
FORK_LOCK = Monitor.new
@socket_type = :SOCK_SEQPACKET

Expand Down Expand Up @@ -198,58 +197,6 @@ def clean_fork(setpgid: true, &block)
end
end

def fork_sibling(role, &block)
if REFORKING_AVAILABLE
r, w = Pitchfork::Info.keep_ios(IO.pipe)
# We double fork so that the new worker is re-attached back
# to the master.
# This requires either PR_SET_CHILD_SUBREAPER which is exclusive to Linux 3.4
# or the master to be PID 1.
if middle_pid = FORK_LOCK.synchronize { Process.fork } # parent
w.close
# We need to wait(2) so that the middle process doesn't end up a zombie.
# The process only call fork again an exit so it should be pretty fast.
# However it might need to execute some `Process._fork` or `at_exit` callbacks,
# so it case it takes more than 5 seconds to exit, we kill it with SIGBUS
# to produce a crash report, as this is indicative of a nasty bug.
process_wait_with_timeout(middle_pid, FORK_TIMEOUT, :BUS)
pid_str = r.gets
r.close
if pid_str
Integer(pid_str)
else
raise ForkFailure, "fork_sibling didn't succeed in #{FORK_TIMEOUT} seconds"
end
else # first child
r.close
Process.setproctitle("<pitchfork fork_sibling(#{role})>")
pid = clean_fork do
# detach into a grand child
w.close
yield
end
w.puts(pid)
w.close
exit
end
else
clean_fork(&block)
end
end

def process_wait_with_timeout(pid, timeout, timeout_signal = :KILL)
(timeout * 200).times do
_, status = Process.waitpid2(pid, Process::WNOHANG)
return status if status
sleep 0.005 # 200 * 5ms => 1s
end

# The process didn't exit in the allotted time, so we kill it.
Process.kill(timeout_signal, pid)
_, status = Process.waitpid2(pid)
status
end

def time_now(int = false)
Process.clock_gettime(Process::CLOCK_MONOTONIC, int ? :second : :float_second)
end
Expand Down
8 changes: 0 additions & 8 deletions lib/pitchfork/children.rb
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,6 @@ def hard_kill_all(sig)
end
end

def hard_timeout(child)
child.hard_timeout!
rescue Errno::ESRCH
reap(child.pid)
child.close
true
end

def workers
@workers.values
end
Expand Down
14 changes: 14 additions & 0 deletions lib/pitchfork/configurator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class Configurator
:soft_timeout => 20,
:cleanup_timeout => 2,
:spawn_timeout => 10,
:timeout_signal => -> (_pid) { :KILL },
:timeout => 22,
:logger => default_logger,
:worker_processes => 1,
Expand Down Expand Up @@ -180,6 +181,19 @@ def timeout(seconds, cleanup: 2)
set_int(:timeout, soft_timeout + cleanup_timeout, 5)
end

def timeout_signal(*args, &block)
if block_given?
set_hook(:timeout_signal, block, 1)
elsif args.first.respond_to?(:call)
set_hook(:timeout_signal, args.first, 1)
elsif args.first.is_a?(Symbol)
signal = args.first
set_hook(:timeout_signal, ->(_pid) { signal }, 1)
else
raise ArgumentError, "timeout_signal must be a symbol or a proc"
end
end

def spawn_timeout(seconds)
set_int(:spawn_timeout, seconds, 1)
end
Expand Down
81 changes: 72 additions & 9 deletions lib/pitchfork/http_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def extend_deadline(extra_time)
end

# :stopdoc:
attr_accessor :app, :timeout, :soft_timeout, :cleanup_timeout, :spawn_timeout, :worker_processes,
attr_accessor :app, :timeout, :timeout_signal, :soft_timeout, :cleanup_timeout, :spawn_timeout, :worker_processes,
:before_fork, :after_worker_fork, :after_mold_fork,
:listener_opts, :children,
:orig_app, :config, :ready_pipe,
Expand Down Expand Up @@ -404,7 +404,12 @@ def stop(graceful = true)
return StopIteration
end
end
@children.hard_kill_all(:KILL)

@children.each do |child|
if child.pid
@children.hard_kill(@timeout_signal.call(child.pid), child)
end
end
@promotion_lock.unlink
end

Expand Down Expand Up @@ -534,7 +539,7 @@ def hard_timeout(child)
else
logger.error "worker=#{child.nr} pid=#{child.pid} gen=#{child.generation} timed out, killing"
end
@children.hard_timeout(child) # take no prisoners for hard timeout violations
@children.hard_kill(@timeout_signal.call(child.pid), child) # take no prisoners for hard timeout violations
end

def trigger_refork
Expand Down Expand Up @@ -571,7 +576,7 @@ def spawn_worker(worker, detach:)
# the monitor process will kill it.
worker.update_deadline(@spawn_timeout)
@before_fork&.call(self)
Pitchfork.fork_sibling("spawn_worker") do
fork_sibling("spawn_worker") do
worker.pid = Process.pid

after_fork_internal
Expand Down Expand Up @@ -844,7 +849,7 @@ def worker_loop(worker)
case client
when Message::PromoteWorker
if Info.fork_safe?
spawn_mold(worker.generation)
spawn_mold(worker)
else
logger.error("worker=#{worker.nr} gen=#{worker.generation} is no longer fork safe, can't refork")
end
Expand All @@ -865,7 +870,7 @@ def worker_loop(worker)
if @refork_condition && Info.fork_safe? && !worker.outdated?
if @refork_condition.met?(worker, logger)
proc_name status: "requests: #{worker.requests_count}, spawning mold"
if spawn_mold(worker.generation)
if spawn_mold(worker)
logger.info("worker=#{worker.nr} gen=#{worker.generation} Refork condition met, promoting ourselves")
end
@refork_condition.backoff!
Expand All @@ -880,14 +885,16 @@ def worker_loop(worker)
end
end

def spawn_mold(current_generation)
def spawn_mold(worker)
return false unless @promotion_lock.try_lock

worker.update_deadline(@spawn_timeout)

@before_fork&.call(self)

begin
Pitchfork.fork_sibling("spawn_mold") do
mold = Worker.new(nil, pid: Process.pid, generation: current_generation)
fork_sibling("spawn_mold") do
mold = Worker.new(nil, pid: Process.pid, generation: worker.generation)
mold.promote!(@spawn_timeout)
mold.start_promotion(@control_socket[1])
mold_loop(mold)
Expand Down Expand Up @@ -1002,5 +1009,61 @@ def prepare_timeout(worker)
handler.timeout_request = SoftTimeout.request(@soft_timeout, handler)
handler
end

FORK_TIMEOUT = 5

def fork_sibling(role, &block)
if REFORKING_AVAILABLE
r, w = Pitchfork::Info.keep_ios(IO.pipe)
# We double fork so that the new worker is re-attached back
# to the master.
# This requires either PR_SET_CHILD_SUBREAPER which is exclusive to Linux 3.4
# or the master to be PID 1.
if middle_pid = FORK_LOCK.synchronize { Process.fork } # parent
w.close
# We need to wait(2) so that the middle process doesn't end up a zombie.
# The process only call fork again an exit so it should be pretty fast.
# However it might need to execute some `Process._fork` or `at_exit` callbacks,
# as well as Ruby's cleanup procedure to run finalizers etc, and there is a risk
# of deadlock.
# So in case it takes more than 5 seconds to exit, we kill it.
# TODO: rather than to busy loop here, we handle it in the worker/mold loop
process_wait_with_timeout(middle_pid, FORK_TIMEOUT)
pid_str = r.gets
r.close
if pid_str
Integer(pid_str)
else
raise ForkFailure, "fork_sibling didn't succeed in #{FORK_TIMEOUT} seconds"
end
else # first child
r.close
Process.setproctitle("<pitchfork fork_sibling(#{role})>")
pid = clean_fork do
# detach into a grand child
w.close
yield
end
w.puts(pid)
w.close
exit
end
else
clean_fork(&block)
end
end

def process_wait_with_timeout(pid, timeout)
(timeout * 50).times do
_, status = Process.waitpid2(pid, Process::WNOHANG)
return status if status
sleep 0.02 # 50 * 20ms => 1s
end

# The process didn't exit in the allotted time, so we kill it.
Process.kill(@timeout_signal.call(pid), pid)
_, status = Process.waitpid2(pid)
status
end
end
end
4 changes: 0 additions & 4 deletions lib/pitchfork/worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,6 @@ def hard_kill(sig)
Process.kill(sig, pid)
end

def hard_timeout!
hard_kill(:KILL)
end

# this only runs when the Rack app.call is not running
# act like a listener
def accept_nonblock(exception: nil) # :nodoc:
Expand Down

0 comments on commit 9ca1541

Please sign in to comment.