From 9ca15413b8d3c600abd6a573c2dfc200503d6d08 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 12 Dec 2023 12:16:23 +0100 Subject: [PATCH] Refactor timeout handling Now all timeout actions use the `timeout_signal` callback which allows to customize the signal sent on timeout. e.g. if you wish to get debug info you may send SIGBUS to get crash reports etc. --- lib/pitchfork.rb | 53 ----------------------- lib/pitchfork/children.rb | 8 ---- lib/pitchfork/configurator.rb | 14 ++++++ lib/pitchfork/http_server.rb | 81 +++++++++++++++++++++++++++++++---- lib/pitchfork/worker.rb | 4 -- 5 files changed, 86 insertions(+), 74 deletions(-) diff --git a/lib/pitchfork.rb b/lib/pitchfork.rb index beab7df6..80700741 100644 --- a/lib/pitchfork.rb +++ b/lib/pitchfork.rb @@ -37,7 +37,6 @@ module Pitchfork # :stopdoc: - FORK_TIMEOUT = 5 FORK_LOCK = Monitor.new @socket_type = :SOCK_SEQPACKET @@ -198,58 +197,6 @@ def clean_fork(setpgid: true, &block) end end - def fork_sibling(role, &block) - if REFORKING_AVAILABLE - r, w = Pitchfork::Info.keep_ios(IO.pipe) - # We double fork so that the new worker is re-attached back - # to the master. - # This requires either PR_SET_CHILD_SUBREAPER which is exclusive to Linux 3.4 - # or the master to be PID 1. - if middle_pid = FORK_LOCK.synchronize { Process.fork } # parent - w.close - # We need to wait(2) so that the middle process doesn't end up a zombie. - # The process only call fork again an exit so it should be pretty fast. - # However it might need to execute some `Process._fork` or `at_exit` callbacks, - # so it case it takes more than 5 seconds to exit, we kill it with SIGBUS - # to produce a crash report, as this is indicative of a nasty bug. - process_wait_with_timeout(middle_pid, FORK_TIMEOUT, :BUS) - pid_str = r.gets - r.close - if pid_str - Integer(pid_str) - else - raise ForkFailure, "fork_sibling didn't succeed in #{FORK_TIMEOUT} seconds" - end - else # first child - r.close - Process.setproctitle("") - pid = clean_fork do - # detach into a grand child - w.close - yield - end - w.puts(pid) - w.close - exit - end - else - clean_fork(&block) - end - end - - def process_wait_with_timeout(pid, timeout, timeout_signal = :KILL) - (timeout * 200).times do - _, status = Process.waitpid2(pid, Process::WNOHANG) - return status if status - sleep 0.005 # 200 * 5ms => 1s - end - - # The process didn't exit in the allotted time, so we kill it. - Process.kill(timeout_signal, pid) - _, status = Process.waitpid2(pid) - status - end - def time_now(int = false) Process.clock_gettime(Process::CLOCK_MONOTONIC, int ? :second : :float_second) end diff --git a/lib/pitchfork/children.rb b/lib/pitchfork/children.rb index 416de4f0..cca292a6 100644 --- a/lib/pitchfork/children.rb +++ b/lib/pitchfork/children.rb @@ -138,14 +138,6 @@ def hard_kill_all(sig) end end - def hard_timeout(child) - child.hard_timeout! - rescue Errno::ESRCH - reap(child.pid) - child.close - true - end - def workers @workers.values end diff --git a/lib/pitchfork/configurator.rb b/lib/pitchfork/configurator.rb index cd2290d7..08727dc0 100644 --- a/lib/pitchfork/configurator.rb +++ b/lib/pitchfork/configurator.rb @@ -34,6 +34,7 @@ class Configurator :soft_timeout => 20, :cleanup_timeout => 2, :spawn_timeout => 10, + :timeout_signal => -> (_pid) { :KILL }, :timeout => 22, :logger => default_logger, :worker_processes => 1, @@ -180,6 +181,19 @@ def timeout(seconds, cleanup: 2) set_int(:timeout, soft_timeout + cleanup_timeout, 5) end + def timeout_signal(*args, &block) + if block_given? + set_hook(:timeout_signal, block, 1) + elsif args.first.respond_to?(:call) + set_hook(:timeout_signal, args.first, 1) + elsif args.first.is_a?(Symbol) + signal = args.first + set_hook(:timeout_signal, ->(_pid) { signal }, 1) + else + raise ArgumentError, "timeout_signal must be a symbol or a proc" + end + end + def spawn_timeout(seconds) set_int(:spawn_timeout, seconds, 1) end diff --git a/lib/pitchfork/http_server.rb b/lib/pitchfork/http_server.rb index d1ba8af4..55eb31e4 100644 --- a/lib/pitchfork/http_server.rb +++ b/lib/pitchfork/http_server.rb @@ -74,7 +74,7 @@ def extend_deadline(extra_time) end # :stopdoc: - attr_accessor :app, :timeout, :soft_timeout, :cleanup_timeout, :spawn_timeout, :worker_processes, + attr_accessor :app, :timeout, :timeout_signal, :soft_timeout, :cleanup_timeout, :spawn_timeout, :worker_processes, :before_fork, :after_worker_fork, :after_mold_fork, :listener_opts, :children, :orig_app, :config, :ready_pipe, @@ -404,7 +404,12 @@ def stop(graceful = true) return StopIteration end end - @children.hard_kill_all(:KILL) + + @children.each do |child| + if child.pid + @children.hard_kill(@timeout_signal.call(child.pid), child) + end + end @promotion_lock.unlink end @@ -534,7 +539,7 @@ def hard_timeout(child) else logger.error "worker=#{child.nr} pid=#{child.pid} gen=#{child.generation} timed out, killing" end - @children.hard_timeout(child) # take no prisoners for hard timeout violations + @children.hard_kill(@timeout_signal.call(child.pid), child) # take no prisoners for hard timeout violations end def trigger_refork @@ -571,7 +576,7 @@ def spawn_worker(worker, detach:) # the monitor process will kill it. worker.update_deadline(@spawn_timeout) @before_fork&.call(self) - Pitchfork.fork_sibling("spawn_worker") do + fork_sibling("spawn_worker") do worker.pid = Process.pid after_fork_internal @@ -844,7 +849,7 @@ def worker_loop(worker) case client when Message::PromoteWorker if Info.fork_safe? - spawn_mold(worker.generation) + spawn_mold(worker) else logger.error("worker=#{worker.nr} gen=#{worker.generation} is no longer fork safe, can't refork") end @@ -865,7 +870,7 @@ def worker_loop(worker) if @refork_condition && Info.fork_safe? && !worker.outdated? if @refork_condition.met?(worker, logger) proc_name status: "requests: #{worker.requests_count}, spawning mold" - if spawn_mold(worker.generation) + if spawn_mold(worker) logger.info("worker=#{worker.nr} gen=#{worker.generation} Refork condition met, promoting ourselves") end @refork_condition.backoff! @@ -880,14 +885,16 @@ def worker_loop(worker) end end - def spawn_mold(current_generation) + def spawn_mold(worker) return false unless @promotion_lock.try_lock + worker.update_deadline(@spawn_timeout) + @before_fork&.call(self) begin - Pitchfork.fork_sibling("spawn_mold") do - mold = Worker.new(nil, pid: Process.pid, generation: current_generation) + fork_sibling("spawn_mold") do + mold = Worker.new(nil, pid: Process.pid, generation: worker.generation) mold.promote!(@spawn_timeout) mold.start_promotion(@control_socket[1]) mold_loop(mold) @@ -1002,5 +1009,61 @@ def prepare_timeout(worker) handler.timeout_request = SoftTimeout.request(@soft_timeout, handler) handler end + + FORK_TIMEOUT = 5 + + def fork_sibling(role, &block) + if REFORKING_AVAILABLE + r, w = Pitchfork::Info.keep_ios(IO.pipe) + # We double fork so that the new worker is re-attached back + # to the master. + # This requires either PR_SET_CHILD_SUBREAPER which is exclusive to Linux 3.4 + # or the master to be PID 1. + if middle_pid = FORK_LOCK.synchronize { Process.fork } # parent + w.close + # We need to wait(2) so that the middle process doesn't end up a zombie. + # The process only call fork again an exit so it should be pretty fast. + # However it might need to execute some `Process._fork` or `at_exit` callbacks, + # as well as Ruby's cleanup procedure to run finalizers etc, and there is a risk + # of deadlock. + # So in case it takes more than 5 seconds to exit, we kill it. + # TODO: rather than to busy loop here, we handle it in the worker/mold loop + process_wait_with_timeout(middle_pid, FORK_TIMEOUT) + pid_str = r.gets + r.close + if pid_str + Integer(pid_str) + else + raise ForkFailure, "fork_sibling didn't succeed in #{FORK_TIMEOUT} seconds" + end + else # first child + r.close + Process.setproctitle("") + pid = clean_fork do + # detach into a grand child + w.close + yield + end + w.puts(pid) + w.close + exit + end + else + clean_fork(&block) + end + end + + def process_wait_with_timeout(pid, timeout) + (timeout * 50).times do + _, status = Process.waitpid2(pid, Process::WNOHANG) + return status if status + sleep 0.02 # 50 * 20ms => 1s + end + + # The process didn't exit in the allotted time, so we kill it. + Process.kill(@timeout_signal.call(pid), pid) + _, status = Process.waitpid2(pid) + status + end end end diff --git a/lib/pitchfork/worker.rb b/lib/pitchfork/worker.rb index c986a862..693e1906 100644 --- a/lib/pitchfork/worker.rb +++ b/lib/pitchfork/worker.rb @@ -151,10 +151,6 @@ def hard_kill(sig) Process.kill(sig, pid) end - def hard_timeout! - hard_kill(:KILL) - end - # this only runs when the Rack app.call is not running # act like a listener def accept_nonblock(exception: nil) # :nodoc: