diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b8aa945..bf1fe687 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ # Unreleased +- Enforce `spawn_timeout` for molds too. - Gracefully shutdown the server if the mold appear to be corrupted (#79). - Add more information in proctitle when forking a new sibbling. - Add a `before_fork` callback called before forking new molds and new workers. diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 315e7009..358cfaa5 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -241,10 +241,10 @@ for more details on nginx upstream configuration. ### `spawn_timeout` ```ruby -timeout 5 +spawn_timeout 5 ``` -Sets the timeout for a newly spawned worker to be ready after being spawned. +Sets the timeout for a newly spawned worker or mold to be ready after being spawned. This timeout is a safeguard against various low-level fork safety bugs that could cause a process to dead-lock. diff --git a/lib/pitchfork/children.rb b/lib/pitchfork/children.rb index 4fa65abb..416de4f0 100644 --- a/lib/pitchfork/children.rb +++ b/lib/pitchfork/children.rb @@ -107,6 +107,10 @@ def molds @molds.values end + def empty? + @children.empty? + end + def each(&block) @children.each_value(&block) end diff --git a/lib/pitchfork/http_server.rb b/lib/pitchfork/http_server.rb index 54f00452..d1ba8af4 100644 --- a/lib/pitchfork/http_server.rb +++ b/lib/pitchfork/http_server.rb @@ -201,7 +201,7 @@ def start(sync = true) else build_app! bind_listeners! - after_mold_fork.call(self, Worker.new(nil, pid: $$).promoted!) + after_mold_fork.call(self, Worker.new(nil, pid: $$).promoted!(@spawn_timeout)) end if sync @@ -394,7 +394,7 @@ def stop(graceful = true) wait_for_pending_workers self.listeners = [] limit = Pitchfork.time_now + timeout - until @children.workers.empty? || Pitchfork.time_now > limit + until @children.empty? || Pitchfork.time_now > limit if graceful @children.soft_kill_all(:TERM) else @@ -495,8 +495,8 @@ def murder_lazy_workers now = Pitchfork.time_now(true) next_sleep = @timeout - 1 - @children.workers.each do |worker| - deadline = worker.deadline + @children.each do |child| + deadline = child.deadline if 0 == deadline # worker is idle next elsif deadline > now # worker still has time @@ -507,34 +507,34 @@ def murder_lazy_workers next else # worker is out of time next_sleep = 0 - hard_timeout(worker) + hard_timeout(child) end end next_sleep <= 0 ? 1 : next_sleep end - def hard_timeout(worker) - if worker.pid.nil? # Not yet registered, likely never spawned - logger.error "worker=#{worker.nr} timed out during spawn, abandoning" + def hard_timeout(child) + if child.pid.nil? # Not yet registered, likely never spawned + logger.error "worker=#{child.nr} timed out during spawn, abandoning" @children.abandon(worker) return end - if @after_worker_hard_timeout + if @after_worker_hard_timeout && !child.mold? begin - @after_worker_hard_timeout.call(self, worker) + @after_worker_hard_timeout.call(self, child) rescue => error Pitchfork.log_error(@logger, "after_worker_hard_timeout callback", error) end end - if worker.mold? - logger.error "mold pid=#{worker.pid} gen=#{worker.generation} timed out, killing" + if child.mold? + logger.error "mold pid=#{child.pid} gen=#{child.generation} timed out, killing" else - logger.error "worker=#{worker.nr} pid=#{worker.pid} gen=#{worker.generation} timed out, killing" + logger.error "worker=#{child.nr} pid=#{child.pid} gen=#{child.generation} timed out, killing" end - @children.hard_timeout(worker) # take no prisoners for hard timeout violations + @children.hard_timeout(child) # take no prisoners for hard timeout violations end def trigger_refork @@ -794,15 +794,18 @@ def init_worker_process(worker) readers << worker trap(:QUIT) { nuke_listeners!(readers) } trap(:TERM) { nuke_listeners!(readers) } + trap(:INT) { nuke_listeners!(readers); exit!(0) } readers end def init_mold_process(mold) - proc_name role: "(gen:#{mold.generation}) mold", status: "ready" + proc_name role: "(gen:#{mold.generation}) mold", status: "init" after_mold_fork.call(self, mold) readers = [mold] trap(:QUIT) { nuke_listeners!(readers) } trap(:TERM) { nuke_listeners!(readers) } + trap(:INT) { nuke_listeners!(readers); exit!(0) } + proc_name role: "(gen:#{mold.generation}) mold", status: "ready" readers end @@ -885,7 +888,7 @@ def spawn_mold(current_generation) begin Pitchfork.fork_sibling("spawn_mold") do mold = Worker.new(nil, pid: Process.pid, generation: current_generation) - mold.promote! + mold.promote!(@spawn_timeout) mold.start_promotion(@control_socket[1]) mold_loop(mold) end diff --git a/lib/pitchfork/shared_memory.rb b/lib/pitchfork/shared_memory.rb index 56d0009e..547eec7a 100644 --- a/lib/pitchfork/shared_memory.rb +++ b/lib/pitchfork/shared_memory.rb @@ -10,7 +10,8 @@ module SharedMemory CURRENT_GENERATION_OFFSET = 0 SHUTDOWN_OFFSET = 1 MOLD_TICK_OFFSET = 2 - WORKER_TICK_OFFSET = 3 + MOLD_PROMOTION_TICK_OFFSET = 3 + WORKER_TICK_OFFSET = 4 DROPS = [Raindrops.new(PER_DROP)] @@ -49,6 +50,10 @@ def mold_deadline self[MOLD_TICK_OFFSET] end + def mold_promotion_deadline + self[MOLD_PROMOTION_TICK_OFFSET] + end + def worker_deadline(worker_nr) self[WORKER_TICK_OFFSET + worker_nr] end diff --git a/lib/pitchfork/worker.rb b/lib/pitchfork/worker.rb index 810144f5..c986a862 100644 --- a/lib/pitchfork/worker.rb +++ b/lib/pitchfork/worker.rb @@ -27,7 +27,7 @@ def initialize(nr, pid: nil, generation: 0) @deadline_drop = SharedMemory.worker_deadline(nr) self.deadline = 0 else - promoted! + promoted!(nil) end end @@ -55,6 +55,13 @@ def update(message) message.class.members.each do |member| send("#{member}=", message.public_send(member)) end + + case message + when Message::MoldSpawned + @deadline_drop = SharedMemory.mold_promotion_deadline + when Message::MoldReady + @deadline_drop = SharedMemory.mold_deadline + end end def register_to_master(control_socket) @@ -75,6 +82,7 @@ def finish_promotion(control_socket) message = Message::MoldReady.new(@nr, @pid, generation) control_socket.sendmsg(message) SharedMemory.current_generation = @generation + @deadline_drop = SharedMemory.mold_deadline end def promote(generation) @@ -85,16 +93,16 @@ def spawn_worker(new_worker) send_message_nonblock(Message::SpawnWorker.new(new_worker.nr)) end - def promote! + def promote!(timeout) @generation += 1 - promoted! + promoted!(timeout) end - def promoted! + def promoted!(timeout) @mold = true @nr = nil - @deadline_drop = SharedMemory.mold_deadline - self.deadline = 0 + @deadline_drop = SharedMemory.mold_promotion_deadline + update_deadline(timeout) if timeout self end diff --git a/test/integration/test_reforking.rb b/test/integration/test_reforking.rb index 51e073f2..325e6bab 100644 --- a/test/integration/test_reforking.rb +++ b/test/integration/test_reforking.rb @@ -97,7 +97,6 @@ def Process.fork assert_stderr "Failed to spawn a worker. Retrying." assert_stderr "Failed to spawn a worker twice in a row. Corrupted mold process?" assert_stderr "No mold alive, shutting down" - assert_stderr "timed out during spawn, abandoning", timeout: 5 assert_exited(pid, 1, timeout: 5) end @@ -147,6 +146,51 @@ def test_exiting_mold assert_clean_shutdown(pid) end + def test_stuck_mold + addr, port = unused_port + + pid = spawn_server(app: File.join(ROOT, "test/integration/env.ru"), config: <<~CONFIG) + Pitchfork::ReforkCondition.backoff_delay = 1.0 + + listen "#{addr}:#{port}" + worker_processes 2 + spawn_timeout 1 + refork_after [5, 5] + after_mold_fork do |_server, mold| + if mold.generation > 0 + if File.exist?("stuck-once.txt") + $stderr.puts "[mold success]" + else + File.write("stuck-once.txt", "1") + $stderr.puts "[mold locking-up]" + sleep 5 + end + end + end + CONFIG + + assert_healthy("http://#{addr}:#{port}") + assert_stderr "worker=0 gen=0 ready" + assert_stderr "worker=1 gen=0 ready", timeout: 5 + + 7.times do + assert_equal true, healthy?("http://#{addr}:#{port}") + end + + assert_stderr(/mold pid=\d+ gen=1 spawned/) + assert_stderr("[mold locking-up]") + assert_stderr(/mold pid=\d+ gen=1 reaped/, timeout: 10) + + 10.times do + assert_equal true, healthy?("http://#{addr}:#{port}") + end + + assert_stderr "worker=0 gen=1 ready", timeout: 5 + assert_stderr "worker=1 gen=1 ready" + + assert_clean_shutdown(pid) + end + def test_fork_unsafe addr, port = unused_port diff --git a/test/unit/test_refork_condition.rb b/test/unit/test_refork_condition.rb index 8d40d765..67b6b712 100644 --- a/test/unit/test_refork_condition.rb +++ b/test/unit/test_refork_condition.rb @@ -14,7 +14,7 @@ def test_requests_count_repeat @worker.increment_requests_count(11) assert @condition.met?(@worker, @logger) - @worker.promote! + @worker.promote!(10) @worker.reset refute @condition.met?(@worker, @logger) @@ -23,7 +23,7 @@ def test_requests_count_repeat @worker.increment_requests_count(40) assert @condition.met?(@worker, @logger) - @worker.promote! + @worker.promote!(10) @worker.reset @worker.increment_requests_count(49) @@ -39,7 +39,7 @@ def test_requests_count_stop @worker.increment_requests_count(11) assert @condition.met?(@worker, @logger) - @worker.promote! + @worker.promote!(10) @worker.reset refute @condition.met?(@worker, @logger)