Skip to content

Commit

Permalink
Add a spawn timeout for new molds
Browse files Browse the repository at this point in the history
Ensure a mold doesn't take longer than `spawn_timeout` to be ready.

If it get stuck for some reason we kill it and another one will be
spawned later.
  • Loading branch information
byroot committed Dec 11, 2023
1 parent ac59329 commit 09c1924
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 30 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Unreleased

- Enforce `spawn_timeout` for molds too.
- Gracefully shutdown the server if the mold appear to be corrupted (#79).
- Add more information in proctitle when forking a new sibbling.
- Add a `before_fork` callback called before forking new molds and new workers.
Expand Down
4 changes: 2 additions & 2 deletions docs/CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,10 @@ for more details on nginx upstream configuration.
### `spawn_timeout`

```ruby
timeout 5
spawn_timeout 5
```

Sets the timeout for a newly spawned worker to be ready after being spawned.
Sets the timeout for a newly spawned worker or mold to be ready after being spawned.

This timeout is a safeguard against various low-level fork safety bugs that could cause
a process to dead-lock.
Expand Down
4 changes: 4 additions & 0 deletions lib/pitchfork/children.rb
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ def molds
@molds.values
end

def empty?
@children.empty?
end

def each(&block)
@children.each_value(&block)
end
Expand Down
35 changes: 18 additions & 17 deletions lib/pitchfork/http_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def start(sync = true)
else
build_app!
bind_listeners!
after_mold_fork.call(self, Worker.new(nil, pid: $$).promoted!)
after_mold_fork.call(self, Worker.new(nil, pid: $$).promoted!(@spawn_timeout))
end

if sync
Expand Down Expand Up @@ -394,11 +394,11 @@ def stop(graceful = true)
wait_for_pending_workers
self.listeners = []
limit = Pitchfork.time_now + timeout
until @children.workers.empty? || Pitchfork.time_now > limit
until @children.empty? || Pitchfork.time_now > limit
if graceful
@children.soft_kill_all(:TERM)
else
@children.hard_kill_all(:INT)
@children.hard_kill_all(:TERM)
end
if monitor_loop(false) == StopIteration
return StopIteration
Expand Down Expand Up @@ -495,8 +495,8 @@ def murder_lazy_workers
now = Pitchfork.time_now(true)
next_sleep = @timeout - 1

@children.workers.each do |worker|
deadline = worker.deadline
@children.each do |child|
deadline = child.deadline
if 0 == deadline # worker is idle
next
elsif deadline > now # worker still has time
Expand All @@ -507,34 +507,34 @@ def murder_lazy_workers
next
else # worker is out of time
next_sleep = 0
hard_timeout(worker)
hard_timeout(child)
end
end

next_sleep <= 0 ? 1 : next_sleep
end

def hard_timeout(worker)
if worker.pid.nil? # Not yet registered, likely never spawned
logger.error "worker=#{worker.nr} timed out during spawn, abandoning"
def hard_timeout(child)
if child.pid.nil? # Not yet registered, likely never spawned
logger.error "worker=#{child.nr} timed out during spawn, abandoning"
@children.abandon(worker)
return
end

if @after_worker_hard_timeout
if @after_worker_hard_timeout && !child.mold?
begin
@after_worker_hard_timeout.call(self, worker)
@after_worker_hard_timeout.call(self, child)
rescue => error
Pitchfork.log_error(@logger, "after_worker_hard_timeout callback", error)
end
end

if worker.mold?
logger.error "mold pid=#{worker.pid} gen=#{worker.generation} timed out, killing"
if child.mold?
logger.error "mold pid=#{child.pid} gen=#{child.generation} timed out, killing"
else
logger.error "worker=#{worker.nr} pid=#{worker.pid} gen=#{worker.generation} timed out, killing"
logger.error "worker=#{child.nr} pid=#{child.pid} gen=#{child.generation} timed out, killing"
end
@children.hard_timeout(worker) # take no prisoners for hard timeout violations
@children.hard_timeout(child) # take no prisoners for hard timeout violations
end

def trigger_refork
Expand Down Expand Up @@ -798,11 +798,12 @@ def init_worker_process(worker)
end

def init_mold_process(mold)
proc_name role: "(gen:#{mold.generation}) mold", status: "ready"
proc_name role: "(gen:#{mold.generation}) mold", status: "init"
after_mold_fork.call(self, mold)
readers = [mold]
trap(:QUIT) { nuke_listeners!(readers) }
trap(:TERM) { nuke_listeners!(readers) }
proc_name role: "(gen:#{mold.generation}) mold", status: "ready"
readers
end

Expand Down Expand Up @@ -885,7 +886,7 @@ def spawn_mold(current_generation)
begin
Pitchfork.fork_sibling("spawn_mold") do
mold = Worker.new(nil, pid: Process.pid, generation: current_generation)
mold.promote!
mold.promote!(@spawn_timeout)
mold.start_promotion(@control_socket[1])
mold_loop(mold)
end
Expand Down
7 changes: 6 additions & 1 deletion lib/pitchfork/shared_memory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ module SharedMemory
CURRENT_GENERATION_OFFSET = 0
SHUTDOWN_OFFSET = 1
MOLD_TICK_OFFSET = 2
WORKER_TICK_OFFSET = 3
MOLD_PROMOTION_TICK_OFFSET = 3
WORKER_TICK_OFFSET = 4

DROPS = [Raindrops.new(PER_DROP)]

Expand Down Expand Up @@ -49,6 +50,10 @@ def mold_deadline
self[MOLD_TICK_OFFSET]
end

def mold_promotion_deadline
self[MOLD_PROMOTION_TICK_OFFSET]
end

def worker_deadline(worker_nr)
self[WORKER_TICK_OFFSET + worker_nr]
end
Expand Down
20 changes: 14 additions & 6 deletions lib/pitchfork/worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def initialize(nr, pid: nil, generation: 0)
@deadline_drop = SharedMemory.worker_deadline(nr)
self.deadline = 0
else
promoted!
promoted!(nil)
end
end

Expand Down Expand Up @@ -55,6 +55,13 @@ def update(message)
message.class.members.each do |member|
send("#{member}=", message.public_send(member))
end

case message
when Message::MoldSpawned
@deadline_drop = SharedMemory.mold_promotion_deadline
when Message::MoldReady
@deadline_drop = SharedMemory.mold_deadline
end
end

def register_to_master(control_socket)
Expand All @@ -75,6 +82,7 @@ def finish_promotion(control_socket)
message = Message::MoldReady.new(@nr, @pid, generation)
control_socket.sendmsg(message)
SharedMemory.current_generation = @generation
@deadline_drop = SharedMemory.mold_deadline
end

def promote(generation)
Expand All @@ -85,16 +93,16 @@ def spawn_worker(new_worker)
send_message_nonblock(Message::SpawnWorker.new(new_worker.nr))
end

def promote!
def promote!(timeout)
@generation += 1
promoted!
promoted!(timeout)
end

def promoted!
def promoted!(timeout)
@mold = true
@nr = nil
@deadline_drop = SharedMemory.mold_deadline
self.deadline = 0
@deadline_drop = SharedMemory.mold_promotion_deadline
update_deadline(timeout) if timeout
self
end

Expand Down
46 changes: 45 additions & 1 deletion test/integration/test_reforking.rb
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ def Process.fork
assert_stderr "Failed to spawn a worker. Retrying."
assert_stderr "Failed to spawn a worker twice in a row. Corrupted mold process?"
assert_stderr "No mold alive, shutting down"
assert_stderr "timed out during spawn, abandoning", timeout: 5

assert_exited(pid, 1, timeout: 5)
end
Expand Down Expand Up @@ -147,6 +146,51 @@ def test_exiting_mold
assert_clean_shutdown(pid)
end

def test_stuck_mold
addr, port = unused_port

pid = spawn_server(app: File.join(ROOT, "test/integration/env.ru"), config: <<~CONFIG)
Pitchfork::ReforkCondition.backoff_delay = 1.0
listen "#{addr}:#{port}"
worker_processes 2
spawn_timeout 1
refork_after [5, 5]
after_mold_fork do |_server, mold|
if mold.generation > 0
if File.exist?("stuck-once.txt")
$stderr.puts "[mold success]"
else
File.write("stuck-once.txt", "1")
$stderr.puts "[mold locking-up]"
sleep 5
end
end
end
CONFIG

assert_healthy("http://#{addr}:#{port}")
assert_stderr "worker=0 gen=0 ready"
assert_stderr "worker=1 gen=0 ready", timeout: 5

7.times do
assert_equal true, healthy?("http://#{addr}:#{port}")
end

assert_stderr(/mold pid=\d+ gen=1 spawned/)
assert_stderr("[mold locking-up]")
assert_stderr(/mold pid=\d+ gen=1 reaped/, timeout: 10)

10.times do
assert_equal true, healthy?("http://#{addr}:#{port}")
end

assert_stderr "worker=0 gen=1 ready", timeout: 5
assert_stderr "worker=1 gen=1 ready"

assert_clean_shutdown(pid)
end

def test_fork_unsafe
addr, port = unused_port

Expand Down
6 changes: 3 additions & 3 deletions test/unit/test_refork_condition.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_requests_count_repeat
@worker.increment_requests_count(11)
assert @condition.met?(@worker, @logger)

@worker.promote!
@worker.promote!(10)
@worker.reset

refute @condition.met?(@worker, @logger)
Expand All @@ -23,7 +23,7 @@ def test_requests_count_repeat
@worker.increment_requests_count(40)
assert @condition.met?(@worker, @logger)

@worker.promote!
@worker.promote!(10)
@worker.reset

@worker.increment_requests_count(49)
Expand All @@ -39,7 +39,7 @@ def test_requests_count_stop
@worker.increment_requests_count(11)
assert @condition.met?(@worker, @logger)

@worker.promote!
@worker.promote!(10)
@worker.reset

refute @condition.met?(@worker, @logger)
Expand Down

0 comments on commit 09c1924

Please sign in to comment.