From 48eedde5ab7d46277e183d2cfb90fff0e6fbb169 Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Mon, 19 Apr 2021 15:04:35 -0400 Subject: [PATCH] stop: break deadlock between Stopper.mu and Replica.mu Fixes #63761. As of #61279, it was possible for us to deadlock due to inconsistent lock orderings between Stopper.mu and Replica.mu. We were previously holding onto Stopper.mu while executing all closers, including those that may acquire other locks. Because closers can be defined anywhere (and may consequently grab any in-scope lock), we should order Stopper.mu to come after all other locks in the system. The closer added in #61279 iterated over all non-destroyed replicas, locking Replica.mu to check for the replica's destroy status (in Store.VisitReplicas). This deadlocked with the lease acquisition code path that first grabs an exclusive lock over Replica.mu (see InitOrJoinRequest), and uses the stopper to kick off an async task acquiring the lease. The stopper internally locks Stopper.mu to check whether or not it was already stopped. Release note: None --- pkg/kv/kvserver/store.go | 3 ++- pkg/util/stop/stopper.go | 12 +++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 84584870bfc0..df050f5280fd 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -470,7 +470,8 @@ type Store struct { // Locking notes: To avoid deadlocks, the following lock order must be // obeyed: baseQueue.mu < Replica.raftMu < Replica.readOnlyCmdMu < Store.mu - // < Replica.mu < Replica.unreachablesMu < Store.coalescedMu < Store.scheduler.mu. + // < Replica.mu < Replica.unreachablesMu < Store.coalescedMu + // < Store.scheduler.mu < Stopper.mu. // (It is not required to acquire every lock in sequence, but when multiple // locks are held at the same time, it is incorrect to acquire a lock with // "lesser" value in this sequence after one with "greater" value). diff --git a/pkg/util/stop/stopper.go b/pkg/util/stop/stopper.go index 5f0816d703a0..6feac0c85fdb 100644 --- a/pkg/util/stop/stopper.go +++ b/pkg/util/stop/stopper.go @@ -473,8 +473,18 @@ func (s *Stopper) Stop(ctx context.Context) { s.Quiesce(ctx) s.mu.Lock() + closers := s.mu.closers defer s.mu.Unlock() - for _, c := range s.mu.closers { + + // We can't hold on to Stopper.mu while executing closers. Closers are + // allowed to be defined anywhere, and are allowed to grab any lock + // available. As far as lock ordering is concerned, Stopper.mu comes after + // everything else to avoid any deadlocks (see comment on Store.mu). + // + // There's no concern around new closers being added; we've marked this + // stopper as `stopping` above, so any attempts to do so will be refused + // (see refuseRLocked). + for _, c := range closers { c.Close() } }