From 1cf917a07638d5bc452efdf14fdd74da51b0822c Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Mon, 16 Jan 2023 15:57:05 +0000 Subject: [PATCH] kvserver: do IO before in-memory Replica creation This commit moves the storage IO before newUnloadedReplica invocation. This allows returning early if storage invariants do not hold, and also simplifies error handling and mutex locking when instantiating a Replica. Release note: none --- pkg/kv/kvserver/store_create_replica.go | 152 +++++++++++------------- 1 file changed, 69 insertions(+), 83 deletions(-) diff --git a/pkg/kv/kvserver/store_create_replica.go b/pkg/kv/kvserver/store_create_replica.go index a5cede75588f..c8131c2e5f10 100644 --- a/pkg/kv/kvserver/store_create_replica.go +++ b/pkg/kv/kvserver/store_create_replica.go @@ -15,6 +15,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/util/hlc" @@ -221,94 +222,79 @@ func (s *Store) tryGetOrCreateReplica( return nil, false, &roachpb.RaftGroupDeletedError{} } + // An uninitialized replica must have an empty HardState.Commit at all times. + // Failure to maintain this invariant indicates corruption. And yet, we have + // observed this in the wild. See #40213. + sl := stateloader.Make(rangeID) + if hs, err := sl.LoadHardState(ctx, s.Engine()); err != nil { + return nil, false, err + } else if hs.Commit != 0 { + log.Fatalf(ctx, "found non-zero HardState.Commit on uninitialized replica r%d/%d. HS=%+v", + rangeID, replicaID, hs) + } + // Write the RaftReplicaID for this replica. This is the only place in the + // CockroachDB code that we are creating a new *uninitialized* replica. + // Note that it is possible that we have already created the HardState for + // an uninitialized replica, then crashed, and on recovery are receiving a + // raft message for the same or later replica. + // - Same replica: we are overwriting the RaftReplicaID with the same + // value, which is harmless. + // - Later replica: there may be an existing HardState for the older + // uninitialized replica with Commit=0 and non-zero Term and Vote. Using + // the Term and Vote values for that older replica in the context of + // this newer replica is harmless since it just limits the votes for + // this replica. + // + // + // Compatibility: + // - v21.2 and v22.1: v22.1 unilaterally introduces RaftReplicaID (an + // unreplicated range-id local key). If a v22.1 binary is rolled back at + // a node, the fact that RaftReplicaID was written is harmless to a + // v21.2 node since it does not read it. When a v21.2 drops an + // initialized range, the RaftReplicaID will also be deleted because the + // whole range-ID local key space is deleted. + // + // - v22.2: we will start relying on the presence of RaftReplicaID, and + // remove any unitialized replicas that have a HardState but no + // RaftReplicaID. This removal will happen in ReplicasStorage.Init and + // allow us to tighten invariants. Additionally, knowing the ReplicaID + // for an unitialized range could allow a node to somehow contact the + // raft group (say by broadcasting to all nodes in the cluster), and if + // the ReplicaID is stale, would allow the node to remove the HardState + // and RaftReplicaID. See + // https://github.com/cockroachdb/cockroach/issues/75740. + // + // There is a concern that there could be some replica that survived + // from v21.2 to v22.1 to v22.2 in unitialized state and will be + // incorrectly removed in ReplicasStorage.Init causing the loss of the + // HardState.{Term,Vote} and lead to a "split-brain" wrt leader + // election. + // + // Even though this seems theoretically possible, it is considered + // practically impossible, and not just because a replica's vote is + // unlikely to stay relevant across 2 upgrades. For one, we're always + // going through learners and don't promote until caught up, so + // uninitialized replicas generally never get to vote. Second, even if + // their vote somehow mattered (perhaps we sent a learner a snap which + // was not durably persisted - which we also know is impossible, but + // let's assume it - and then promoted the node and it immediately + // power-cycled, losing the snapshot) the fire-and-forget way in which + // raft votes are requested (in the same raft cycle) makes it extremely + // unlikely that the restarted node would then receive it. + if err := sl.SetRaftReplicaID(ctx, s.Engine(), replicaID); err != nil { + return nil, false, err + } + // Create a new uninitialized replica and lock it for raft processing. repl := newUnloadedReplica(ctx, rangeID, s, replicaID) repl.raftMu.Lock() // not unlocked - // Take out read-only lock. Not strictly necessary here, but follows the - // normal lock protocol for destroyStatus.Set(). - repl.readOnlyCmdMu.Lock() - // Grab the internal Replica state lock to ensure nobody mucks with our - // replica even outside of raft processing. repl.mu.Lock() - // Initialize the Replica with the replicaID. - if err := func() error { - // An uninitialized replica should have an empty HardState.Commit at - // all times. Failure to maintain this invariant indicates corruption. - // And yet, we have observed this in the wild. See #40213. - if hs, err := repl.mu.stateLoader.LoadHardState(ctx, s.Engine()); err != nil { - return err - } else if hs.Commit != 0 { - log.Fatalf(ctx, "found non-zero HardState.Commit on uninitialized replica %s. HS=%+v", repl, hs) - } - - // Write the RaftReplicaID for this replica. This is the only place in the - // CockroachDB code that we are creating a new *uninitialized* replica. - // Note that it is possible that we have already created the HardState for - // an uninitialized replica, then crashed, and on recovery are receiving a - // raft message for the same or later replica. - // - Same replica: we are overwriting the RaftReplicaID with the same - // value, which is harmless. - // - Later replica: there may be an existing HardState for the older - // uninitialized replica with Commit=0 and non-zero Term and Vote. Using - // the Term and Vote values for that older replica in the context of - // this newer replica is harmless since it just limits the votes for - // this replica. - // - // - // Compatibility: - // - v21.2 and v22.1: v22.1 unilaterally introduces RaftReplicaID (an - // unreplicated range-id local key). If a v22.1 binary is rolled back at - // a node, the fact that RaftReplicaID was written is harmless to a - // v21.2 node since it does not read it. When a v21.2 drops an - // initialized range, the RaftReplicaID will also be deleted because the - // whole range-ID local key space is deleted. - // - // - v22.2: we will start relying on the presence of RaftReplicaID, and - // remove any unitialized replicas that have a HardState but no - // RaftReplicaID. This removal will happen in ReplicasStorage.Init and - // allow us to tighten invariants. Additionally, knowing the ReplicaID - // for an unitialized range could allow a node to somehow contact the - // raft group (say by broadcasting to all nodes in the cluster), and if - // the ReplicaID is stale, would allow the node to remove the HardState - // and RaftReplicaID. See - // https://github.com/cockroachdb/cockroach/issues/75740. - // - // There is a concern that there could be some replica that survived - // from v21.2 to v22.1 to v22.2 in unitialized state and will be - // incorrectly removed in ReplicasStorage.Init causing the loss of the - // HardState.{Term,Vote} and lead to a "split-brain" wrt leader - // election. - // - // Even though this seems theoretically possible, it is considered - // practically impossible, and not just because a replica's vote is - // unlikely to stay relevant across 2 upgrades. For one, we're always - // going through learners and don't promote until caught up, so - // uninitialized replicas generally never get to vote. Second, even if - // their vote somehow mattered (perhaps we sent a learner a snap which - // was not durably persisted - which we also know is impossible, but - // let's assume it - and then promoted the node and it immediately - // power-cycled, losing the snapshot) the fire-and-forget way in which - // raft votes are requested (in the same raft cycle) makes it extremely - // unlikely that the restarted node would then receive it. - if err := repl.mu.stateLoader.SetRaftReplicaID(ctx, s.Engine(), replicaID); err != nil { - return err - } - - repl.assertStateRaftMuLockedReplicaMuRLocked(ctx, s.Engine()) - return nil - }(); err != nil { - // Mark the replica as destroyed and remove it from the replicas maps to - // ensure nobody tries to use it. - repl.mu.destroyStatus.Set(errors.Wrapf(err, "%s: failed to initialize", repl), destroyReasonRemoved) - repl.mu.Unlock() - repl.readOnlyCmdMu.Unlock() - repl.raftMu.Unlock() - return nil, false, err - } - + // TODO(pavelkalinnikov): there is little benefit in this check, since loading + // ReplicaID is a no-op after the above write, and the ReplicaState load is + // only for making sure it's empty. Distill the useful IO and make its result + // the direct input into Replica creation, then this check won't be needed. + repl.assertStateRaftMuLockedReplicaMuRLocked(ctx, s.Engine()) repl.mu.Unlock() - repl.readOnlyCmdMu.Unlock() - // NB: only repl.raftMu is now locked. // Install the replica in the store's replica map. s.mu.Lock()