diff --git a/pkg/storage/client_raft_test.go b/pkg/storage/client_raft_test.go index 278089ce84b4..9bb967a76ac0 100644 --- a/pkg/storage/client_raft_test.go +++ b/pkg/storage/client_raft_test.go @@ -1906,19 +1906,19 @@ func TestRaftAfterRemoveRange(t *testing.T) { mtc.expireLeases() } -// TestRaftRemoveRace adds and removes a replica repeatedly in an -// attempt to reproduce a race -// (https://github.com/cockroachdb/cockroach/issues/1911). Note that -// 10 repetitions is not enough to reliably reproduce the problem, but -// it's better than any other tests we have for this (increasing the -// number of repetitions adds an unacceptable amount of test runtime). +// TestRaftRemoveRace adds and removes a replica repeatedly in an attempt to +// reproduce a race (see #1911 and #9037). func TestRaftRemoveRace(t *testing.T) { defer leaktest.AfterTest(t)() - mtc := startMultiTestContext(t, 3) + mtc := startMultiTestContext(t, 10) defer mtc.Stop() - rangeID := roachpb.RangeID(1) - mtc.replicateRange(rangeID, 1, 2) + const rangeID = roachpb.RangeID(1) + // Up-replicate to a bunch of nodes which stresses a condition where a + // replica created via a preemptive snapshot receives a message for a + // previous incarnation of the replica (i.e. has a smaller replica ID) that + // existed on the same node. + mtc.replicateRange(rangeID, 1, 2, 3, 4, 5, 6, 7, 8, 9) for i := 0; i < 10; i++ { mtc.unreplicateRange(rangeID, 2) diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go index 5020ee9b3b4e..13a8a5a25d0d 100644 --- a/pkg/storage/replica.go +++ b/pkg/storage/replica.go @@ -337,6 +337,9 @@ type Replica struct { // Raft group). The replica ID will be non-zero whenever the replica is // part of a Raft group. replicaID roachpb.ReplicaID + // The minimum allowed ID for this replica. Initialized from + // RaftTombstone.NextReplicaID. + minReplicaID roachpb.ReplicaID // The ID of the leader replica within the Raft group. Used to determine // when the leadership changes. leaderID roachpb.ReplicaID @@ -702,18 +705,24 @@ func (r *Replica) destroyDataRaftMuLocked() error { // Save a tombstone. The range cannot be re-replicated onto this // node without having a replica ID of at least desc.NextReplicaID. - tombstoneKey := keys.RaftTombstoneKey(desc.RangeID) - tombstone := &roachpb.RaftTombstone{ - NextReplicaID: desc.NextReplicaID, - } ctx := r.AnnotateCtx(context.TODO()) - if err := engine.MVCCPutProto(ctx, batch, nil, tombstoneKey, hlc.ZeroTimestamp, nil, tombstone); err != nil { + if err := r.setTombstoneKey(ctx, batch, desc); err != nil { return err } - return batch.Commit() } +func (r *Replica) setTombstoneKey( + ctx context.Context, eng engine.ReadWriter, desc *roachpb.RangeDescriptor, +) error { + tombstoneKey := keys.RaftTombstoneKey(desc.RangeID) + tombstone := &roachpb.RaftTombstone{ + NextReplicaID: desc.NextReplicaID, + } + return engine.MVCCPutProto(ctx, eng, nil, tombstoneKey, + hlc.ZeroTimestamp, nil, tombstone) +} + func (r *Replica) setReplicaID(replicaID roachpb.ReplicaID) error { r.mu.Lock() defer r.mu.Unlock() @@ -723,11 +732,24 @@ func (r *Replica) setReplicaID(replicaID roachpb.ReplicaID) error { // setReplicaIDLocked requires that the replica lock is held. func (r *Replica) setReplicaIDLocked(replicaID roachpb.ReplicaID) error { if replicaID == 0 { - // If the incoming message didn't give us a new replica ID, - // there's nothing to do (this is only expected for preemptive snapshots). - return nil - } - if r.mu.replicaID == replicaID { + // If the incoming message does not have a new replica ID it is a + // preemptive snapshot. Clear the existing replica ID and note the minimum + // replica ID we can accept for future messages. + r.mu.replicaID = 0 + if r.mu.minReplicaID == r.mu.state.Desc.NextReplicaID { + return nil + } + r.mu.minReplicaID = r.mu.state.Desc.NextReplicaID + ctx := r.AnnotateCtx(context.TODO()) + // TODO(peter): Is writing the tombstone necessary? If this replica is + // destroyed we'll write the tombstone then. If we crash before applying + // the preemptive snapshot we'll be in the same state as before the + // preemptive snapshot (old replica present). + return r.setTombstoneKey(ctx, r.store.Engine(), r.mu.state.Desc) + } + if replicaID < r.mu.minReplicaID { + return &roachpb.RaftGroupDeletedError{} + } else if r.mu.replicaID == replicaID { return nil } else if r.mu.replicaID > replicaID { return errors.Errorf("replicaID cannot move backwards from %d to %d", r.mu.replicaID, replicaID) diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go index 462deb50fa39..93e0b2d38860 100644 --- a/pkg/storage/replica_test.go +++ b/pkg/storage/replica_test.go @@ -6015,6 +6015,47 @@ func TestReplicaIDChangePending(t *testing.T) { <-commandProposed } +func TestSetReplicaID(t *testing.T) { + defer leaktest.AfterTest(t)() + + tsc := TestStoreConfig(nil) + tc := testContext{} + tc.StartWithStoreConfig(t, tsc) + defer tc.Stop() + + repl := tc.repl + + testCases := []struct { + replicaID roachpb.ReplicaID + minReplicaID roachpb.ReplicaID + newReplicaID roachpb.ReplicaID + expected string + }{ + {0, 0, 1, ""}, + {0, 1, 1, ""}, + {0, 2, 1, "raft group deleted"}, + {1, 2, 1, "raft group deleted"}, + {2, 0, 1, "replicaID cannot move backwards"}, + } + for i, c := range testCases { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + repl.mu.Lock() + repl.mu.replicaID = c.replicaID + repl.mu.minReplicaID = c.minReplicaID + repl.mu.Unlock() + + err := repl.setReplicaID(c.newReplicaID) + if c.expected == "" { + if err != nil { + t.Fatalf("expected success, but found %v", err) + } + } else if !testutils.IsError(err, c.expected) { + t.Fatalf("expected %s, but found %v", c.expected, err) + } + }) + } +} + func TestReplicaRetryRaftProposal(t *testing.T) { defer leaktest.AfterTest(t)() diff --git a/pkg/storage/store.go b/pkg/storage/store.go index 9907c489c279..1613a8660cdb 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -3488,6 +3488,7 @@ func (s *Store) tryGetOrCreateReplica( // replica even outside of raft processing. Have to do this after grabbing // Store.mu to maintain lock ordering invariant. repl.mu.Lock() + repl.mu.minReplicaID = tombstone.NextReplicaID // Add the range to range map, but not replicasByKey since the range's start // key is unknown. The range will be added to replicasByKey later when a // snapshot is applied. After unlocking Store.mu above, another goroutine