diff --git a/pkg/storage/apply/task.go b/pkg/storage/apply/task.go index 8bfa62acd0a1..26e9ef9511dd 100644 --- a/pkg/storage/apply/task.go +++ b/pkg/storage/apply/task.go @@ -12,6 +12,7 @@ package apply import ( "context" + "errors" "go.etcd.io/etcd/raft/raftpb" ) @@ -54,6 +55,10 @@ type StateMachine interface { ApplySideEffects(CheckedCommand) (AppliedCommand, error) } +// ErrRemoved can be returned from ApplySideEffects which will stop the +// task from processing more commands and return immediately. +var ErrRemoved = errors.New("replica removed") + // Batch accumulates a series of updates from Commands and performs them // all at once to its StateMachine when applied. Groups of Commands will be // staged in the Batch such that one or more trivial Commands are staged or diff --git a/pkg/storage/client_merge_test.go b/pkg/storage/client_merge_test.go index 9917ada54c26..ae0068469f2b 100644 --- a/pkg/storage/client_merge_test.go +++ b/pkg/storage/client_merge_test.go @@ -57,7 +57,6 @@ import ( "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "go.etcd.io/etcd/raft" "go.etcd.io/etcd/raft/raftpb" ) @@ -1641,6 +1640,7 @@ func TestStoreReplicaGCAfterMerge(t *testing.T) { storeCfg.TestingKnobs.DisableReplicateQueue = true storeCfg.TestingKnobs.DisableReplicaGCQueue = true storeCfg.TestingKnobs.DisableMergeQueue = true + storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{storeConfig: &storeCfg} mtc.Start(t, 2) defer mtc.Stop() @@ -2043,8 +2043,20 @@ func TestStoreRangeMergeSlowAbandonedFollower(t *testing.T) { lhsRepl2.RaftUnlock() // Ensure that the unblocked merge eventually applies and subsumes the RHS. + // In general this will happen due to receiving a ReplicaTooOldError but + // it may require the replica GC queue. In rare cases the LHS will never + // hear about the merge and may need to be GC'd on its own. testutils.SucceedsSoon(t, func() error { - if _, err := store2.GetReplica(rhsDesc.RangeID); err == nil { + // Make the the LHS gets destroyed. + if lhsRepl, err := store2.GetReplica(lhsDesc.RangeID); err == nil { + if err := store2.ManualReplicaGC(lhsRepl); err != nil { + t.Fatal(err) + } + } + if rhsRepl, err := store2.GetReplica(rhsDesc.RangeID); err == nil { + if err := store2.ManualReplicaGC(rhsRepl); err != nil { + t.Fatal(err) + } return errors.New("rhs not yet destroyed") } return nil @@ -2060,6 +2072,7 @@ func TestStoreRangeMergeAbandonedFollowers(t *testing.T) { storeCfg.TestingKnobs.DisableReplicaGCQueue = true storeCfg.TestingKnobs.DisableSplitQueue = true storeCfg.TestingKnobs.DisableMergeQueue = true + storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{storeConfig: &storeCfg} mtc.Start(t, 3) defer mtc.Stop() @@ -2827,74 +2840,6 @@ func TestStoreRangeMergeSlowWatcher(t *testing.T) { } } -// unreliableRaftHandler drops all Raft messages that are addressed to the -// specified rangeID, but lets all other messages through. -type unreliableRaftHandler struct { - rangeID roachpb.RangeID - storage.RaftMessageHandler - // If non-nil, can return false to avoid dropping a msg to rangeID - dropReq func(*storage.RaftMessageRequest) bool - dropHB func(*storage.RaftHeartbeat) bool - dropResp func(*storage.RaftMessageResponse) bool -} - -func (h *unreliableRaftHandler) HandleRaftRequest( - ctx context.Context, - req *storage.RaftMessageRequest, - respStream storage.RaftMessageResponseStream, -) *roachpb.Error { - if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 { - reqCpy := *req - req = &reqCpy - req.Heartbeats = h.filterHeartbeats(req.Heartbeats) - req.HeartbeatResps = h.filterHeartbeats(req.HeartbeatResps) - if len(req.Heartbeats)+len(req.HeartbeatResps) == 0 { - // Entirely filtered. - return nil - } - } else if req.RangeID == h.rangeID { - if h.dropReq == nil || h.dropReq(req) { - log.Infof( - ctx, - "dropping Raft message %s", - raft.DescribeMessage(req.Message, func([]byte) string { - return "" - }), - ) - - return nil - } - } - return h.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream) -} - -func (h *unreliableRaftHandler) filterHeartbeats( - hbs []storage.RaftHeartbeat, -) []storage.RaftHeartbeat { - if len(hbs) == 0 { - return hbs - } - var cpy []storage.RaftHeartbeat - for i := range hbs { - hb := &hbs[i] - if hb.RangeID != h.rangeID || (h.dropHB != nil && !h.dropHB(hb)) { - cpy = append(cpy, *hb) - } - } - return cpy -} - -func (h *unreliableRaftHandler) HandleRaftResponse( - ctx context.Context, resp *storage.RaftMessageResponse, -) error { - if resp.RangeID == h.rangeID { - if h.dropResp == nil || h.dropResp(resp) { - return nil - } - } - return h.RaftMessageHandler.HandleRaftResponse(ctx, resp) -} - func TestStoreRangeMergeRaftSnapshot(t *testing.T) { defer leaktest.AfterTest(t)() @@ -3082,20 +3027,22 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) { mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{ rangeID: aRepl0.RangeID, RaftMessageHandler: store2, - dropReq: func(req *storage.RaftMessageRequest) bool { - // Make sure that even going forward no MsgApp for what we just - // truncated can make it through. The Raft transport is asynchronous - // so this is necessary to make the test pass reliably - otherwise - // the follower on store2 may catch up without needing a snapshot, - // tripping up the test. - // - // NB: the Index on the message is the log index that _precedes_ any of the - // entries in the MsgApp, so filter where msg.Index < index, not <= index. - return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + dropFuncs: dropFuncs{ + dropReq: func(req *storage.RaftMessageRequest) bool { + // Make sure that even going forward no MsgApp for what we just + // truncated can make it through. The Raft transport is asynchronous + // so this is necessary to make the test pass reliably - otherwise + // the follower on store2 may catch up without needing a snapshot, + // tripping up the test. + // + // NB: the Index on the message is the log index that _precedes_ any of the + // entries in the MsgApp, so filter where msg.Index < index, not <= index. + return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + }, + // Don't drop heartbeats or responses. + dropHB: func(*storage.RaftHeartbeat) bool { return false }, + dropResp: func(*storage.RaftMessageResponse) bool { return false }, }, - // Don't drop heartbeats or responses. - dropHB: func(*storage.RaftHeartbeat) bool { return false }, - dropResp: func(*storage.RaftMessageResponse) bool { return false }, }) // Wait for all replicas to catch up to the same point. Because we truncated @@ -3372,9 +3319,12 @@ func TestMergeQueue(t *testing.T) { t.Run("non-collocated", func(t *testing.T) { reset(t) verifyUnmerged(t) - mtc.replicateRange(rhs().RangeID, 1) - mtc.transferLease(ctx, rhs().RangeID, 0, 1) - mtc.unreplicateRange(rhs().RangeID, 0) + rhsRangeID := rhs().RangeID + mtc.replicateRange(rhsRangeID, 1) + mtc.transferLease(ctx, rhsRangeID, 0, 1) + mtc.unreplicateRange(rhsRangeID, 0) + require.NoError(t, mtc.waitForUnreplicated(rhsRangeID, 0)) + clearRange(t, lhsStartKey, rhsEndKey) store.MustForceMergeScanAndProcess() verifyMerged(t) diff --git a/pkg/storage/client_metrics_test.go b/pkg/storage/client_metrics_test.go index 4fd43fcbcc21..c91d386c9e81 100644 --- a/pkg/storage/client_metrics_test.go +++ b/pkg/storage/client_metrics_test.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/pkg/errors" + "github.com/stretchr/testify/require" ) func checkGauge(t *testing.T, id string, g *metric.Gauge, e int64) { @@ -313,8 +314,9 @@ func TestStoreMetrics(t *testing.T) { return mtc.unreplicateRangeNonFatal(replica.RangeID, 0) }) - // Force GC Scan on store 0 in order to fully remove range. - mtc.stores[1].MustForceReplicaGCScanAndProcess() + // Wait until we're sure that store 0 has successfully processed its removal. + require.NoError(t, mtc.waitForUnreplicated(replica.RangeID, 0)) + mtc.waitForValues(roachpb.Key("z"), []int64{0, 5, 5}) // Verify range count is as expected. diff --git a/pkg/storage/client_raft_helpers_test.go b/pkg/storage/client_raft_helpers_test.go new file mode 100644 index 000000000000..0d9bf9cd4945 --- /dev/null +++ b/pkg/storage/client_raft_helpers_test.go @@ -0,0 +1,291 @@ +// Copyright 2019 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage_test + +import ( + "context" + "errors" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" + "go.etcd.io/etcd/raft" +) + +type dropFuncs struct { + // If non-nil, can return false to avoid dropping a msg to rangeID + dropReq func(*storage.RaftMessageRequest) bool + dropHB func(*storage.RaftHeartbeat) bool + dropResp func(*storage.RaftMessageResponse) bool + dropSnap func(*storage.SnapshotRequest_Header) error +} + +// unreliableRaftHandler drops all Raft messages that are addressed to the +// specified rangeID, but lets all other messages through. +type unreliableRaftHandler struct { + rangeID roachpb.RangeID + storage.RaftMessageHandler + dropFuncs +} + +func (h *unreliableRaftHandler) HandleRaftRequest( + ctx context.Context, + req *storage.RaftMessageRequest, + respStream storage.RaftMessageResponseStream, +) *roachpb.Error { + if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 { + reqCpy := *req + req = &reqCpy + req.Heartbeats = h.filterHeartbeats(req.Heartbeats) + req.HeartbeatResps = h.filterHeartbeats(req.HeartbeatResps) + if len(req.Heartbeats)+len(req.HeartbeatResps) == 0 { + // Entirely filtered. + return nil + } + } else if req.RangeID == h.rangeID { + if h.dropReq == nil || h.dropReq(req) { + log.Infof( + ctx, + "dropping r%d Raft message %s", + req.RangeID, + raft.DescribeMessage(req.Message, func([]byte) string { + return "" + }), + ) + + return nil + } + } + return h.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream) +} + +func (h *unreliableRaftHandler) filterHeartbeats( + hbs []storage.RaftHeartbeat, +) []storage.RaftHeartbeat { + if len(hbs) == 0 { + return hbs + } + var cpy []storage.RaftHeartbeat + for i := range hbs { + hb := &hbs[i] + if hb.RangeID != h.rangeID || (h.dropHB != nil && !h.dropHB(hb)) { + cpy = append(cpy, *hb) + } + } + return cpy +} + +func (h *unreliableRaftHandler) HandleRaftResponse( + ctx context.Context, resp *storage.RaftMessageResponse, +) error { + if resp.RangeID == h.rangeID { + if h.dropResp == nil || h.dropResp(resp) { + return nil + } + } + return h.RaftMessageHandler.HandleRaftResponse(ctx, resp) +} + +func (h *unreliableRaftHandler) HandleSnapshot( + header *storage.SnapshotRequest_Header, respStream storage.SnapshotResponseStream, +) error { + if header.RaftMessageRequest.RangeID == h.rangeID && h.dropSnap != nil { + if err := h.dropSnap(header); err != nil { + return err + } + } + return h.RaftMessageHandler.HandleSnapshot(header, respStream) +} + +// mtcStoreRaftMessageHandler exists to allows a store to be stopped and +// restarted while maintaining a partition using an unreliableRaftHandler. +type mtcStoreRaftMessageHandler struct { + mtc *multiTestContext + storeIdx int +} + +func (h *mtcStoreRaftMessageHandler) HandleRaftRequest( + ctx context.Context, + req *storage.RaftMessageRequest, + respStream storage.RaftMessageResponseStream, +) *roachpb.Error { + return h.mtc.Store(h.storeIdx).HandleRaftRequest(ctx, req, respStream) +} + +func (h *mtcStoreRaftMessageHandler) HandleRaftResponse( + ctx context.Context, resp *storage.RaftMessageResponse, +) error { + return h.mtc.Store(h.storeIdx).HandleRaftResponse(ctx, resp) +} + +func (h *mtcStoreRaftMessageHandler) HandleSnapshot( + header *storage.SnapshotRequest_Header, respStream storage.SnapshotResponseStream, +) error { + return h.mtc.Store(h.storeIdx).HandleSnapshot(header, respStream) +} + +// mtcPartitionedRange is a convenient abstraction to create a range on a node +// in a multiTestContext which can be partitioned and unpartitioned. +type mtcPartitionedRange struct { + rangeID roachpb.RangeID + mu struct { + syncutil.RWMutex + partitionedNode int + partitioned bool + partitionedReplicas map[roachpb.ReplicaID]bool + } + handlers []storage.RaftMessageHandler +} + +// setupPartitionedRange sets up an mtcPartitionedRange for the provided mtc, +// rangeID, and node index in the mtc. The range is initially not partitioned. +// +// We're going to set up the cluster with partitioning so that we can +// partition node p from the others. We do this by installing +// unreliableRaftHandler listeners on all three Stores which we can enable +// and disable with an atomic. The handler on the partitioned store filters +// out all messages while the handler on the other two stores only filters +// out messages from the partitioned store. When activated the configuration +// looks like: +// +// [p] +// x x +// / \ +// x x +// [*]<---->[*] +// +// The activated argument controls whether the partition is activated when this +// function returns. +// +// If replicaID is zero then it is resolved by looking up the replica for the +// partitionedNode of from the current range descriptor of rangeID. +func setupPartitionedRange( + mtc *multiTestContext, + rangeID roachpb.RangeID, + replicaID roachpb.ReplicaID, + partitionedNode int, + activated bool, + funcs dropFuncs, +) (*mtcPartitionedRange, error) { + handlers := make([]storage.RaftMessageHandler, 0, len(mtc.stores)) + for i := range mtc.stores { + handlers = append(handlers, &mtcStoreRaftMessageHandler{ + mtc: mtc, + storeIdx: i, + }) + } + return setupPartitionedRangeWithHandlers(mtc, rangeID, replicaID, partitionedNode, activated, handlers, funcs) +} + +func setupPartitionedRangeWithHandlers( + mtc *multiTestContext, + rangeID roachpb.RangeID, + replicaID roachpb.ReplicaID, + partitionedNode int, + activated bool, + handlers []storage.RaftMessageHandler, + funcs dropFuncs, +) (*mtcPartitionedRange, error) { + pr := &mtcPartitionedRange{ + rangeID: rangeID, + handlers: make([]storage.RaftMessageHandler, 0, len(handlers)), + } + pr.mu.partitioned = activated + pr.mu.partitionedNode = partitionedNode + if replicaID == 0 { + partRepl, err := mtc.Store(partitionedNode).GetReplica(rangeID) + if err != nil { + return nil, err + } + partReplDesc, err := partRepl.GetReplicaDescriptor() + if err != nil { + return nil, err + } + replicaID = partReplDesc.ReplicaID + } + pr.mu.partitionedReplicas = map[roachpb.ReplicaID]bool{ + replicaID: true, + } + for i := range mtc.stores { + s := i + h := &unreliableRaftHandler{ + rangeID: rangeID, + RaftMessageHandler: handlers[s], + dropFuncs: funcs, + } + // Only filter messages from the partitioned store on the other + // two stores. + if h.dropReq == nil { + h.dropReq = func(req *storage.RaftMessageRequest) bool { + pr.mu.RLock() + defer pr.mu.RUnlock() + return pr.mu.partitioned && + (s == pr.mu.partitionedNode || + req.FromReplica.StoreID == roachpb.StoreID(pr.mu.partitionedNode)+1) + } + } + if h.dropHB == nil { + h.dropHB = func(hb *storage.RaftHeartbeat) bool { + pr.mu.RLock() + defer pr.mu.RUnlock() + if !pr.mu.partitioned { + return false + } + if s == partitionedNode { + return true + } + return pr.mu.partitionedReplicas[hb.FromReplicaID] + } + } + if h.dropSnap == nil { + h.dropSnap = func(header *storage.SnapshotRequest_Header) error { + pr.mu.RLock() + defer pr.mu.RUnlock() + if !pr.mu.partitioned { + return nil + } + if pr.mu.partitionedReplicas[header.RaftMessageRequest.ToReplica.ReplicaID] { + return errors.New("partitioned") + } + return nil + } + } + pr.handlers = append(pr.handlers, h) + mtc.transport.Listen(mtc.stores[s].Ident.StoreID, h) + } + return pr, nil +} + +func (pr *mtcPartitionedRange) deactivate() { pr.set(false) } +func (pr *mtcPartitionedRange) activate() { pr.set(true) } +func (pr *mtcPartitionedRange) set(active bool) { + pr.mu.Lock() + defer pr.mu.Unlock() + pr.mu.partitioned = active +} + +func (pr *mtcPartitionedRange) addReplica(replicaID roachpb.ReplicaID) { + pr.mu.Lock() + defer pr.mu.Unlock() + pr.mu.partitionedReplicas[replicaID] = true +} + +func (pr *mtcPartitionedRange) extend( + mtc *multiTestContext, + rangeID roachpb.RangeID, + replicaID roachpb.ReplicaID, + partitionedNode int, + activated bool, + funcs dropFuncs, +) (*mtcPartitionedRange, error) { + return setupPartitionedRangeWithHandlers(mtc, rangeID, replicaID, partitionedNode, activated, pr.handlers, funcs) +} diff --git a/pkg/storage/client_raft_test.go b/pkg/storage/client_raft_test.go index 12335fcc96ea..0c510970ae52 100644 --- a/pkg/storage/client_raft_test.go +++ b/pkg/storage/client_raft_test.go @@ -33,6 +33,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/engine" + "github.com/cockroachdb/cockroach/pkg/storage/stateloader" "github.com/cockroachdb/cockroach/pkg/storage/storagebase" "github.com/cockroachdb/cockroach/pkg/storage/storagepb" "github.com/cockroachdb/cockroach/pkg/testutils" @@ -973,16 +974,18 @@ func TestSnapshotAfterTruncationWithUncommittedTail(t *testing.T) { mtc.transport.Listen(mtc.stores[s].Ident.StoreID, &unreliableRaftHandler{ rangeID: 1, RaftMessageHandler: mtc.stores[s], - dropReq: func(req *storage.RaftMessageRequest) bool { - // Make sure that even going forward no MsgApp for what we just truncated can - // make it through. The Raft transport is asynchronous so this is necessary - // to make the test pass reliably. - // NB: the Index on the message is the log index that _precedes_ any of the - // entries in the MsgApp, so filter where msg.Index < index, not <= index. - return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + dropFuncs: dropFuncs{ + dropReq: func(req *storage.RaftMessageRequest) bool { + // Make sure that even going forward no MsgApp for what we just truncated can + // make it through. The Raft transport is asynchronous so this is necessary + // to make the test pass reliably. + // NB: the Index on the message is the log index that _precedes_ any of the + // entries in the MsgApp, so filter where msg.Index < index, not <= index. + return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + }, + dropHB: func(*storage.RaftHeartbeat) bool { return false }, + dropResp: func(*storage.RaftMessageResponse) bool { return false }, }, - dropHB: func(*storage.RaftHeartbeat) bool { return false }, - dropResp: func(*storage.RaftMessageResponse) bool { return false }, }) } @@ -1192,15 +1195,12 @@ func TestReplicateAfterRemoveAndSplit(t *testing.T) { return err } - if err := replicateRHS(); !testutils.IsError(err, storage.IntersectingSnapshotMsg) { - t.Fatalf("unexpected error %v", err) - } - - // Enable the replica GC queue so that the next attempt to replicate the RHS - // to store 2 will cause the obsolete replica to be GC'd allowing a - // subsequent replication to succeed. - mtc.stores[2].SetReplicaGCQueueActive(true) - + // This used to fail with IntersectingSnapshotMsg because we relied on replica + // GC to remove the LHS and that queue is disabled. Now we will detect that + // the LHS is not part of the range because of a ReplicaTooOldError and then + // we'll replicaGC the LHS in response. + // TODO(ajwerner): filter the reponses to node 2 or disable this eager + // replicaGC. testutils.SucceedsSoon(t, replicateRHS) } @@ -1883,6 +1883,7 @@ func testReplicaAddRemove(t *testing.T, addFirst bool) { // replica GC queue does its work, so we disable the replica gc queue here // and run it manually when we're ready. sc.TestingKnobs.DisableReplicaGCQueue = true + sc.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{ storeConfig: &sc, // This test was written before the multiTestContext started creating many @@ -3000,6 +3001,10 @@ func TestReplicateRogueRemovedNode(t *testing.T) { defer mtc.Stop() mtc.Start(t, 3) + // We're going to set up the cluster with partitioning so that we can + // partition node 0 from the others. The partition is not initially active. + partRange, err := setupPartitionedRange(mtc, 1, 0, 0, false /* activated */, dropFuncs{}) + require.NoError(t, err) // First put the range on all three nodes. raftID := roachpb.RangeID(1) mtc.replicateRange(raftID, 1, 2) @@ -3044,7 +3049,9 @@ func TestReplicateRogueRemovedNode(t *testing.T) { } return nil }) - + // Partition nodes 1 and 2 from node 0. Otherwise they'd get a + // ReplicaTooOldError from node 0 and proceed to remove themselves. + partRange.activate() // Bring node 2 back up. mtc.restartStore(2) @@ -3547,6 +3554,7 @@ func TestRemoveRangeWithoutGC(t *testing.T) { sc := storage.TestStoreConfig(nil) sc.TestingKnobs.DisableReplicaGCQueue = true + sc.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{storeConfig: &sc} defer mtc.Stop() mtc.Start(t, 2) @@ -3597,12 +3605,21 @@ func TestRemoveRangeWithoutGC(t *testing.T) { mtc.advanceClock(context.TODO()) mtc.manualClock.Increment(int64(storage.ReplicaGCQueueInactivityThreshold + 1)) mtc.stores[0].SetReplicaGCQueueActive(true) - mtc.stores[0].MustForceReplicaGCScanAndProcess() + // There's a fun flake where between when the queue detects that this replica + // needs to be removed and when it actually gets processed whereby an older + // replica will send this replica a raft message which will give it an ID. + // When our replica ID changes the queue will ignore the previous addition and + // we won't be removed. + testutils.SucceedsSoon(t, func() error { + mtc.stores[0].MustForceReplicaGCScanAndProcess() - // The Replica object should be removed. - if _, err := mtc.stores[0].GetReplica(rangeID); !testutils.IsError(err, "r[0-9]+ was not found") { - t.Fatalf("expected replica to be missing; got %v", err) - } + // The Replica object should be removed. + const msg = "r[0-9]+ was not found" + if _, err := mtc.stores[0].GetReplica(rangeID); !testutils.IsError(err, msg) { + return errors.Errorf("expected %s, got %v", msg, err) + } + return nil + }) // And the data should no longer be on disk. if ok, err := engine.MVCCGetProto(context.Background(), mtc.stores[0].Engine(), descKey, @@ -4327,8 +4344,10 @@ func TestTracingDoesNotRaceWithCancelation(t *testing.T) { mtc.transport.Listen(mtc.stores[i].Ident.StoreID, &unreliableRaftHandler{ rangeID: ri.Desc.RangeID, RaftMessageHandler: mtc.stores[i], - dropReq: func(req *storage.RaftMessageRequest) bool { - return rand.Intn(2) == 0 + dropFuncs: dropFuncs{ + dropReq: func(req *storage.RaftMessageRequest) bool { + return rand.Intn(2) == 0 + }, }, }) } @@ -4595,3 +4614,465 @@ func TestAckWriteBeforeApplication(t *testing.T) { }) } } + +// TestProcessSplitAfterRightHandSideHasBeenRemoved tests cases where we have +// a follower replica which has received information about the RHS of a split +// before it has processed that split. The replica can't both have an +// initialized RHS and process the split but it can have (1) an uninitialized +// RHS with a higher replica ID than lives in the split and (2) a RHS with +// an unknown replica ID and a tombstone at exactly the replica ID of the RHS. +// It may learn about a newer replica ID than the split without ever hearing +// about the split replica. If it does not crash (3) it will know that the +// split replica is too old and will not initialize it. If the node does +// crash (4) it will forget it had learned about the higher replica ID and +// will initialize the RHS as the split replica. +// +// Starting in 19.2 if a replica discovers from a raft message that it is an +// old replica then it knows that it has been removed and re-added to the range. +// In this case the Replica eagerly destroys itself and its data. +// +// Given this behavior there are 4 troubling cases with regards to splits. +// +// * In all cases we begin with s1 processing a presplit snapshot for +// r20. After the split the store should have r21/3. +// +// In the first two cases the following occurs: +// +// * s1 receives a message for r21/3 prior to acquiring the split lock +// in r21. This will create an uninitialized r21/3 which may write +// HardState. +// +// * Before the r20 processes the split r21 is removed and re-added to +// s1 as r21/4. s1 receives a raft message destined for r21/4 and proceeds +// to destroy its uninitialized r21/3, laying down a tombstone at 4 in the +// process. +// +// (1) s1 processes the split and finds the RHS to be an uninitialized replica +// with a higher replica ID. +// +// (2) s1 crashes before processing the split, forgetting the replica ID of the +// RHS but retaining its tombstone. +// +// In both cases we know that the RHS could not have committed anything because +// it cannot have gotten a snapshot but we want to be sure to not synthesize a +// HardState for the RHS that contains a non-zero commit index if we know that +// the RHS will need another snapshot later. +// +// In the third case: +// +// * s1 never receives a message for r21/3. +// +// * Before the r20 processes the split r21 is removed and re-added to +// s1 as r21/4. s1 receives a raft message destined for r21/4 and has never +// heard about r21/3. +// +// (3) s1 processes the split and finds the RHS to be an uninitialized replica +// with a higher replica ID (but without a tombstone). This case is very +// similar to (1) +// +// (4) s1 crashes still before processing the split, forgetting that it had +// known about r21/4. When it reboots r21/4 is totally partitioned and +// r20 becomes unpartitioned. +// +// * r20 processes the split successfully and initialized r21/3. +// +// In the 3rd case we find that until we unpartition r21/4 =, there will be +// a CommitIndex at 10 for initialized replica r21/3, the initial value. After +// r21/4 becomes unpartitioned it will learn it is removed by catching up on +// its log or receiving a ReplicaTooOldError and will write a tombstone. +// +func TestProcessSplitAfterRightHandSideHasBeenRemoved(t *testing.T) { + defer leaktest.AfterTest(t)() + sc := storage.TestStoreConfig(nil) + // Newly-started stores (including the "rogue" one) should not GC + // their replicas. We'll turn this back on when needed. + sc.TestingKnobs.DisableReplicaGCQueue = true + sc.RaftDelaySplitToSuppressSnapshotTicks = 0 + // Make the tick interval short so we don't need to wait too long for the + // partitioned leader to time out. Also make the + // RangeLeaseRaftElectionTimeout multiplier high so that system ranges + // like node liveness can actually get leases. + sc.RaftTickInterval = 10 * time.Millisecond + sc.RangeLeaseRaftElectionTimeoutMultiplier = 1000 + noopProposalFilter := storagebase.ReplicaProposalFilter(func(args storagebase.ProposalFilterArgs) *roachpb.Error { + return nil + }) + var proposalFilter atomic.Value + proposalFilter.Store(noopProposalFilter) + sc.TestingKnobs.TestingProposalFilter = func(args storagebase.ProposalFilterArgs) *roachpb.Error { + return proposalFilter.Load().(storagebase.ReplicaProposalFilter)(args) + } + + ctx := context.Background() + increment := func(t *testing.T, db *client.DB, key roachpb.Key, by int64) { + b := &client.Batch{} + b.AddRawRequest(incrementArgs(key, by)) + require.NoError(t, db.Run(ctx, b)) + } + changeReplicas := func( + t *testing.T, db *client.DB, typ roachpb.ReplicaChangeType, key roachpb.Key, idx int, + ) error { + ri, err := getRangeInfo(ctx, db, key) + require.NoError(t, err) + _, err = db.AdminChangeReplicas(ctx, ri.Desc.StartKey.AsRawKey(), ri.Desc, + roachpb.MakeReplicationChanges(typ, makeReplicationTargets(idx+1)...)) + return err + } + split := func(t *testing.T, db *client.DB, key roachpb.Key) { + b := &client.Batch{} + b.AddRawRequest(adminSplitArgs(key)) + require.NoError(t, db.Run(ctx, b)) + } + ensureNoTombstone := func(t *testing.T, store *storage.Store, rangeID roachpb.RangeID) { + var tombstone roachpb.RaftTombstone + tombstoneKey := keys.RaftTombstoneKey(rangeID) + ok, err := engine.MVCCGetProto( + ctx, store.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, engine.MVCCGetOptions{}, + ) + require.NoError(t, err) + require.False(t, ok) + } + getHardState := func( + t *testing.T, store *storage.Store, rangeID roachpb.RangeID, + ) raftpb.HardState { + hs, err := stateloader.Make(rangeID).LoadHardState(ctx, store.Engine()) + require.NoError(t, err) + return hs + } + partitionReplicaOnSplit := func(t *testing.T, mtc *multiTestContext, key roachpb.Key, basePartition *mtcPartitionedRange, partRange **mtcPartitionedRange) { + // Set up a hook to partition the RHS range at its initial range ID + // before proposing the split trigger. + var setupOnce sync.Once + f := storagebase.ReplicaProposalFilter(func(args storagebase.ProposalFilterArgs) *roachpb.Error { + req, ok := args.Req.GetArg(roachpb.EndTransaction) + if !ok { + return nil + } + endTxn := req.(*roachpb.EndTransactionRequest) + if endTxn.InternalCommitTrigger == nil || endTxn.InternalCommitTrigger.SplitTrigger == nil { + return nil + } + split := endTxn.InternalCommitTrigger.SplitTrigger + + if !split.RightDesc.StartKey.Equal(key) { + return nil + } + setupOnce.Do(func() { + replDesc, ok := split.RightDesc.GetReplicaDescriptor(1) + require.True(t, ok) + var err error + *partRange, err = basePartition.extend(mtc, split.RightDesc.RangeID, replDesc.ReplicaID, + 0 /* partitionedNode */, true /* activated */, dropFuncs{}) + require.NoError(t, err) + proposalFilter.Store(noopProposalFilter) + }) + return nil + }) + proposalFilter.Store(f) + } + + // The basic setup for all of these tests are that we have a LHS range on 3 + // nodes and we've partitioned store 0 for the LHS range. The tests will now + // perform a split, remove the RHS, add it back and validate assumptions. + // + // Different outcomes will occur depending on whether and how the RHS is + // partitioned and whether the server is killed. In all cases we want the + // split to succeed and the RHS to eventually also be on all 3 nodes. + setup := func(t *testing.T) ( + mtc *multiTestContext, + db *client.DB, + keyA, keyB roachpb.Key, + lhsID roachpb.RangeID, + lhsPartition *mtcPartitionedRange, + ) { + mtc = &multiTestContext{ + storeConfig: &sc, + } + mtc.Start(t, 3) + + db = mtc.Store(1).DB() + + // Split off a non-system range so we don't have to account for node liveness + // traffic. + scratchTableKey := keys.MakeTablePrefix(math.MaxUint32) + // Put some data in the range so we'll have something to test for. + keyA = append(append(roachpb.Key{}, scratchTableKey...), 'a') + keyB = append(append(roachpb.Key{}, scratchTableKey...), 'b') + + split(t, db, scratchTableKey) + ri, err := getRangeInfo(ctx, db, scratchTableKey) + require.Nil(t, err) + lhsID = ri.Desc.RangeID + // First put the range on all three nodes. + mtc.replicateRange(lhsID, 1, 2) + + // Set up a partition for the LHS range only. Initially it is not active. + lhsPartition, err = setupPartitionedRange(mtc, lhsID, + 0 /* replicaID */, 0 /* partitionedNode */, false /* activated */, dropFuncs{}) + require.NoError(t, err) + // Wait for all nodes to catch up. + increment(t, db, keyA, 5) + mtc.waitForValues(keyA, []int64{5, 5, 5}) + + // Transfer the lease off of node 0. + mtc.transferLease(ctx, lhsID, 0, 2) + + // Make sure everybody knows about that transfer. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 6, 6}) + lhsPartition.activate() + + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 7, 7}) + return mtc, db, keyA, keyB, lhsID, lhsPartition + } + + // In this case we only have the LHS partitioned. The RHS will learn about its + // identity as the replica in the split and after being re-added will learn + // about the new replica ID and will lay down a tombstone. At this point we'll + // partition the RHS and ensure that the split does not clobber the RHS's hard + // state. + t.Run("no RHS partition", func(t *testing.T) { + mtc, db, keyA, keyB, _, lhsPartition := setup(t) + defer mtc.Stop() + + split(t, db, keyB) + + // Write a value which we can observe to know when the split has been + // applied by the LHS. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 8, 8}) + + increment(t, db, keyB, 6) + // Wait for all non-partitioned nodes to catch up. + mtc.waitForValues(keyB, []int64{0, 6, 6}) + + rhsInfo, err := getRangeInfo(ctx, db, keyB) + require.NoError(t, err) + rhsID := rhsInfo.Desc.RangeID + _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) + require.True(t, store0Exists) + + // Remove and re-add the RHS to create a new uninitialized replica at + // a higher replica ID. This will lead to a tombstone being written. + require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) + // Unsuccessfuly because the RHS will not accept the learner snapshot + // and will be rolled back. Nevertheless it will have learned that it + // has been removed at the old replica ID. + err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + require.True(t, + testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) + + // Without a partitioned RHS we'll end up always writing a tombstone here because + // the RHS will be created at the initial replica ID because it will get + // raft message when the other nodes split and then after the above call + // it will find out about its new replica ID and write a tombstone for the + // old one. + waitForTombstone(t, mtc.Store(0).Engine(), rhsID) + lhsPartition.deactivate() + mtc.waitForValues(keyA, []int64{8, 8, 8}) + hs := getHardState(t, mtc.Store(0), rhsID) + require.Equal(t, uint64(0), hs.Commit) + testutils.SucceedsSoon(t, func() error { + return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + }) + mtc.waitForValues(keyB, []int64{6, 6, 6}) + }) + + // This case is like the previous case except the store crashes after + // laying down a tombstone. + t.Run("no RHS partition, with restart", func(t *testing.T) { + mtc, db, keyA, keyB, _, lhsPartition := setup(t) + defer mtc.Stop() + + split(t, db, keyB) + + // Write a value which we can observe to know when the split has been + // applied by the LHS. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 8, 8}) + + increment(t, db, keyB, 6) + // Wait for all non-partitioned nodes to catch up. + mtc.waitForValues(keyB, []int64{0, 6, 6}) + + rhsInfo, err := getRangeInfo(ctx, db, keyB) + require.NoError(t, err) + rhsID := rhsInfo.Desc.RangeID + _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) + require.True(t, store0Exists) + + // Remove and re-add the RHS to create a new uninitialized replica at + // a higher replica ID. This will lead to a tombstone being written. + require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) + // Unsuccessfuly because the RHS will not accept the learner snapshot + // and will be rolled back. Nevertheless it will have learned that it + // has been removed at the old replica ID. + err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + require.True(t, + testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) + + // Without a partitioned RHS we'll end up always writing a tombstone here because + // the RHS will be created at the initial replica ID because it will get + // raft message when the other nodes split and then after the above call + // it will find out about its new replica ID and write a tombstone for the + // old one. + waitForTombstone(t, mtc.Store(0).Engine(), rhsID) + + // We do all of this incrementing to ensure that nobody will ever + // succeed in sending a message the new RHS replica after we restart + // the store. Previously there were races which could happen if we + // stopped the store immediately. Sleeps worked but this feels somehow + // more principled. + curB := int64(6) + for curB < 100 { + curB++ + increment(t, db, keyB, 1) + mtc.waitForValues(keyB, []int64{0, curB, curB}) + } + + // Restart store 0 so that it forgets about the newer replicaID. + mtc.stopStore(0) + mtc.restartStore(0) + + lhsPartition.deactivate() + mtc.waitForValues(keyA, []int64{8, 8, 8}) + hs := getHardState(t, mtc.Store(0), rhsID) + require.Equal(t, uint64(0), hs.Commit) + testutils.SucceedsSoon(t, func() error { + return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + }) + mtc.waitForValues(keyB, []int64{curB, curB, curB}) + }) + + // In this case the RHS will be partitioned from hearing anything about + // the initial replica ID of the RHS after the split. It will learn about + // the higher replica ID and have that higher replica ID in memory when + // the split is processed. We partition the RHS's new replica ID before + // processing the split to ensure that the RHS doesn't get initialized. + t.Run("initial replica RHS partition, no restart", func(t *testing.T) { + mtc, db, keyA, keyB, _, lhsPartition := setup(t) + defer mtc.Stop() + var rhsPartition *mtcPartitionedRange + partitionReplicaOnSplit(t, mtc, keyB, lhsPartition, &rhsPartition) + split(t, db, keyB) + + // Write a value which we can observe to know when the split has been + // applied by the LHS. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 8, 8}) + + increment(t, db, keyB, 6) + // Wait for all non-partitioned nodes to catch up. + mtc.waitForValues(keyB, []int64{0, 6, 6}) + + rhsInfo, err := getRangeInfo(ctx, db, keyB) + require.NoError(t, err) + rhsID := rhsInfo.Desc.RangeID + _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) + require.True(t, store0Exists) + + // Remove and re-add the RHS to create a new uninitialized replica at + // a higher replica ID. This will lead to a tombstone being written. + require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) + // Unsuccessfuly because the RHS will not accept the learner snapshot + // and will be rolled back. Nevertheless it will have learned that it + // has been removed at the old replica ID. + err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + require.True(t, + testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) + rhsPartition.addReplica(rhsInfo.Desc.NextReplicaID) + + // Ensure that there's no tombstone. + // The RHS on store 0 never should have heard about its original ID. + ensureNoTombstone(t, mtc.Store(0), rhsID) + lhsPartition.deactivate() + mtc.waitForValues(keyA, []int64{8, 8, 8}) + hs := getHardState(t, mtc.Store(0), rhsID) + require.Equal(t, uint64(0), hs.Commit) + // Now succeed in adding the RHS. + testutils.SucceedsSoon(t, func() error { + return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + }) + mtc.waitForValues(keyB, []int64{6, 6, 6}) + }) + + // This case is set up like the previous one except after the RHS learns about + // its higher replica ID the store crahes and forgets. The RHS replica gets + // initialized by the split. + t.Run("initial replica RHS partition, with restart", func(t *testing.T) { + mtc, db, keyA, keyB, _, lhsPartition := setup(t) + defer mtc.Stop() + var rhsPartition *mtcPartitionedRange + + partitionReplicaOnSplit(t, mtc, keyB, lhsPartition, &rhsPartition) + split(t, db, keyB) + + // Write a value which we can observe to know when the split has been + // applied by the LHS. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 8, 8}) + + increment(t, db, keyB, 6) + // Wait for all non-partitioned nodes to catch up. + mtc.waitForValues(keyB, []int64{0, 6, 6}) + + rhsInfo, err := getRangeInfo(ctx, db, keyB) + require.NoError(t, err) + rhsID := rhsInfo.Desc.RangeID + _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) + require.True(t, store0Exists) + + // Remove and re-add the RHS to create a new uninitialized replica at + // a higher replica ID. This will lead to a tombstone being written. + require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) + // Unsuccessfuly because the RHS will not accept the learner snapshot + // and will be rolled back. Nevertheless it will have learned that it + // has been removed at the old replica ID. + err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + require.True(t, + testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) + // Ensure that there's no tombstone. + // The RHS on store 0 never should have heard about its original ID. + ensureNoTombstone(t, mtc.Store(0), rhsID) + + // Now, before we deactivate the LHS partition, partition the newer replica + // on the RHS too. + rhsPartition.addReplica(rhsInfo.Desc.NextReplicaID) + + // We do all of this incrementing to ensure that nobody will ever + // succeed in sending a message the new RHS replica after we restart + // the store. Previously there were races which could happen if we + // stopped the store immediately. Sleeps worked but this feels somehow + // more principled. + curB := int64(6) + for curB < 100 { + curB++ + increment(t, db, keyB, 1) + mtc.waitForValues(keyB, []int64{0, curB, curB}) + } + + // Restart store 0 so that it forgets about the newer replicaID. + mtc.stopStore(0) + mtc.restartStore(0) + + lhsPartition.deactivate() + mtc.waitForValues(keyA, []int64{8, 8, 8}) + // In this case the store has forgotten that it knew the RHS of the split + // could not exist. We ensure that it has been initialized to the initial + // commit value, which is 10. + testutils.SucceedsSoon(t, func() error { + hs := getHardState(t, mtc.Store(0), rhsID) + if hs.Commit != uint64(10) { + return errors.Errorf("hard state not yet initialized: got %v, expected %v", + hs.Commit, uint64(10)) + } + return nil + }) + rhsPartition.deactivate() + testutils.SucceedsSoon(t, func() error { + return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + }) + mtc.waitForValues(keyB, []int64{curB, curB, curB}) + }) +} diff --git a/pkg/storage/client_replica_gc_test.go b/pkg/storage/client_replica_gc_test.go index 7b8e41cf1b36..4162ffca8d85 100644 --- a/pkg/storage/client_replica_gc_test.go +++ b/pkg/storage/client_replica_gc_test.go @@ -148,8 +148,11 @@ func TestReplicaGCQueueDropReplicaDirect(t *testing.T) { // removes a range from a store that no longer should have a replica. func TestReplicaGCQueueDropReplicaGCOnScan(t *testing.T) { defer leaktest.AfterTest(t)() - mtc := &multiTestContext{} + cfg := storage.TestStoreConfig(nil) + cfg.TestingKnobs.DisableEagerReplicaRemoval = true + mtc.storeConfig = &cfg + defer mtc.Stop() mtc.Start(t, 3) // Disable the replica gc queue to prevent direct removal of replica. diff --git a/pkg/storage/client_replica_test.go b/pkg/storage/client_replica_test.go index 48d3203557c6..d2ed5d8af2ea 100644 --- a/pkg/storage/client_replica_test.go +++ b/pkg/storage/client_replica_test.go @@ -2134,6 +2134,226 @@ func TestRandomConcurrentAdminChangeReplicasRequests(t *testing.T) { } } +// TestReplicaTombstone ensures that tombstones are written when we expect +// them to be. +// +// Tombstones are laid down when replicas are removed. +// Replicas are removed for several reasons: +// +// 1) In response to a ChangeReplicasTrigger which removes it. +// 2) In response to a ReplicaTooOldError from a sent raft message. +// 3) Due to the replicaGCQueue. +// 3.1) When this detects a merge. +// 4) Due to a raft message addressed to a newer replica ID. +// 4.1) When the older replica is not initialized. +// 5) Due to a merge. +// 6) Due to snapshot which subsumes a range. +// +// This test creates all of these scenarios and ensures that tombstones are +// written. +func TestReplicaTombstone(t *testing.T) { + defer leaktest.AfterTest(t)() + split := func(t *testing.T, db *client.DB, key roachpb.Key) { + b := &client.Batch{} + b.AddRawRequest(adminSplitArgs(key)) + require.NoError(t, db.Run(context.TODO(), b)) + } + t.Run("ChangeReplicasTrigger", func(t *testing.T) { + defer leaktest.AfterTest(t)() + + sc := storage.TestStoreConfig(nil) + // We'll control replication by hand. + sc.TestingKnobs.DisableReplicateQueue = true + // Avoid fighting with the merge queue while trying to reproduce this race. + sc.TestingKnobs.DisableMergeQueue = true + mtc := &multiTestContext{storeConfig: &sc} + defer mtc.Stop() + mtc.Start(t, 2) + + keyA := roachpb.Key("a") + + repl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) + rangeID := repl0.RangeID + // Partition node 2 from sending traffic but not from receiving it. + // Replicate the range onto nodes 1 and 2. + mtc.replicateRange(rangeID, 1) + // Partition the range such that it doesn't hear responses but it hears + // everything else. + part, err := setupPartitionedRange(mtc, rangeID, 0, 1, true, dropFuncs{ + dropResp: func(*storage.RaftMessageResponse) bool { + return true + }, + dropReq: func(*storage.RaftMessageRequest) bool { + return false + }, + dropHB: func(*storage.RaftHeartbeat) bool { + return false + }, + }) + defer part.deactivate() + require.NoError(t, err) + mtc.unreplicateRange(rangeID, 1) + tombstone := waitForTombstone(t, mtc.Store(1).Engine(), rangeID) + require.Equal(t, roachpb.ReplicaID(3), tombstone.NextReplicaID) + }) + t.Run("ReplicaTooOldError", func(t *testing.T) { + defer leaktest.AfterTest(t)() + + sc := storage.TestStoreConfig(nil) + // We'll control replication by hand. + sc.TestingKnobs.DisableReplicateQueue = true + // Avoid fighting with the merge queue while trying to reproduce this race. + sc.TestingKnobs.DisableMergeQueue = true + // Send lots of heartbeats. + sc.RaftTickInterval = 10 * time.Millisecond + sc.RangeLeaseRaftElectionTimeoutMultiplier = 1000 + + mtc := &multiTestContext{storeConfig: &sc} + defer mtc.Stop() + mtc.Start(t, 3) + + keyA := roachpb.Key("a") + + repl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) + rangeID := repl0.RangeID + // Partition node 2 from sending traffic but not from receiving it. + // Replicate the range onto nodes 1 and 2. + mtc.replicateRange(rangeID, 1, 2) + // Partition the range such that it hears responses but does not hear + // responses. It should destroy the local replica due to a + // ReplicaTooOldError. + sawTooOld := make(chan struct{}, 1) + part, err := setupPartitionedRange(mtc, rangeID, 0, 2, true, dropFuncs{ + dropResp: func(resp *storage.RaftMessageResponse) bool { + if resp.ToReplica.StoreID == 3 { + switch val := resp.Union.GetValue().(type) { + case *roachpb.Error: + switch val.GetDetail().(type) { + case *roachpb.ReplicaTooOldError: + select { + case sawTooOld <- struct{}{}: + default: + } + } + } + } + return false + }, + dropReq: func(req *storage.RaftMessageRequest) bool { + return req.ToReplica.StoreID == 3 + }, + dropHB: func(*storage.RaftHeartbeat) bool { + return false + }, + }) + defer part.deactivate() + require.NoError(t, err) + mtc.unreplicateRange(rangeID, 2) + // Write a value to ensure tha the remaining replicas have processed the + // removal. + args := incrementArgs(keyA, 1) + _, pErr := client.SendWrappedWith(context.Background(), mtc.Store(0).TestSender(), + roachpb.Header{RangeID: rangeID}, args) + require.Nil(t, pErr) + mtc.waitForValues(keyA, []int64{1, 1, 0}) + + // Advance the clock to expire all leases. + mtc.advanceClock(context.TODO()) + + // Try to send a request to the partitioned range so that it sends raft + // messages when it attempts to campaign. + _, pErr = client.SendWrappedWith(context.Background(), mtc.Store(2).TestSender(), + roachpb.Header{RangeID: rangeID}, args) + // The replica may return NotLeaseHolderError or may have already discovered + // that it's been removed. + require.True(t, testutils.IsPError(pErr, "NotLeaseHolderError|was not found on s3"), pErr) + // Wait until we're sure that the replica has seen ReplicaTooOld. + <-sawTooOld + // We want to make sure that we actually see + tombstone := waitForTombstone(t, mtc.Store(2).Engine(), rangeID) + require.Equal(t, roachpb.ReplicaID(4), tombstone.NextReplicaID) + }) + t.Run("ReplicaGCQueue (moved)", func(t *testing.T) { + defer leaktest.AfterTest(t)() + + sc := storage.TestStoreConfig(nil) + // We'll control replication by hand. + sc.TestingKnobs.DisableReplicateQueue = true + // Avoid fighting with the merge queue while trying to reproduce this race. + sc.TestingKnobs.DisableMergeQueue = true + + mtc := &multiTestContext{storeConfig: &sc} + defer mtc.Stop() + mtc.Start(t, 3) + + keyA := roachpb.Key("a") + + repl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) + rangeID := repl0.RangeID + // Partition node 2 from sending traffic but not from receiving it. + // Replicate the range onto nodes 1 and 2. + mtc.replicateRange(rangeID, 1, 2) + // Partition the range such that it doesn't hear anything. + // Use the replica GC queue to remove it. + part, err := setupPartitionedRange(mtc, rangeID, 0, 2, true, dropFuncs{}) + require.NoError(t, err) + defer part.deactivate() + mtc.unreplicateRange(rangeID, 2) + repl, err := mtc.Store(2).GetReplica(rangeID) + require.NoError(t, err) + require.NoError(t, mtc.Store(2).ManualReplicaGC(repl)) + tombstone := waitForTombstone(t, mtc.Store(2).Engine(), rangeID) + require.Equal(t, roachpb.ReplicaID(4), tombstone.NextReplicaID) + }) + // This case also detects the tombstone for nodes which processed the merge. + t.Run("ReplicaGCQueue (merged)", func(t *testing.T) { + defer leaktest.AfterTest(t)() + + sc := storage.TestStoreConfig(nil) + // We'll control replication by hand. + sc.TestingKnobs.DisableReplicateQueue = true + // Avoid fighting with the merge queue while trying to reproduce this race. + sc.TestingKnobs.DisableMergeQueue = true + + mtc := &multiTestContext{storeConfig: &sc} + defer mtc.Stop() + mtc.Start(t, 4) + + scratchTableKey := keys.MakeTablePrefix(math.MaxUint32) + keyA := append(append(roachpb.Key{}, scratchTableKey...), 'a') + keyB := append(append(roachpb.Key{}, scratchTableKey...), 'b') + split(t, mtc.Store(0).DB(), keyA) + split(t, mtc.Store(0).DB(), keyB) + + lhsRepl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) + lhsID := lhsRepl0.RangeID + mtc.replicateRange(lhsID, 1, 3) + + rhsRepl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyB)) + rhsID := rhsRepl0.RangeID + mtc.replicateRange(rhsID, 1, 2) + + // Partition the range such that it doesn't hear anything. + // Use the replica GC queue to remove it. + part, err := setupPartitionedRange(mtc, rhsID, 0, 2, true, dropFuncs{}) + require.NoError(t, err) + defer part.deactivate() + mtc.unreplicateRange(rhsID, 2) + mtc.replicateRange(rhsID, 3) + require.NoError(t, mtc.Store(0).DB().AdminMerge(context.TODO(), keyA)) + repl, err := mtc.Store(2).GetReplica(rhsID) + require.NoError(t, err) + require.NoError(t, mtc.Store(2).ManualReplicaGC(repl)) + tombstone := waitForTombstone(t, mtc.Store(2).Engine(), rhsID) + require.Equal(t, roachpb.ReplicaID(math.MaxInt32), tombstone.NextReplicaID) + tombstone = waitForTombstone(t, mtc.Store(3).Engine(), rhsID) + require.Equal(t, roachpb.ReplicaID(math.MaxInt32), tombstone.NextReplicaID) + }) + // TODO(ajwerner): test the subsumption case by blocking all messages after + // the merge has entered its critical phase and then wait the the LHS to + // get a snapshot. +} + // TestAdminRelocateRangeSafety exercises a situation where calls to // AdminRelocateRange can race with calls to ChangeReplicas and verifies // that such races do not leave the range in an under-replicated state. diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go index 6db3d819b5f9..f3cb67ba9fc8 100644 --- a/pkg/storage/client_split_test.go +++ b/pkg/storage/client_split_test.go @@ -3314,8 +3314,12 @@ func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) { // different replicaID than the split trigger expects. add := func() { _, err := tc.AddReplicas(kRHS, tc.Target(1)) - if !testutils.IsError(err, `snapshot intersects existing range`) { - t.Fatalf(`expected snapshot intersects existing range" error got: %+v`, err) + // The "snapshot intersects existing range" error is expected if the store + // has not heard a raft message addressed to a later replica ID while the + // "was not found on" error is expected if the store has heard that it has + // a newer replica ID before receiving the snapshot. + if !testutils.IsError(err, `snapshot intersects existing range|r[0-9]+ was not found on s[0-9]+`) { + t.Fatalf(`expected snapshot intersects existing range|r[0-9]+ was not found on s[0-9]+" error got: %+v`, err) } } for i := 0; i < 5; i++ { @@ -3361,7 +3365,8 @@ func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) { if err != nil { return err } - if desc := repl.Desc(); !descLHS.Equal(desc) { + if desc := repl.Desc(); desc.IsInitialized() && !descLHS.Equal(desc) { + require.NoError(t, store.ManualReplicaGC(repl)) return errors.Errorf("expected %s got %s", &descLHS, desc) } return nil diff --git a/pkg/storage/client_test.go b/pkg/storage/client_test.go index 11c57695ff08..1f1a0803ca95 100644 --- a/pkg/storage/client_test.go +++ b/pkg/storage/client_test.go @@ -1061,8 +1061,8 @@ func (m *multiTestContext) restartStore(i int) { } func (m *multiTestContext) Store(i int) *storage.Store { - m.mu.Lock() - defer m.mu.Unlock() + m.mu.RLock() + defer m.mu.RUnlock() return m.stores[i] } @@ -1243,6 +1243,23 @@ func (m *multiTestContext) unreplicateRangeNonFatal(rangeID roachpb.RangeID, des return err } +// waitForUnreplicated waits until no replica exists for the specified range +// on the dest store. +func (m *multiTestContext) waitForUnreplicated(rangeID roachpb.RangeID, dest int) error { + // Wait for the unreplications to complete on destination node. + return retry.ForDuration(testutils.DefaultSucceedsSoonDuration, func() error { + _, err := m.stores[dest].GetReplica(rangeID) + switch err.(type) { + case nil: + return fmt.Errorf("replica still exists on dest %d", dest) + case *roachpb.RangeNotFoundError: + return nil + default: + return err + } + }) +} + // readIntFromEngines reads the current integer value at the given key // from all configured engines, filling in zeros when the value is not // found. Returns a slice of the same length as mtc.engines. @@ -1551,3 +1568,22 @@ func verifyRecomputedStats( } return nil } + +func waitForTombstone( + t *testing.T, eng engine.Reader, rangeID roachpb.RangeID, +) (tombstone roachpb.RaftTombstone) { + testutils.SucceedsSoon(t, func() error { + tombstoneKey := keys.RaftTombstoneKey(rangeID) + ok, err := engine.MVCCGetProto( + context.TODO(), eng, tombstoneKey, hlc.Timestamp{}, &tombstone, engine.MVCCGetOptions{}, + ) + if err != nil { + t.Fatalf("failed to read tombstone: %v", err) + } + if !ok { + return fmt.Errorf("tombstone not found for range %d", rangeID) + } + return nil + }) + return tombstone +} diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go index 095a00256bd9..b87f928b5aee 100644 --- a/pkg/storage/replica.go +++ b/pkg/storage/replica.go @@ -242,6 +242,7 @@ type Replica struct { syncutil.RWMutex // The destroyed status of a replica indicating if it's alive, corrupt, // scheduled for destruction or has been GCed. + // destroyStatus should only be set while also holding the raftMu. destroyStatus // Is the range quiescent? Quiescent ranges are not Tick()'d and unquiesce // whenever a Raft operation is performed. @@ -345,7 +346,8 @@ type Replica struct { replicaID roachpb.ReplicaID // The minimum allowed ID for this replica. Initialized from // RaftTombstone.NextReplicaID. - minReplicaID roachpb.ReplicaID + tombstoneReplicaID roachpb.ReplicaID + // The ID of the leader replica within the Raft group. Used to determine // when the leadership changes. leaderID roachpb.ReplicaID @@ -610,13 +612,26 @@ func (r *Replica) String() string { return fmt.Sprintf("[n%d,s%d,r%s]", r.store.Ident.NodeID, r.store.Ident.StoreID, &r.rangeStr) } -// ReplicaID returns the ID for the Replica. +// ReplicaID returns the ID for the Replica. It may be zero if the replica does +// not know its ID. Once a Replica has a non-zero ReplicaID it will never change. func (r *Replica) ReplicaID() roachpb.ReplicaID { r.mu.RLock() defer r.mu.RUnlock() return r.mu.replicaID } +// minReplicaID returns the minimum replica ID this replica could ever possibly +// have. If this replica currently knows its replica ID (i.e. ReplicaID() is +// non-zero) then it returns it. Otherwise it returns r.mu.tombstoneReplicaID. +func (r *Replica) minReplicaID() roachpb.ReplicaID { + r.mu.RLock() + defer r.mu.RUnlock() + if r.mu.replicaID != 0 { + return r.mu.replicaID + } + return r.mu.tombstoneReplicaID +} + // cleanupFailedProposal cleans up after a proposal that has failed. It // clears any references to the proposal and releases associated quota. // It requires that both Replica.mu and Replica.raftMu are exclusively held. @@ -1043,6 +1058,62 @@ func (r *Replica) requestCanProceed(rspan roachpb.RSpan, ts hlc.Timestamp) error return mismatchErr } +// isNewerThanSplit is a helper used in split(Pre|Post)Apply to +// determine whether the Replica on the right hand side of the split must +// have been removed from this store after the split. +// +// TODO(ajwerner): Ideally if this store had ever learned that the replica +// created by the split were removed it would not forget that fact. +// There exists one edge case where the store may learn that it should house +// a replica of the same range with a higher replica ID and then forget. +// If the first raft message this store ever receives for the this range +// contains a replica ID higher than the replica ID in the split trigger +// then an in-memory replica at that higher replica ID will be created and +// no tombstone at a lower replica ID will be written. If the server then +// crashes it will forget that it had ever been the higher replica ID. The +// server may then proceed to process the split and initialize a replica at +// the replica ID implied by the split. This is potentially problematic as +// the replica may have voted as this higher replica ID and when it rediscovers +// the higher replica ID it will delete all of the state corresponding to the +// older replica ID including its hard state which may have been synthesized +// with votes as the newer replica ID. This case tends to be handled safely +// in practice because the replica should only be receiving messages as the +// newer replica ID after it has been added to the range. Prior to learner +// replicas we would only add a store to a range after we've successfully +// applied a pre-emptive snapshot. If the store were to split between the +// preemptive snapshot and the addition then the addition would fail due to +// the conditional put logic. If the store were to then enable learners then +// we're still okay because we won't promote a learner unless we succeed in +// sending a learner snapshot. If we fail to send the replica never becomes +// a voter then its votes don't matter and are safe to discard. +// +// Despite the safety due to the change replicas protocol explained above +// it'd be good to know for sure that a replica ID for a range on a store +// is always monotonically increasing, even across restarts. +// +// See TestProcessSplitAfterRightHandSideHasBeenRemoved. +func (r *Replica) isNewerThanSplit(split *roachpb.SplitTrigger) bool { + r.mu.RLock() + defer r.mu.RUnlock() + return r.isNewerThanSplitRLocked(split) +} + +func (r *Replica) isNewerThanSplitRLocked(split *roachpb.SplitTrigger) bool { + rightDesc, hasRightDesc := split.RightDesc.GetReplicaDescriptor(r.StoreID()) + // If we have written a tombstone for this range then we know that the RHS + // must have already been removed at the split replica ID. + return r.mu.tombstoneReplicaID != 0 || + // If the first raft message we received for the RHS range was for a replica + // ID which is above the replica ID of the split then we would not have + // written a tombstone but we will have a replica ID that will exceed the + // split replica ID. + (r.mu.replicaID > rightDesc.ReplicaID && + // If we're catching up from a preemptive snapshot we won't be in the split. + // and we won't know whether our current replica ID indicates we've been + // removed. + hasRightDesc) +} + // checkBatchRequest verifies BatchRequest validity requirements. In particular, // the batch must have an assigned timestamp, and either all requests must be // read-only, or none. diff --git a/pkg/storage/replica_application_result.go b/pkg/storage/replica_application_result.go index aa3a0116a841..706a45057305 100644 --- a/pkg/storage/replica_application_result.go +++ b/pkg/storage/replica_application_result.go @@ -293,22 +293,43 @@ func (r *Replica) handleComputeChecksumResult(ctx context.Context, cc *storagepb r.computeChecksumPostApply(ctx, *cc) } -func (r *Replica) handleChangeReplicasResult(ctx context.Context, chng *storagepb.ChangeReplicas) { - storeID := r.store.StoreID() - var found bool - for _, rDesc := range chng.Replicas() { - if rDesc.StoreID == storeID { - found = true - break - } +func (r *Replica) handleChangeReplicasResult( + ctx context.Context, chng *storagepb.ChangeReplicas, +) (changeRemovedReplica bool) { + // If this command removes us then we would have set the destroy status + // to destroyReasonRemovalPending which we detect here. + // Note that a replica's destroy status is only ever updated under the + // raftMu and we validated that the replica was not RemovingOrRemoved + // before processing this raft ready. + if ds, _ := r.IsDestroyed(); ds != destroyReasonRemovalPending { + return false // changeRemovedReplica + } + + // If this command removes us then we need to go through the process of + // removing our replica from the store. After this method returns, the code + // should roughly return all the way up to whoever called handleRaftReady + // and this Replica should never be heard from again. We can detect if this + // change removed us by inspecting the replica's destroyStatus. We check the + // destroy status before processing a raft ready so if we find ourselves with + // removal pending at this point then we know that this command must be + // responsible. + if log.V(1) { + log.Infof(ctx, "removing replica due to ChangeReplicasTrigger: %v", chng) } - if !found { - // This wants to run as late as possible, maximizing the chances - // that the other nodes have finished this command as well (since - // processing the removal from the queue looks up the Range at the - // lease holder, being too early here turns this into a no-op). - r.store.replicaGCQueue.AddAsync(ctx, r, replicaGCPriorityRemoved) + + // NB: postDestroyRaftMuLocked requires that the batch which removed the data + // be durably synced to disk, which we have. + // See replicaAppBatch.ApplyToStateMachine(). + if err := r.postDestroyRaftMuLocked(ctx, r.GetMVCCStats()); err != nil { + log.Fatalf(ctx, "failed to run Replica postDestroy: %v", err) + } + + if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, r, chng.Desc.NextReplicaID, RemoveOptions{ + DestroyData: false, // We already destroyed the data when the batch committed. + }); err != nil { + log.Fatalf(ctx, "failed to remove replica: %v", err) } + return true } func (r *Replica) handleRaftLogDeltaResult(ctx context.Context, delta int64) { diff --git a/pkg/storage/replica_application_state_machine.go b/pkg/storage/replica_application_state_machine.go index 83f3cfca75fc..b3f8ca578219 100644 --- a/pkg/storage/replica_application_state_machine.go +++ b/pkg/storage/replica_application_state_machine.go @@ -13,7 +13,6 @@ package storage import ( "context" "fmt" - "math" "time" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -376,6 +375,9 @@ type replicaAppBatch struct { // triggered a migration to the replica applied state key. If so, this // migration will be performed when the application batch is committed. migrateToAppliedStateKey bool + // changeRemovesReplica tracks whether the command in the batch (there must + // be only one) removes this replica from the range. + changeRemovesReplica bool // Statistics. entries int @@ -515,6 +517,32 @@ func (b *replicaAppBatch) stageWriteBatch(ctx context.Context, cmd *replicatedCm return nil } +// changeRemovesStore returns true if any of the removals in this change have storeID. +func changeRemovesStore( + desc *roachpb.RangeDescriptor, change *storagepb.ChangeReplicas, storeID roachpb.StoreID, +) (removesStore bool) { + curReplica, existsInDesc := desc.GetReplicaDescriptor(storeID) + // NB: if we're catching up from a preemptive snapshot then we won't + // exist in the current descriptor and we can't be removed. + if !existsInDesc { + return false + } + + // NB: We don't use change.Removed() because it will include replicas being + // transitioned to VOTER_OUTGOING. + + // In 19.1 and before we used DeprecatedUpdatedReplicas instead of providing + // a new range descriptor. Check first if this is 19.1 or earlier command which + // uses DeprecatedChangeType and DeprecatedReplica + if change.Desc == nil { + return change.DeprecatedChangeType == roachpb.REMOVE_REPLICA && change.DeprecatedReplica.ReplicaID == curReplica.ReplicaID + } + // In 19.2 and beyond we supply the new range descriptor in the change. + // We know we're removed if we do not appear in the new descriptor. + _, existsInChange := change.Desc.GetReplicaDescriptor(storeID) + return !existsInChange +} + // runPreApplyTriggers runs any triggers that must fire before a command is // applied. It may modify the command's ReplicatedEvalResult. func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicatedCmd) error { @@ -554,27 +582,40 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat // cannot be constructed at evaluation time because it differs // on each replica (votes may have already been cast on the // uninitialized replica). Write this new hardstate to the batch too. - // See https://github.com/cockroachdb/cockroach/issues/20629 - splitPreApply(ctx, b.batch, res.Split.SplitTrigger) + // See https://github.com/cockroachdb/cockroach/issues/20629. + // + // Alternatively if we discover that the RHS has already been removed + // from this store, clean up its data. + splitPreApply(ctx, b.batch, res.Split.SplitTrigger, b.r) } if merge := res.Merge; merge != nil { // Merges require the subsumed range to be atomically deleted when the // merge transaction commits. + + // If our range currently has a non-zero replica ID then we know we're + // safe to commit this merge because of the invariants provided to us + // by the merge protocol. Namely if this committed we know that if the + // command committed then all of the replicas in the range descriptor + // are collocated when this command commits. If we do not have a non-zero + // replica ID then the logic in Stage should detect that and destroy our + // preemptive snapshot so we shouldn't ever get here. rhsRepl, err := b.r.store.GetReplica(merge.RightDesc.RangeID) if err != nil { return wrapWithNonDeterministicFailure(err, "unable to get replica for merge") } - // Use math.MaxInt32 as the nextReplicaID as an extra safeguard against creating - // new replicas of the RHS. This isn't required for correctness, since the merge - // protocol should guarantee that no new replicas of the RHS can ever be - // created, but it doesn't hurt to be careful. - const rangeIDLocalOnly = true + + // Use math.MaxInt32 (mergedTombstoneReplicaID) as the nextReplicaID as an + // extra safeguard against creating new replicas of the RHS. This isn't + // required for correctness, since the merge protocol should guarantee that + // no new replicas of the RHS can ever be created, but it doesn't hurt to + // be careful. + const clearRangeIDLocalOnly = true const mustClearRange = false if err := rhsRepl.preDestroyRaftMuLocked( - ctx, b.batch, b.batch, math.MaxInt32, rangeIDLocalOnly, mustClearRange, + ctx, b.batch, b.batch, mergedTombstoneReplicaID, clearRangeIDLocalOnly, mustClearRange, ); err != nil { - return wrapWithNonDeterministicFailure(err, "unable to destroy range before merge") + return wrapWithNonDeterministicFailure(err, "unable to destroy replica before merge") } } @@ -602,6 +643,44 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat } } + // Detect if this command will remove us from the range. + // If so we stage the removal of all of our range data into this batch. + // We'll complete the removal when it commits. Later logic detects the + // removal by inspecting the destroy status. + // + // NB: This is the last step in the preApply which durably writes to the + // replica state so that if it removes the replica it removes everything. + if change := res.ChangeReplicas; change != nil && + changeRemovesStore(b.state.Desc, change, b.r.store.StoreID()) { + // Delete all of the local data. We're going to delete this hard state too. + // In order for this to be safe we need code above this to promise that we're + // never going to write hard state in response to a message for a later + // replica (with a different replica ID) to this range state. + // Furthermore we mark the replica as destroyed so that new commands are not + // accepted. The replica will be destroyed in handleChangeReplicas. + // Note that we must be holding the raftMu here because we're in the + // midst of application. + + if !b.r.store.TestingKnobs().DisableEagerReplicaRemoval { + b.r.mu.Lock() + b.r.mu.destroyStatus.Set( + roachpb.NewRangeNotFoundError(b.r.RangeID, b.r.store.StoreID()), + destroyReasonRemovalPending) + b.r.mu.Unlock() + b.changeRemovesReplica = true + if err := b.r.preDestroyRaftMuLocked( + ctx, + b.batch, + b.batch, + change.Desc.NextReplicaID, + false, /* clearRangeIDLocalOnly */ + false, /* mustUseClearRange */ + ); err != nil { + return wrapWithNonDeterministicFailure(err, "unable to destroy replica before removal") + } + } + } + // Provide the command's corresponding logical operations to the Replica's // rangefeed. Only do so if the WriteBatch is non-nil, in which case the // rangefeed requires there to be a corresponding logical operation log or @@ -614,6 +693,7 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat } else if cmd.raftCmd.LogicalOpLog != nil { log.Fatalf(ctx, "non-nil logical op log with nil write batch: %v", cmd.raftCmd) } + return nil } @@ -671,17 +751,23 @@ func (b *replicaAppBatch) ApplyToStateMachine(ctx context.Context) error { r := b.r r.store.Clock().Update(b.maxTS) - // Add the replica applied state key to the write batch. - if err := b.addAppliedStateKeyToBatch(ctx); err != nil { - return err + // Add the replica applied state key to the write batch if this change + // doesn't remove us. + if !b.changeRemovesReplica { + if err := b.addAppliedStateKeyToBatch(ctx); err != nil { + return err + } } // Apply the write batch to RockDB. Entry application is done without // syncing to disk. The atomicity guarantees of the batch and the fact that // the applied state is stored in this batch, ensure that if the batch ends // up not being durably committed then the entries in this batch will be - // applied again upon startup. - const sync = false + // applied again upon startup. However, if we're removing the replica's data + // then we sync this batch as it is not safe to call postDestroyRaftMuLocked + // before ensuring that the replica's data has been synchronously removed. + // See handleChangeReplicasResult(). + sync := b.changeRemovesReplica if err := b.batch.Commit(sync); err != nil { return wrapWithNonDeterministicFailure(err, "unable to commit Raft entry batch") } @@ -862,7 +948,11 @@ func (sm *replicaStateMachine) ApplySideEffects( // before notifying a potentially waiting client. clearTrivialReplicatedEvalResultFields(cmd.replicatedResult()) if !cmd.IsTrivial() { - shouldAssert := sm.handleNonTrivialReplicatedEvalResult(ctx, *cmd.replicatedResult()) + shouldAssert, isRemoved := sm.handleNonTrivialReplicatedEvalResult(ctx, *cmd.replicatedResult()) + + if isRemoved { + return nil, apply.ErrRemoved + } // NB: Perform state assertion before acknowledging the client. // Some tests (TestRangeStatsInit) assumes that once the store has started // and the first range has a lease that there will not be a later hard-state. @@ -928,7 +1018,7 @@ func (sm *replicaStateMachine) ApplySideEffects( // to pass a replicatedResult that does not imply any side-effects. func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( ctx context.Context, rResult storagepb.ReplicatedEvalResult, -) (shouldAssert bool) { +) (shouldAssert, isRemoved bool) { // Assert that this replicatedResult implies at least one side-effect. if rResult.Equal(storagepb.ReplicatedEvalResult{}) { log.Fatalf(ctx, "zero-value ReplicatedEvalResult passed to handleNonTrivialReplicatedEvalResult") @@ -960,7 +1050,7 @@ func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( // we want to assert that these two states do not diverge. shouldAssert = !rResult.Equal(storagepb.ReplicatedEvalResult{}) if !shouldAssert { - return false + return false, false } if rResult.Split != nil { @@ -1000,7 +1090,7 @@ func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( } if rResult.ChangeReplicas != nil { - sm.r.handleChangeReplicasResult(ctx, rResult.ChangeReplicas) + isRemoved = sm.r.handleChangeReplicasResult(ctx, rResult.ChangeReplicas) rResult.ChangeReplicas = nil } @@ -1012,7 +1102,7 @@ func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( if !rResult.Equal(storagepb.ReplicatedEvalResult{}) { log.Fatalf(ctx, "unhandled field in ReplicatedEvalResult: %s", pretty.Diff(rResult, storagepb.ReplicatedEvalResult{})) } - return true + return true, isRemoved } func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *replicatedCmd) error { diff --git a/pkg/storage/replica_destroy.go b/pkg/storage/replica_destroy.go index 0fb60c307472..cd3515a5be60 100644 --- a/pkg/storage/replica_destroy.go +++ b/pkg/storage/replica_destroy.go @@ -12,6 +12,8 @@ package storage import ( "context" + "fmt" + "math" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -29,7 +31,10 @@ type DestroyReason int const ( // The replica is alive. destroyReasonAlive DestroyReason = iota - // The replica has been marked for GC, but hasn't been GCed yet. + // The replica is in the process of being removed but has not been removed + // yet. It exists to avoid races between two threads which may decide to + // destroy a replica (e.g. processing a ChangeReplicasTrigger removing the + // range and receiving a raft message with a higher replica ID). destroyReasonRemovalPending // The replica has been GCed. destroyReasonRemoved @@ -43,15 +48,15 @@ type destroyStatus struct { err error } +func (s destroyStatus) String() string { + return fmt.Sprintf("{%v %d}", s.err, s.reason) +} + func (s *destroyStatus) Set(err error, reason DestroyReason) { s.err = err s.reason = reason } -func (s *destroyStatus) Reset() { - s.Set(nil, destroyReasonAlive) -} - // IsAlive returns true when a replica is alive. func (s destroyStatus) IsAlive() bool { return s.reason == destroyReasonAlive @@ -62,16 +67,30 @@ func (s destroyStatus) Removed() bool { return s.reason == destroyReasonRemoved } +// RemovingOrRemoved returns whether the replica is removed or in the process of +// being removed. +func (s destroyStatus) RemovingOrRemoved() bool { + return s.reason == destroyReasonRemovalPending || s.reason == destroyReasonRemoved +} + +// mergedTombstoneReplicaID is the replica ID written into the tombstone +// for replicas which are part of a range which is known to have been merged. +// This value should prevent any messages from stale replicas of that range from +// ever resurrecting merged replicas. Whenever merging or subsuming a replica we +// know new replicas can never be created so this value is used even if we +// don't know the current replica ID. +const mergedTombstoneReplicaID roachpb.ReplicaID = math.MaxInt32 + func (r *Replica) preDestroyRaftMuLocked( ctx context.Context, reader engine.Reader, writer engine.Writer, nextReplicaID roachpb.ReplicaID, - rangeIDLocalOnly bool, - mustClearRange bool, + clearRangeIDLocalOnly bool, + mustUseClearRange bool, ) error { desc := r.Desc() - err := clearRangeData(desc, reader, writer, rangeIDLocalOnly, mustClearRange) + err := clearRangeData(desc, reader, writer, clearRangeIDLocalOnly, mustUseClearRange) if err != nil { return err } @@ -89,15 +108,17 @@ func (r *Replica) postDestroyRaftMuLocked(ctx context.Context, ms enginepb.MVCCS // // TODO(benesch): we would ideally atomically suggest the compaction with // the deletion of the data itself. - desc := r.Desc() - r.store.compactor.Suggest(ctx, storagepb.SuggestedCompaction{ - StartKey: roachpb.Key(desc.StartKey), - EndKey: roachpb.Key(desc.EndKey), - Compaction: storagepb.Compaction{ - Bytes: ms.Total(), - SuggestedAtNanos: timeutil.Now().UnixNano(), - }, - }) + if ms != (enginepb.MVCCStats{}) { + desc := r.Desc() + r.store.compactor.Suggest(ctx, storagepb.SuggestedCompaction{ + StartKey: roachpb.Key(desc.StartKey), + EndKey: roachpb.Key(desc.EndKey), + Compaction: storagepb.Compaction{ + Bytes: ms.Total(), + SuggestedAtNanos: timeutil.Now().UnixNano(), + }, + }) + } // NB: we need the nil check below because it's possible that we're GC'ing a // Replica without a replicaID, in which case it does not have a sideloaded @@ -115,21 +136,22 @@ func (r *Replica) postDestroyRaftMuLocked(ctx context.Context, ms enginepb.MVCCS } // destroyRaftMuLocked deletes data associated with a replica, leaving a -// tombstone. +// tombstone. The Replica may not be initialized in which case only the +// range ID local data is removed. func (r *Replica) destroyRaftMuLocked(ctx context.Context, nextReplicaID roachpb.ReplicaID) error { startTime := timeutil.Now() ms := r.GetMVCCStats() - batch := r.Engine().NewWriteOnlyBatch() defer batch.Close() + clearRangeIDLocalOnly := !r.IsInitialized() if err := r.preDestroyRaftMuLocked( ctx, r.Engine(), batch, nextReplicaID, - false, /* rangeIDLocalOnly */ - false, /* mustClearRange */ + clearRangeIDLocalOnly, + false, /* mustUseClearRange */ ); err != nil { return err } @@ -148,12 +170,18 @@ func (r *Replica) destroyRaftMuLocked(ctx context.Context, nextReplicaID roachpb if err := r.postDestroyRaftMuLocked(ctx, ms); err != nil { return err } - - log.Infof(ctx, "removed %d (%d+%d) keys in %0.0fms [clear=%0.0fms commit=%0.0fms]", - ms.KeyCount+ms.SysCount, ms.KeyCount, ms.SysCount, - commitTime.Sub(startTime).Seconds()*1000, - preTime.Sub(startTime).Seconds()*1000, - commitTime.Sub(preTime).Seconds()*1000) + if r.IsInitialized() { + log.Infof(ctx, "removed %d (%d+%d) keys in %0.0fms [clear=%0.0fms commit=%0.0fms]", + ms.KeyCount+ms.SysCount, ms.KeyCount, ms.SysCount, + commitTime.Sub(startTime).Seconds()*1000, + preTime.Sub(startTime).Seconds()*1000, + commitTime.Sub(preTime).Seconds()*1000) + } else { + log.Infof(ctx, "removed uninitialized range in %0.0fms [clear=%0.0fms commit=%0.0fms]", + commitTime.Sub(startTime).Seconds()*1000, + preTime.Sub(startTime).Seconds()*1000, + commitTime.Sub(preTime).Seconds()*1000) + } return nil } @@ -188,8 +216,8 @@ func (r *Replica) setTombstoneKey( if nextReplicaID < externalNextReplicaID { nextReplicaID = externalNextReplicaID } - if nextReplicaID > r.mu.minReplicaID { - r.mu.minReplicaID = nextReplicaID + if nextReplicaID > r.mu.tombstoneReplicaID { + r.mu.tombstoneReplicaID = nextReplicaID } r.mu.Unlock() diff --git a/pkg/storage/replica_gc_queue.go b/pkg/storage/replica_gc_queue.go index 649e87d60eaa..81d5da274a77 100644 --- a/pkg/storage/replica_gc_queue.go +++ b/pkg/storage/replica_gc_queue.go @@ -12,7 +12,6 @@ package storage import ( "context" - "math" "time" "github.com/cockroachdb/cockroach/pkg/config" @@ -114,13 +113,13 @@ func newReplicaGCQueue(store *Store, db *client.DB, gossip *gossip.Gossip) *repl // in the past. func (rgcq *replicaGCQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, _ *config.SystemConfig, -) (bool, float64) { +) (shouldQ bool, prio float64) { + lastCheck, err := repl.GetLastReplicaGCTimestamp(ctx) if err != nil { log.Errorf(ctx, "could not read last replica GC timestamp: %+v", err) return false, 0 } - if _, currentMember := repl.Desc().GetReplicaDescriptor(repl.store.StoreID()); !currentMember { return true, replicaGCPriorityRemoved } @@ -215,10 +214,16 @@ func (rgcq *replicaGCQueue) process( } replyDesc := rs[0] + repl.mu.RLock() + replicaID := repl.mu.replicaID + ticks := repl.mu.ticks + repl.mu.RUnlock() + // Now check whether the replica is meant to still exist. // Maybe it was deleted "under us" by being moved. currentDesc, currentMember := replyDesc.GetReplicaDescriptor(repl.store.StoreID()) - if desc.RangeID == replyDesc.RangeID && currentMember { + sameRange := desc.RangeID == replyDesc.RangeID + if sameRange && currentMember { // This replica is a current member of the raft group. Set the last replica // GC check time to avoid re-processing for another check interval. // @@ -230,15 +235,10 @@ func (rgcq *replicaGCQueue) process( if err := repl.setLastReplicaGCTimestamp(ctx, repl.store.Clock().Now()); err != nil { return err } - } else if desc.RangeID == replyDesc.RangeID { + } else if sameRange { // We are no longer a member of this range, but the range still exists. // Clean up our local data. - repl.mu.RLock() - replicaID := repl.mu.replicaID - ticks := repl.mu.ticks - repl.mu.RUnlock() - if replicaID == 0 { // This is a preemptive replica. GC'ing a preemptive replica is a // good idea if and only if the up-replication that it was a part of @@ -284,13 +284,18 @@ func (rgcq *replicaGCQueue) process( rgcq.metrics.RemoveReplicaCount.Inc(1) log.VEventf(ctx, 1, "destroying local data") + + nextReplicaID := replyDesc.NextReplicaID // Note that this seems racy - we didn't hold any locks between reading // the range descriptor above and deciding to remove the replica - but // we pass in the NextReplicaID to detect situations in which the // replica became "non-gc'able" in the meantime by checking (with raftMu // held throughout) whether the replicaID is still smaller than the - // NextReplicaID. - if err := repl.store.RemoveReplica(ctx, repl, replyDesc.NextReplicaID, RemoveOptions{ + // NextReplicaID. Given non-zero replica IDs don't change, this is only + // possible if we currently think we're processing a pre-emptive snapshot + // but discover in RemoveReplica that this range has since been added and + // knows that. + if err := repl.store.RemoveReplica(ctx, repl, nextReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { return err @@ -328,10 +333,10 @@ func (rgcq *replicaGCQueue) process( } } - // A replica ID of MaxInt32 is written when we know a range to have been - // merged. See the Merge case of runPreApplyTriggers() for details. - const nextReplicaID = math.MaxInt32 - if err := repl.store.RemoveReplica(ctx, repl, nextReplicaID, RemoveOptions{ + // A tombstone is written with a value of mergedTombstoneReplicaID because + // we know the range to have been merged. See the Merge case of + // runPreApplyTriggers() for details. + if err := repl.store.RemoveReplica(ctx, repl, mergedTombstoneReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { return err diff --git a/pkg/storage/replica_init.go b/pkg/storage/replica_init.go index b28a3a33591b..4007bbf11bf9 100644 --- a/pkg/storage/replica_init.go +++ b/pkg/storage/replica_init.go @@ -150,26 +150,24 @@ func (r *Replica) initRaftMuLockedReplicaMuLocked( } r.rangeStr.store(replicaID, r.mu.state.Desc) r.connectionClass.set(rpc.ConnectionClassForKey(desc.StartKey)) - if err := r.setReplicaIDRaftMuLockedMuLocked(replicaID); err != nil { - return err + if r.mu.replicaID == 0 { + if err := r.setReplicaIDRaftMuLockedMuLocked(ctx, replicaID); err != nil { + return err + } + } else if r.mu.replicaID != replicaID { + log.Fatalf(ctx, "attempting to initialize a replica which has ID %d with ID %d", + r.mu.replicaID, replicaID) } - r.assertStateLocked(ctx, r.store.Engine()) return nil } -func (r *Replica) setReplicaID(replicaID roachpb.ReplicaID) error { - r.raftMu.Lock() - defer r.raftMu.Unlock() - r.mu.Lock() - defer r.mu.Unlock() - return r.setReplicaIDRaftMuLockedMuLocked(replicaID) -} - -func (r *Replica) setReplicaIDRaftMuLockedMuLocked(replicaID roachpb.ReplicaID) error { - if r.mu.replicaID == replicaID { - // The common case: the replica ID is unchanged. - return nil +func (r *Replica) setReplicaIDRaftMuLockedMuLocked( + ctx context.Context, replicaID roachpb.ReplicaID, +) error { + if r.mu.replicaID != 0 { + log.Fatalf(ctx, "cannot set replica ID from anything other than 0, currently %d", + r.mu.replicaID) } if replicaID == 0 { // If the incoming message does not have a new replica ID it is a @@ -177,23 +175,17 @@ func (r *Replica) setReplicaIDRaftMuLockedMuLocked(replicaID roachpb.ReplicaID) // accepted. return nil } - if replicaID < r.mu.minReplicaID { + if replicaID < r.mu.tombstoneReplicaID { return &roachpb.RaftGroupDeletedError{} } if r.mu.replicaID > replicaID { return errors.Errorf("replicaID cannot move backwards from %d to %d", r.mu.replicaID, replicaID) } - - if r.mu.destroyStatus.reason == destroyReasonRemovalPending { - // An earlier incarnation of this replica was removed, but apparently it has been re-added - // now, so reset the status. - r.mu.destroyStatus.Reset() + if r.mu.destroyStatus.RemovingOrRemoved() { + // This replica has been marked for removal and we're trying to resurrect it. + log.Fatalf(ctx, "cannot resurect replica %d", r.mu.replicaID) } - // if r.mu.replicaID != 0 { - // // TODO(bdarnell): clean up previous raftGroup (update peers) - // } - // Initialize or update the sideloaded storage. If the sideloaded storage // already exists (which is iff the previous replicaID was non-zero), then // we have to move the contained files over (this corresponds to the case in @@ -220,24 +212,12 @@ func (r *Replica) setReplicaIDRaftMuLockedMuLocked(replicaID roachpb.ReplicaID) return errors.Wrap(err, "while initializing sideloaded storage") } - previousReplicaID := r.mu.replicaID r.mu.replicaID = replicaID - if replicaID >= r.mu.minReplicaID { - r.mu.minReplicaID = replicaID + 1 - } - // Reset the raft group to force its recreation on next usage. - r.mu.internalRaftGroup = nil - - // If there was a previous replica, repropose its pending commands under - // this new incarnation. - if previousReplicaID != 0 { - if log.V(1) { - log.Infof(r.AnnotateCtx(context.TODO()), "changed replica ID from %d to %d", - previousReplicaID, replicaID) - } - // repropose all pending commands under new replicaID. - r.refreshProposalsLocked(0, reasonReplicaIDChanged) + // Sanity check that we do not already have a raft group as we did not + // know our replica ID before this call. + if r.mu.internalRaftGroup != nil { + log.Fatalf(ctx, "somehow had an initialized raft group on a zero valued replica") } return nil @@ -270,8 +250,12 @@ func (r *Replica) maybeInitializeRaftGroup(ctx context.Context) { // If this replica hasn't initialized the Raft group, create it and // unquiesce and wake the leader to ensure the replica comes up to date. initialized := r.mu.internalRaftGroup != nil + // If this replica has been removed or is in the process of being removed + // then it'll never handle any raft events so there's no reason to initialize + // it now. + removed := !r.mu.destroyStatus.IsAlive() r.mu.RUnlock() - if initialized { + if initialized || removed { return } @@ -281,9 +265,11 @@ func (r *Replica) maybeInitializeRaftGroup(ctx context.Context) { r.mu.Lock() defer r.mu.Unlock() + // If we raced on checking the destroyStatus above that's fine as + // the below withRaftGroupLocked will no-op. if err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { return true, nil - }); err != nil { + }); err != nil && err != errRemoved { log.VErrEventf(ctx, 1, "unable to initialize raft group: %s", err) } } diff --git a/pkg/storage/replica_learner_test.go b/pkg/storage/replica_learner_test.go index 6f2a5ff6825a..9e9072af9e6f 100644 --- a/pkg/storage/replica_learner_test.go +++ b/pkg/storage/replica_learner_test.go @@ -429,13 +429,13 @@ func TestReplicaGCQueueSeesLearnerOrJointConfig(t *testing.T) { require.Contains(t, tracing.FormatRecordedSpans(trace), msg) return tc.LookupRangeOrFatal(t, scratchStartKey) } - desc := checkNoGC() // Make sure it didn't collect the learner. require.NotEmpty(t, desc.Replicas().Learners()) // Now get the range into a joint config. tc.RemoveReplicasOrFatal(t, scratchStartKey, tc.Target(1)) // remove learner + ltk.withStopAfterJointConfig(func() { desc = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) require.Len(t, desc.Replicas().Filter(predIncoming), 1, desc) @@ -547,10 +547,14 @@ func TestLearnerAdminChangeReplicasRace(t *testing.T) { // Unblock the snapshot, and surprise AddReplicas. It should retry and error // that the descriptor has changed since the AdminChangeReplicas command - // started. + // started. Alternatively it may fail in sending the snapshot because of a + // "raft group deleted" error if the newly added learner attempts to send + // a raft message to another node after it has been removed and then destroys + // itself in response to a ReplicaTooOldError. close(blockSnapshotsCh) - if err := g.Wait(); !testutils.IsError(err, `descriptor changed`) { - t.Fatalf(`expected "descriptor changed" error got: %+v`, err) + const msgRE = `descriptor changed|raft group deleted` + if err := g.Wait(); !testutils.IsError(err, msgRE) { + t.Fatalf(`expected %q error got: %+v`, msgRE, err) } desc = tc.LookupRangeOrFatal(t, scratchStartKey) require.Len(t, desc.Replicas().Voters(), 1) diff --git a/pkg/storage/replica_raft.go b/pkg/storage/replica_raft.go index 52b577da569c..e5744c064f9a 100644 --- a/pkg/storage/replica_raft.go +++ b/pkg/storage/replica_raft.go @@ -379,6 +379,8 @@ func (r *Replica) hasPendingProposalsRLocked() bool { return r.numPendingProposalsRLocked() > 0 } +var errRemoved = errors.New("replica removed") + // stepRaftGroup calls Step on the replica's RawNode with the provided request's // message. Before doing so, it assures that the replica is unquiesced and ready // to handle the request. @@ -445,13 +447,11 @@ func (r *Replica) handleRaftReadyRaftMuLocked( var hasReady bool var rd raft.Ready r.mu.Lock() - lastIndex := r.mu.lastIndex // used for append below lastTerm := r.mu.lastTerm raftLogSize := r.mu.raftLogSize leaderID := r.mu.leaderID lastLeaderID := leaderID - err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { if err := r.mu.proposalBuf.FlushLockedWithRaftGroup(raftGroup); err != nil { return false, err @@ -462,11 +462,13 @@ func (r *Replica) handleRaftReadyRaftMuLocked( return hasReady /* unquiesceAndWakeLeader */, nil }) r.mu.Unlock() - if err != nil { + if err == errRemoved { + // If we've been removed then just return. + return stats, "", nil + } else if err != nil { const expl = "while checking raft group for Ready" return stats, expl, errors.Wrap(err, expl) } - if !hasReady { // We must update the proposal quota even if we don't have a ready. // Consider the case when our quota is of size 1 and two out of three @@ -723,7 +725,6 @@ func (r *Replica) handleRaftReadyRaftMuLocked( // Might have gone negative if node was recently restarted. raftLogSize = 0 } - } // Update protected state - last index, last term, raft log size, and raft @@ -756,10 +757,18 @@ func (r *Replica) handleRaftReadyRaftMuLocked( applicationStart := timeutil.Now() if len(rd.CommittedEntries) > 0 { - if err := appTask.ApplyCommittedEntries(ctx); err != nil { + err := appTask.ApplyCommittedEntries(ctx) + stats.applyCommittedEntriesStats = sm.moveStats() + switch err { + case nil: + case apply.ErrRemoved: + // We know that our replica has been removed. We also know that we've + // stepped the state machine up to the point where it's been removed. + // TODO(ajwerner): decide if there's more to be done here. + return stats, "", err + default: return stats, err.(*nonDeterministicFailure).safeExpl, err } - stats.applyCommittedEntriesStats = sm.moveStats() // etcd raft occasionally adds a nil entry (our own commands are never // empty). This happens in two situations: When a new leader is elected, and @@ -789,11 +798,12 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.mu.Unlock() } - // TODO(bdarnell): need to check replica id and not Advance if it - // has changed. Or do we need more locking to guarantee that replica - // ID cannot change during handleRaftReady? + // NB: if we just processed a command which removed this replica from the + // raft group we will early return before this point. This, combined with + // the fact that we'll refuse to process messages intended for a higher + // replica ID ensures that our replica ID could not have changed. const expl = "during advance" - if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { + err = r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { raftGroup.Advance(rd) // If the Raft group still has more to process then we immediately @@ -804,7 +814,8 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.store.enqueueRaftUpdateCheck(r.RangeID) } return true, nil - }); err != nil { + }) + if err != nil { return stats, expl, errors.Wrap(err, expl) } @@ -836,9 +847,14 @@ func splitMsgApps(msgs []raftpb.Message) (msgApps, otherMsgs []raftpb.Message) { return msgs[:splitIdx], msgs[splitIdx:] } -func fatalOnRaftReadyErr(ctx context.Context, expl string, err error) { - // Mimic the behavior in processRaft. - log.Fatalf(ctx, "%s: %+v", log.Safe(expl), err) // TODO(bdarnell) +// maybeFatalOnRaftReadyErr will fatal if err is neither nil nor +// apply.ErrRemoved. +func maybeFatalOnRaftReadyErr(ctx context.Context, expl string, err error) { + switch err { + case nil, apply.ErrRemoved: + default: + log.FatalfDepth(ctx, 1, "%s: %+v", log.Safe(expl), err) // TODO(bdarnell) + } } // tick the Raft group, returning true if the raft group exists and is @@ -1157,7 +1173,7 @@ func (r *Replica) sendRaftMessage(ctx context.Context, msg raftpb.Message) { r.mu.droppedMessages++ raftGroup.ReportUnreachable(msg.To) return true, nil - }); err != nil { + }); err != nil && err != errRemoved { log.Fatal(ctx, err) } } @@ -1200,7 +1216,7 @@ func (r *Replica) reportSnapshotStatus(ctx context.Context, to roachpb.ReplicaID if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { raftGroup.ReportSnapshot(uint64(to), snapStatus) return true, nil - }); err != nil { + }); err != nil && err != errRemoved { log.Fatal(ctx, err) } } @@ -1322,13 +1338,15 @@ func (s pendingCmdSlice) Less(i, j int) bool { // varies. // // Requires that Replica.mu is held. +// +// If this Replica is in the process of being removed this method will return +// errRemoved. func (r *Replica) withRaftGroupLocked( mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error), ) error { - if r.mu.destroyStatus.Removed() { - // Silently ignore all operations on destroyed replicas. We can't return an - // error here as all errors returned from this method are considered fatal. - return nil + if r.mu.destroyStatus.RemovingOrRemoved() { + // Callers know to detect errRemoved as non-fatal. + return errRemoved } if r.mu.replicaID == 0 { @@ -1378,6 +1396,9 @@ func (r *Replica) withRaftGroupLocked( // should not initiate an election while handling incoming raft // messages (which may include MsgVotes from an election in progress, // and this election would be disrupted if we started our own). +// +// If this Replica is in the process of being removed this method will return +// errRemoved. func (r *Replica) withRaftGroup( mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error), ) error { @@ -1536,6 +1557,9 @@ func (r *Replica) maybeAcquireSnapshotMergeLock( // not be applied yet) and acquires the split or merge lock if // necessary (in addition to other preparation). It returns a function // which will release any lock acquired (or nil). +// +// After this method returns successfully the RHS of the split or merge +// is guaranteed to exist in the Store using GetReplica(). func (r *Replica) maybeAcquireSplitMergeLock( ctx context.Context, raftCmd storagepb.RaftCommand, ) (func(), error) { diff --git a/pkg/storage/replica_raftstorage.go b/pkg/storage/replica_raftstorage.go index 7132eae707e0..a3ab8c4650bb 100644 --- a/pkg/storage/replica_raftstorage.go +++ b/pkg/storage/replica_raftstorage.go @@ -698,7 +698,6 @@ func clearRangeData( } else { keyRanges = rditer.MakeAllKeyRanges(desc) } - var clearRangeFn func(engine.Reader, engine.Writer, engine.MVCCKey, engine.MVCCKey) error if mustClearRange { clearRangeFn = func(eng engine.Reader, writer engine.Writer, start, end engine.MVCCKey) error { @@ -894,8 +893,7 @@ func (r *Replica) applySnapshot( // problematic, as it would prevent this store from ever having a new replica // of the removed range. In this case, however, it's copacetic, as subsumed // ranges _can't_ have new replicas. - const subsumedNextReplicaID = math.MaxInt32 - if err := r.clearSubsumedReplicaDiskData(ctx, inSnap.SSSS, s.Desc, subsumedRepls, subsumedNextReplicaID); err != nil { + if err := r.clearSubsumedReplicaDiskData(ctx, inSnap.SSSS, s.Desc, subsumedRepls, mergedTombstoneReplicaID); err != nil { return err } stats.subsumedReplicas = timeutil.Now() @@ -915,7 +913,7 @@ func (r *Replica) applySnapshot( // has not yet been updated. Any errors past this point must therefore be // treated as fatal. - if err := r.clearSubsumedReplicaInMemoryData(ctx, subsumedRepls, subsumedNextReplicaID); err != nil { + if err := r.clearSubsumedReplicaInMemoryData(ctx, subsumedRepls, mergedTombstoneReplicaID); err != nil { log.Fatalf(ctx, "failed to clear in-memory data of subsumed replicas while applying snapshot: %+v", err) } @@ -1013,7 +1011,7 @@ func (r *Replica) clearSubsumedReplicaDiskData( r.store.Engine(), &subsumedReplSST, subsumedNextReplicaID, - true, /* rangeIDLocalOnly */ + true, /* clearRangeIDLocalOnly */ true, /* mustClearRange */ ); err != nil { subsumedReplSST.Close() @@ -1101,7 +1099,7 @@ func (r *Replica) clearSubsumedReplicaInMemoryData( // acquisition leaves the store in a consistent state, and access to the // replicas themselves is protected by their raftMus, which are held from // start to finish. - if err := r.store.removeReplicaImpl(ctx, sr, subsumedNextReplicaID, RemoveOptions{ + if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, sr, subsumedNextReplicaID, RemoveOptions{ DestroyData: false, // data is already destroyed }); err != nil { return err diff --git a/pkg/storage/replica_rangefeed_test.go b/pkg/storage/replica_rangefeed_test.go index e5c0bcb7362e..7634874de2a2 100644 --- a/pkg/storage/replica_rangefeed_test.go +++ b/pkg/storage/replica_rangefeed_test.go @@ -600,16 +600,18 @@ func TestReplicaRangefeedRetryErrors(t *testing.T) { mtc.transport.Listen(partitionStore.Ident.StoreID, &unreliableRaftHandler{ rangeID: rangeID, RaftMessageHandler: partitionStore, - dropReq: func(req *storage.RaftMessageRequest) bool { - // Make sure that even going forward no MsgApp for what we just truncated can - // make it through. The Raft transport is asynchronous so this is necessary - // to make the test pass reliably. - // NB: the Index on the message is the log index that _precedes_ any of the - // entries in the MsgApp, so filter where msg.Index < index, not <= index. - return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + dropFuncs: dropFuncs{ + dropReq: func(req *storage.RaftMessageRequest) bool { + // Make sure that even going forward no MsgApp for what we just truncated can + // make it through. The Raft transport is asynchronous so this is necessary + // to make the test pass reliably. + // NB: the Index on the message is the log index that _precedes_ any of the + // entries in the MsgApp, so filter where msg.Index < index, not <= index. + return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + }, + dropHB: func(*storage.RaftHeartbeat) bool { return false }, + dropResp: func(*storage.RaftMessageResponse) bool { return false }, }, - dropHB: func(*storage.RaftHeartbeat) bool { return false }, - dropResp: func(*storage.RaftMessageResponse) bool { return false }, }) // Check the error. diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go index 05490d143468..a187aa8bc467 100644 --- a/pkg/storage/replica_test.go +++ b/pkg/storage/replica_test.go @@ -6568,7 +6568,7 @@ func TestReplicaDestroy(t *testing.T) { func() { tc.repl.raftMu.Lock() defer tc.repl.raftMu.Unlock() - if err := tc.store.removeReplicaImpl(ctx, tc.repl, origDesc.NextReplicaID, RemoveOptions{ + if err := tc.store.removeInitializedReplicaRaftMuLocked(ctx, tc.repl, origDesc.NextReplicaID, RemoveOptions{ DestroyData: true, }); !testutils.IsError(err, expectedErr) { t.Fatalf("expected error %q but got %v", expectedErr, err) @@ -6579,7 +6579,7 @@ func TestReplicaDestroy(t *testing.T) { func() { tc.repl.raftMu.Lock() defer tc.repl.raftMu.Unlock() - if err := tc.store.removeReplicaImpl(ctx, tc.repl, repl.Desc().NextReplicaID, RemoveOptions{ + if err := tc.store.removeInitializedReplicaRaftMuLocked(ctx, tc.repl, repl.Desc().NextReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { t.Fatal(err) @@ -6671,7 +6671,7 @@ func TestQuotaPoolAccessOnDestroyedReplica(t *testing.T) { func() { tc.repl.raftMu.Lock() defer tc.repl.raftMu.Unlock() - if err := tc.store.removeReplicaImpl(ctx, repl, repl.Desc().NextReplicaID, RemoveOptions{ + if err := tc.store.removeInitializedReplicaRaftMuLocked(ctx, repl, repl.Desc().NextReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { t.Fatal(err) @@ -7278,117 +7278,6 @@ func TestSyncSnapshot(t *testing.T) { } } -// TestReplicaIDChangePending verifies that on a replica ID change, pending -// commands are re-proposed on the new raft group. -func TestReplicaIDChangePending(t *testing.T) { - defer leaktest.AfterTest(t)() - - tc := testContext{} - cfg := TestStoreConfig(nil) - // Disable ticks to avoid automatic reproposals after a timeout, which - // would pass this test. - cfg.RaftTickInterval = math.MaxInt32 - stopper := stop.NewStopper() - defer stopper.Stop(context.TODO()) - tc.StartWithStoreConfig(t, stopper, cfg) - repl := tc.repl - - // Stop the command from being proposed to the raft group and being removed. - proposedOnOld := make(chan struct{}, 1) - repl.mu.Lock() - repl.mu.proposalBuf.testing.submitProposalFilter = func(*ProposalData) (drop bool, _ error) { - select { - case proposedOnOld <- struct{}{}: - default: - } - return true, nil - } - lease := *repl.mu.state.Lease - repl.mu.Unlock() - - // Add a command to the pending list and wait for it to be proposed. - magicTS := tc.Clock().Now() - ba := roachpb.BatchRequest{} - ba.Timestamp = magicTS - ba.Add(&roachpb.PutRequest{ - RequestHeader: roachpb.RequestHeader{ - Key: roachpb.Key("a"), - }, - Value: roachpb.MakeValueFromBytes([]byte("val")), - }) - _, _, _, err := repl.evalAndPropose(context.Background(), lease, &ba, &allSpans, endCmds{}) - if err != nil { - t.Fatal(err) - } - <-proposedOnOld - - // Set the raft command handler so we can tell if the command has been - // re-proposed. - proposedOnNew := make(chan struct{}, 1) - repl.mu.Lock() - repl.mu.proposalBuf.testing.submitProposalFilter = func(p *ProposalData) (drop bool, _ error) { - if p.Request.Timestamp == magicTS { - select { - case proposedOnNew <- struct{}{}: - default: - } - } - return false, nil - } - repl.mu.Unlock() - - // Set the ReplicaID on the replica. - if err := repl.setReplicaID(2); err != nil { - t.Fatal(err) - } - - <-proposedOnNew -} - -func TestSetReplicaID(t *testing.T) { - defer leaktest.AfterTest(t)() - - tsc := TestStoreConfig(nil) - tc := testContext{} - stopper := stop.NewStopper() - defer stopper.Stop(context.TODO()) - tc.StartWithStoreConfig(t, stopper, tsc) - - repl := tc.repl - - testCases := []struct { - replicaID roachpb.ReplicaID - minReplicaID roachpb.ReplicaID - newReplicaID roachpb.ReplicaID - expectedMinReplicaID roachpb.ReplicaID - expectedErr string - }{ - {0, 0, 1, 2, ""}, - {0, 1, 1, 2, ""}, - {0, 2, 1, 2, "raft group deleted"}, - {1, 2, 1, 2, ""}, // not an error; replicaID == newReplicaID is checked first - {2, 0, 1, 0, "replicaID cannot move backwards"}, - } - for _, c := range testCases { - t.Run("", func(t *testing.T) { - repl.mu.Lock() - repl.mu.replicaID = c.replicaID - repl.mu.minReplicaID = c.minReplicaID - repl.mu.Unlock() - - err := repl.setReplicaID(c.newReplicaID) - repl.mu.Lock() - if repl.mu.minReplicaID != c.expectedMinReplicaID { - t.Errorf("expected minReplicaID=%d, but found %d", c.expectedMinReplicaID, repl.mu.minReplicaID) - } - repl.mu.Unlock() - if !testutils.IsError(err, c.expectedErr) { - t.Fatalf("expected %q, but found %v", c.expectedErr, err) - } - }) - } -} - func TestReplicaRetryRaftProposal(t *testing.T) { defer leaktest.AfterTest(t)() diff --git a/pkg/storage/store.go b/pkg/storage/store.go index 99fa19230a5a..e63879c23cc1 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -1142,7 +1142,7 @@ func (s *Store) IsStarted() bool { return atomic.LoadInt32(&s.started) == 1 } -// IterateIDPrefixKeys helps visit system keys that use RangeID prefixing ( such as +// IterateIDPrefixKeys helps visit system keys that use RangeID prefixing (such as // RaftHardStateKey, RaftTombstoneKey, and many others). Such keys could in principle exist at any // RangeID, and this helper efficiently discovers all the keys of the desired type (as specified by // the supplied `keyFn`) and, for each key-value pair discovered, unmarshals it into `msg` and then @@ -2088,10 +2088,72 @@ func (s *Store) AllocateRangeID(ctx context.Context) (roachpb.RangeID, error) { // splitPreApply is called when the raft command is applied. Any // changes to the given ReadWriter will be written atomically with the // split commit. -func splitPreApply(ctx context.Context, eng engine.ReadWriter, split roachpb.SplitTrigger) { +func splitPreApply( + ctx context.Context, eng engine.ReadWriter, split roachpb.SplitTrigger, r *Replica, +) { + // Check on the RHS, we need to ensure that it exists and has a minReplicaID + // less than or equal to the replica we're about to initialize. + // + // The right hand side of the split was already created (and its raftMu + // acquired) in Replica.acquireSplitLock. It must be present here. + rightRepl, err := r.store.GetReplica(split.RightDesc.RangeID) + if err != nil { + log.Fatalf(ctx, "unable to find RHS replica: %+v", err) + } + + // If the RHS is not in the split, sanity check that the LHS is currently + // catching up from a preemptive snapshot. + _, hasRightDesc := split.RightDesc.GetReplicaDescriptor(r.StoreID()) + if !hasRightDesc { + _, lhsExists := r.Desc().GetReplicaDescriptor(r.StoreID()) + if lhsExists { + log.Fatalf(ctx, "cannot process split on s%s which exists in LHS and not in RHS: %+v", + r.StoreID(), split) + } + } + + // Check to see if we know that the RHS has already been removed from this + // store at the replica ID implied by the split. + if rightRepl.isNewerThanSplit(&split) { + // We're in the rare case where we know that the RHS has been removed + // and re-added with a higher replica ID. We know we've never processed a + // snapshot for the right range because up to this point it would overlap + // with the left and ranges cannot move rightwards. + // + // It is important to preserve the HardState because we might however have + // already voted at a higher term. In general this shouldn't happen because + // we add learners and then promote them only after we snapshot but we're + // going to be extra careful in case future versions of cockroach somehow + // promote replicas without ensuring that a snapshot has been received. + // + // Clear the user data the RHS would have inherited from the LHS due to the + // split and additionally clear all of the range ID local state that the + // split trigger writes into the RHS. + // + // Rather than specifically deleting around the data we want to preserve + // we read the HardState to preserve it, clear everything and write back + // the HardState and tombstone. + hs, err := rightRepl.raftMu.stateLoader.LoadHardState(ctx, eng) + if err != nil { + log.Fatalf(ctx, "failed to load hard state for removed rhs: %v", err) + } + const rangeIDLocalOnly = false + const mustUseClearRange = false + if err := clearRangeData(&split.RightDesc, eng, eng, rangeIDLocalOnly, mustUseClearRange); err != nil { + log.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err) + } + if err := rightRepl.raftMu.stateLoader.SetHardState(ctx, eng, hs); err != nil { + log.Fatalf(ctx, "failed to set hard state with 0 commit index for removed rhs: %v", err) + } + if err := r.setTombstoneKey(ctx, eng, r.minReplicaID()); err != nil { + log.Fatalf(ctx, "failed to set tombstone for removed rhs: %v", err) + } + return + } + // Update the raft HardState with the new Commit value now that the // replica is initialized (combining it with existing or default - // Term and Vote). + // Term and Vote). This is the common case. rsl := stateloader.Make(split.RightDesc.RangeID) if err := rsl.SynthesizeRaftState(ctx, eng); err != nil { log.Fatal(ctx, err) @@ -2112,146 +2174,70 @@ func splitPostApply( if err != nil { log.Fatalf(ctx, "unable to find RHS replica: %+v", err) } - { - // Already holding raftMu, see above. - rightRng.mu.Lock() - // The right hand side of the split may have been removed and re-added - // in the meantime, and the replicaID in RightDesc may be stale. - // Consequently the call below may fail with a RaftGroupDeletedError. In - // general, this protects earlier incarnations of the replica that were - // since replicaGC'ed from reneging on promises made earlier (for - // example, once the HardState is removed, a replica could cast a - // different vote for the same term). - // - // It is safe to circumvent that in this case because the RHS must have - // remained uninitialized (the LHS blocks all user data, and since our - // Raft logs start at a nonzero index a snapshot must go through before - // any log entries are appended). This means the data in that range is - // just a HardState which we "logically" snapshot by assigning it data - // formerly located within the LHS. - // - // Note that if we ever have a way to replicaGC uninitialized replicas, - // the RHS may have been gc'ed and so the HardState would be gone. In - // that case, the requirement that the HardState remains would have been - // violated if the bypass below were used, which is why we place an - // assertion. - // - // See: - // https://github.com/cockroachdb/cockroach/issues/21146#issuecomment-365757329 - // - // TODO(tbg): this argument is flawed - it's possible for a tombstone - // to exist on the RHS: - // https://github.com/cockroachdb/cockroach/issues/40470 - // Morally speaking, this means that we should throw away the data we - // moved from the LHS to the RHS (depending on the tombstone). - // Realistically speaking it will probably be easier to create the RHS - // anyway, even though there's a tombstone and it may just get gc'ed - // again. Note that for extra flavor, we may not even know whether the - // RHS is currently supposed to exist or not, lending more weight to the - // second approach. - tombstoneKey := keys.RaftTombstoneKey(rightRng.RangeID) - var tombstone roachpb.RaftTombstone - if ok, err := engine.MVCCGetProto( - ctx, r.store.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, engine.MVCCGetOptions{}, - ); err != nil { - log.Fatalf(ctx, "unable to load tombstone for RHS: %+v", err) - } else if ok { - log.Fatalf(ctx, "split trigger found right-hand side with tombstone %+v: %v", tombstone, rightRng) - } - rightDesc, ok := split.RightDesc.GetReplicaDescriptor(r.StoreID()) - if !ok { - // This is yet another special quirky case. The local store is not - // necessarily a member of the split; this can occur if this store - // wasn't a member at the time of the split, but is nevertheless - // catching up across the split. For example, add a learner, and - // while it is being caught up via a snapshot, remove the learner - // again, then execute a split, and re-add it. Upon being re-added - // the learner will likely catch up from where the snapshot left it, - // and it will see itself get removed, then we hit this branch when - // the split trigger is applied, and eventually there's a - // ChangeReplicas that re-adds the local store under a new - // replicaID. - // - // So our trigger will have a smaller replicaID for our RHS, which - // will blow up in initRaftMuLockedReplicaMuLocked. We still want - // to force the RHS to accept the descriptor, even though that - // rewinds the replicaID. To do that we want to change the existing - // replicaID, but we didn't find one -- zero is then the only reasonable - // choice. Note that this is also the replicaID a replica that is - // not reflected in its own descriptor will have, i.e. we're cooking - // up less of a special case here than you'd expect at first glance. - // - // Note that futzing with the replicaID is a high-risk operation as - // it is what the raft peer will believe itself to be identified by. - // Under no circumstances must we use a replicaID that belongs to - // someone else, or a byzantine situation will result. Zero is - // special-cased and will never init a raft group until the real - // ID is known from inbound raft traffic. - rightDesc.ReplicaID = 0 // for clarity only; it's already zero - } - if rightRng.mu.replicaID > rightDesc.ReplicaID { - rightRng.mu.replicaID = rightDesc.ReplicaID - } - // NB: the safety argument above implies that we don't have to worry - // about restoring the existing minReplicaID if it's nonzero. No - // promises have been made, so none need to be kept. So we clear this - // unconditionally, making sure that it doesn't block us from init'ing - // the RHS. - rightRng.mu.minReplicaID = 0 + // Already holding raftMu, see above. + rightRng.mu.Lock() + + // If we know that the RHS has already been removed at this replica ID + // then we also know that its data has already been removed by the preApply + // so we continue to update the descriptor for the left hand side and + // return. + if rightRng.isNewerThanSplitRLocked(split) { + rightRng.mu.Unlock() + rightRng = nil + } else { + // Finish initialization of the RHS. err := rightRng.initRaftMuLockedReplicaMuLocked(&split.RightDesc, r.store.Clock(), 0) rightRng.mu.Unlock() if err != nil { log.Fatal(ctx, err) } - } - // Finish initialization of the RHS. - - // This initialMaxClosedValue is created here to ensure that follower reads - // do not regress following the split. After the split occurs there will be no - // information in the closedts subsystem about the newly minted RHS range from - // its leaseholder's store. Furthermore, the RHS will have a lease start time - // equal to that of the LHS which might be quite old. This means that - // timestamps which follow the least StartTime for the LHS part are below the - // current closed timestamp for the LHS would no longer be readable on the RHS - // after the split. It is critical that this call to maxClosed happen during - // the splitPostApply so that it refers to a LAI that is equal to the index at - // which this lease was applied. If it were to refer to a LAI after the split - // then the value of initialMaxClosed might be unsafe. - initialMaxClosed := r.maxClosed(ctx) - r.mu.Lock() - rightRng.mu.Lock() - // Copy the minLeaseProposedTS from the LHS. - rightRng.mu.minLeaseProposedTS = r.mu.minLeaseProposedTS - rightRng.mu.initialMaxClosed = initialMaxClosed - rightLease := *rightRng.mu.state.Lease - rightRng.mu.Unlock() - r.mu.Unlock() - - // We need to explicitly wake up the Raft group on the right-hand range or - // else the range could be underreplicated for an indefinite period of time. - // - // Specifically, suppose one of the replicas of the left-hand range never - // applies this split trigger, e.g., because it catches up via a snapshot that - // advances it past this split. That store won't create the right-hand replica - // until it receives a Raft message addressed to the right-hand range. But - // since new replicas start out quiesced, unless we explicitly awaken the - // Raft group, there might not be any Raft traffic for quite a while. - if err := rightRng.withRaftGroup(true, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error) { - return true, nil - }); err != nil { - log.Fatalf(ctx, "unable to create raft group for right-hand range in split: %+v", err) - } + // This initialMaxClosedValue is created here to ensure that follower reads + // do not regress following the split. After the split occurs there will be no + // information in the closedts subsystem about the newly minted RHS range from + // its leaseholder's store. Furthermore, the RHS will have a lease start time + // equal to that of the LHS which might be quite old. This means that + // timestamps which follow the least StartTime for the LHS part are below the + // current closed timestamp for the LHS would no longer be readable on the RHS + // after the split. It is critical that this call to maxClosed happen during + // the splitPostApply so that it refers to a LAI that is equal to the index at + // which this lease was applied. If it were to refer to a LAI after the split + // then the value of initialMaxClosed might be unsafe. + initialMaxClosed := r.maxClosed(ctx) + r.mu.Lock() + rightRng.mu.Lock() + // Copy the minLeaseProposedTS from the LHS. + rightRng.mu.minLeaseProposedTS = r.mu.minLeaseProposedTS + rightRng.mu.initialMaxClosed = initialMaxClosed + rightLease := *rightRng.mu.state.Lease + rightRng.mu.Unlock() + r.mu.Unlock() - // Invoke the leasePostApply method to ensure we properly initialize - // the replica according to whether it holds the lease. This enables - // the txnWaitQueue. - rightRng.leasePostApply(ctx, rightLease, false /* permitJump */) + // We need to explicitly wake up the Raft group on the right-hand range or + // else the range could be underreplicated for an indefinite period of time. + // + // Specifically, suppose one of the replicas of the left-hand range never + // applies this split trigger, e.g., because it catches up via a snapshot that + // advances it past this split. That store won't create the right-hand replica + // until it receives a Raft message addressed to the right-hand range. But + // since new replicas start out quiesced, unless we explicitly awaken the + // Raft group, there might not be any Raft traffic for quite a while. + err = rightRng.withRaftGroup(true, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error) { + return true, nil + }) + if err != nil { + log.Fatalf(ctx, "unable to create raft group for right-hand range in split: %+v", err) + } + // Invoke the leasePostApply method to ensure we properly initialize + // the replica according to whether it holds the lease. This enables + // the txnWaitQueue. + rightRng.leasePostApply(ctx, rightLease, false /* permitJump */) + } // Add the RHS replica to the store. This step atomically updates // the EndKey of the LHS replica and also adds the RHS replica // to the store's replica map. - if err := r.store.SplitRange(ctx, r, rightRng, split.LeftDesc); err != nil { + if err := r.store.SplitRange(ctx, r, rightRng, split); err != nil { // Our in-memory state has diverged from the on-disk state. log.Fatalf(ctx, "%s: failed to update Store after split: %+v", r, err) } @@ -2265,22 +2251,24 @@ func splitPostApply( // might require the range to be split again. Enqueue both the left and right // ranges to speed up such splits. See #10160. r.store.splitQueue.MaybeAddAsync(ctx, r, now) - r.store.splitQueue.MaybeAddAsync(ctx, rightRng, now) - // If the range was not properly replicated before the split, the replicate // queue may not have picked it up (due to the need for a split). Enqueue // both the left and right ranges to speed up a potentially necessary // replication. See #7022 and #7800. r.store.replicateQueue.MaybeAddAsync(ctx, r, now) - r.store.replicateQueue.MaybeAddAsync(ctx, rightRng, now) - if len(split.RightDesc.Replicas().All()) == 1 { - // TODO(peter): In single-node clusters, we enqueue the right-hand side of - // the split (the new range) for Raft processing so that the corresponding - // Raft group is created. This shouldn't be necessary for correctness, but - // some tests rely on this (e.g. server.TestNodeStatusWritten). - r.store.enqueueRaftUpdateCheck(rightRng.RangeID) + if rightRng != nil { + r.store.splitQueue.MaybeAddAsync(ctx, rightRng, now) + r.store.replicateQueue.MaybeAddAsync(ctx, rightRng, now) + if len(split.RightDesc.Replicas().All()) == 1 { + // TODO(peter): In single-node clusters, we enqueue the right-hand side of + // the split (the new range) for Raft processing so that the corresponding + // Raft group is created. This shouldn't be necessary for correctness, but + // some tests rely on this (e.g. server.TestNodeStatusWritten). + r.store.enqueueRaftUpdateCheck(rightRng.RangeID) + } } + } // SplitRange shortens the original range to accommodate the new range. The new @@ -2288,13 +2276,14 @@ func splitPostApply( // and newRng.raftMu must be held. // // This is only called from the split trigger in the context of the execution -// of a Raft command. +// of a Raft command. Note that rightRepl will be nil if the replica described +// by rightDesc is known to have been removed. func (s *Store) SplitRange( - ctx context.Context, leftRepl, rightRepl *Replica, newLeftDesc roachpb.RangeDescriptor, + ctx context.Context, leftRepl, rightRepl *Replica, split *roachpb.SplitTrigger, ) error { + rightDesc := &split.RightDesc + newLeftDesc := &split.LeftDesc oldLeftDesc := leftRepl.Desc() - rightDesc := rightRepl.Desc() - if !bytes.Equal(oldLeftDesc.EndKey, rightDesc.EndKey) || bytes.Compare(oldLeftDesc.StartKey, rightDesc.StartKey) >= 0 { return errors.Errorf("left range is not splittable by right range: %+v, %+v", oldLeftDesc, rightDesc) @@ -2302,7 +2291,7 @@ func (s *Store) SplitRange( s.mu.Lock() defer s.mu.Unlock() - if exRng, ok := s.mu.uninitReplicas[rightDesc.RangeID]; ok { + if exRng, ok := s.mu.uninitReplicas[rightDesc.RangeID]; rightRepl != nil && ok { // If we have an uninitialized replica of the new range we require pointer // equivalence with rightRepl. See Store.splitTriggerPostApply(). if exRng != rightRepl { @@ -2315,7 +2304,7 @@ func (s *Store) SplitRange( s.replicaQueues.Delete(int64(rightDesc.RangeID)) } - leftRepl.setDesc(ctx, &newLeftDesc) + leftRepl.setDesc(ctx, newLeftDesc) // Clear the LHS txn wait queue, to redirect to the RHS if // appropriate. We do this after setDescWithoutProcessUpdate @@ -2336,22 +2325,27 @@ func (s *Store) SplitRange( // Clear the original range's request stats, since they include requests for // spans that are now owned by the new range. leftRepl.leaseholderStats.resetRequestCounts() - leftRepl.writeStats.splitRequestCounts(rightRepl.writeStats) - if err := s.addReplicaInternalLocked(rightRepl); err != nil { - return errors.Errorf("unable to add replica %v: %s", rightRepl, err) - } + if rightRepl == nil { + throwawayRightWriteStats := new(replicaStats) + leftRepl.writeStats.splitRequestCounts(throwawayRightWriteStats) + } else { + leftRepl.writeStats.splitRequestCounts(rightRepl.writeStats) + if err := s.addReplicaInternalLocked(rightRepl); err != nil { + return errors.Errorf("unable to add replica %v: %s", rightRepl, err) + } - // Update the replica's cached byte thresholds. This is a no-op if the system - // config is not available, in which case we rely on the next gossip update - // to perform the update. - if err := rightRepl.updateRangeInfo(rightRepl.Desc()); err != nil { - return err + // Update the replica's cached byte thresholds. This is a no-op if the system + // config is not available, in which case we rely on the next gossip update + // to perform the update. + if err := rightRepl.updateRangeInfo(rightRepl.Desc()); err != nil { + return err + } + // Add the range to metrics and maybe gossip on capacity change. + s.metrics.ReplicaCount.Inc(1) + s.maybeGossipOnCapacityChange(ctx, rangeAddEvent) } - // Add the range to metrics and maybe gossip on capacity change. - s.metrics.ReplicaCount.Inc(1) - s.maybeGossipOnCapacityChange(ctx, rangeAddEvent) return nil } @@ -2406,7 +2400,7 @@ func (s *Store) MergeRange( // Note that we were called (indirectly) from raft processing so we must // call removeReplicaImpl directly to avoid deadlocking on the right-hand // replica's raftMu. - if err := s.removeReplicaImpl(ctx, rightRepl, rightDesc.NextReplicaID, RemoveOptions{ + if err := s.removeInitializedReplicaRaftMuLocked(ctx, rightRepl, rightDesc.NextReplicaID, RemoveOptions{ DestroyData: false, // the replica was destroyed when the merge commit applied }); err != nil { return errors.Errorf("cannot remove range: %s", err) @@ -2562,25 +2556,48 @@ type RemoveOptions struct { // advanced to or beyond the NextReplicaID since the removal decision was made. // // If opts.DestroyReplica is false, replica.destroyRaftMuLocked is not called. +// +// The passed replica must be initialized. func (s *Store) RemoveReplica( ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, opts RemoveOptions, ) error { rep.raftMu.Lock() defer rep.raftMu.Unlock() - return s.removeReplicaImpl(ctx, rep, nextReplicaID, opts) + return s.removeInitializedReplicaRaftMuLocked(ctx, rep, nextReplicaID, opts) +} + +// removeReplicaRaftMuLocked removes the passed replica. If the replica is +// initialized the RemoveOptions will be consulted. +func (s *Store) removeReplicaRaftMuLocked( + ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, opts RemoveOptions, +) (err error) { + rep.raftMu.AssertHeld() + if !rep.IsInitialized() { + err = errors.Wrap(s.removeUninitializedReplicaRaftMuLocked(ctx, rep, nextReplicaID), + "failed to remove uninitialized replica") + } else { + err = errors.Wrap(s.removeInitializedReplicaRaftMuLocked(ctx, rep, nextReplicaID, opts), + "failed to remove replica") + } + return err } // removeReplicaImpl is the implementation of RemoveReplica, which is sometimes // called directly when the necessary lock is already held. It requires that // Replica.raftMu is held and that s.mu is not held. -func (s *Store) removeReplicaImpl( +func (s *Store) removeInitializedReplicaRaftMuLocked( ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, opts RemoveOptions, ) error { rep.raftMu.AssertHeld() // We check both rep.mu.ReplicaID and rep.mu.state.Desc's replica ID because // they can differ in cases when a replica's ID is increased due to an // incoming raft message (see #14231 for background). + // TODO(ajwerner): reconsider some of this sanity checking. rep.mu.Lock() + if rep.mu.destroyStatus.Removed() { + rep.mu.Unlock() + return nil + } replicaID := rep.mu.replicaID if rep.mu.replicaID >= nextReplicaID { rep.mu.Unlock() @@ -2600,12 +2617,7 @@ func (s *Store) removeReplicaImpl( } if !rep.IsInitialized() { - // The split trigger relies on the fact that it can bypass the tombstone - // check for the RHS, but this is only true as long as we never delete - // its HardState. - // - // See the comment in splitPostApply for details. - log.Fatalf(ctx, "can not replicaGC uninitialized replicas") + log.Fatalf(ctx, "can not replicaGC uninitialized replicas in this method") } // During merges, the context might have the subsuming range, so we explicitly @@ -2666,7 +2678,73 @@ func (s *Store) removeReplicaImpl( // TODO(peter): Could release s.mu.Lock() here. s.maybeGossipOnCapacityChange(ctx, rangeRemoveEvent) s.scanner.RemoveReplica(rep) + return nil +} + +func (s *Store) removeUninitializedReplicaRaftMuLocked( + ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, +) error { + rep.raftMu.AssertHeld() + // Sanity check this removal. + rep.mu.RLock() + ds := rep.mu.destroyStatus + isInitialized := rep.isInitializedRLocked() + rep.mu.RUnlock() + // Somebody already removed this Replica. + if ds.Removed() { + return nil + } + if ds.reason != destroyReasonRemovalPending { + log.Fatalf(ctx, "cannot remove uninitialized replica which is not removal pending: %v", ds) + } + + // When we're in this state we should have already had our destroy status set + // so it shouldn't have been possible to process any raft messages or apply + // any snapshots. + if isInitialized { + log.Fatalf(ctx, "previously uninitialized replica became initialized before removal") + } + + // Proceed with the removal. + rep.readOnlyCmdMu.Lock() + rep.mu.Lock() + rep.cancelPendingCommandsLocked() + rep.mu.internalRaftGroup = nil + rep.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(rep.RangeID, rep.store.StoreID()), destroyReasonRemoved) + rep.mu.Unlock() + rep.readOnlyCmdMu.Unlock() + + if err := rep.destroyRaftMuLocked(ctx, nextReplicaID); err != nil { + log.Fatalf(ctx, "failed to remove uninitialized replica %v: %v", rep, err) + } + + s.mu.Lock() + defer s.mu.Unlock() + + // Sanity check, could be removed. + value, stillExists := s.mu.replicas.Load(int64(rep.RangeID)) + if !stillExists { + log.Fatalf(ctx, "uninitialized replica was removed in the meantime") + } + existing := (*Replica)(value) + if existing == rep { + log.Infof(ctx, "removing uninitialized replica %v", rep) + } else { + log.Fatalf(ctx, "uninitialized replica %v was unexpectedly replaced", existing) + } + + s.metrics.ReplicaCount.Dec(1) + + // Only an uninitialized replica can have a placeholder since, by + // definition, an initialized replica will be present in the + // replicasByKey map. While the replica will usually consume the + // placeholder itself, that isn't guaranteed and so this invocation + // here is crucial (i.e. don't remove it). + if s.removePlaceholderLocked(ctx, rep.RangeID) { + atomic.AddInt32(&s.counts.droppedPlaceholders, 1) + } + s.unlinkReplicaByRangeIDLocked(rep.RangeID) return nil } @@ -3504,10 +3582,8 @@ func (s *Store) processRaftSnapshotRequest( if err := r.stepRaftGroup(&snapHeader.RaftMessageRequest); err != nil { return roachpb.NewError(err) } - - if _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, inSnap); err != nil { - fatalOnRaftReadyErr(ctx, expl, err) - } + _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, inSnap) + maybeFatalOnRaftReadyErr(ctx, expl, err) removePlaceholder = false return nil }) @@ -3543,33 +3619,41 @@ func (s *Store) HandleRaftResponse(ctx context.Context, resp *RaftMessageRespons repl.raftMu.Lock() defer repl.raftMu.Unlock() repl.mu.Lock() - defer repl.mu.Unlock() // If the replica ID in the error does not match then we know // that the replica has been removed and re-added quickly. In // that case, we don't want to add it to the replicaGCQueue. - if tErr.ReplicaID != repl.mu.replicaID { - log.Infof(ctx, "replica too old response with old replica ID: %s", tErr.ReplicaID) + // If the replica is not alive then we also should ignore this error. + if tErr.ReplicaID != repl.mu.replicaID || + !repl.mu.destroyStatus.IsAlive() || + // Ignore if we want to test the replicaGC queue. + s.TestingKnobs().DisableEagerReplicaRemoval { + repl.mu.Unlock() return nil } - // If the replica ID in the error does match, we know the replica - // will be removed and we can cancel any pending commands. This is - // sometimes necessary to unblock PushTxn operations that are - // necessary for the replica GC to succeed. - repl.cancelPendingCommandsLocked() - // The replica will be garbage collected soon (we are sure // since our replicaID is definitely too old), but in the meantime we // already want to bounce all traffic from it. Note that the replica - // could be re-added with a higher replicaID, in which this error is - // cleared in setReplicaIDRaftMuLockedMuLocked. - if repl.mu.destroyStatus.IsAlive() { - storeID := repl.store.StoreID() - repl.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(repl.RangeID, storeID), destroyReasonRemovalPending) + // could be re-added with a higher replicaID, but we want to clear the + // replica's data before that happens. + if log.V(1) { + log.Infof(ctx, "setting local replica to destroyed due to ReplicaTooOld error") } - s.replicaGCQueue.AddAsync(ctx, repl, replicaGCPriorityRemoved) + storeID := repl.store.StoreID() + // NB: This response may imply that this range has been added back to + // this store at a higher replica ID. We set RangeNotFound error despite + // the fact that this newer replica may have already been created + // because we don't have a more specific error. In general nobody should + // see it. + repl.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(repl.RangeID, storeID), + destroyReasonRemovalPending) + repl.mu.Unlock() + nextReplicaID := tErr.ReplicaID + 1 + return s.removeReplicaRaftMuLocked(ctx, repl, nextReplicaID, RemoveOptions{ + DestroyData: true, + }) case *roachpb.RaftGroupDeletedError: if replErr != nil { // RangeNotFoundErrors are expected here; nothing else is. @@ -3632,9 +3716,8 @@ func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID // giving up the lock. Set lastRepl to nil, so we don't handle it // down below as well. lastRepl = nil - if _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, noSnap); err != nil { - fatalOnRaftReadyErr(ctx, expl, err) - } + _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, noSnap) + maybeFatalOnRaftReadyErr(ctx, expl, err) } return pErr }) @@ -3677,9 +3760,8 @@ func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID // handleRaftReadyRaftMuLocked) since racing to handle Raft Ready won't // have any undesirable results. ctx = lastRepl.AnnotateCtx(ctx) - if _, expl, err := lastRepl.handleRaftReady(ctx, noSnap); err != nil { - fatalOnRaftReadyErr(ctx, expl, err) - } + _, expl, err := lastRepl.handleRaftReady(ctx, noSnap) + maybeFatalOnRaftReadyErr(ctx, expl, err) } } @@ -3693,9 +3775,7 @@ func (s *Store) processReady(ctx context.Context, rangeID roachpb.RangeID) { ctx = r.AnnotateCtx(ctx) start := timeutil.Now() stats, expl, err := r.handleRaftReady(ctx, noSnap) - if err != nil { - log.Fatalf(ctx, "%s: %+v", log.Safe(expl), err) // TODO(bdarnell) - } + maybeFatalOnRaftReadyErr(ctx, expl, err) elapsed := timeutil.Since(start) s.metrics.RaftWorkingDurationNanos.Inc(elapsed.Nanoseconds()) // Warn if Raft processing took too long. We use the same duration as we @@ -3937,7 +4017,16 @@ func (s *Store) getOrCreateReplica( replicaID roachpb.ReplicaID, creatingReplica *roachpb.ReplicaDescriptor, ) (_ *Replica, created bool, _ error) { + // We need a retry loop as the replica we find in the map may be in the + // process of being removed or may need to be removed. + r := retry.Start(retry.Options{ + InitialBackoff: time.Microsecond, + // Set the backoff up to only a small amount to wait for data that + // might need to be cleared. + MaxBackoff: 10 * time.Millisecond, + }) for { + r.Next() r, created, err := s.tryGetOrCreateReplica( ctx, rangeID, @@ -3966,32 +4055,42 @@ func (s *Store) tryGetOrCreateReplica( replicaID roachpb.ReplicaID, creatingReplica *roachpb.ReplicaDescriptor, ) (_ *Replica, created bool, _ error) { + // NB: All of the below closures assume that both the raftMu and mu are held + // for the passed Replica. + // The common case: look up an existing (initialized) replica. if value, ok := s.mu.replicas.Load(int64(rangeID)); ok { repl := (*Replica)(value) - repl.raftMu.Lock() // not unlocked + repl.raftMu.Lock() // not unlocked on success repl.mu.Lock() defer repl.mu.Unlock() - - var replTooOldErr error - if creatingReplica != nil { - // Drop messages that come from a node that we believe was once a member of - // the group but has been removed. - desc := repl.mu.state.Desc - _, found := desc.GetReplicaDescriptorByID(creatingReplica.ReplicaID) - // It's not a current member of the group. Is it from the past? - if !found && creatingReplica.ReplicaID < desc.NextReplicaID { - replTooOldErr = roachpb.NewReplicaTooOldError(creatingReplica.ReplicaID) - } + if err := tryGetOrCreateHandleFromReplicaTooOld(ctx, s, repl, creatingReplica); err != nil { + repl.raftMu.Unlock() + return nil, false, err + } + if repl.mu.destroyStatus.RemovingOrRemoved() { + repl.raftMu.Unlock() + return nil, false, errRetry + } + if err := tryGetOrCreateHandleToReplicaTooOld(ctx, s, repl, replicaID); err != nil { + repl.raftMu.Unlock() + return nil, false, err } var err error - if replTooOldErr != nil { - err = replTooOldErr - } else if ds := repl.mu.destroyStatus; ds.reason == destroyReasonRemoved { - err = errRetry - } else { - err = repl.setReplicaIDRaftMuLockedMuLocked(replicaID) + if repl.mu.replicaID == 0 { + // This message is telling us about our replica ID. + // This is a common case when dealing with preemptive snapshots. + err = repl.setReplicaIDRaftMuLockedMuLocked(repl.AnnotateCtx(ctx), replicaID) + } else if replicaID != 0 && repl.mu.replicaID > replicaID { + // The sender is behind and is sending to an old replica. + // We could silently drop this message but this way we'll inform the + // sender that they may no longer exist. + err = roachpb.NewRangeNotFoundError(rangeID, s.StoreID()) + } else if replicaID != 0 && repl.mu.replicaID != replicaID { + // This case should have been caught by handleToReplicaTooOld. + log.Fatalf(ctx, "intended replica id %d unexpectedly does not match the current replica %v", + replicaID, repl) } if err != nil { repl.raftMu.Unlock() @@ -4028,7 +4127,7 @@ func (s *Store) tryGetOrCreateReplica( // replica even outside of raft processing. Have to do this after grabbing // Store.mu to maintain lock ordering invariant. repl.mu.Lock() - repl.mu.minReplicaID = tombstone.NextReplicaID + repl.mu.tombstoneReplicaID = tombstone.NextReplicaID // Add the range to range map, but not replicasByKey since the range's start // key is unknown. The range will be added to replicasByKey later when a // snapshot is applied. After unlocking Store.mu above, another goroutine @@ -4073,6 +4172,49 @@ func (s *Store) tryGetOrCreateReplica( return repl, true, nil } +// TODO(ajwerner): The below helper functions are not closures inside of +// tryGetOrCreate because the escape analysis in go1.12 is not advanced +// enough to prevent them from escaping and moving to the heap. + +// Drop messages that come from a node that we believe was once a member of +// the group but has been removed. Assumes that repl.mu and repl.raftMu are both +// held. +func tryGetOrCreateHandleFromReplicaTooOld( + ctx context.Context, s *Store, repl *Replica, creatingReplica *roachpb.ReplicaDescriptor, +) error { + if creatingReplica == nil { + return nil + } + desc := repl.mu.state.Desc + _, found := desc.GetReplicaDescriptorByID(creatingReplica.ReplicaID) + // It's not a current member of the group. Is it from the past? + if !found && creatingReplica.ReplicaID < desc.NextReplicaID { + return roachpb.NewReplicaTooOldError(creatingReplica.ReplicaID) + } + return nil +} + +func tryGetOrCreateHandleToReplicaTooOld( + ctx context.Context, s *Store, repl *Replica, replicaID roachpb.ReplicaID, +) error { + if replicaID == 0 || repl.mu.replicaID == 0 || repl.mu.replicaID >= replicaID { + return nil + } + if log.V(1) { + log.Infof(ctx, "found message for replica ID %d which is newer than %v", replicaID, repl) + } + repl.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(repl.RangeID, repl.StoreID()), + destroyReasonRemovalPending) + repl.mu.Unlock() + defer repl.mu.Lock() + if err := s.removeReplicaRaftMuLocked(ctx, repl, replicaID, RemoveOptions{ + DestroyData: true, + }); err != nil { + log.Fatal(ctx, err) + } + return errRetry +} + func (s *Store) updateCapacityGauges() error { desc, err := s.Descriptor(false /* useCached */) if err != nil { diff --git a/pkg/storage/store_snapshot.go b/pkg/storage/store_snapshot.go index 3fb7ff8b74d5..7c8062fb5250 100644 --- a/pkg/storage/store_snapshot.go +++ b/pkg/storage/store_snapshot.go @@ -628,7 +628,9 @@ func (s *Store) canApplySnapshotLocked( existingRepl.raftMu.AssertHeld() existingRepl.mu.RLock() - existingIsInitialized := existingRepl.isInitializedRLocked() + existingDesc := existingRepl.mu.state.Desc + existingIsInitialized := existingDesc.IsInitialized() + existingDestroyStatus := existingRepl.mu.destroyStatus existingRepl.mu.RUnlock() if existingIsInitialized { @@ -637,15 +639,19 @@ func (s *Store) canApplySnapshotLocked( // in Replica.maybeAcquireSnapshotMergeLock for how this is // made safe. // - // NB: we expect the replica to know its replicaID at this point - // (i.e. !existingIsPreemptive), though perhaps it's possible - // that this isn't true if the leader initiates a Raft snapshot - // (that would provide a range descriptor with this replica in - // it) but this node reboots (temporarily forgetting its - // replicaID) before the snapshot arrives. + // NB: The snapshot must be intended for this replica as + // withReplicaForRequest ensures that requests with a non-zero replica + // id are passed to a replica with a matching id. Given this is not a + // preemptive snapshot we know that its id must be non-zero. return nil, nil } + // If we are not alive then we should not apply a snapshot as our removal + // is imminent. + if existingDestroyStatus.RemovingOrRemoved() { + return nil, existingDestroyStatus.err + } + // We have a key range [desc.StartKey,desc.EndKey) which we want to apply a // snapshot for. Is there a conflicting existing placeholder or an // overlapping range? @@ -670,7 +676,7 @@ func (s *Store) checkSnapshotOverlapLocked( // NB: this check seems redundant since placeholders are also represented in // replicasByKey (and thus returned in getOverlappingKeyRangeLocked). if exRng, ok := s.mu.replicaPlaceholders[desc.RangeID]; ok { - return errors.Errorf("%s: canApplySnapshotLocked: cannot add placeholder, have an existing placeholder %s", s, exRng) + return errors.Errorf("%s: canApplySnapshotLocked: cannot add placeholder, have an existing placeholder %s %v", s, exRng, snapHeader.RaftMessageRequest.FromReplica) } // TODO(benesch): consider discovering and GC'ing *all* overlapping ranges, @@ -736,43 +742,16 @@ func (s *Store) shouldAcceptSnapshotData( if snapHeader.IsPreemptive() { return crdberrors.AssertionFailedf(`expected a raft or learner snapshot`) } - - s.mu.Lock() - defer s.mu.Unlock() - - // TODO(tbg): see the comment on desc.Generation for what seems to be a much - // saner way to handle overlap via generational semantics. - desc := *snapHeader.State.Desc - - // First, check for an existing Replica. - if v, ok := s.mu.replicas.Load( - int64(desc.RangeID), - ); ok { - existingRepl := (*Replica)(v) - existingRepl.mu.RLock() - existingIsInitialized := existingRepl.isInitializedRLocked() - existingRepl.mu.RUnlock() - - if existingIsInitialized { - // Regular Raft snapshots can't be refused at this point, - // even if they widen the existing replica. See the comments - // in Replica.maybeAcquireSnapshotMergeLock for how this is - // made safe. - // - // NB: we expect the replica to know its replicaID at this point - // (i.e. !existingIsPreemptive), though perhaps it's possible - // that this isn't true if the leader initiates a Raft snapshot - // (that would provide a range descriptor with this replica in - // it) but this node reboots (temporarily forgetting its - // replicaID) before the snapshot arrives. + pErr := s.withReplicaForRequest(ctx, &snapHeader.RaftMessageRequest, + func(ctx context.Context, r *Replica) *roachpb.Error { + if !r.IsInitialized() { + s.mu.Lock() + defer s.mu.Unlock() + return roachpb.NewError(s.checkSnapshotOverlapLocked(ctx, snapHeader)) + } return nil - } - } - - // We have a key range [desc.StartKey,desc.EndKey) which we want to apply a - // snapshot for. Is there a conflicting existing placeholder or an - // overlapping range? - return s.checkSnapshotOverlapLocked(ctx, snapHeader) + }) + return pErr.GoError() } // receiveSnapshot receives an incoming snapshot via a pre-opened GRPC stream. diff --git a/pkg/storage/store_snapshot_preemptive.go b/pkg/storage/store_snapshot_preemptive.go index 9c8893a93980..67d351719eea 100644 --- a/pkg/storage/store_snapshot_preemptive.go +++ b/pkg/storage/store_snapshot_preemptive.go @@ -330,8 +330,8 @@ func (s *Store) processPreemptiveSnapshotRequest( // Raft has decided the snapshot shouldn't be applied we would be // writing the tombstone key incorrectly. r.mu.Lock() - if r.mu.state.Desc.NextReplicaID > r.mu.minReplicaID { - r.mu.minReplicaID = r.mu.state.Desc.NextReplicaID + if r.mu.state.Desc.NextReplicaID > r.mu.tombstoneReplicaID { + r.mu.tombstoneReplicaID = r.mu.state.Desc.NextReplicaID } r.mu.Unlock() } diff --git a/pkg/storage/store_test.go b/pkg/storage/store_test.go index 8829fd7a15cb..ba431889d1c3 100644 --- a/pkg/storage/store_test.go +++ b/pkg/storage/store_test.go @@ -569,8 +569,8 @@ func TestStoreAddRemoveRanges(t *testing.T) { // Try to remove range 1 again. if err := store.RemoveReplica(context.Background(), repl1, repl1.Desc().NextReplicaID, RemoveOptions{ DestroyData: true, - }); err == nil { - t.Fatal("expected error re-removing same range") + }); err != nil { + t.Fatalf("didn't expect error re-removing same range: %v", err) } // Try to add a range with previously-used (but now removed) ID. repl2Dup := createReplica(store, 1, roachpb.RKey("a"), roachpb.RKey("b")) @@ -712,11 +712,10 @@ func TestStoreRemoveReplicaDestroy(t *testing.T) { // Verify that removal of a replica marks it as destroyed so that future raft // commands on the Replica will silently be dropped. - if err := repl1.withRaftGroup(true, func(r *raft.RawNode) (bool, error) { + err = repl1.withRaftGroup(true, func(r *raft.RawNode) (bool, error) { return true, errors.Errorf("unexpectedly created a raft group") - }); err != nil { - t.Fatal(err) - } + }) + require.Equal(t, errRemoved, err) repl1.mu.Lock() expErr := roachpb.NewError(repl1.mu.destroyStatus.err) @@ -1340,21 +1339,24 @@ func splitTestRange(store *Store, key, splitKey roachpb.RKey, t *testing.T) *Rep require.NotNil(t, repl) rangeID, err := store.AllocateRangeID(ctx) require.NoError(t, err) - desc := roachpb.NewRangeDescriptor( + rhsDesc := roachpb.NewRangeDescriptor( rangeID, splitKey, repl.Desc().EndKey, repl.Desc().Replicas()) // Minimal amount of work to keep this deprecated machinery working: Write // some required Raft keys. cv := store.ClusterSettings().Version.Version().Version _, err = stateloader.WriteInitialState( - context.Background(), store.engine, enginepb.MVCCStats{}, *desc, roachpb.Lease{}, + context.Background(), store.engine, enginepb.MVCCStats{}, *rhsDesc, roachpb.Lease{}, hlc.Timestamp{}, cv, stateloader.TruncatedStateUnreplicated, ) require.NoError(t, err) - newRng, err := NewReplica(desc, store, 0) + newRng, err := NewReplica(rhsDesc, store, 0) require.NoError(t, err) newLeftDesc := *repl.Desc() newLeftDesc.EndKey = splitKey - err = store.SplitRange(repl.AnnotateCtx(context.TODO()), repl, newRng, newLeftDesc) + err = store.SplitRange(repl.AnnotateCtx(context.TODO()), repl, newRng, &roachpb.SplitTrigger{ + RightDesc: *rhsDesc, + LeftDesc: newLeftDesc, + }) require.NoError(t, err) return newRng } @@ -2953,104 +2955,6 @@ func TestStoreRemovePlaceholderOnRaftIgnored(t *testing.T) { }) } -// Test that we set proper tombstones for removed replicas and use the -// tombstone to reject attempts to create a replica with a lesser ID. -func TestRemovedReplicaTombstone(t *testing.T) { - defer leaktest.AfterTest(t)() - - const rangeID = 1 - creatingReplica := roachpb.ReplicaDescriptor{ - NodeID: 2, - StoreID: 2, - ReplicaID: 2, - } - - // All test cases assume that the starting replica ID is 1. This assumption - // is enforced by a check within the test logic. - testCases := []struct { - setReplicaID roachpb.ReplicaID // set the existing replica to this before removing it - descNextReplicaID roachpb.ReplicaID // the descriptor's NextReplicaID during replica removal - createReplicaID roachpb.ReplicaID // try creating a replica at this ID - expectCreated bool - }{ - {1, 2, 2, true}, - {1, 2, 1, false}, - {1, 2, 1, false}, - {1, 3, 1, false}, - {1, 3, 2, false}, - {1, 3, 3, true}, - {1, 99, 98, false}, - {1, 99, 99, true}, - {2, 2, 2, false}, - {2, 2, 3, true}, - {2, 2, 99, true}, - {98, 2, 98, false}, - {98, 2, 99, true}, - } - for _, c := range testCases { - t.Run("", func(t *testing.T) { - tc := testContext{} - stopper := stop.NewStopper() - ctx := context.TODO() - defer stopper.Stop(ctx) - tc.Start(t, stopper) - s := tc.store - - repl1, err := s.GetReplica(rangeID) - if err != nil { - t.Fatal(err) - } - repl1.mu.Lock() - if repl1.mu.replicaID != 1 { - repl1.mu.Unlock() - t.Fatalf("test precondition not met; expected ReplicaID=1, got %d", repl1.mu.replicaID) - } - repl1.mu.Unlock() - - // Try to trigger a race where the replica ID gets increased during the GC - // process by taking the store lock and inserting a short sleep to cause - // the goroutine to start running the setReplicaID call. - errChan := make(chan error) - - func() { - repl1.raftMu.Lock() - defer repl1.raftMu.Unlock() - s.mu.Lock() - defer s.mu.Unlock() - repl1.mu.Lock() - defer repl1.mu.Unlock() - - go func() { - errChan <- s.RemoveReplica(ctx, repl1, c.descNextReplicaID, RemoveOptions{DestroyData: true}) - }() - - time.Sleep(1 * time.Millisecond) - - if err := repl1.setReplicaIDRaftMuLockedMuLocked(c.setReplicaID); err != nil { - t.Fatal(err) - } - }() - - if err := <-errChan; testutils.IsError(err, "replica ID has changed") { - // We didn't trigger the race, so just return success. - return - } else if err != nil { - t.Fatal(err) - } - - _, created, err := s.getOrCreateReplica(ctx, rangeID, c.createReplicaID, &creatingReplica) - if created != c.expectCreated { - t.Errorf("expected s.getOrCreateReplica(%d, %d, %v).created=%v, got %v", - rangeID, c.createReplicaID, creatingReplica, c.expectCreated, created) - } - if !c.expectCreated && !testutils.IsError(err, "raft group deleted") { - t.Errorf("expected s.getOrCreateReplica(%d, %d, %v).err='raft group deleted', got %v", - rangeID, c.createReplicaID, creatingReplica, err) - } - }) - } -} - type fakeSnapshotStream struct { nextResp *SnapshotResponse nextErr error diff --git a/pkg/storage/testing_knobs.go b/pkg/storage/testing_knobs.go index 7ee8dccf6953..14d1f31a24db 100644 --- a/pkg/storage/testing_knobs.go +++ b/pkg/storage/testing_knobs.go @@ -139,6 +139,13 @@ type StoreTestingKnobs struct { // DisableRefreshReasonTicks disables refreshing pending commands // periodically. DisableRefreshReasonTicks bool + // DisableEagerReplicaRemoval prevents the Replica from destroying itself + // when it encounters a ChangeReplicasTrigger which would remove it or when + // a ReplicaTooOldError in a RaftMessageResponse would lead to removal. + // This option can lead to nasty cases during shutdown where a replica will + // spin attempting to acquire a split or merge lock on a RHS which will + // always fail and is generally not safe but is useful for testing. + DisableEagerReplicaRemoval bool // RefreshReasonTicksPeriod overrides the default period over which // pending commands are refreshed. The period is specified as a multiple // of Raft group ticks.