From 264c71da19ff00b8f012b0ed7088bf8d03a89000 Mon Sep 17 00:00:00 2001
From: Andrew Werner <ajwerner@cockroachlabs.com>
Date: Mon, 16 Sep 2019 18:01:23 -0400
Subject: [PATCH] storage: write a tombstone with math.MaxInt32 when processing
 a merge

Before this PR we would write such a tombstone when detecting that a range had
been merged via a snapshot or via the replica gc queue but curiously not when
merging the range by applying a merge.

Release Justification: Came across this oddity while working on updating tests
for #40751.

Release note: None
---
 pkg/storage/client_merge_test.go              | 67 ++++++++++++-------
 .../replica_application_state_machine.go      |  7 +-
 pkg/storage/replica_gc_queue.go               |  9 +--
 3 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/pkg/storage/client_merge_test.go b/pkg/storage/client_merge_test.go
index f6073a23ec68..9917ada54c26 100644
--- a/pkg/storage/client_merge_test.go
+++ b/pkg/storage/client_merge_test.go
@@ -1628,6 +1628,9 @@ func TestStoreRangeMergeConcurrentRequests(t *testing.T) {
 // replica tombstone, if S4 received a slow Raft message for the now-GC'd
 // replica, it would incorrectly construct an uninitialized replica and panic.
 //
+// This test also ensures that the nodes which processes the Merge writes a
+// tombstone which prevents the range from being resurrected by a raft message.
+//
 // This test's approach to simulating this sequence of events is based on
 // TestReplicaGCRace.
 func TestStoreReplicaGCAfterMerge(t *testing.T) {
@@ -1681,7 +1684,7 @@ func TestStoreReplicaGCAfterMerge(t *testing.T) {
 		t.Fatalf("expected %s to have a replica on %s", rhsDesc, store1)
 	}
 
-	transport0 := storage.NewRaftTransport(
+	transport := storage.NewRaftTransport(
 		log.AmbientContext{Tracer: mtc.storeConfig.Settings.Tracer},
 		cluster.MakeTestingClusterSettings(),
 		nodedialer.New(mtc.rpcContext, gossip.AddressResolver(mtc.gossips[0])),
@@ -1689,18 +1692,22 @@ func TestStoreReplicaGCAfterMerge(t *testing.T) {
 		mtc.transportStopper,
 	)
 	errChan := errorChannelTestHandler(make(chan *roachpb.Error, 1))
-	transport0.Listen(store0.StoreID(), errChan)
+	transport.Listen(store0.StoreID(), errChan)
+	transport.Listen(store1.StoreID(), errChan)
 
-	sendHeartbeat := func(toReplDesc roachpb.ReplicaDescriptor) {
+	sendHeartbeat := func(
+		rangeID roachpb.RangeID,
+		fromReplDesc, toReplDesc roachpb.ReplicaDescriptor,
+	) {
 		// Try several times, as the message may be dropped (see #18355).
 		for i := 0; i < 5; i++ {
-			if sent := transport0.SendAsync(&storage.RaftMessageRequest{
-				FromReplica: rhsReplDesc0,
+			if sent := transport.SendAsync(&storage.RaftMessageRequest{
+				FromReplica: fromReplDesc,
 				ToReplica:   toReplDesc,
 				Heartbeats: []storage.RaftHeartbeat{
 					{
-						RangeID:       rhsDesc.RangeID,
-						FromReplicaID: rhsReplDesc0.ReplicaID,
+						RangeID:       rangeID,
+						FromReplicaID: fromReplDesc.ReplicaID,
 						ToReplicaID:   toReplDesc.ReplicaID,
 						Commit:        42,
 					},
@@ -1722,34 +1729,46 @@ func TestStoreReplicaGCAfterMerge(t *testing.T) {
 		t.Fatal("did not get expected RaftGroupDeleted error")
 	}
 
-	// Send a heartbeat to the now-GC'd replica on store1. If the replica
+	// Send a heartbeat to the now-GC'd replica on the stores. If the replica
 	// tombstone was not written correctly when the replica was GC'd, this will
 	// cause a panic.
-	sendHeartbeat(rhsReplDesc1)
+	sendHeartbeat(rhsDesc.RangeID, rhsReplDesc0, rhsReplDesc1)
+	sendHeartbeat(rhsDesc.RangeID, rhsReplDesc1, rhsReplDesc0)
 
-	// Send a heartbeat to a fictional replica on store1 with a large replica ID.
+	// Send a heartbeat to a fictional replicas on with a large replica ID.
 	// This tests an implementation detail: the replica tombstone that gets
-	// written in this case will use the maximum possible replica ID, so store1
-	// should ignore heartbeats for replicas of the range with _any_ replica ID.
-	sendHeartbeat(roachpb.ReplicaDescriptor{
+	// written in this case will use the maximum possible replica ID, so the
+	// stores should ignore heartbeats for replicas of the range with _any_
+	// replica ID.
+	sendHeartbeat(rhsDesc.RangeID, rhsReplDesc0, roachpb.ReplicaDescriptor{
 		NodeID:    store1.Ident.NodeID,
 		StoreID:   store1.Ident.StoreID,
 		ReplicaID: 123456,
 	})
 
+	sendHeartbeat(rhsDesc.RangeID, rhsReplDesc1, roachpb.ReplicaDescriptor{
+		NodeID:    store0.Ident.NodeID,
+		StoreID:   store0.Ident.StoreID,
+		ReplicaID: 123456,
+	})
+
 	// Be extra paranoid and verify the exact value of the replica tombstone.
-	var rhsTombstone1 roachpb.RaftTombstone
-	rhsTombstoneKey := keys.RaftTombstoneKey(rhsDesc.RangeID)
-	ok, err = engine.MVCCGetProto(ctx, store1.Engine(), rhsTombstoneKey, hlc.Timestamp{},
-		&rhsTombstone1, engine.MVCCGetOptions{})
-	if err != nil {
-		t.Fatal(err)
-	} else if !ok {
-		t.Fatalf("missing raft tombstone at key %s", rhsTombstoneKey)
-	}
-	if e, a := roachpb.ReplicaID(math.MaxInt32), rhsTombstone1.NextReplicaID; e != a {
-		t.Fatalf("expected next replica ID to be %d, but got %d", e, a)
+	checkTombstone := func(eng engine.Engine) {
+		var rhsTombstone roachpb.RaftTombstone
+		rhsTombstoneKey := keys.RaftTombstoneKey(rhsDesc.RangeID)
+		ok, err = engine.MVCCGetProto(ctx, eng, rhsTombstoneKey, hlc.Timestamp{},
+			&rhsTombstone, engine.MVCCGetOptions{})
+		if err != nil {
+			t.Fatal(err)
+		} else if !ok {
+			t.Fatalf("missing raft tombstone at key %s", rhsTombstoneKey)
+		}
+		if e, a := roachpb.ReplicaID(math.MaxInt32), rhsTombstone.NextReplicaID; e != a {
+			t.Fatalf("expected next replica ID to be %d, but got %d", e, a)
+		}
 	}
+	checkTombstone(store0.Engine())
+	checkTombstone(store1.Engine())
 }
 
 // TestStoreRangeMergeAddReplicaRace verifies that when an add replica request
diff --git a/pkg/storage/replica_application_state_machine.go b/pkg/storage/replica_application_state_machine.go
index ac8432388c09..83f3cfca75fc 100644
--- a/pkg/storage/replica_application_state_machine.go
+++ b/pkg/storage/replica_application_state_machine.go
@@ -13,6 +13,7 @@ package storage
 import (
 	"context"
 	"fmt"
+	"math"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
@@ -564,10 +565,14 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat
 		if err != nil {
 			return wrapWithNonDeterministicFailure(err, "unable to get replica for merge")
 		}
+		// Use math.MaxInt32 as the nextReplicaID as an extra safeguard against creating
+		// new replicas of the RHS. This isn't required for correctness, since the merge
+		// protocol should guarantee that no new replicas of the RHS can ever be
+		// created, but it doesn't hurt to be careful.
 		const rangeIDLocalOnly = true
 		const mustClearRange = false
 		if err := rhsRepl.preDestroyRaftMuLocked(
-			ctx, b.batch, b.batch, merge.RightDesc.NextReplicaID, rangeIDLocalOnly, mustClearRange,
+			ctx, b.batch, b.batch, math.MaxInt32, rangeIDLocalOnly, mustClearRange,
 		); err != nil {
 			return wrapWithNonDeterministicFailure(err, "unable to destroy range before merge")
 		}
diff --git a/pkg/storage/replica_gc_queue.go b/pkg/storage/replica_gc_queue.go
index 33d64df45dca..649e87d60eaa 100644
--- a/pkg/storage/replica_gc_queue.go
+++ b/pkg/storage/replica_gc_queue.go
@@ -328,13 +328,8 @@ func (rgcq *replicaGCQueue) process(
 			}
 		}
 
-		// We don't have the last NextReplicaID for the subsumed range, nor can we
-		// obtain it, but that's OK: we can just be conservative and use the maximum
-		// possible replica ID. store.RemoveReplica will write a tombstone using
-		// this maximum possible replica ID, which would normally be problematic, as
-		// it would prevent this store from ever having a new replica of the removed
-		// range. In this case, however, it's copacetic, as subsumed ranges _can't_
-		// have new replicas.
+		// A replica ID of MaxInt32 is written when we know a range to have been
+		// merged. See the Merge case of runPreApplyTriggers() for details.
 		const nextReplicaID = math.MaxInt32
 		if err := repl.store.RemoveReplica(ctx, repl, nextReplicaID, RemoveOptions{
 			DestroyData: true,