From c7f8c59c93c345ea9f93db088ee8e361094e0272 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Fri, 30 Aug 2019 17:35:35 +0200
Subject: [PATCH] storage: make leaving joint config idempotent

ChangeReplicas (and AdminSplit, and AdminMerge) take a RangeDescriptor
that they use as a basis for a CPut to make sure the operations mutating
the range serialize. This is great for correctness but generally
unfortunate for usability since on a mismatch, the caller usually wanted
to do the thing they were trying to do anyway, using the new descriptor.
The fact that every split (replication change, merge) basically needs a
retry loop is constant trickle of test flakes and UX papercuts.

It became more pressing to do something against this as we are routinely
using joint configs when atomic replication changes are enabled. A joint
configuration is transitioned out of opportunistically whenever it is
encountered, but this frequently causes a race in which actor A finds
a joint config, begins a transaction out of it but is raced by actor B
getting there first. The end result is that what actor A wanted to
achieve has been achieved, though by someone else, and the result
is a spurious error.

This commit fixes that behavior in the targeted case of wanting to leave
a joint configuration: actor A will get a successful result.

Before this change,

make stress PKG=./pkg/sql TESTS=TestShowRangesWithLocal

would fail immediately when `kv.atomic_replication_changes.enabled` was
true because the splits this test carries out would run into the joint
configuration changes of the initial upreplication, and would race the
replicate queue to transition out of them, which at least one split
would typically lose. This still happens, but now it's not an error any
more.

I do think that it makes sense to use a similar strategy in general
(fail replication changes only if the *replicas* changed, allow all
splits except when the split key moves out of the current descriptor,
etc) but in the process of coding this up I got reminded of all of
the problems relating to range merges and also found what I think is
a long-standing pretty fatal bug, #40367, so I don't want to do anything
until the release is out of the door. But I'm basically convinced that
if we did it, it wouldn't cause a new "bug" because any replication
change carried out in that way is just one that could be triggered
just the same by a user under the old checks.

Release note: None
---
 pkg/storage/replica_command.go | 39 +++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
index c6b14b22a674..0e4d8a211798 100644
--- a/pkg/storage/replica_command.go
+++ b/pkg/storage/replica_command.go
@@ -1468,21 +1468,47 @@ func prepareChangeReplicasTrigger(
 func execChangeReplicasTxn(
 	ctx context.Context,
 	store *Store,
-	desc *roachpb.RangeDescriptor,
+	referenceDesc *roachpb.RangeDescriptor,
 	reason storagepb.RangeLogEventReason,
 	details string,
 	chgs []internalReplicationChange,
 ) (*roachpb.RangeDescriptor, error) {
 	var returnDesc *roachpb.RangeDescriptor
 
-	descKey := keys.RangeDescriptorKey(desc.StartKey)
+	descKey := keys.RangeDescriptorKey(referenceDesc.StartKey)
+
+	check := func(kvDesc *roachpb.RangeDescriptor) bool {
+		if len(chgs) == 0 {
+			// If there are no changes, we're trying to leave a joint config,
+			// so that's all we care about. But since leaving a joint config
+			// is done opportunistically whenever one is encountered, this is
+			// more likely to race than other operations. So we verify literally
+			// nothing about the descriptor, but once we get the descriptor out
+			// from conditionalGetDescValueFromDB, we'll check if it's in a
+			// joint config and if not, noop.
+			return true
+		}
+		// Otherwise, check that the descriptors are equal.
+		//
+		// TODO(tbg): check that the replica sets are equal only. I was going to
+		// do that but then discovered #40367. Try again in the 20.1 cycle.
+		return checkDescsEqual(referenceDesc)(kvDesc)
+	}
+
 	if err := store.DB().Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
 		log.Event(ctx, "attempting txn")
 		txn.SetDebugName(replicaChangeTxnName)
-		_, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, desc.StartKey, checkDescsEqual(desc))
+		desc, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, referenceDesc.StartKey, check)
 		if err != nil {
 			return err
 		}
+		if len(chgs) == 0 && !desc.Replicas().InAtomicReplicationChange() {
+			// Nothing to do. See comment in 'check' above for details.
+			returnDesc = desc
+			return nil
+		}
+		// Note that we are now using the descriptor from KV, not the one passed
+		// into this method.
 		crt, err := prepareChangeReplicasTrigger(store, desc, chgs)
 		if err != nil {
 			return err
@@ -1545,10 +1571,13 @@ func execChangeReplicasTxn(
 		return nil
 	}); err != nil {
 		log.Event(ctx, err.Error())
-		if msg, ok := maybeDescriptorChangedError(desc, err); ok {
+		// NB: desc may not be the descriptor we actually compared against, but
+		// either way this gives a good idea of what happened which is all it's
+		// supposed to do.
+		if msg, ok := maybeDescriptorChangedError(referenceDesc, err); ok {
 			err = &benignError{errors.New(msg)}
 		}
-		return nil, errors.Wrapf(err, "change replicas of r%d failed", desc.RangeID)
+		return nil, errors.Wrapf(err, "change replicas of r%d failed", referenceDesc.RangeID)
 	}
 	log.Event(ctx, "txn complete")
 	return returnDesc, nil