From 4d8123a498406d40cb65b4bd0e031ae1626a4b9d Mon Sep 17 00:00:00 2001
From: Peter Mattis <petermattis@gmail.com>
Date: Mon, 26 Sep 2016 14:54:32 -0400
Subject: [PATCH] storage: don't campaign the RHS after splits

Remove the hack to campaign the RHS on one and only one of the
nodes. The hack is no longer necessary now that we're campaigning idle
ranges when the first proposal is received. By removing the hack, we're
effectively leaving the RHS of a split as a lazily loaded Raft
group. Tweaked Store.Bootstrap so that it allows eager campaigning of
idle replicas immediately upon startup.

Discovered while investigating performance hiccups in #9465.
---
 storage/client_raft_test.go | 17 ++++++++
 storage/replica_command.go  |  3 +-
 storage/store.go            | 77 +++++++------------------------------
 storage/store_test.go       |  4 ++
 4 files changed, 36 insertions(+), 65 deletions(-)

diff --git a/storage/client_raft_test.go b/storage/client_raft_test.go
index c46708a733b2..80a36d1562fc 100644
--- a/storage/client_raft_test.go
+++ b/storage/client_raft_test.go
@@ -1557,6 +1557,23 @@ func TestReplicaRemovalCampaign(t *testing.T) {
 			}
 
 			replica2 := store0.LookupReplica(roachpb.RKey(key2), nil)
+
+			rg2 := func(s *storage.Store) client.Sender {
+				return client.Wrap(s, func(ba roachpb.BatchRequest) roachpb.BatchRequest {
+					if ba.RangeID == 0 {
+						ba.RangeID = replica2.RangeID
+					}
+					return ba
+				})
+			}
+
+			// Raft processing is initialized lazily; issue a no-op write request to
+			// ensure that the Raft group has been started.
+			incArgs := incrementArgs(key2, 0)
+			if _, err := client.SendWrapped(rg2(store0), nil, &incArgs); err != nil {
+				t.Fatal(err)
+			}
+
 			if td.remove {
 				// Simulate second replica being transferred by removing it.
 				if err := store0.RemoveReplica(replica2, *replica2.Desc(), true); err != nil {
diff --git a/storage/replica_command.go b/storage/replica_command.go
index 45b5418c7ad9..2db3111bb511 100644
--- a/storage/replica_command.go
+++ b/storage/replica_command.go
@@ -2308,7 +2308,8 @@ func (r *Replica) AdminSplit(
 	leftDesc := *desc
 	leftDesc.EndKey = splitKey
 
-	log.Infof(ctx, "initiating a split of this range at key %s", splitKey)
+	log.Infof(ctx, "initiating a split of this range at key %s [r%d]",
+		splitKey, rightDesc.RangeID)
 
 	if err := r.store.DB().Txn(ctx, func(txn *client.Txn) error {
 		log.Event(ctx, "split closure begins")
diff --git a/storage/store.go b/storage/store.go
index 492f4aedb0c8..7bb6ee6bc0d6 100644
--- a/storage/store.go
+++ b/storage/store.go
@@ -1224,8 +1224,18 @@ func (s *Store) Bootstrap(ident roachpb.StoreIdent, stopper *stop.Stopper) error
 		}
 		return errors.Errorf("store %s is non-empty but does not contain store metadata (first %d key/values: %s)", s.engine, len(keyVals), keyVals)
 	}
-	err = engine.MVCCPutProto(context.Background(), s.engine, nil, keys.StoreIdentKey(), hlc.ZeroTimestamp, nil, &s.Ident)
-	return err
+	err = engine.MVCCPutProto(context.Background(), s.engine, nil,
+		keys.StoreIdentKey(), hlc.ZeroTimestamp, nil, &s.Ident)
+	if err != nil {
+		return err
+	}
+
+	// If we're bootstrapping the store we can campaign idle replicas
+	// immediately. This primarily affects tests.
+	s.idleReplicaElectionTime.Lock()
+	s.idleReplicaElectionTime.at = s.Clock().PhysicalTime()
+	s.idleReplicaElectionTime.Unlock()
+	return nil
 }
 
 // GetReplica fetches a replica by Range ID. Returns an error if no replica is found.
@@ -1504,68 +1514,7 @@ func splitTriggerPostCommit(
 	r.store.replicateQueue.MaybeAdd(r, now)
 	r.store.replicateQueue.MaybeAdd(rightRng, now)
 
-	// To avoid leaving the RHS range unavailable as it waits to elect
-	// its leader, one (and only one) of the nodes should start an
-	// election as soon as the split is processed.
-	//
-	// If there is only one replica, raft instantly makes it the leader.
-	// Otherwise, we must trigger a campaign here.
-	//
-	// TODO(bdarnell): The asynchronous campaign can cause a problem
-	// with merges, by recreating a replica that should have been
-	// destroyed. This will probably be addressed as a part of the
-	// general work to be done for merges
-	// (https://github.com/cockroachdb/cockroach/issues/2433), but for
-	// now we're safe because we only perform the asynchronous
-	// campaign when there are multiple replicas, and we only perform
-	// merges in testing scenarios with a single replica.
-	// Note that in multi-node scenarios the async campaign is safe
-	// because it has exactly the same behavior as an incoming message
-	// from another node; the problem here is only with merges.
-	rightRng.mu.Lock()
-	shouldCampaign := rightRng.mu.state.Lease.OwnedBy(r.store.StoreID())
-	rightRng.mu.Unlock()
-	if len(split.RightDesc.Replicas) > 1 && shouldCampaign {
-		// Schedule the campaign a short time in the future. As
-		// followers process the split, they destroy and recreate their
-		// raft groups, which can cause messages to be dropped. In
-		// general a shorter delay (perhaps all the way down to zero) is
-		// better in production, because the race is rare and the worst
-		// case scenario is that we simply wait for an election timeout.
-		// However, the test for this feature disables election timeouts
-		// and relies solely on this campaign trigger, so it is unacceptably
-		// flaky without a bit of a delay.
-		//
-		// Note: you must not use the context inside of this task since it may
-		// contain a finished trace by the time it runs.
-		if err := r.store.stopper.RunAsyncTask(ctx, func(ctx context.Context) {
-			time.Sleep(10 * time.Millisecond)
-			// Make sure that rightRng hasn't been removed.
-			replica, err := r.store.GetReplica(rightRng.RangeID)
-			if err != nil {
-				if _, ok := err.(*roachpb.RangeNotFoundError); ok {
-					log.Infof(ctx, "%s: RHS replica %d removed before campaigning",
-						r, r.mu.replicaID)
-				} else {
-					log.Infof(ctx, "%s: RHS replica %d unable to campaign: %s",
-						r, r.mu.replicaID, err)
-				}
-				return
-			}
-
-			if err := replica.withRaftGroup(func(raftGroup *raft.RawNode) (bool, error) {
-				if err := raftGroup.Campaign(); err != nil {
-					log.Warningf(ctx, "%s: error %v", r, err)
-				}
-				return true, nil
-			}); err != nil {
-				panic(err)
-			}
-		}); err != nil {
-			log.Warningf(ctx, "%s: error %v", r, err)
-			return
-		}
-	} else if len(split.RightDesc.Replicas) == 1 {
+	if len(split.RightDesc.Replicas) == 1 {
 		// TODO(peter): In single-node clusters, we enqueue the right-hand side of
 		// the split (the new range) for Raft processing so that the corresponding
 		// Raft group is created. This shouldn't be necessary for correctness, but
diff --git a/storage/store_test.go b/storage/store_test.go
index d1b20f9aa2f0..e342052ff4d0 100644
--- a/storage/store_test.go
+++ b/storage/store_test.go
@@ -2470,6 +2470,10 @@ func TestCanCampaignIdleReplica(t *testing.T) {
 	s := tc.store
 	ctx := context.Background()
 
+	s.idleReplicaElectionTime.Lock()
+	s.idleReplicaElectionTime.at = time.Time{}
+	s.idleReplicaElectionTime.Unlock()
+
 	// Bump the clock by the election timeout. Idle replicas can't campaign
 	// eagerly yet because we haven't gossiped the store descriptor.
 	electionTimeout := int64(s.ctx.RaftTickInterval * time.Duration(s.ctx.RaftElectionTimeoutTicks))