storage: don't campaign the RHS after splits

Remove the hack to campaign the RHS on one and only one of the nodes. The hack is no longer necessary now that we're campaigning idle ranges when the first proposal is received. By removing the hack, we're effectively leaving the RHS of a split as a lazily loaded Raft group. Tweaked multiTestContext so that it allows eager campaigning of idle replicas immediately upon startup. Discovered while investigating performance hiccups in cockroachdb#9465.
petermattis · Sep 26, 2016 · 0f5ae8a · 0f5ae8a
1 parent 3e55a5d
commit 0f5ae8a
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 62 deletions.
diff --git a/storage/client_raft_test.go b/storage/client_raft_test.go
@@ -1557,6 +1557,23 @@ func TestReplicaRemovalCampaign(t *testing.T) {
 			}
 
 			replica2 := store0.LookupReplica(roachpb.RKey(key2), nil)
+
+			rg2 := func(s *storage.Store) client.Sender {
+				return client.Wrap(s, func(ba roachpb.BatchRequest) roachpb.BatchRequest {
+					if ba.RangeID == 0 {
+						ba.RangeID = replica2.RangeID
+					}
+					return ba
+				})
+			}
+
+			// Raft processing is initialized lazily; issue a no-op write request to
+			// ensure that the Raft group has been started.
+			incArgs := incrementArgs(key2, 0)
+			if _, err := client.SendWrapped(rg2(store0), nil, &incArgs); err != nil {
+				t.Fatal(err)
+			}
+
 			if td.remove {
 				// Simulate second replica being transferred by removing it.
 				if err := store0.RemoveReplica(replica2, *replica2.Desc(), true); err != nil {

diff --git a/storage/client_test.go b/storage/client_test.go
@@ -635,6 +635,7 @@ func (m *multiTestContext) addStore(idx int) {
 			}
 		}
 	}
+	store.AllowIdleReplicaCampaign()
 
 	ln, err := netutil.ListenAndServeGRPC(m.transportStopper, grpcServer, util.TestAddr)
 	if err != nil {

diff --git a/storage/helpers_test.go b/storage/helpers_test.go
@@ -158,6 +158,12 @@ func (s *Store) ManualReplicaGC(repl *Replica) error {
 	return s.gcQueue.process(s.Ctx(), s.Clock().Now(), repl, cfg)
 }
 
+func (s *Store) AllowIdleReplicaCampaign() {
+	s.idleReplicaElectionTime.Lock()
+	s.idleReplicaElectionTime.at = s.Clock().PhysicalTime()
+	s.idleReplicaElectionTime.Unlock()
+}
+
 func (r *Replica) RaftLock() {
 	r.raftMu.Lock()
 }

diff --git a/storage/store.go b/storage/store.go
@@ -1504,68 +1504,7 @@ func splitTriggerPostCommit(
 	r.store.replicateQueue.MaybeAdd(r, now)
 	r.store.replicateQueue.MaybeAdd(rightRng, now)
 
-	// To avoid leaving the RHS range unavailable as it waits to elect
-	// its leader, one (and only one) of the nodes should start an
-	// election as soon as the split is processed.
-	//
-	// If there is only one replica, raft instantly makes it the leader.
-	// Otherwise, we must trigger a campaign here.
-	//
-	// TODO(bdarnell): The asynchronous campaign can cause a problem
-	// with merges, by recreating a replica that should have been
-	// destroyed. This will probably be addressed as a part of the
-	// general work to be done for merges
-	// (https://github.com/cockroachdb/cockroach/issues/2433), but for
-	// now we're safe because we only perform the asynchronous
-	// campaign when there are multiple replicas, and we only perform
-	// merges in testing scenarios with a single replica.
-	// Note that in multi-node scenarios the async campaign is safe
-	// because it has exactly the same behavior as an incoming message
-	// from another node; the problem here is only with merges.
-	rightRng.mu.Lock()
-	shouldCampaign := rightRng.mu.state.Lease.OwnedBy(r.store.StoreID())
-	rightRng.mu.Unlock()
-	if len(split.RightDesc.Replicas) > 1 && shouldCampaign {
-		// Schedule the campaign a short time in the future. As
-		// followers process the split, they destroy and recreate their
-		// raft groups, which can cause messages to be dropped. In
-		// general a shorter delay (perhaps all the way down to zero) is
-		// better in production, because the race is rare and the worst
-		// case scenario is that we simply wait for an election timeout.
-		// However, the test for this feature disables election timeouts
-		// and relies solely on this campaign trigger, so it is unacceptably
-		// flaky without a bit of a delay.
-		//
-		// Note: you must not use the context inside of this task since it may
-		// contain a finished trace by the time it runs.
-		if err := r.store.stopper.RunAsyncTask(ctx, func(ctx context.Context) {
-			time.Sleep(10 * time.Millisecond)
-			// Make sure that rightRng hasn't been removed.
-			replica, err := r.store.GetReplica(rightRng.RangeID)
-			if err != nil {
-				if _, ok := err.(*roachpb.RangeNotFoundError); ok {
-					log.Infof(ctx, "%s: RHS replica %d removed before campaigning",
-						r, r.mu.replicaID)
-				} else {
-					log.Infof(ctx, "%s: RHS replica %d unable to campaign: %s",
-						r, r.mu.replicaID, err)
-				}
-				return
-			}
-
-			if err := replica.withRaftGroup(func(raftGroup *raft.RawNode) (bool, error) {
-				if err := raftGroup.Campaign(); err != nil {
-					log.Warningf(ctx, "%s: error %v", r, err)
-				}
-				return true, nil
-			}); err != nil {
-				panic(err)
-			}
-		}); err != nil {
-			log.Warningf(ctx, "%s: error %v", r, err)
-			return
-		}
-	} else if len(split.RightDesc.Replicas) == 1 {
+	if len(split.RightDesc.Replicas) == 1 {
 		// TODO(peter): In single-node clusters, we enqueue the right-hand side of
 		// the split (the new range) for Raft processing so that the corresponding
 		// Raft group is created. This shouldn't be necessary for correctness, but
-Original file line number
+Diff line change
@@ Expand Up / @@ -635,6 +635,7 @@ func (m *multiTestContext) addStore(idx int) { @@
     			}
     		}
     	}
+    	store.AllowIdleReplicaCampaign()
     	ln, err := netutil.ListenAndServeGRPC(m.transportStopper, grpcServer, util.TestAddr)
     	if err != nil {
@@ Expand Down @@