Skip to content

Commit

Permalink
storage: don't campaign the RHS after splits
Browse files Browse the repository at this point in the history
Remove the hack to campaign the RHS on one and only one of the
nodes. The hack is no longer necessary now that we're campaigning idle
ranges when the first proposal is received. By removing the hack, we're
effectively leaving the RHS of a split as a lazily loaded Raft
group. Tweaked multiTestContext so that it allows eager campaigning of
idle replicas immediately upon startup.

Discovered while investigating performance hiccups in cockroachdb#9465.
  • Loading branch information
petermattis committed Sep 26, 2016
1 parent 3e55a5d commit 0f5ae8a
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 62 deletions.
17 changes: 17 additions & 0 deletions storage/client_raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1557,6 +1557,23 @@ func TestReplicaRemovalCampaign(t *testing.T) {
}

replica2 := store0.LookupReplica(roachpb.RKey(key2), nil)

rg2 := func(s *storage.Store) client.Sender {
return client.Wrap(s, func(ba roachpb.BatchRequest) roachpb.BatchRequest {
if ba.RangeID == 0 {
ba.RangeID = replica2.RangeID
}
return ba
})
}

// Raft processing is initialized lazily; issue a no-op write request to
// ensure that the Raft group has been started.
incArgs := incrementArgs(key2, 0)
if _, err := client.SendWrapped(rg2(store0), nil, &incArgs); err != nil {
t.Fatal(err)
}

if td.remove {
// Simulate second replica being transferred by removing it.
if err := store0.RemoveReplica(replica2, *replica2.Desc(), true); err != nil {
Expand Down
1 change: 1 addition & 0 deletions storage/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,7 @@ func (m *multiTestContext) addStore(idx int) {
}
}
}
store.AllowIdleReplicaCampaign()

ln, err := netutil.ListenAndServeGRPC(m.transportStopper, grpcServer, util.TestAddr)
if err != nil {
Expand Down
6 changes: 6 additions & 0 deletions storage/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,12 @@ func (s *Store) ManualReplicaGC(repl *Replica) error {
return s.gcQueue.process(s.Ctx(), s.Clock().Now(), repl, cfg)
}

func (s *Store) AllowIdleReplicaCampaign() {
s.idleReplicaElectionTime.Lock()
s.idleReplicaElectionTime.at = s.Clock().PhysicalTime()
s.idleReplicaElectionTime.Unlock()
}

func (r *Replica) RaftLock() {
r.raftMu.Lock()
}
Expand Down
63 changes: 1 addition & 62 deletions storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -1504,68 +1504,7 @@ func splitTriggerPostCommit(
r.store.replicateQueue.MaybeAdd(r, now)
r.store.replicateQueue.MaybeAdd(rightRng, now)

// To avoid leaving the RHS range unavailable as it waits to elect
// its leader, one (and only one) of the nodes should start an
// election as soon as the split is processed.
//
// If there is only one replica, raft instantly makes it the leader.
// Otherwise, we must trigger a campaign here.
//
// TODO(bdarnell): The asynchronous campaign can cause a problem
// with merges, by recreating a replica that should have been
// destroyed. This will probably be addressed as a part of the
// general work to be done for merges
// (https://github.com/cockroachdb/cockroach/issues/2433), but for
// now we're safe because we only perform the asynchronous
// campaign when there are multiple replicas, and we only perform
// merges in testing scenarios with a single replica.
// Note that in multi-node scenarios the async campaign is safe
// because it has exactly the same behavior as an incoming message
// from another node; the problem here is only with merges.
rightRng.mu.Lock()
shouldCampaign := rightRng.mu.state.Lease.OwnedBy(r.store.StoreID())
rightRng.mu.Unlock()
if len(split.RightDesc.Replicas) > 1 && shouldCampaign {
// Schedule the campaign a short time in the future. As
// followers process the split, they destroy and recreate their
// raft groups, which can cause messages to be dropped. In
// general a shorter delay (perhaps all the way down to zero) is
// better in production, because the race is rare and the worst
// case scenario is that we simply wait for an election timeout.
// However, the test for this feature disables election timeouts
// and relies solely on this campaign trigger, so it is unacceptably
// flaky without a bit of a delay.
//
// Note: you must not use the context inside of this task since it may
// contain a finished trace by the time it runs.
if err := r.store.stopper.RunAsyncTask(ctx, func(ctx context.Context) {
time.Sleep(10 * time.Millisecond)
// Make sure that rightRng hasn't been removed.
replica, err := r.store.GetReplica(rightRng.RangeID)
if err != nil {
if _, ok := err.(*roachpb.RangeNotFoundError); ok {
log.Infof(ctx, "%s: RHS replica %d removed before campaigning",
r, r.mu.replicaID)
} else {
log.Infof(ctx, "%s: RHS replica %d unable to campaign: %s",
r, r.mu.replicaID, err)
}
return
}

if err := replica.withRaftGroup(func(raftGroup *raft.RawNode) (bool, error) {
if err := raftGroup.Campaign(); err != nil {
log.Warningf(ctx, "%s: error %v", r, err)
}
return true, nil
}); err != nil {
panic(err)
}
}); err != nil {
log.Warningf(ctx, "%s: error %v", r, err)
return
}
} else if len(split.RightDesc.Replicas) == 1 {
if len(split.RightDesc.Replicas) == 1 {
// TODO(peter): In single-node clusters, we enqueue the right-hand side of
// the split (the new range) for Raft processing so that the corresponding
// Raft group is created. This shouldn't be necessary for correctness, but
Expand Down

0 comments on commit 0f5ae8a

Please sign in to comment.