Skip to content

Commit

Permalink
kv: don't unquiesce uninitialized replicas
Browse files Browse the repository at this point in the history
In a [support issue](cockroachlabs/support#1340), we
saw that 10s of thousands of uninitialized replicas were being ticked regularly
and creating a large amount of background work on a node, driving up CPU. This
commit updates the Raft quiescence logic to disallow uninitialized replicas from
being unquiesced and Tick()'ing themselves.

Keeping uninitialized replicas quiesced even in the presence of Raft traffic
avoids wasted work. We could Tick() these replicas, but doing so is unnecessary
because uninitialized replicas can never win elections, so there is no reason
for them to ever call an election. In fact, uninitialized replicas do not even
know who their peers are, so there would be no way for them to call an election
or for them to send any other non-reactive message. As a result, all work
performed by an uninitialized replica is reactive and in response to incoming
messages (see processRequestQueue).

There are multiple ways for an uninitialized replica to be created and
then abandoned, and we don't do a good job garbage collecting them at a
later point (see cockroachdb#73424),
so it is important that they are cheap. Keeping them quiesced instead of
letting them unquiesce and tick every 200ms indefinitely avoids a
meaningful amount of periodic work for each uninitialized replica.

Release notes (bug fix): uninitialized replicas that are abandoned after an
unsuccessful snapshot no longer perform periodic background work, so they no
longer have a non-negligible cost.
  • Loading branch information
nvanbenschoten committed Dec 6, 2021
1 parent c3b86cd commit 6b66392
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 2 deletions.
66 changes: 66 additions & 0 deletions pkg/kv/kvserver/client_raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4090,6 +4090,72 @@ func TestRangeQuiescence(t *testing.T) {
}
}

// TestUninitializedReplicaRemainsQuiesced verifies that an uninitialized
// replica remains quiesced until it receives the snapshot that initializes it.
func TestUninitializedReplicaRemainsQuiesced(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()
tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)

_, desc, err := tc.Servers[0].ScratchRangeEx()
key := desc.StartKey.AsRawKey()
require.NoError(t, err)
require.NoError(t, tc.WaitForSplitAndInitialization(key))

// Block a snapshot.
blockSnapshot := make(chan struct{})
handlerFuncs := noopRaftHandlerFuncs()
handlerFuncs.snapErr = func(header *kvserver.SnapshotRequest_Header) error {
<-blockSnapshot
return nil
}
s1, err := tc.Server(1).GetStores().(*kvserver.Stores).GetStore(tc.Server(1).GetFirstStoreID())
require.NoError(t, err)
tc.Servers[1].RaftTransport().Listen(s1.StoreID(), &unreliableRaftHandler{
rangeID: desc.RangeID,
RaftMessageHandler: s1,
unreliableRaftHandlerFuncs: handlerFuncs,
})

// Try to up-replicate to s1. Should block on a learner snapshot after the new
// replica on s1 has been created, but before it has been initialized. While
// the replica is uninitialized, it should remain quiesced, even while it is
// receiving Raft traffic from the leader.
replicateErrChan := make(chan error)
go func() {
_, err := tc.AddVoters(key, tc.Target(1))
replicateErrChan <- err
}()
testutils.SucceedsSoon(t, func() error {
repl, err := s1.GetReplica(desc.RangeID)
if err == nil {
// IMPORTANT: the replica should always be quiescent while uninitialized.
require.False(t, repl.IsInitialized())
require.True(t, repl.IsQuiescent())
}
return err
})

// Let the snapshot through. The up-replication attempt should succeed, the
// replica should now be initialized, and the replica should quiesce again.
close(blockSnapshot)
require.NoError(t, <-replicateErrChan)
repl, err := s1.GetReplica(desc.RangeID)
require.NoError(t, err)
require.True(t, repl.IsInitialized())
testutils.SucceedsSoon(t, func() error {
if !repl.IsQuiescent() {
return errors.Errorf("%s not quiescent", repl)
}
return nil
})
}

// TestInitRaftGroupOnRequest verifies that an uninitialized Raft group
// is initialized if a request is received, even if the current range
// lease points to a different replica.
Expand Down
9 changes: 9 additions & 0 deletions pkg/kv/kvserver/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,15 @@ type Replica struct {
destroyStatus
// Is the range quiescent? Quiescent ranges are not Tick()'d and unquiesce
// whenever a Raft operation is performed.
//
// Replica objects always begin life in a quiescent state, as the field is
// set to true in the Replica constructor newUnloadedReplica. They unquiesce
// and set the field to false in either unquiesceAndWakeLeaderLocked or
// unquiesceWithOptionsLocked, which are called in response to Raft traffic.
//
// Only initialized replicas that have a non-nil internalRaftGroup are
// allowed to unquiesce and be Tick()'d. See canUnquiesceRLocked for an
// explanation of these conditions.
quiescent bool
// laggingFollowersOnQuiesce is the set of dead replicas that are not
// up-to-date with the rest of the quiescent Raft group. Nil if !quiescent.
Expand Down
33 changes: 31 additions & 2 deletions pkg/kv/kvserver/replica_raft_quiesce.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (r *Replica) unquiesceLocked() {
}

func (r *Replica) unquiesceWithOptionsLocked(campaignOnWake bool) {
if r.mu.quiescent && r.mu.internalRaftGroup != nil {
if r.canUnquiesceRLocked() {
ctx := r.AnnotateCtx(context.TODO())
if log.V(3) {
log.Infof(ctx, "unquiescing %d", r.RangeID)
Expand All @@ -71,7 +71,7 @@ func (r *Replica) unquiesceWithOptionsLocked(campaignOnWake bool) {
}

func (r *Replica) unquiesceAndWakeLeaderLocked() {
if r.mu.quiescent && r.mu.internalRaftGroup != nil {
if r.canUnquiesceRLocked() {
ctx := r.AnnotateCtx(context.TODO())
if log.V(3) {
log.Infof(ctx, "unquiescing %d: waking leader", r.RangeID)
Expand All @@ -88,6 +88,35 @@ func (r *Replica) unquiesceAndWakeLeaderLocked() {
}
}

func (r *Replica) canUnquiesceRLocked() bool {
return r.mu.quiescent &&
// If the replica is uninitialized (i.e. it contains no replicated state),
// it is not allowed to unquiesce and begin Tick()'ing itself.
//
// Keeping uninitialized replicas quiesced even in the presence of Raft
// traffic avoids wasted work. We could Tick() these replicas, but doing so
// is unnecessary because uninitialized replicas can never win elections, so
// there is no reason for them to ever call an election. In fact,
// uninitialized replicas do not even know who their peers are, so there
// would be no way for them to call an election or for them to send any
// other non-reactive message. As a result, all work performed by an
// uninitialized replica is reactive and in response to incoming messages
// (see processRequestQueue).
//
// There are multiple ways for an uninitialized replica to be created and
// then abandoned, and we don't do a good job garbage collecting them at a
// later point (see https://github.com/cockroachdb/cockroach/issues/73424),
// so it is important that they are cheap. Keeping them quiesced instead of
// letting them unquiesce and tick every 200ms indefinitely avoids a
// meaningful amount of periodic work for each uninitialized replica.
r.isInitializedRLocked() &&
// A replica's Raft group begins in a dormant state and is initialized
// lazily in response to any Raft traffic (see stepRaftGroup) or KV request
// traffic (see maybeInitializeRaftGroup). If it has yet to be initialized,
// let it remain quiesced. The Raft group will be initialized soon enough.
r.mu.internalRaftGroup != nil
}

// maybeQuiesceLocked checks to see if the replica is quiescable and initiates
// quiescence if it is. Returns true if the replica has been quiesced and false
// otherwise.
Expand Down
8 changes: 8 additions & 0 deletions pkg/kv/kvserver/store_create_replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,14 @@ func (s *Store) maybeMarkReplicaInitializedLockedReplLocked(
it.item.key(), it)
}

// Unquiesce the replica. We don't allow uninitialized replicas to unquiesce,
// but now that the replica has been initialized, we unquiesce it as soon as
// possible. This replica was initialized in response to the reception of a
// snapshot from another replica. This means that the other replica is not
// quiesced, so we don't need to campaign or wake the leader. We just want
// to start ticking.
lockedRepl.unquiesceWithOptionsLocked(false /* campaignOnWake */)

// Add the range to metrics and maybe gossip on capacity change.
s.metrics.ReplicaCount.Inc(1)
s.maybeGossipOnCapacityChange(ctx, rangeAddEvent)
Expand Down

0 comments on commit 6b66392

Please sign in to comment.