Skip to content

Commit

Permalink
storage: delay manual splits that would result in more snapshots
Browse files Browse the repository at this point in the history
When a Range has followers that aren't replicating properly, splitting
that range results in a right-hand side with followers in a similar
state. Certain workloads (restore/import/presplit) can run large numbers
of splits against a given range, and this can result in a large number
of Raft snapshots that backs up the Raft snapshot queue.

Ideally we'd never have any ranges that require a snapshot, but over
the last weeks it has become clear that this is very difficult to
achieve since the knowledge required to decide whether a snapshot
can efficiently be prevented is distributed across multiple nodes
that don't share the necessary information.

This commit is a bit of a nuclear option to prevent the likely last big
culprit in large numbers of Raft snapshots in cockroachdb#31409.

With this change, we should expect to see Raft snapshots regularly when
a split/scatter phase of an import/restore is active, but never large
volumes at once (except perhaps for an initial spike).

Splits are delayed only for manual splits. In particular, the split
queue is not affected and could in theory cause Raft snapshots. However,
at the present juncture, adding delays in the split queue could cause
problems as well, so we retain the previous behavior there which isn't
known to have caused problems.

More follow-up work in the area of Raft snapshots will be necessary to
add some more sanity to this area of the code.

Release note (bug fix): resolve a cluster degradation scenario that
could occur during IMPORT/RESTORE operations, manifested through a
high number of pending Raft snapshots.
  • Loading branch information
tbg committed Nov 29, 2018
1 parent 07edde2 commit 96f7f0b
Show file tree
Hide file tree
Showing 6 changed files with 364 additions and 3 deletions.
25 changes: 25 additions & 0 deletions pkg/base/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,20 @@ type RaftConfig struct {
// single raft.Ready operation.
RaftMaxInflightMsgs int

// When a Replica with an empty log (i.e. last index zero), drop rejecting
// MsgAppResp for the first few ticks to allow the split trigger to perform
// the split.
//
// -1 to disable.
RaftPostSplitSuppressSnapshotTicks int
// Splitting a range which has a replica needing a snapshot results in two
// ranges in that state. The delay configured here slows down splits when in
// that situation (limiting to those splits not run through the split
// queue). The most important target here are the splits performed by
// backup/restore.
//
// -1 to disable.
RaftDelaySplitToSuppressSnapshotTicks int
}

// SetDefaults initializes unset fields.
Expand Down Expand Up @@ -532,6 +545,18 @@ func (cfg *RaftConfig) SetDefaults() {
if cfg.RaftPostSplitSuppressSnapshotTicks == 0 {
cfg.RaftPostSplitSuppressSnapshotTicks = defaultRaftPostSplitSuppressSnapshotTicks
}

if cfg.RaftDelaySplitToSuppressSnapshotTicks == 0 {
// The Raft Ticks interval defaults to 200ms, and
// RaftPostSplitSuppressSnapshotTicks to 20 ticks. A total of 120 ticks is
// ~24s which experimentally has been shown to allow the small pile (<100)
// of Raft snapshots observed at the beginning of an import/restore to be
// resolved.
cfg.RaftDelaySplitToSuppressSnapshotTicks = 100
if cfg.RaftPostSplitSuppressSnapshotTicks > 0 {
cfg.RaftDelaySplitToSuppressSnapshotTicks += cfg.RaftPostSplitSuppressSnapshotTicks
}
}
}

// RaftElectionTimeout returns the raft election timeout, as computed from the
Expand Down
3 changes: 3 additions & 0 deletions pkg/storage/client_split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1417,6 +1417,9 @@ func runSetupSplitSnapshotRace(
sc.TestingKnobs.DisableAsyncIntentResolution = true
// Avoid fighting with the merge queue while trying to reproduce this race.
sc.TestingKnobs.DisableMergeQueue = true
// Disable the split delay mechanism, or it'll spend 10s going in circles.
// (We can't set it to zero as otherwise the default overrides us).
sc.RaftDelaySplitToSuppressSnapshotTicks = -1
mtc := &multiTestContext{storeConfig: &sc}
defer mtc.Stop()
mtc.Start(t, 6)
Expand Down
13 changes: 10 additions & 3 deletions pkg/storage/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func (r *Replica) AdminSplit(
return roachpb.AdminSplitResponse{}, pErr
}

reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc())
reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc(), true /* delayable */)
// On seeing a ConditionFailedError or an AmbiguousResultError, retry
// the command with the updated descriptor.
if retry := causer.Visit(lastErr, func(err error) bool {
Expand Down Expand Up @@ -258,7 +258,10 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
//
// See the comment on splitTrigger for details on the complexities.
func (r *Replica) adminSplitWithDescriptor(
ctx context.Context, args roachpb.AdminSplitRequest, desc *roachpb.RangeDescriptor,
ctx context.Context,
args roachpb.AdminSplitRequest,
desc *roachpb.RangeDescriptor,
delayable bool,
) (roachpb.AdminSplitResponse, error) {
var reply roachpb.AdminSplitResponse

Expand Down Expand Up @@ -337,7 +340,11 @@ func (r *Replica) adminSplitWithDescriptor(
}
leftDesc.EndKey = splitKey

extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
var extra string
if delayable {
extra += maybeDelaySplitToAvoidSnapshot(ctx, (*splitDelayHelper)(r))
}
extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())

log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
splitKey, rightDesc.RangeID, extra)
Expand Down
157 changes: 157 additions & 0 deletions pkg/storage/split_delay_helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// Copyright 2018 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package storage

import (
"context"
"fmt"
"time"

"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"go.etcd.io/etcd/raft"
)

type splitDelayHelperI interface {
RaftStatus(context.Context) (roachpb.RangeID, *raft.Status)
ProposeEmptyCommand(ctx context.Context)
NumAttempts() int
Sleep(context.Context) time.Duration
}

type splitDelayHelper Replica

func (sdh *splitDelayHelper) RaftStatus(ctx context.Context) (roachpb.RangeID, *raft.Status) {
r := (*Replica)(sdh)
r.mu.RLock()
raftStatus := r.raftStatusRLocked()
if raftStatus != nil {
updateRaftProgressFromActivity(
ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
)
}
r.mu.RUnlock()
return r.RangeID, raftStatus
}

func (sdh *splitDelayHelper) ProposeEmptyCommand(ctx context.Context) {
r := (*Replica)(sdh)
r.raftMu.Lock()
_ = r.withRaftGroup(true /* campaignOnWake */, func(rawNode *raft.RawNode) (bool, error) {
// NB: intentionally ignore the error (which can be ErrProposalDropped
// when there's an SST inflight).
_ = rawNode.Propose(encodeRaftCommandV1(makeIDKey(), nil))
// NB: we need to unquiesce as the group might be quiesced.
return true /* unquiesceAndWakeLeader */, nil
})
r.raftMu.Unlock()
}

func (sdh *splitDelayHelper) NumAttempts() int {
return (*Replica)(sdh).store.cfg.RaftDelaySplitToSuppressSnapshotTicks
}

func (sdh *splitDelayHelper) Sleep(ctx context.Context) time.Duration {
tBegin := timeutil.Now()

r := (*Replica)(sdh)
select {
case <-time.After(r.store.cfg.RaftTickInterval):
case <-ctx.Done():
}

return timeutil.Since(tBegin)
}

func maybeDelaySplitToAvoidSnapshot(ctx context.Context, sdh splitDelayHelperI) string {
// We have an "optimization" to avoid Raft snapshots by dropping some
// outgoing MsgAppResp (see the _ assignment below) which takes effect for
// RaftPostSplitSuppressSnapshotTicks ticks after an uninitialized replica
// is created. This check can err, in which case the snapshot will be
// delayed for that many ticks, and so we want to delay by at least as much
// plus a bit of padding to give a snapshot a chance to catch the follower
// up. If we run out of time, we'll resume the split no matter what.
_ = (*Replica)(nil).maybeDropMsgAppResp // guru assignment
maxDelaySplitToAvoidSnapshotTicks := sdh.NumAttempts()

var slept time.Duration
var extra string
var succeeded bool
for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
succeeded = false
extra = ""
rangeID, raftStatus := sdh.RaftStatus(ctx)

if raftStatus == nil {
// Don't delay on followers (we don't know when to stop). This case
// is hit rarely enough to not matter.
extra += "; not Raft leader"
succeeded = true
break
}

done := true
for replicaID, pr := range raftStatus.Progress {
if replicaID == raftStatus.Lead {
// TODO(tschottdorf): remove this once we have picked up
// https://github.com/etcd-io/etcd/pull/10279
continue
}

if pr.State != raft.ProgressStateReplicate {
if !pr.RecentActive {
if ticks == 0 {
// Having set done = false, we make sure we're not exiting early.
// This is important because we sometimes need that Raft proposal
// below to make the followers active as there's no chatter on an
// idle range. (Note that there's a theoretical race in which the
// follower becomes inactive again during the sleep, but the
// inactivity interval is much larger than a tick).
//
// Don't do this more than once though: if a follower is down,
// we don't want to delay splits for it.
done = false
}
extra += fmt.Sprintf("; r%d/%d inactive", rangeID, replicaID)
continue
}
done = false
extra += fmt.Sprintf("; replica r%d/%d not caught up: %+v", rangeID, replicaID, &pr)
}
}
if done {
succeeded = true
break
}
// Propose an empty command which works around a Raft bug that can
// leave a follower in ProgressStateProbe even though it has caught
// up.
sdh.ProposeEmptyCommand(ctx)
slept += sdh.Sleep(ctx)

if ctx.Err() != nil {
return ""
}
}

if slept != 0 {
extra += fmt.Sprintf("; delayed split for %.1fs to avoid Raft snapshot", slept.Seconds())
if !succeeded {
extra += " (without success)"
}
}

return extra
}
Loading

0 comments on commit 96f7f0b

Please sign in to comment.