Skip to content

Commit

Permalink
kvserver: decline rebalance snapshots on receivers with poor LSM
Browse files Browse the repository at this point in the history
This commit introduces two new cluster settings:
```
kv.snapshot_decline.read_amp_threshold

server.declined_snapshot_timeout
```

With this commit, stores with a read amplification level higher than
`kv.snapshot_decline.read_amp_threshold` will decline all `REBALANCE`
snapshots. Upon receiving a `DECLINED` response, the senders of these snapshots
will consider these receivers `throttled` for
`server.declined_snapshot_timeout`.

This means that stores with poor LSM health will not be considered as valid
candidates for replica rebalancing.

Fixes cockroachdb#73714
Related to cockroachdb#62168

Release note: None
  • Loading branch information
aayushshah15 committed Dec 12, 2021
1 parent 22df0a6 commit 98116f3
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 83 deletions.
164 changes: 84 additions & 80 deletions pkg/kv/kvserver/raft.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkg/kv/kvserver/raft.proto
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ message SnapshotResponse {
ACCEPTED = 1;
APPLIED = 2;
ERROR = 3;
DECLINED = 5;
reserved 4;
}
Status status = 1;
Expand Down
19 changes: 19 additions & 0 deletions pkg/kv/kvserver/store_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,16 @@ var FailedReservationsTimeout = settings.RegisterDurationSetting(
settings.NonNegativeDuration,
)

// DeclinedSnapshotTimeout specifies a duration during which the local replicate
// queue will not consider stores which have declined a snapshot a viable
// target.
var DeclinedSnapshotTimeout = settings.RegisterDurationSetting(
"server.declined_snapshot_timeout",
"the amount of time to consider the store throttled for up-replication after a declined rebalance snapshot",
60*time.Second,
settings.NonNegativeDuration,
)

const timeAfterStoreSuspectSettingName = "server.time_after_store_suspect"

// TimeAfterStoreSuspect measures how long we consider a store suspect since
Expand Down Expand Up @@ -914,6 +924,7 @@ type throttleReason int
const (
_ throttleReason = iota
throttleFailed
throttleDeclined
)

// throttle informs the store pool that the given remote store declined a
Expand All @@ -939,6 +950,14 @@ func (sp *StorePool) throttle(reason throttleReason, why string, storeID roachpb
log.Infof(ctx, "snapshot failed (%s), s%d will be throttled for %s until %s",
why, storeID, timeout, detail.throttledUntil)
}
case throttleDeclined:
timeout := DeclinedSnapshotTimeout.Get(&sp.st.SV)
detail.throttledUntil = sp.clock.PhysicalTime().Add(timeout)
if log.V(2) {
ctx := sp.AnnotateCtx(context.TODO())
log.Infof(ctx, "snapshot declined (%s), s%d will be throttled for %s until %s",
why, storeID, timeout, detail.throttledUntil)
}
default:
log.Warningf(sp.AnnotateCtx(context.TODO()), "unknown throttle reason %v", reason)
}
Expand Down
Loading

0 comments on commit 98116f3

Please sign in to comment.