Skip to content

Commit

Permalink
kvserver: make the StoreRebalancer interval a cluster setting
Browse files Browse the repository at this point in the history
Release note (ops change): the `kv.allocator.load_based_rebalancing_interval`
cluster setting now lets operators the interval at which each store in the
cluster will check for load-based lease or replica rebalancing opportunities.
  • Loading branch information
aayushshah15 committed Mar 29, 2022
1 parent b9ea426 commit 7228a17
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 4 deletions.
1 change: 1 addition & 0 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
<tr><td><code>jobs.retention_time</code></td><td>duration</td><td><code>336h0m0s</code></td><td>the amount of time to retain records for completed jobs before</td></tr>
<tr><td><code>kv.allocator.load_based_lease_rebalancing.enabled</code></td><td>boolean</td><td><code>true</code></td><td>set to enable rebalancing of range leases based on load and latency</td></tr>
<tr><td><code>kv.allocator.load_based_rebalancing</code></td><td>enumeration</td><td><code>leases and replicas</code></td><td>whether to rebalance based on the distribution of QPS across stores [off = 0, leases = 1, leases and replicas = 2]</td></tr>
<tr><td><code>kv.allocator.load_based_rebalancing_interval</code></td><td>duration</td><td><code>1m0s</code></td><td>the rough interval at which each store will check for load-based lease / replica rebalancing opportunities</td></tr>
<tr><td><code>kv.allocator.qps_rebalance_threshold</code></td><td>float</td><td><code>0.1</code></td><td>minimum fraction away from the mean a store's QPS (such as queries per second) can be before it is considered overfull or underfull</td></tr>
<tr><td><code>kv.allocator.range_rebalance_threshold</code></td><td>float</td><td><code>0.05</code></td><td>minimum fraction away from the mean a store's range count can be before it is considered overfull or underfull</td></tr>
<tr><td><code>kv.bulk_io_write.max_rate</code></td><td>byte size</td><td><code>1.0 TiB</code></td><td>the rate limit (bytes/sec) to use for writes to disk on behalf of bulk io ops</td></tr>
Expand Down
25 changes: 21 additions & 4 deletions pkg/kv/kvserver/store_rebalancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/metric"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"go.etcd.io/etcd/raft/v3"
)

const (
// storeRebalancerTimerDuration is how frequently to check the store-level
// defaultLoadBasedRebalancingInterval is how frequently to check the store-level
// balance of the cluster.
storeRebalancerTimerDuration = time.Minute
defaultLoadBasedRebalancingInterval = time.Minute

// minQPSThresholdDifference is the minimum QPS difference from the cluster
// mean that this system should care about. In other words, we won't worry
Expand Down Expand Up @@ -101,6 +102,22 @@ var qpsRebalanceThreshold = func() *settings.FloatSetting {
return s
}()

var loadBasedRebalanceInterval = settings.RegisterPublicDurationSettingWithExplicitUnit(
settings.SystemOnly,
"kv.allocator.load_based_rebalancing_interval",
"the rough interval at which each store will check for load-based lease / replica rebalancing opportunities",
defaultLoadBasedRebalancingInterval,
func(d time.Duration) error {
// Setting this interval to a very low duration is generally going to be a
// bad idea without any real benefit, so let's disallow that.
const min = 10 * time.Second
if d <= min {
return errors.Errorf("must specify a minimum of %s", min)
}
return nil
},
)

// minQPSDifferenceForTransfers is the minimum QPS difference that the store
// rebalancer would care to reconcile (via lease or replica rebalancing) between
// any two stores.
Expand Down Expand Up @@ -203,7 +220,7 @@ func (sr *StoreRebalancer) Start(ctx context.Context, stopper *stop.Stopper) {
_ = stopper.RunAsyncTask(ctx, "store-rebalancer", func(ctx context.Context) {
timer := timeutil.NewTimer()
defer timer.Stop()
timer.Reset(jitteredInterval(storeRebalancerTimerDuration))
timer.Reset(jitteredInterval(loadBasedRebalanceInterval.Get(&sr.st.SV)))
for {
// Wait out the first tick before doing anything since the store is still
// starting up and we might as well wait for some qps/wps stats to
Expand All @@ -213,7 +230,7 @@ func (sr *StoreRebalancer) Start(ctx context.Context, stopper *stop.Stopper) {
return
case <-timer.C:
timer.Read = true
timer.Reset(jitteredInterval(storeRebalancerTimerDuration))
timer.Reset(jitteredInterval(loadBasedRebalanceInterval.Get(&sr.st.SV)))
}

mode := LBRebalancingMode(LoadBasedRebalancingMode.Get(&sr.st.SV))
Expand Down

0 comments on commit 7228a17

Please sign in to comment.