Skip to content

Commit

Permalink
allocator: check store health on lease transfer
Browse files Browse the repository at this point in the history
Previously, the allocator would transfer leases without considering
candidate's IO overload. When leases would transfer to the IO overloaded
stores, service latency tended to degrade.

This commit adds health checks prior to lease transfers. The health
checks are similar to the IO overload checks for allocating replicas in cockroachdb#97142.

The checks work by comparing a candidate store against
`kv.allocator.io_overload_threshold` and the mean of other candidates.
If the candidate store is equal to or greater than both these values, it
is considered IO overloaded.

The current leaseholder has to meet a higher bar to be considered IO
overloaded. It must have an IO overload score greater or equal to
`kv.allocator.io_overload_threshold` +
`kv.allocator.io_overload_threshold_enforcement_leases`.

The level of enforcement for IO overload is controlled by
`kv.allocator.io_overload_threshold_enforcement_leases` controls the
action taken when a candidate store for a lease transfer is IO overloaded.

- `block_none`: don't exclude stores.
- `block_none_log`: don't exclude stores, log an event.
- `block`: exclude stores from being considered as leaseholder targets
  for a range if they exceed The current leaseholder store will NOT be
  excluded as a candidate for its current range leases.
- `shed`: same behavior as block, however the current leaseholder store
  WILL BE excluded as a candidate for its current range leases i.e. The
  lease will always transfer to a healthy and valid store if one exists.

The default is `block` and a buffer value of `0.4`.

Resolves: cockroachdb#96508

Release note: None
  • Loading branch information
kvoli committed Feb 24, 2023
1 parent 1355005 commit d964813
Show file tree
Hide file tree
Showing 4 changed files with 334 additions and 14 deletions.
75 changes: 66 additions & 9 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1912,7 +1912,7 @@ func (a Allocator) RebalanceNonVoter(
// machinery to achieve range count convergence.
func (a *Allocator) ScorerOptions(ctx context.Context) *RangeCountScorerOptions {
return &RangeCountScorerOptions{
StoreHealthOptions: a.StoreHealthOptions(ctx),
StoreHealthOptions: a.StoreHealthOptions(),
deterministic: a.deterministic,
rangeRebalanceThreshold: RangeRebalanceThreshold.Get(&a.st.SV),
}
Expand All @@ -1922,7 +1922,7 @@ func (a *Allocator) ScorerOptions(ctx context.Context) *RangeCountScorerOptions
func (a *Allocator) ScorerOptionsForScatter(ctx context.Context) *ScatterScorerOptions {
return &ScatterScorerOptions{
RangeCountScorerOptions: RangeCountScorerOptions{
StoreHealthOptions: a.StoreHealthOptions(ctx),
StoreHealthOptions: a.StoreHealthOptions(),
deterministic: a.deterministic,
rangeRebalanceThreshold: 0,
},
Expand Down Expand Up @@ -2032,6 +2032,51 @@ func (a *Allocator) ValidLeaseTargets(
return candidates
}

// healthyLeaseTargets returns a list of healthy replica targets and whether
// the leaseholder replica should be replaced, given the existing replicas,
// store health options and IO overload of existing replica stores.
func (a *Allocator) healthyLeaseTargets(
ctx context.Context,
storePool storepool.AllocatorStorePool,
existingReplicas []roachpb.ReplicaDescriptor,
leaseStoreID roachpb.StoreID,
storeHealthOpts StoreHealthOptions,
) (healthyTargets []roachpb.ReplicaDescriptor, excludeLeaseRepl bool) {
sl, _, _ := storePool.GetStoreListFromIDs(replDescsToStoreIDs(existingReplicas), storepool.StoreFilterSuspect)
avgIOOverload := sl.CandidateIOOverloadScores.Mean

for _, replDesc := range existingReplicas {
store, ok := sl.FindStoreByID(replDesc.StoreID)
// If the replica is the current leaseholder, don't include it as a
// candidate and additionally excludeLeaseRepl if it is filtered out of the
// store list due to being suspect; or the leaseholder store doesn't pass
// the leaseholder store health check.
//
// Note that the leaseholder store health check is less strict than the
// transfer target check below. We don't want to shed leases at the same
// point a candidate becomes ineligible as it could lead to thrashing.
// Instead, we create a buffer between the two to avoid leases moving back
// and forth.
if (replDesc.StoreID == leaseStoreID) &&
(!ok || !storeHealthOpts.leaseStoreIsHealthy(ctx, store, avgIOOverload)) {
excludeLeaseRepl = true
continue
}

// If the replica is not the leaseholder, don't include it as a candidate
// if it is filtered out similar to above, or the replica store doesn't
// pass the lease transfer store health check.
if replDesc.StoreID != leaseStoreID &&
(!ok || !storeHealthOpts.transferToStoreIsHealthy(ctx, store, avgIOOverload)) {
continue
}

healthyTargets = append(healthyTargets, replDesc)
}

return healthyTargets, excludeLeaseRepl
}

// leaseholderShouldMoveDueToPreferences returns true if the current leaseholder
// is in violation of lease preferences _that can otherwise be satisfied_ by
// some existing replica.
Expand Down Expand Up @@ -2088,13 +2133,14 @@ func (a *Allocator) leaseholderShouldMoveDueToPreferences(
// considering the threshold for io overload. This threshold is not
// considered in allocation or rebalancing decisions (excluding candidate
// stores as targets) when enforcementLevel is set to storeHealthNoAction or
// storeHealthLogOnly. By default storeHealthBlockRebalanceTo is the action taken. When
// there is a mixed version cluster, storeHealthNoAction is set instead.
func (a *Allocator) StoreHealthOptions(_ context.Context) StoreHealthOptions {
enforcementLevel := IOOverloadEnforcementLevel(IOOverloadThresholdEnforcement.Get(&a.st.SV))
// storeHealthLogOnly. By default block_rebalance_to is the enforcement level
// for replica allocation and shed for lease transfers.
func (a *Allocator) StoreHealthOptions() StoreHealthOptions {
return StoreHealthOptions{
EnforcementLevel: enforcementLevel,
IOOverloadThreshold: IOOverloadThreshold.Get(&a.st.SV),
EnforcementLevel: IOOverloadEnforcementLevel(IOOverloadThresholdEnforcement.Get(&a.st.SV)),
IOOverloadThreshold: IOOverloadThreshold.Get(&a.st.SV),
LeaseEnforcementLevel: LeaseIOOverloadEnforcementLevel(LeaseIOOverloadThresholdEnforcement.Get(&a.st.SV)),
LeaseIOOverloadThresholdShedBuffer: ShedIOOverloadThresholdBuffer.Get(&a.st.SV),
}
}

Expand Down Expand Up @@ -2148,7 +2194,10 @@ func (a *Allocator) TransferLeaseTarget(
return roachpb.ReplicaDescriptor{}
}

var excludeLeaseReplForHealth bool
existing = a.ValidLeaseTargets(ctx, storePool, conf, existing, leaseRepl, opts)
existing, excludeLeaseReplForHealth = a.healthyLeaseTargets(ctx, storePool, existing, leaseRepl.StoreID(), a.StoreHealthOptions())
excludeLeaseRepl = excludeLeaseRepl || excludeLeaseReplForHealth
// Short-circuit if there are no valid targets out there.
if len(existing) == 0 || (len(existing) == 1 && existing[0].StoreID == leaseRepl.StoreID()) {
log.KvDistribution.VEventf(ctx, 2, "no lease transfer target found for r%d", leaseRepl.GetRangeID())
Expand Down Expand Up @@ -2262,7 +2311,7 @@ func (a *Allocator) TransferLeaseTarget(
candidates,
storeDescMap,
&LoadScorerOptions{
StoreHealthOptions: a.StoreHealthOptions(ctx),
StoreHealthOptions: a.StoreHealthOptions(),
Deterministic: a.deterministic,
LoadDims: opts.LoadDimensions,
LoadThreshold: LoadThresholds(&a.st.SV, opts.LoadDimensions...),
Expand Down Expand Up @@ -2861,3 +2910,11 @@ func maxReplicaID(replicas []roachpb.ReplicaDescriptor) roachpb.ReplicaID {
}
return max
}

func replDescsToStoreIDs(descs []roachpb.ReplicaDescriptor) []roachpb.StoreID {
ret := make([]roachpb.StoreID, len(descs))
for i, desc := range descs {
ret[i] = desc.StoreID
}
return ret
}
136 changes: 132 additions & 4 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ const (
// conjunction with IOOverloadMeanThreshold below.
DefaultIOOverloadThreshold = 0.5

// DefaultIOOverloadShedBuffer is used to shed leases from stores with an IO
// overload score greater than the threshold + buffer. This is typically used
// in conjunction with IOOverloadMeanThreshold below.
DefaultIOOverloadShedBuffer = 0.4

// IOOverloadMeanThreshold is the percentage above the mean after which a
// store could be conisdered unhealthy if also exceeding the threshold.
IOOverloadMeanThreshold = 1.1
Expand All @@ -95,7 +100,7 @@ const (
// as targets for any action regardless of the store IO overload.
IOOverloadThresholdNoAction IOOverloadEnforcementLevel = iota
// IOOverloadThresholdLogOnly will not exclude stores from being considered
// as targets for any action regarldess of the store IO overload. When a
// as targets for any action regardless of the store IO overload. When a
// store exceeds IOOverloadThreshold, an event is logged.
IOOverloadThresholdLogOnly
// IOOverloadThresholdBlockRebalanceTo excludes stores from being being
Expand Down Expand Up @@ -167,6 +172,69 @@ var IOOverloadThresholdEnforcement = settings.RegisterEnumSetting(
},
)

// LeaseIOOverloadEnforcementLevel represents the level of action that may be
// taken or excluded when a lease candidate disk is considered unhealthy.
type LeaseIOOverloadEnforcementLevel int64

const (
// LeaseIOOverloadThresholdBlock will not exclude stores from being
// considered as leaseholder targets for a range, regardless of the store IO
// overload.
LeaseIOOverloadThresholdNoAction LeaseIOOverloadEnforcementLevel = iota
// LeaseIOOverloadThresholdBlock will not exclude stores from being
// considered as leaseholder targets for a range, regardless of the store IO
// overload. When a store exceeds IOOverloadThreshold an event is logged.
LeaseIOOverloadThresholdNoActionLogOnly
// LeaseIOOverloadThresholdBlock excludes stores from being considered as
// leaseholder targets for a range if they exceed (a)
// kv.allocator.io_overload_threshold and (b) the mean IO overload among
// possible candidates. The current leaseholder store will NOT be excluded as
// a candidate for its current range leases.
LeaseIOOverloadThresholdBlock
// LeaseIOOverloadThresholdShed has the same behavior as block, however the
// current leaseholder store WILL BE excluded as a candidate for its current
// range leases i.e. The lease will always transfer to a healthy and valid
// store if one exists.
LeaseIOOverloadThresholdShed
)

// ShedIOOverloadThresholdBuffer added to IOOverloadThreshold is the maximum IO
// overload score the current leaseholder store for a range may have before
// considered unhealthy. If unhealthy and LeaseIOOverloadThresholdEnforcement
// is set to 'shed', the store will shed its leases to other healthy stores.
var ShedIOOverloadThresholdBuffer = settings.RegisterFloatSetting(
settings.SystemOnly,
"kv.allocator.io_overload_threshold_lease_shed_buffer",
"a store will shed its leases when its IO overload score is above this "+
"value added to `kv.allocator.io_overload_threshold` and "+
"`kv.allocator.io_overload_threshold_enforcement_leases` is `shed`",
DefaultIOOverloadShedBuffer,
)

// LeaseIOOverloadThresholdEnforcement defines the level of enforcement for
// lease transfers when a candidate stores' IO overload exceeds the threshold
// defined in IOOverloadThreshold, and additionally
// ShedIOOverloadThresholdBuffer when shed is set.
var LeaseIOOverloadThresholdEnforcement = settings.RegisterEnumSetting(
settings.SystemOnly,
"kv.allocator.io_overload_threshold_enforcement_leases",
"the level of enforcement on lease transfers when a candidate store has an"+
"io overload score exceeding `kv.allocator.io_overload_threshold` and above the "+
"average of comparable allocation candidates:`block_none` will exclude "+
"no candidate stores, `block_none_log` will exclude no candidates but log an "+
"event, `block` will exclude candidates stores from being "+
"targets of lease transfers, `shed` will exclude candidate stores "+
"from being targets of lease transfers and cause the existing "+
"leaseholder to transfer away its lease",
"block",
map[int64]string{
int64(LeaseIOOverloadThresholdNoAction): "block_none",
int64(LeaseIOOverloadThresholdNoActionLogOnly): "block_none_log",
int64(LeaseIOOverloadThresholdBlock): "block",
int64(LeaseIOOverloadThresholdShed): "shed",
},
)

// ScorerOptions defines the interface for the two heuristics that trigger
// replica rebalancing: range count convergence and QPS convergence.
type ScorerOptions interface {
Expand Down Expand Up @@ -2177,8 +2245,10 @@ func convergesOnMean(oldVal, newVal, mean float64) bool {
// StoreHealthOptions is the scorer options for store health. It is
// used to inform scoring based on the health of a store.
type StoreHealthOptions struct {
EnforcementLevel IOOverloadEnforcementLevel
IOOverloadThreshold float64
EnforcementLevel IOOverloadEnforcementLevel
LeaseEnforcementLevel LeaseIOOverloadEnforcementLevel
IOOverloadThreshold float64
LeaseIOOverloadThresholdShedBuffer float64
}

// storeIsHealthy returns true if the store IO overload does not exceed
Expand All @@ -2192,7 +2262,6 @@ func (o StoreHealthOptions) storeIsHealthy(
ioOverloadScore < o.IOOverloadThreshold {
return true
}

// Still log an event when the IO overload score exceeds the threshold, however
// does not exceed the cluster average. This is enabled to avoid confusion
// where candidate stores are still targets, despite exeeding the
Expand Down Expand Up @@ -2245,6 +2314,65 @@ func (o StoreHealthOptions) rebalanceToStoreIsHealthy(
return o.EnforcementLevel < IOOverloadThresholdBlockRebalanceTo
}

func (o StoreHealthOptions) transferToStoreIsHealthy(
ctx context.Context, store roachpb.StoreDescriptor, avg float64,
) bool {
ioOverloadScore, _ := store.Capacity.IOThreshold.Score()
if o.LeaseEnforcementLevel == LeaseIOOverloadThresholdNoAction ||
ioOverloadScore < o.IOOverloadThreshold {
return true
}

if ioOverloadScore < avg*IOOverloadMeanThreshold {
log.KvDistribution.VEventf(ctx, 5,
"s%d, lease transfer check io overload %.2f exceeds threshold %.2f, but "+
"below average watermark: %.2f, action enabled %d",
store.StoreID, ioOverloadScore, o.IOOverloadThreshold,
avg*IOOverloadMeanThreshold, o.LeaseEnforcementLevel)
return true
}

log.KvDistribution.VEventf(ctx, 5,
"s%d, lease transfer check io overload %.2f exceeds threshold %.2f, above average "+
"watermark: %.2f, action enabled %d",
store.StoreID, ioOverloadScore, o.IOOverloadThreshold,
avg*IOOverloadMeanThreshold, o.LeaseEnforcementLevel)

// The store is only considered unhealthy when the enforcement level is
// LeaseIOOverloadThresholdNoAction.
return o.LeaseEnforcementLevel < LeaseIOOverloadThresholdBlock
}

func (o StoreHealthOptions) leaseStoreIsHealthy(
ctx context.Context, store roachpb.StoreDescriptor, avg float64,
) bool {
threshold := o.IOOverloadThreshold + o.LeaseIOOverloadThresholdShedBuffer
ioOverloadScore, _ := store.Capacity.IOThreshold.Score()
if o.LeaseEnforcementLevel == LeaseIOOverloadThresholdNoAction ||
ioOverloadScore < threshold {
return true
}

if ioOverloadScore < avg*IOOverloadMeanThreshold {
log.KvDistribution.VEventf(ctx, 5,
"s%d, existing leaseholder check io overload %.2f exceeds threshold %.2f, but "+
"below average watermark: %.2f, action enabled %d",
store.StoreID, ioOverloadScore, o.IOOverloadThreshold,
avg*IOOverloadMeanThreshold, o.LeaseEnforcementLevel)
return true
}

log.KvDistribution.VEventf(ctx, 5,
"s%d, existing leaseholder check io overload %.2f exceeds threshold %.2f, above average "+
"watermark: %.2f, action enabled %d",
store.StoreID, ioOverloadScore, o.IOOverloadThreshold,
avg*IOOverloadMeanThreshold, o.LeaseEnforcementLevel)

// The existing leaseholder store is only considered unhealthy when the enforcement level is
// LeaseIOOverloadThresholdShed.
return o.LeaseEnforcementLevel < LeaseIOOverloadThresholdShed
}

// rebalanceToMaxCapacityCheck returns true if the store has enough room to
// accept a rebalance. The bar for this is stricter than for whether a store
// has enough room to accept a necessary replica (i.e. via AllocateCandidates).
Expand Down
Loading

0 comments on commit d964813

Please sign in to comment.