Skip to content

Commit

Permalink
kvserver: add admission.io.overload metric
Browse files Browse the repository at this point in the history
Resolves #87424.

Previously, only the unnormalized values of the LSM L0 sub-level and
file counts is exposed externally, not the store's IOThreshold.

This was inadequate because it is tedious to normalize and compare the
LSM L0 sub-level and file counts (as they require dividing by different
numbers).

To address this, this patch adds a metric `admission.io.overload`
tracking the store's IOThreshold.

Release note (ops change): Added a metric `admission.io.overload` which
tracks the store's IOThreshold.
  • Loading branch information
KaiSun314 committed Sep 12, 2022
1 parent 7bcbc70 commit d5571a2
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 1 deletion.
14 changes: 14 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,18 @@ The messages are dropped to help these replicas to recover from I/O overload.`,
Unit: metric.Unit_COUNT,
}

metaIOOverload = metric.Metadata{
Name: "admission.io.overload",
Help: `1-normalized float to pause replication to raft group followers if its value exceeds a given threshold.
This threshold is the admission.kv.pause_replication_io_threshold cluster setting
(pause replication feature is disabled if this setting is 0, feature is disabled by default);
see pkg/kv/kvserver/replica_raft_overload.go for more details. Composed of LSM L0
sub-level and file counts.`,
Measurement: "Threshold",
Unit: metric.Unit_COUNT,
}

// Replica queue metrics.
metaMVCCGCQueueSuccesses = metric.Metadata{
Name: "queue.gc.process.success",
Expand Down Expand Up @@ -1770,6 +1782,7 @@ type StoreMetrics struct {

RaftPausedFollowerCount *metric.Gauge
RaftPausedFollowerDroppedMsgs *metric.Counter
IOOverload *metric.GaugeFloat64

RaftCoalescedHeartbeatsPending *metric.Gauge

Expand Down Expand Up @@ -2293,6 +2306,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {

RaftPausedFollowerCount: metric.NewGauge(metaRaftFollowerPaused),
RaftPausedFollowerDroppedMsgs: metric.NewCounter(metaRaftPausedFollowerDroppedMsgs),
IOOverload: metric.NewGaugeFloat64(metaIOOverload),

// This Gauge measures the number of heartbeats queued up just before
// the queue is cleared, to avoid flapping wildly.
Expand Down
8 changes: 8 additions & 0 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -3127,6 +3127,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
overreplicatedRangeCount int64
behindCount int64
pausedFollowerCount int64
ioOverload float64
slowRaftProposalCount int64

locks int64
Expand All @@ -3152,6 +3153,12 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
uninitializedCount = int64(len(s.mu.uninitReplicas))
s.mu.RUnlock()

// TODO(kaisun314,kvoli): move this to a per-store admission control metrics
// struct when available. See pkg/util/admission/granter.go.
s.ioThreshold.Lock()
ioOverload, _ = s.ioThreshold.t.Score()
s.ioThreshold.Unlock()

newStoreReplicaVisitor(s).Visit(func(rep *Replica) bool {
metrics := rep.Metrics(ctx, now, livenessMap, clusterNodes)
if metrics.Leader {
Expand Down Expand Up @@ -3249,6 +3256,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
s.metrics.OverReplicatedRangeCount.Update(overreplicatedRangeCount)
s.metrics.RaftLogFollowerBehindCount.Update(behindCount)
s.metrics.RaftPausedFollowerCount.Update(pausedFollowerCount)
s.metrics.IOOverload.Update(ioOverload)
s.metrics.SlowRaftRequests.Update(slowRaftProposalCount)

var averageLockHoldDurationNanos int64
Expand Down
6 changes: 6 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -3503,6 +3503,12 @@ var charts = []sectionDescription{
"admission.granter.io_tokens_exhausted_duration.kv",
},
},
{
Title: "IO Overload - IOThreshold Score",
Metrics: []string{
"admission.io.overload",
},
},
},
},
{
Expand Down
9 changes: 8 additions & 1 deletion pkg/util/admission/admissionpb/io_threshold.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,14 @@ import (
// max number of compactions). And we will need to incorporate overload due to
// disk bandwidth bottleneck.
func (iot *IOThreshold) Score() (float64, bool) {
if iot == nil {
// iot.L0NumFilesThreshold and iot.L0NumSubLevelsThreshold are initialized to
// 0 by default, and there appears to be a period of time before we update
// iot.L0NumFilesThreshold and iot.L0NumSubLevelsThreshold to their
// appropriate values. During this period of time, to prevent dividing by 0
// below and Score() returning NaN, we check if iot.L0NumFilesThreshold or
// iot.L0NumSubLevelsThreshold are 0 (i.e. currently uninitialized) and
// return 0 as the score if so.
if iot == nil || iot.L0NumFilesThreshold == 0 || iot.L0NumSubLevelsThreshold == 0 {
return 0, false
}
f := math.Max(
Expand Down

0 comments on commit d5571a2

Please sign in to comment.