Skip to content

Commit

Permalink
kvserver: add admission.io.overload metric
Browse files Browse the repository at this point in the history
Resolves cockroachdb#87424.

Previously, only the unnormalized values of the LSM L0 sub-level and
file counts is exposed externally, not the store's IOThreshold.

This was inadequate because it is tedious to normalize and compare the
LSM L0 sub-level and file counts (as they require dividing by different
numbers).

To address this, this patch adds a metric `admission.io.overload`
tracking the store's IOThreshold.

Release note (ops change): Added a metric `admission.io.overload` which
tracks the store's IOThreshold.
  • Loading branch information
KaiSun314 committed Sep 8, 2022
1 parent ce55e1b commit b3b9ed4
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
11 changes: 11 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,15 @@ The messages are dropped to help these replicas to recover from I/O overload.`,
Unit: metric.Unit_COUNT,
}

metaIOOverload = metric.Metadata{
Name: "admission.io.overload",
Help: `1-normalized float to pause replication to raft group followers if its value is at least 1.
Composed of LSM L0 sub-level and file counts.`,
Measurement: "Threshold",
Unit: metric.Unit_COUNT,
}

// Replica queue metrics.
metaMVCCGCQueueSuccesses = metric.Metadata{
Name: "queue.gc.process.success",
Expand Down Expand Up @@ -1770,6 +1779,7 @@ type StoreMetrics struct {

RaftPausedFollowerCount *metric.Gauge
RaftPausedFollowerDroppedMsgs *metric.Counter
IOOverload *metric.GaugeFloat64

RaftCoalescedHeartbeatsPending *metric.Gauge

Expand Down Expand Up @@ -2293,6 +2303,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {

RaftPausedFollowerCount: metric.NewGauge(metaRaftFollowerPaused),
RaftPausedFollowerDroppedMsgs: metric.NewCounter(metaRaftPausedFollowerDroppedMsgs),
IOOverload: metric.NewGaugeFloat64(metaIOOverload),

// This Gauge measures the number of heartbeats queued up just before
// the queue is cleared, to avoid flapping wildly.
Expand Down
6 changes: 6 additions & 0 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -3127,6 +3127,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
overreplicatedRangeCount int64
behindCount int64
pausedFollowerCount int64
ioOverload float64

locks int64
totalLockHoldDurationNanos int64
Expand All @@ -3151,6 +3152,10 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
uninitializedCount = int64(len(s.mu.uninitReplicas))
s.mu.RUnlock()

s.ioThreshold.Lock()
ioOverload, _ = s.ioThreshold.t.Score()
s.ioThreshold.Unlock()

newStoreReplicaVisitor(s).Visit(func(rep *Replica) bool {
metrics := rep.Metrics(ctx, now, livenessMap, clusterNodes)
if metrics.Leader {
Expand Down Expand Up @@ -3247,6 +3252,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
s.metrics.OverReplicatedRangeCount.Update(overreplicatedRangeCount)
s.metrics.RaftLogFollowerBehindCount.Update(behindCount)
s.metrics.RaftPausedFollowerCount.Update(pausedFollowerCount)
s.metrics.IOOverload.Update(ioOverload)

var averageLockHoldDurationNanos int64
var averageLockWaitDurationNanos int64
Expand Down

0 comments on commit b3b9ed4

Please sign in to comment.