From d5571a275e44cd27727f390209c046dfb5d8518e Mon Sep 17 00:00:00 2001
From: Kai Sun <kai.sun@cockroachlabs.com>
Date: Mon, 12 Sep 2022 17:33:57 -0400
Subject: [PATCH] kvserver: add admission.io.overload metric

Resolves #87424.

Previously, only the unnormalized values of the LSM L0 sub-level and
file counts is exposed externally, not the store's IOThreshold.

This was inadequate because it is tedious to normalize and compare the
LSM L0 sub-level and file counts (as they require dividing by different
numbers).

To address this, this patch adds a metric `admission.io.overload`
tracking the store's IOThreshold.

Release note (ops change): Added a metric `admission.io.overload` which
tracks the store's IOThreshold.
---
 pkg/kv/kvserver/metrics.go                     | 14 ++++++++++++++
 pkg/kv/kvserver/store.go                       |  8 ++++++++
 pkg/ts/catalog/chart_catalog.go                |  6 ++++++
 pkg/util/admission/admissionpb/io_threshold.go |  9 ++++++++-
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go
index 501e8e2f3750..9b238ec09886 100644
--- a/pkg/kv/kvserver/metrics.go
+++ b/pkg/kv/kvserver/metrics.go
@@ -1023,6 +1023,18 @@ The messages are dropped to help these replicas to recover from I/O overload.`,
 		Unit:        metric.Unit_COUNT,
 	}
 
+	metaIOOverload = metric.Metadata{
+		Name: "admission.io.overload",
+		Help: `1-normalized float to pause replication to raft group followers if its value exceeds a given threshold.
+
+This threshold is the admission.kv.pause_replication_io_threshold cluster setting
+(pause replication feature is disabled if this setting is 0, feature is disabled by default);
+see pkg/kv/kvserver/replica_raft_overload.go for more details. Composed of LSM L0
+sub-level and file counts.`,
+		Measurement: "Threshold",
+		Unit:        metric.Unit_COUNT,
+	}
+
 	// Replica queue metrics.
 	metaMVCCGCQueueSuccesses = metric.Metadata{
 		Name:        "queue.gc.process.success",
@@ -1770,6 +1782,7 @@ type StoreMetrics struct {
 
 	RaftPausedFollowerCount       *metric.Gauge
 	RaftPausedFollowerDroppedMsgs *metric.Counter
+	IOOverload                    *metric.GaugeFloat64
 
 	RaftCoalescedHeartbeatsPending *metric.Gauge
 
@@ -2293,6 +2306,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
 
 		RaftPausedFollowerCount:       metric.NewGauge(metaRaftFollowerPaused),
 		RaftPausedFollowerDroppedMsgs: metric.NewCounter(metaRaftPausedFollowerDroppedMsgs),
+		IOOverload:                    metric.NewGaugeFloat64(metaIOOverload),
 
 		// This Gauge measures the number of heartbeats queued up just before
 		// the queue is cleared, to avoid flapping wildly.
diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go
index f038393854ee..1c88c88b3ede 100644
--- a/pkg/kv/kvserver/store.go
+++ b/pkg/kv/kvserver/store.go
@@ -3127,6 +3127,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
 		overreplicatedRangeCount  int64
 		behindCount               int64
 		pausedFollowerCount       int64
+		ioOverload                float64
 		slowRaftProposalCount     int64
 
 		locks                          int64
@@ -3152,6 +3153,12 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
 	uninitializedCount = int64(len(s.mu.uninitReplicas))
 	s.mu.RUnlock()
 
+	// TODO(kaisun314,kvoli): move this to a per-store admission control metrics
+	// struct when available. See pkg/util/admission/granter.go.
+	s.ioThreshold.Lock()
+	ioOverload, _ = s.ioThreshold.t.Score()
+	s.ioThreshold.Unlock()
+
 	newStoreReplicaVisitor(s).Visit(func(rep *Replica) bool {
 		metrics := rep.Metrics(ctx, now, livenessMap, clusterNodes)
 		if metrics.Leader {
@@ -3249,6 +3256,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
 	s.metrics.OverReplicatedRangeCount.Update(overreplicatedRangeCount)
 	s.metrics.RaftLogFollowerBehindCount.Update(behindCount)
 	s.metrics.RaftPausedFollowerCount.Update(pausedFollowerCount)
+	s.metrics.IOOverload.Update(ioOverload)
 	s.metrics.SlowRaftRequests.Update(slowRaftProposalCount)
 
 	var averageLockHoldDurationNanos int64
diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go
index fc77a12e07c1..a60e7830bcfa 100644
--- a/pkg/ts/catalog/chart_catalog.go
+++ b/pkg/ts/catalog/chart_catalog.go
@@ -3503,6 +3503,12 @@ var charts = []sectionDescription{
 					"admission.granter.io_tokens_exhausted_duration.kv",
 				},
 			},
+			{
+				Title: "IO Overload - IOThreshold Score",
+				Metrics: []string{
+					"admission.io.overload",
+				},
+			},
 		},
 	},
 	{
diff --git a/pkg/util/admission/admissionpb/io_threshold.go b/pkg/util/admission/admissionpb/io_threshold.go
index ab088938aaca..4024ff92c900 100644
--- a/pkg/util/admission/admissionpb/io_threshold.go
+++ b/pkg/util/admission/admissionpb/io_threshold.go
@@ -32,7 +32,14 @@ import (
 // max number of compactions). And we will need to incorporate overload due to
 // disk bandwidth bottleneck.
 func (iot *IOThreshold) Score() (float64, bool) {
-	if iot == nil {
+	// iot.L0NumFilesThreshold and iot.L0NumSubLevelsThreshold are initialized to
+	// 0 by default, and there appears to be a period of time before we update
+	// iot.L0NumFilesThreshold and iot.L0NumSubLevelsThreshold to their
+	// appropriate values. During this period of time, to prevent dividing by 0
+	// below and Score() returning NaN, we check if iot.L0NumFilesThreshold or
+	// iot.L0NumSubLevelsThreshold are 0 (i.e. currently uninitialized) and
+	// return 0 as the score if so.
+	if iot == nil || iot.L0NumFilesThreshold == 0 || iot.L0NumSubLevelsThreshold == 0 {
 		return 0, false
 	}
 	f := math.Max(