From 36ae8252d344cf8a8fd8efa8c7b45bee74bc98df Mon Sep 17 00:00:00 2001 From: Andrew Baptist Date: Tue, 28 Mar 2023 14:48:07 -0400 Subject: [PATCH] kvserver: Add a metric for in-progress snapshots Fixes: #98242 Knowing how many delegate snapshot requests are currently in-progress will be useful for detecting problems. This change adds a metric for this. It also updates the names of the previous stats to have the prefix `range.snapshots` vs `range.snapshot` to be consistent with other stats. Epic: none Release note (ops change): Adds a new stat range.snapshots.delegate.in-progress and renames two existing stats. They were never part of a release, so better to rename them before 23.1.0 is cut. range.snapshot.delegate.successes -> range.snapshots.delegate.successes range.snapshot.delegate.failures -> range.snapshots.delegate.failures --- pkg/kv/kvserver/metrics.go | 18 +++++++++++++----- pkg/kv/kvserver/replica_command.go | 9 ++++++++- pkg/ts/catalog/chart_catalog.go | 5 +++-- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index abaf1fc3d982..c9b8fceea495 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -825,7 +825,7 @@ evaluating the network savings of not sending cross region traffic. Unit: metric.Unit_BYTES, } metaDelegateSnapshotSuccesses = metric.Metadata{ - Name: "range.snapshot.delegate.successes", + Name: "range.snapshots.delegate.successes", Help: `Number of snapshots that were delegated to a different node and resulted in success on that delegate. This does not count self delegated snapshots. `, @@ -833,7 +833,7 @@ resulted in success on that delegate. This does not count self delegated snapsho Unit: metric.Unit_COUNT, } metaDelegateSnapshotFailures = metric.Metadata{ - Name: "range.snapshot.delegate.failures", + Name: "range.snapshots.delegate.failures", Help: `Number of snapshots that were delegated to a different node and resulted in failure on that delegate. There are numerous reasons a failure can occur on a delegate such as timeout, the delegate Raft log being too far behind @@ -842,6 +842,12 @@ or the delegate being too busy to send. Measurement: "Snapshots", Unit: metric.Unit_COUNT, } + metaDelegateSnapshotInProgress = metric.Metadata{ + Name: "range.snapshots.delegate.in-progress", + Help: `Number of delegated snapshots that are currently in-flight.`, + Measurement: "Snapshots", + Unit: metric.Unit_COUNT, + } // Quota pool metrics. metaRaftQuotaPoolPercentUsed = metric.Metadata{ @@ -1922,9 +1928,10 @@ type StoreMetrics struct { RangeSnapshotRecvTotalInProgress *metric.Gauge // Delegate snapshot metrics. These don't count self-delegated snapshots. - DelegateSnapshotSendBytes *metric.Counter - DelegateSnapshotSuccesses *metric.Counter - DelegateSnapshotFailures *metric.Counter + DelegateSnapshotSendBytes *metric.Counter + DelegateSnapshotSuccesses *metric.Counter + DelegateSnapshotFailures *metric.Counter + DelegateSnapshotInProgress *metric.Gauge // Raft processing metrics. RaftTicks *metric.Counter @@ -2461,6 +2468,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { DelegateSnapshotSendBytes: metric.NewCounter(metaDelegateSnapshotSendBytes), DelegateSnapshotSuccesses: metric.NewCounter(metaDelegateSnapshotSuccesses), DelegateSnapshotFailures: metric.NewCounter(metaDelegateSnapshotFailures), + DelegateSnapshotInProgress: metric.NewGauge(metaDelegateSnapshotInProgress), // Raft processing metrics. RaftTicks: metric.NewCounter(metaRaftTicks), diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index f55ec1476017..12f7082c0650 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -2845,6 +2845,9 @@ func (r *Replica) sendSnapshotUsingDelegate( if selfDelegate { delegateRequest.QueueOnDelegateLen = -1 } + if !selfDelegate { + r.store.Metrics().DelegateSnapshotInProgress.Inc(1) + } retErr = contextutil.RunWithTimeout( ctx, "send-snapshot", sendSnapshotTimeout, func(ctx context.Context) error { @@ -2852,6 +2855,10 @@ func (r *Replica) sendSnapshotUsingDelegate( return r.store.cfg.Transport.DelegateSnapshot(ctx, delegateRequest) }, ) + if !selfDelegate { + r.store.Metrics().DelegateSnapshotInProgress.Dec(1) + } + // Return once we have success. if retErr == nil { if !selfDelegate { @@ -2862,7 +2869,7 @@ func (r *Replica) sendSnapshotUsingDelegate( if !selfDelegate { r.store.Metrics().DelegateSnapshotFailures.Inc(1) } - log.Warningf(ctx, "attempt %d: delegate snapshot %+v request failed %v", n+1, delegateRequest, retErr) + log.KvDistribution.Warningf(ctx, "attempt %d: delegate snapshot %+v request failed %v", n+1, delegateRequest, retErr) } } return diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go index 54ed9ff70c4c..7c4174022d0a 100644 --- a/pkg/ts/catalog/chart_catalog.go +++ b/pkg/ts/catalog/chart_catalog.go @@ -636,8 +636,8 @@ var charts = []sectionDescription{ "range.snapshots.applied-voter", "range.snapshots.applied-initial", "range.snapshots.applied-non-voter", - "range.snapshot.delegate.successes", - "range.snapshot.delegate.failures", + "range.snapshots.delegate.successes", + "range.snapshots.delegate.failures", }, }, { @@ -649,6 +649,7 @@ var charts = []sectionDescription{ "range.snapshots.recv-in-progress", "range.snapshots.send-total-in-progress", "range.snapshots.recv-total-in-progress", + "range.snapshots.delegate.in-progress", }, }, {