From d41cce07e6649d0f34cbbb03dc25eb9156055b2c Mon Sep 17 00:00:00 2001 From: Nick Travers Date: Thu, 22 Sep 2022 13:09:06 -0700 Subject: [PATCH] kvserver: add storage time-series metrics for level size and score Currently, the only way to infer the compaction score and heuristics is to use the LSM printout from the logs (emitted once every ten minutes), or to call the `/debug/lsm` endpoint manually, and track values over time. This makes it difficult to debug issues retroactively. Add two new sets of per-LSM-level time-series metrics for level size and level score. These new metrics have names of the form `storage.$LEVEL-level-{size,score}`. Closes #88415. Release note (ops change): Adds two new sets of per-LSM-level time-series metrics, one for level size and another for level score. These metrics are of the form `storage.$LEVEL-level-{size,score}`. --- pkg/kv/kvserver/metrics.go | 32 +++++++++++++++++++++++++++++++- pkg/ts/catalog/chart_catalog.go | 25 +++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 15b6ee6908f0..f8bef021b3f1 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -550,6 +550,20 @@ var metaRdbBytesIngested = storageLevelMetricMetadata( metric.Unit_BYTES, ) +var metaRdbLevelSize = storageLevelMetricMetadata( + "level-size", + "Size of the SSTables in level %d", + "Bytes", + metric.Unit_BYTES, +) + +var metaRdbLevelScores = storageLevelMetricMetadata( + "level-score", + "Compaction score of level %d", + "Score", + metric.Unit_COUNT, +) + var ( metaRdbWriteStalls = metric.Metadata{ Name: "storage.write-stalls", @@ -1708,7 +1722,9 @@ type StoreMetrics struct { RdbL0BytesFlushed *metric.Gauge RdbL0Sublevels *metric.Gauge RdbL0NumFiles *metric.Gauge - RdbBytesIngested [7]*metric.Gauge // idx = level + RdbBytesIngested [7]*metric.Gauge // idx = level + RdbLevelSize [7]*metric.Gauge // idx = level + RdbLevelScore [7]*metric.GaugeFloat64 // idx = level RdbWriteStalls *metric.Gauge RdbWriteStallNanos *metric.Gauge @@ -2137,6 +2153,8 @@ func newTenantsStorageMetrics() *TenantsStorageMetrics { func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { storeRegistry := metric.NewRegistry() rdbBytesIngested := storageLevelGaugeSlice(metaRdbBytesIngested) + rdbLevelSize := storageLevelGaugeSlice(metaRdbLevelSize) + rdbLevelScore := storageLevelGaugeFloat64Slice(metaRdbLevelScores) sm := &StoreMetrics{ registry: storeRegistry, @@ -2219,6 +2237,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { RdbL0Sublevels: metric.NewGauge(metaRdbL0Sublevels), RdbL0NumFiles: metric.NewGauge(metaRdbL0NumFiles), RdbBytesIngested: rdbBytesIngested, + RdbLevelSize: rdbLevelSize, + RdbLevelScore: rdbLevelScore, RdbWriteStalls: metric.NewGauge(metaRdbWriteStalls), RdbWriteStallNanos: metric.NewGauge(metaRdbWriteStallNanos), @@ -2523,6 +2543,8 @@ func (sm *StoreMetrics) updateEngineMetrics(m storage.Metrics) { sm.RdbL0BytesFlushed.Update(int64(m.Levels[0].BytesFlushed)) for level, stats := range m.Levels { sm.RdbBytesIngested[level].Update(int64(stats.BytesIngested)) + sm.RdbLevelSize[level].Update(stats.Size) + sm.RdbLevelScore[level].Update(stats.Score) } } @@ -2577,3 +2599,11 @@ func storageLevelGaugeSlice(sl [7]metric.Metadata) [7]*metric.Gauge { } return gs } + +func storageLevelGaugeFloat64Slice(sl [7]metric.Metadata) [7]*metric.GaugeFloat64 { + var gs [7]*metric.GaugeFloat64 + for i := range sl { + gs[i] = metric.NewGaugeFloat64(sl[i]) + } + return gs +} diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go index 13dbb3b1ebd5..e4ddbe13a291 100644 --- a/pkg/ts/catalog/chart_catalog.go +++ b/pkg/ts/catalog/chart_catalog.go @@ -3004,6 +3004,31 @@ var charts = []sectionDescription{ Metrics: []string{"storage.write-stall-nanos"}, AxisLabel: "Duration (nanos)", }, + { + Title: "Bytes Used Per Level", + Metrics: []string{ + "storage.l0-level-size", + "storage.l1-level-size", + "storage.l2-level-size", + "storage.l3-level-size", + "storage.l4-level-size", + "storage.l5-level-size", + "storage.l6-level-size", + }, + AxisLabel: "Bytes", + }, + { + Title: "Compaction Score Per Level", + Metrics: []string{ + "storage.l0-level-score", + "storage.l1-level-score", + "storage.l2-level-score", + "storage.l3-level-score", + "storage.l4-level-score", + "storage.l5-level-score", + "storage.l6-level-score", + }, + }, }, }, {