From c748c2514ef1c7c3fd3676616f196f923a2b1318 Mon Sep 17 00:00:00 2001 From: Eric Harmeling Date: Fri, 21 Jul 2023 14:38:06 -0400 Subject: [PATCH] metrics: refactor histogram bucket generation and testing This commit refactors histogram bucketing for legibility and composibility. It also introduces a data-driven test for histogram bucket generation. This refactor should make it easier to add additional metric categories, distributions, and bucket types. Part of #97144. Release note: None --- pkg/ccl/changefeedccl/metrics.go | 86 ++-- pkg/ccl/sqlproxyccl/connector_test.go | 24 +- pkg/ccl/sqlproxyccl/metrics.go | 34 +- pkg/ccl/streamingccl/streamingest/metrics.go | 30 +- pkg/kv/bulk/bulk_metrics.go | 10 +- pkg/kv/kvclient/kvcoord/txn_metrics.go | 18 +- pkg/kv/kvprober/kvprober.go | 16 +- .../kvserver/client_manual_proposal_test.go | 8 +- .../kvflowcontroller_metrics.go | 8 +- .../kvflowhandle/kvflowhandle_metrics.go | 8 +- pkg/kv/kvserver/liveness/liveness.go | 8 +- .../kvserver/logstore/logstore_bench_test.go | 8 +- pkg/kv/kvserver/metrics.go | 88 ++-- pkg/kv/kvserver/txnwait/metrics.go | 16 +- pkg/rpc/clock_offset.go | 2 +- pkg/server/node.go | 8 +- pkg/sql/conn_executor.go | 76 ++-- pkg/sql/execinfra/metrics.go | 20 +- pkg/sql/mem_metrics.go | 10 +- pkg/sql/pgwire/pre_serve.go | 8 +- pkg/sql/pgwire/server.go | 8 +- pkg/sql/sqlstats/sslocal/sql_stats_test.go | 10 +- pkg/sql/ttl/ttljob/ttljob_metrics.go | 24 +- pkg/util/admission/elastic_cpu_granter.go | 8 +- pkg/util/admission/work_queue.go | 8 +- pkg/util/metric/BUILD.bazel | 1 - pkg/util/metric/aggmetric/agg_metric_test.go | 18 +- pkg/util/metric/histogram_buckets.go | 425 +++++------------- pkg/util/metric/histogram_buckets_test.go | 72 +-- pkg/util/metric/metric.go | 15 +- pkg/util/metric/metric_test.go | 14 +- pkg/util/metric/registry_test.go | 24 +- .../testdata/BatchProcessLatencyBuckets | 63 +++ pkg/util/metric/testdata/Count1KBuckets | 14 + pkg/util/metric/testdata/DataSize16MBBuckets | 18 + pkg/util/metric/testdata/IOLatencyBuckets | 63 +++ .../testdata/LongRunning60mLatencyBuckets | 63 +++ .../metric/testdata/MemoryUsage64MBBuckets | 18 + pkg/util/metric/testdata/Percent100Buckets | 13 + .../testdata/ReplicaBatchRequestCountBuckets | 23 + .../metric/testdata/ReplicaCPUTimeBuckets | 23 + 41 files changed, 736 insertions(+), 675 deletions(-) create mode 100644 pkg/util/metric/testdata/BatchProcessLatencyBuckets create mode 100644 pkg/util/metric/testdata/Count1KBuckets create mode 100644 pkg/util/metric/testdata/DataSize16MBBuckets create mode 100644 pkg/util/metric/testdata/IOLatencyBuckets create mode 100644 pkg/util/metric/testdata/LongRunning60mLatencyBuckets create mode 100644 pkg/util/metric/testdata/MemoryUsage64MBBuckets create mode 100644 pkg/util/metric/testdata/Percent100Buckets create mode 100644 pkg/util/metric/testdata/ReplicaBatchRequestCountBuckets create mode 100644 pkg/util/metric/testdata/ReplicaCPUTimeBuckets diff --git a/pkg/ccl/changefeedccl/metrics.go b/pkg/ccl/changefeedccl/metrics.go index 060ea6d9bcea..35d907f5f879 100644 --- a/pkg/ccl/changefeedccl/metrics.go +++ b/pkg/ccl/changefeedccl/metrics.go @@ -552,52 +552,52 @@ func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics { EmittedMessages: b.Counter(metaChangefeedEmittedMessages), FilteredMessages: b.Counter(metaChangefeedFilteredMessages), MessageSize: b.Histogram(metric.HistogramOptions{ - Metadata: metaMessageSize, - Duration: histogramWindow, - MaxVal: 10 << 20, /* 10MB max message size */ - SigFigs: 1, - Buckets: metric.DataSize16MBBuckets, + Metadata: metaMessageSize, + Duration: histogramWindow, + MaxVal: 10 << 20, /* 10MB max message size */ + SigFigs: 1, + BucketConfig: metric.DataSize16MBBuckets, }), EmittedBytes: b.Counter(metaChangefeedEmittedBytes), FlushedBytes: b.Counter(metaChangefeedFlushedBytes), Flushes: b.Counter(metaChangefeedFlushes), SizeBasedFlushes: b.Counter(metaSizeBasedFlushes), ParallelIOQueueNanos: b.Histogram(metric.HistogramOptions{ - Metadata: metaChangefeedParallelIOQueueNanos, - Duration: histogramWindow, - MaxVal: changefeedIOQueueMaxLatency.Nanoseconds(), - SigFigs: 2, - Buckets: metric.BatchProcessLatencyBuckets, + Metadata: metaChangefeedParallelIOQueueNanos, + Duration: histogramWindow, + MaxVal: changefeedIOQueueMaxLatency.Nanoseconds(), + SigFigs: 2, + BucketConfig: metric.BatchProcessLatencyBuckets, }), SinkIOInflight: b.Gauge(metaChangefeedSinkIOInflight), BatchHistNanos: b.Histogram(metric.HistogramOptions{ - Metadata: metaChangefeedBatchHistNanos, - Duration: histogramWindow, - MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(), - SigFigs: 1, - Buckets: metric.BatchProcessLatencyBuckets, + Metadata: metaChangefeedBatchHistNanos, + Duration: histogramWindow, + MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(), + SigFigs: 1, + BucketConfig: metric.BatchProcessLatencyBuckets, }), FlushHistNanos: b.Histogram(metric.HistogramOptions{ - Metadata: metaChangefeedFlushHistNanos, - Duration: histogramWindow, - MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(), - SigFigs: 2, - Buckets: metric.BatchProcessLatencyBuckets, + Metadata: metaChangefeedFlushHistNanos, + Duration: histogramWindow, + MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(), + SigFigs: 2, + BucketConfig: metric.BatchProcessLatencyBuckets, }), CommitLatency: b.Histogram(metric.HistogramOptions{ - Metadata: metaCommitLatency, - Duration: histogramWindow, - MaxVal: commitLatencyMaxValue.Nanoseconds(), - SigFigs: 1, - Buckets: metric.BatchProcessLatencyBuckets, + Metadata: metaCommitLatency, + Duration: histogramWindow, + MaxVal: commitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + BucketConfig: metric.BatchProcessLatencyBuckets, }), AdmitLatency: b.Histogram(metric.HistogramOptions{ - Metadata: metaAdmitLatency, - Duration: histogramWindow, - MaxVal: admitLatencyMaxValue.Nanoseconds(), - SigFigs: 1, - Buckets: metric.BatchProcessLatencyBuckets, + Metadata: metaAdmitLatency, + Duration: histogramWindow, + MaxVal: admitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + BucketConfig: metric.BatchProcessLatencyBuckets, }), BackfillCount: b.Gauge(metaChangefeedBackfillCount), BackfillPendingRanges: b.Gauge(metaChangefeedBackfillPendingRanges), @@ -712,27 +712,27 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct { Failures: metric.NewCounter(metaChangefeedFailures), QueueTimeNanos: metric.NewCounter(metaEventQueueTime), CheckpointHistNanos: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaChangefeedCheckpointHistNanos, - Duration: histogramWindow, - MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(), - SigFigs: 2, - Buckets: metric.IOLatencyBuckets, + Metadata: metaChangefeedCheckpointHistNanos, + Duration: histogramWindow, + MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(), + SigFigs: 2, + BucketConfig: metric.IOLatencyBuckets, }), FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates), ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow), // Below two metrics were never implemented using the hdr histogram. Set ForceUsePrometheus // to true. ParallelConsumerFlushNanos: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaChangefeedEventConsumerFlushNanos, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, - Mode: metric.HistogramModePrometheus, + Metadata: metaChangefeedEventConsumerFlushNanos, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, }), ParallelConsumerConsumeNanos: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaChangefeedEventConsumerConsumeNanos, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, - Mode: metric.HistogramModePrometheus, + Metadata: metaChangefeedEventConsumerConsumeNanos, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, }), ParallelConsumerInFlightEvents: metric.NewGauge(metaChangefeedEventConsumerInFlightEvents), } diff --git a/pkg/ccl/sqlproxyccl/connector_test.go b/pkg/ccl/sqlproxyccl/connector_test.go index fbbeb968d615..3b09bb720c4c 100644 --- a/pkg/ccl/sqlproxyccl/connector_test.go +++ b/pkg/ccl/sqlproxyccl/connector_test.go @@ -381,10 +381,10 @@ func TestConnector_dialTenantCluster(t *testing.T) { c := &connector{ TenantID: roachpb.MustMakeTenantID(42), DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePrometheus, - Metadata: metaDialTenantLatency, - Duration: time.Millisecond, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, + Metadata: metaDialTenantLatency, + Duration: time.Millisecond, + BucketConfig: metric.IOLatencyBuckets, }), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } @@ -466,10 +466,10 @@ func TestConnector_dialTenantCluster(t *testing.T) { c := &connector{ DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaDialTenantLatency, - Duration: time.Millisecond, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDialTenantLatency, + Duration: time.Millisecond, + BucketConfig: metric.IOLatencyBuckets, }), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } @@ -500,10 +500,10 @@ func TestConnector_dialTenantCluster(t *testing.T) { c := &connector{ TenantID: roachpb.MustMakeTenantID(42), DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaDialTenantLatency, - Duration: time.Millisecond, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDialTenantLatency, + Duration: time.Millisecond, + BucketConfig: metric.IOLatencyBuckets, }), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } diff --git a/pkg/ccl/sqlproxyccl/metrics.go b/pkg/ccl/sqlproxyccl/metrics.go index 28d9ad426847..d609bb978ed0 100644 --- a/pkg/ccl/sqlproxyccl/metrics.go +++ b/pkg/ccl/sqlproxyccl/metrics.go @@ -234,19 +234,19 @@ func makeProxyMetrics() metrics { RefusedConnCount: metric.NewCounter(metaRefusedConnCount), SuccessfulConnCount: metric.NewCounter(metaSuccessfulConnCount), ConnectionLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaConnMigrationAttemptedCount, - Duration: base.DefaultHistogramWindowInterval(), - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaConnMigrationAttemptedCount, + Duration: base.DefaultHistogramWindowInterval(), + BucketConfig: metric.IOLatencyBuckets, }), AuthFailedCount: metric.NewCounter(metaAuthFailedCount), ExpiredClientConnCount: metric.NewCounter(metaExpiredClientConnCount), // Connector metrics. DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaDialTenantLatency, - Duration: base.DefaultHistogramWindowInterval(), - Buckets: metric.IOLatencyBuckets}, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDialTenantLatency, + Duration: base.DefaultHistogramWindowInterval(), + BucketConfig: metric.IOLatencyBuckets}, ), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), // Connection migration metrics. @@ -255,17 +255,17 @@ func makeProxyMetrics() metrics { ConnMigrationErrorRecoverableCount: metric.NewCounter(metaConnMigrationErrorRecoverableCount), ConnMigrationAttemptedCount: metric.NewCounter(metaConnMigrationAttemptedCount), ConnMigrationAttemptedLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaConnMigrationAttemptedLatency, - Duration: base.DefaultHistogramWindowInterval(), - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaConnMigrationAttemptedLatency, + Duration: base.DefaultHistogramWindowInterval(), + BucketConfig: metric.IOLatencyBuckets, }), ConnMigrationTransferResponseMessageSize: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaConnMigrationTransferResponseMessageSize, - Duration: base.DefaultHistogramWindowInterval(), - Buckets: metric.DataSize16MBBuckets, - MaxVal: maxExpectedTransferResponseMessageSize, - SigFigs: 1, + Metadata: metaConnMigrationTransferResponseMessageSize, + Duration: base.DefaultHistogramWindowInterval(), + BucketConfig: metric.DataSize16MBBuckets, + MaxVal: maxExpectedTransferResponseMessageSize, + SigFigs: 1, }), QueryCancelReceivedPGWire: metric.NewCounter(metaQueryCancelReceivedPGWire), QueryCancelReceivedHTTP: metric.NewCounter(metaQueryCancelReceivedHTTP), diff --git a/pkg/ccl/streamingccl/streamingest/metrics.go b/pkg/ccl/streamingccl/streamingest/metrics.go index 6104150333ac..83c857cd5d83 100644 --- a/pkg/ccl/streamingccl/streamingest/metrics.go +++ b/pkg/ccl/streamingccl/streamingest/metrics.go @@ -171,25 +171,25 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct { JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates), ReplanCount: metric.NewCounter(metaDistSQLReplanCount), FlushHistNanos: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaReplicationFlushHistNanos, - Duration: histogramWindow, - Buckets: metric.BatchProcessLatencyBuckets, - MaxVal: streamingFlushHistMaxLatency.Nanoseconds(), - SigFigs: 1, + Metadata: metaReplicationFlushHistNanos, + Duration: histogramWindow, + BucketConfig: metric.BatchProcessLatencyBuckets, + MaxVal: streamingFlushHistMaxLatency.Nanoseconds(), + SigFigs: 1, }), CommitLatency: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaReplicationCommitLatency, - Duration: histogramWindow, - Buckets: metric.BatchProcessLatencyBuckets, - MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(), - SigFigs: 1, + Metadata: metaReplicationCommitLatency, + Duration: histogramWindow, + BucketConfig: metric.BatchProcessLatencyBuckets, + MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, }), AdmitLatency: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaReplicationAdmitLatency, - Duration: histogramWindow, - Buckets: metric.BatchProcessLatencyBuckets, - MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(), - SigFigs: 1, + Metadata: metaReplicationAdmitLatency, + Duration: histogramWindow, + BucketConfig: metric.BatchProcessLatencyBuckets, + MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, }), RunningCount: metric.NewGauge(metaStreamsRunning), EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan), diff --git a/pkg/kv/bulk/bulk_metrics.go b/pkg/kv/bulk/bulk_metrics.go index 7cbbc748a20b..16ae10524268 100644 --- a/pkg/kv/bulk/bulk_metrics.go +++ b/pkg/kv/bulk/bulk_metrics.go @@ -52,11 +52,11 @@ const log10int64times1000 = 19 * 1000 func MakeBulkMetrics(histogramWindow time.Duration) Metrics { return Metrics{ MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaMemMaxBytes, - Duration: histogramWindow, - MaxVal: log10int64times1000, - SigFigs: 3, - Buckets: metric.MemoryUsage64MBBuckets, + Metadata: metaMemMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + BucketConfig: metric.MemoryUsage64MBBuckets, }), CurBytesCount: metric.NewGauge(metaMemCurBytes), } diff --git a/pkg/kv/kvclient/kvcoord/txn_metrics.go b/pkg/kv/kvclient/kvcoord/txn_metrics.go index e35abeadd493..51371d044273 100644 --- a/pkg/kv/kvclient/kvcoord/txn_metrics.go +++ b/pkg/kv/kvclient/kvcoord/txn_metrics.go @@ -285,20 +285,20 @@ func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics { ClientRefreshAutoRetries: metric.NewCounter(metaClientRefreshAutoRetries), ServerRefreshSuccess: metric.NewCounter(metaServerRefreshSuccess), Durations: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaDurationsHistograms, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDurationsHistograms, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans), TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge), TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget), Restarts: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaRestartsHistogram, - Duration: histogramWindow, - MaxVal: 100, - SigFigs: 3, - Buckets: metric.Count1KBuckets, + Metadata: metaRestartsHistogram, + Duration: histogramWindow, + MaxVal: 100, + SigFigs: 3, + BucketConfig: metric.Count1KBuckets, }), RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld), RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti), diff --git a/pkg/kv/kvprober/kvprober.go b/pkg/kv/kvprober/kvprober.go index 3346f67df803..c3ac87a32a8a 100644 --- a/pkg/kv/kvprober/kvprober.go +++ b/pkg/kv/kvprober/kvprober.go @@ -276,18 +276,18 @@ func NewProber(opts Opts) *Prober { ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts), ReadProbeFailures: metric.NewCounter(metaReadProbeFailures), ReadProbeLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaReadProbeLatency, - Duration: opts.HistogramWindowInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaReadProbeLatency, + Duration: opts.HistogramWindowInterval, + BucketConfig: metric.IOLatencyBuckets, }), WriteProbeAttempts: metric.NewCounter(metaWriteProbeAttempts), WriteProbeFailures: metric.NewCounter(metaWriteProbeFailures), WriteProbeLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaWriteProbeLatency, - Duration: opts.HistogramWindowInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaWriteProbeLatency, + Duration: opts.HistogramWindowInterval, + BucketConfig: metric.IOLatencyBuckets, }), WriteProbeQuarantineOldestDuration: metric.NewFunctionalGauge( metaWriteProbeQuarantineOldestDuration, diff --git a/pkg/kv/kvserver/client_manual_proposal_test.go b/pkg/kv/kvserver/client_manual_proposal_test.go index 003693d1f3fc..eeb6f3e6a0c6 100644 --- a/pkg/kv/kvserver/client_manual_proposal_test.go +++ b/pkg/kv/kvserver/client_manual_proposal_test.go @@ -232,10 +232,10 @@ LIMIT Settings: st, Metrics: logstore.Metrics{ RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePrometheus, - Metadata: fakeMeta, - Duration: time.Millisecond, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, + Metadata: fakeMeta, + Duration: time.Millisecond, + BucketConfig: metric.IOLatencyBuckets, }), }, } diff --git a/pkg/kv/kvserver/kvflowcontrol/kvflowcontroller/kvflowcontroller_metrics.go b/pkg/kv/kvserver/kvflowcontrol/kvflowcontroller/kvflowcontroller_metrics.go index 2882ce56fbc2..7b35f7a7341c 100644 --- a/pkg/kv/kvserver/kvflowcontrol/kvflowcontroller/kvflowcontroller_metrics.go +++ b/pkg/kv/kvserver/kvflowcontrol/kvflowcontroller/kvflowcontroller_metrics.go @@ -186,10 +186,10 @@ func newMetrics(c *Controller) *metrics { ) m.WaitDuration[wc] = metric.NewHistogram( metric.HistogramOptions{ - Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration), - Duration: base.DefaultHistogramWindowInterval(), - Buckets: metric.IOLatencyBuckets, - Mode: metric.HistogramModePrometheus, + Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration), + Duration: base.DefaultHistogramWindowInterval(), + BucketConfig: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, }, ) m.TotalStreamCount[wc] = metric.NewFunctionalGauge( diff --git a/pkg/kv/kvserver/kvflowcontrol/kvflowhandle/kvflowhandle_metrics.go b/pkg/kv/kvserver/kvflowcontrol/kvflowhandle/kvflowhandle_metrics.go index 5006781b53e8..765f2ba4c86c 100644 --- a/pkg/kv/kvserver/kvflowcontrol/kvflowhandle/kvflowhandle_metrics.go +++ b/pkg/kv/kvserver/kvflowcontrol/kvflowhandle/kvflowhandle_metrics.go @@ -109,10 +109,10 @@ func NewMetrics(registry *metric.Registry) *Metrics { ) m.WaitDuration[wc] = metric.NewHistogram( metric.HistogramOptions{ - Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration), - Duration: base.DefaultHistogramWindowInterval(), - Buckets: metric.IOLatencyBuckets, - Mode: metric.HistogramModePrometheus, + Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration), + Duration: base.DefaultHistogramWindowInterval(), + BucketConfig: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, }, ) } diff --git a/pkg/kv/kvserver/liveness/liveness.go b/pkg/kv/kvserver/liveness/liveness.go index 9450ee6a2add..24b0d97eabc6 100644 --- a/pkg/kv/kvserver/liveness/liveness.go +++ b/pkg/kv/kvserver/liveness/liveness.go @@ -367,10 +367,10 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness { HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures), EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements), HeartbeatLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaHeartbeatLatency, - Duration: opts.HistogramWindowInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaHeartbeatLatency, + Duration: opts.HistogramWindowInterval, + BucketConfig: metric.IOLatencyBuckets, }), } nl.cache = newCache(opts.Gossip, opts.Clock, nl.cacheUpdated) diff --git a/pkg/kv/kvserver/logstore/logstore_bench_test.go b/pkg/kv/kvserver/logstore/logstore_bench_test.go index d318d1b7852c..fc2c33b07937 100644 --- a/pkg/kv/kvserver/logstore/logstore_bench_test.go +++ b/pkg/kv/kvserver/logstore/logstore_bench_test.go @@ -68,10 +68,10 @@ func runBenchmarkLogStore_StoreEntries(b *testing.B, bytes int64) { Settings: st, Metrics: Metrics{ RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePrometheus, - Metadata: metric.Metadata{}, - Duration: 10 * time.Second, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, + Metadata: metric.Metadata{}, + Duration: 10 * time.Second, + BucketConfig: metric.IOLatencyBuckets, }), }, } diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 5730d55de188..98ca367f6ac4 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -2838,10 +2838,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { LeaseRequestSuccessCount: metric.NewCounter(metaLeaseRequestSuccessCount), LeaseRequestErrorCount: metric.NewCounter(metaLeaseRequestErrorCount), LeaseRequestLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaLeaseRequestLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaLeaseRequestLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), LeaseTransferSuccessCount: metric.NewCounter(metaLeaseTransferSuccessCount), LeaseTransferErrorCount: metric.NewCounter(metaLeaseTransferErrorCount), @@ -2871,12 +2871,14 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { AverageCPUNanosPerSecond: metric.NewGaugeFloat64(metaAverageCPUNanosPerSecond), RecentReplicaCPUNanosPerSecond: metric.NewManualWindowHistogram( metaRecentReplicaCPUNanosPerSecond, - metric.ReplicaCPUTimeBuckets, + metric.ReplicaCPUTimeBuckets. + GetBucketsFromBucketConfig(), true, /* withRotate */ ), RecentReplicaQueriesPerSecond: metric.NewManualWindowHistogram( metaRecentReplicaQueriesPerSecond, - metric.ReplicaBatchRequestCountBuckets, + metric.ReplicaBatchRequestCountBuckets. + GetBucketsFromBucketConfig(), true, /* withRotate */ ), @@ -3008,50 +3010,50 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { RaftProposalsDropped: metric.NewCounter(metaRaftProposalsDropped), RaftProposalsDroppedLeader: metric.NewCounter(metaRaftProposalsDroppedLeader), RaftQuotaPoolPercentUsed: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaRaftQuotaPoolPercentUsed, - Duration: histogramWindow, - MaxVal: 100, - SigFigs: 1, - Buckets: metric.Percent100Buckets, + Metadata: metaRaftQuotaPoolPercentUsed, + Duration: histogramWindow, + MaxVal: 100, + SigFigs: 1, + BucketConfig: metric.Percent100Buckets, }), RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos), RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos), RaftCommandsApplied: metric.NewCounter(metaRaftCommandsApplied), RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaRaftLogCommitLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftLogCommitLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), RaftCommandCommitLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaRaftCommandCommitLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftCommandCommitLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), RaftHandleReadyLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaRaftHandleReadyLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftHandleReadyLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), RaftApplyCommittedLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaRaftApplyCommittedLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftApplyCommittedLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), RaftReplicationLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePrometheus, - Metadata: metaRaftReplicationLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, + Metadata: metaRaftReplicationLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), RaftSchedulerLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaRaftSchedulerLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftSchedulerLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), RaftStorageReadBytes: metric.NewCounter(metaRaftStorageReadBytes), @@ -3201,16 +3203,16 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { // Replica batch evaluation. ReplicaReadBatchEvaluationLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaReplicaReadBatchEvaluationLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaReplicaReadBatchEvaluationLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), ReplicaWriteBatchEvaluationLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaReplicaWriteBatchEvaluationLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaReplicaWriteBatchEvaluationLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), FlushUtilization: metric.NewGaugeFloat64(metaStorageFlushUtilization), FsyncLatency: metric.NewManualWindowHistogram( diff --git a/pkg/kv/kvserver/txnwait/metrics.go b/pkg/kv/kvserver/txnwait/metrics.go index 4610fd8e375c..49ecc21a189d 100644 --- a/pkg/kv/kvserver/txnwait/metrics.go +++ b/pkg/kv/kvserver/txnwait/metrics.go @@ -73,10 +73,10 @@ func NewMetrics(histogramWindowInterval time.Duration) *Metrics { Measurement: "Pusher wait time", Unit: metric.Unit_NANOSECONDS, }, - MaxVal: time.Hour.Nanoseconds(), - SigFigs: 1, - Duration: histogramWindowInterval, - Buckets: metric.LongRunning60mLatencyBuckets, + MaxVal: time.Hour.Nanoseconds(), + SigFigs: 1, + Duration: histogramWindowInterval, + BucketConfig: metric.LongRunning60mLatencyBuckets, }), QueryWaitTime: metric.NewHistogram(metric.HistogramOptions{ @@ -86,10 +86,10 @@ func NewMetrics(histogramWindowInterval time.Duration) *Metrics { Measurement: "Query wait time", Unit: metric.Unit_NANOSECONDS, }, - MaxVal: time.Hour.Nanoseconds(), - SigFigs: 1, - Duration: histogramWindowInterval, - Buckets: metric.LongRunning60mLatencyBuckets, + MaxVal: time.Hour.Nanoseconds(), + SigFigs: 1, + Duration: histogramWindowInterval, + BucketConfig: metric.LongRunning60mLatencyBuckets, }), DeadlocksTotal: metric.NewCounter( diff --git a/pkg/rpc/clock_offset.go b/pkg/rpc/clock_offset.go index 66f0cb4ec53b..28462d10a623 100644 --- a/pkg/rpc/clock_offset.go +++ b/pkg/rpc/clock_offset.go @@ -157,7 +157,7 @@ func newRemoteClockMonitor( // NB: the choice of IO over Network buckets is somewhat debatable, but // it's fine. Heartbeats can take >1s which the IO buckets can represent, // but the Network buckets top out at 1s. - Buckets: metric.IOLatencyBuckets, + BucketConfig: metric.IOLatencyBuckets, }), } return &r diff --git a/pkg/server/node.go b/pkg/server/node.go index 9a9dd08e55e0..0478c7c6e16e 100644 --- a/pkg/server/node.go +++ b/pkg/server/node.go @@ -276,10 +276,10 @@ type nodeMetrics struct { func makeNodeMetrics(reg *metric.Registry, histogramWindow time.Duration) nodeMetrics { nm := nodeMetrics{ Latency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: metaExecLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaExecLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), Success: metric.NewCounter(metaExecSuccess), Err: metric.NewCounter(metaExecError), diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 4cd4ef5ade77..dcc59648ea86 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -505,34 +505,34 @@ func makeMetrics(internal bool) Metrics { SQLOptPlanCacheMisses: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheMisses, internal)), // TODO(mrtracy): See HistogramWindowInterval in server/config.go for the 6x factor. DistSQLExecLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: getMetricMeta(MetaDistSQLExecLatency, internal), - Duration: 6 * metricsSampleInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaDistSQLExecLatency, internal), + Duration: 6 * metricsSampleInterval, + BucketConfig: metric.IOLatencyBuckets, }), SQLExecLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: getMetricMeta(MetaSQLExecLatency, internal), - Duration: 6 * metricsSampleInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaSQLExecLatency, internal), + Duration: 6 * metricsSampleInterval, + BucketConfig: metric.IOLatencyBuckets, }), DistSQLServiceLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: getMetricMeta(MetaDistSQLServiceLatency, internal), - Duration: 6 * metricsSampleInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaDistSQLServiceLatency, internal), + Duration: 6 * metricsSampleInterval, + BucketConfig: metric.IOLatencyBuckets, }), SQLServiceLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: getMetricMeta(MetaSQLServiceLatency, internal), - Duration: 6 * metricsSampleInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaSQLServiceLatency, internal), + Duration: 6 * metricsSampleInterval, + BucketConfig: metric.IOLatencyBuckets, }), SQLTxnLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: getMetricMeta(MetaSQLTxnLatency, internal), - Duration: 6 * metricsSampleInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaSQLTxnLatency, internal), + Duration: 6 * metricsSampleInterval, + BucketConfig: metric.IOLatencyBuckets, }), SQLTxnsOpen: metric.NewGauge(getMetricMeta(MetaSQLTxnsOpen, internal)), SQLActiveStatements: metric.NewGauge(getMetricMeta(MetaSQLActiveQueries, internal)), @@ -558,36 +558,36 @@ func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics { return ServerMetrics{ StatsMetrics: StatsMetrics{ SQLStatsMemoryMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ - Metadata: MetaSQLStatsMemMaxBytes, - Duration: cfg.HistogramWindowInterval, - MaxVal: log10int64times1000, - SigFigs: 3, - Buckets: metric.MemoryUsage64MBBuckets, + Metadata: MetaSQLStatsMemMaxBytes, + Duration: cfg.HistogramWindowInterval, + MaxVal: log10int64times1000, + SigFigs: 3, + BucketConfig: metric.MemoryUsage64MBBuckets, }), SQLStatsMemoryCurBytesCount: metric.NewGauge(MetaSQLStatsMemCurBytes), ReportedSQLStatsMemoryMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ - Metadata: MetaReportedSQLStatsMemMaxBytes, - Duration: cfg.HistogramWindowInterval, - MaxVal: log10int64times1000, - SigFigs: 3, - Buckets: metric.MemoryUsage64MBBuckets, + Metadata: MetaReportedSQLStatsMemMaxBytes, + Duration: cfg.HistogramWindowInterval, + MaxVal: log10int64times1000, + SigFigs: 3, + BucketConfig: metric.MemoryUsage64MBBuckets, }), ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes), DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats), SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted), SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure), SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: MetaSQLStatsFlushDuration, - Duration: 6 * metricsSampleInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: MetaSQLStatsFlushDuration, + Duration: 6 * metricsSampleInterval, + BucketConfig: metric.IOLatencyBuckets, }), SQLStatsRemovedRows: metric.NewCounter(MetaSQLStatsRemovedRows), SQLTxnStatsCollectionOverhead: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: MetaSQLTxnStatsCollectionOverhead, - Duration: 6 * metricsSampleInterval, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: MetaSQLTxnStatsCollectionOverhead, + Duration: 6 * metricsSampleInterval, + BucketConfig: metric.IOLatencyBuckets, }), }, ContentionSubsystemMetrics: txnidcache.NewMetrics(), diff --git a/pkg/sql/execinfra/metrics.go b/pkg/sql/execinfra/metrics.go index 30123ce27ee6..49305063056e 100644 --- a/pkg/sql/execinfra/metrics.go +++ b/pkg/sql/execinfra/metrics.go @@ -133,21 +133,21 @@ func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics { FlowsActive: metric.NewGauge(metaFlowsActive), FlowsTotal: metric.NewCounter(metaFlowsTotal), MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaMemMaxBytes, - Duration: histogramWindow, - MaxVal: log10int64times1000, - SigFigs: 3, - Buckets: metric.MemoryUsage64MBBuckets, + Metadata: metaMemMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + BucketConfig: metric.MemoryUsage64MBBuckets, }), CurBytesCount: metric.NewGauge(metaMemCurBytes), VecOpenFDs: metric.NewGauge(metaVecOpenFDs), CurDiskBytesCount: metric.NewGauge(metaDiskCurBytes), MaxDiskBytesHist: metric.NewHistogram(metric.HistogramOptions{ - Metadata: metaDiskMaxBytes, - Duration: histogramWindow, - MaxVal: log10int64times1000, - SigFigs: 3, - Buckets: metric.MemoryUsage64MBBuckets}), + Metadata: metaDiskMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + BucketConfig: metric.MemoryUsage64MBBuckets}), QueriesSpilled: metric.NewCounter(metaQueriesSpilled), SpilledBytesWritten: metric.NewCounter(metaSpilledBytesWritten), SpilledBytesRead: metric.NewCounter(metaSpilledBytesRead), diff --git a/pkg/sql/mem_metrics.go b/pkg/sql/mem_metrics.go index 995a95d1226d..16bf2923b1af 100644 --- a/pkg/sql/mem_metrics.go +++ b/pkg/sql/mem_metrics.go @@ -74,11 +74,11 @@ func makeMemMetricHistogram( metadata metric.Metadata, histogramWindow time.Duration, ) metric.IHistogram { return metric.NewHistogram(metric.HistogramOptions{ - Metadata: metadata, - Duration: histogramWindow, - MaxVal: log10int64times1000, - SigFigs: 3, - Buckets: metric.MemoryUsage64MBBuckets, + Metadata: metadata, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + BucketConfig: metric.MemoryUsage64MBBuckets, }) } diff --git a/pkg/sql/pgwire/pre_serve.go b/pkg/sql/pgwire/pre_serve.go index 37b9ae56281a..b6695f19ac61 100644 --- a/pkg/sql/pgwire/pre_serve.go +++ b/pkg/sql/pgwire/pre_serve.go @@ -192,10 +192,10 @@ func makeTenantIndependentMetrics(histogramWindow time.Duration) tenantIndepende PreServeNewConns: metric.NewCounter(MetaPreServeNewConns), PreServeConnFailures: metric.NewCounter(MetaPreServeConnFailures), PreServeMaxBytes: metric.NewHistogram(metric.HistogramOptions{ - Metadata: MetaPreServeMaxBytes, - Duration: histogramWindow, - Buckets: metric.MemoryUsage64MBBuckets, - Mode: metric.HistogramModePrometheus, + Metadata: MetaPreServeMaxBytes, + Duration: histogramWindow, + BucketConfig: metric.MemoryUsage64MBBuckets, + Mode: metric.HistogramModePrometheus, }), PreServeCurBytes: metric.NewGauge(MetaPreServeCurBytes), } diff --git a/pkg/sql/pgwire/server.go b/pkg/sql/pgwire/server.go index 391d2c08a313..96b5e6ab174b 100644 --- a/pkg/sql/pgwire/server.go +++ b/pkg/sql/pgwire/server.go @@ -279,10 +279,10 @@ func newTenantSpecificMetrics( NewConns: metric.NewCounter(MetaNewConns), ConnsWaitingToHash: metric.NewGauge(MetaConnsWaitingToHash), ConnLatency: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: MetaConnLatency, - Duration: histogramWindow, - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: MetaConnLatency, + Duration: histogramWindow, + BucketConfig: metric.IOLatencyBuckets, }), ConnFailures: metric.NewCounter(MetaConnFailures), PGWireCancelTotalCount: metric.NewCounter(MetaPGWireCancelTotal), diff --git a/pkg/sql/sqlstats/sslocal/sql_stats_test.go b/pkg/sql/sqlstats/sslocal/sql_stats_test.go index cafc94cda195..e421da694170 100644 --- a/pkg/sql/sqlstats/sslocal/sql_stats_test.go +++ b/pkg/sql/sqlstats/sslocal/sql_stats_test.go @@ -529,11 +529,11 @@ func BenchmarkRecordStatement(b *testing.B) { sqlstats.MaxMemSQLStatsTxnFingerprints, metric.NewGauge(sql.MetaReportedSQLStatsMemCurBytes), /* curMemoryBytesCount */ metric.NewHistogram(metric.HistogramOptions{ - Metadata: sql.MetaReportedSQLStatsMemMaxBytes, - Duration: 10 * time.Second, - MaxVal: 19 * 1000, - SigFigs: 3, - Buckets: metric.MemoryUsage64MBBuckets, + Metadata: sql.MetaReportedSQLStatsMemMaxBytes, + Duration: 10 * time.Second, + MaxVal: 19 * 1000, + SigFigs: 3, + BucketConfig: metric.MemoryUsage64MBBuckets, }), /* maxMemoryBytesHist */ insightsProvider.Writer, monitor, diff --git a/pkg/sql/ttl/ttljob/ttljob_metrics.go b/pkg/sql/ttl/ttljob/ttljob_metrics.go index 7670f44a28f6..1746a0ed4661 100644 --- a/pkg/sql/ttl/ttljob/ttljob_metrics.go +++ b/pkg/sql/ttl/ttljob/ttljob_metrics.go @@ -107,10 +107,10 @@ func makeRowLevelTTLAggMetrics(histogramWindowInterval time.Duration) metric.Str Unit: metric.Unit_NANOSECONDS, MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, - MaxVal: time.Hour.Nanoseconds(), - SigFigs: sigFigs, - Duration: histogramWindowInterval, - Buckets: metric.LongRunning60mLatencyBuckets, + MaxVal: time.Hour.Nanoseconds(), + SigFigs: sigFigs, + Duration: histogramWindowInterval, + BucketConfig: metric.LongRunning60mLatencyBuckets, }), SelectDuration: b.Histogram(metric.HistogramOptions{ Metadata: metric.Metadata{ @@ -120,10 +120,10 @@ func makeRowLevelTTLAggMetrics(histogramWindowInterval time.Duration) metric.Str Unit: metric.Unit_NANOSECONDS, MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, - MaxVal: time.Minute.Nanoseconds(), - SigFigs: sigFigs, - Duration: histogramWindowInterval, - Buckets: metric.BatchProcessLatencyBuckets, + MaxVal: time.Minute.Nanoseconds(), + SigFigs: sigFigs, + Duration: histogramWindowInterval, + BucketConfig: metric.BatchProcessLatencyBuckets, }), DeleteDuration: b.Histogram(metric.HistogramOptions{ Metadata: metric.Metadata{ @@ -133,10 +133,10 @@ func makeRowLevelTTLAggMetrics(histogramWindowInterval time.Duration) metric.Str Unit: metric.Unit_NANOSECONDS, MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, - MaxVal: time.Minute.Nanoseconds(), - SigFigs: sigFigs, - Duration: histogramWindowInterval, - Buckets: metric.BatchProcessLatencyBuckets, + MaxVal: time.Minute.Nanoseconds(), + SigFigs: sigFigs, + Duration: histogramWindowInterval, + BucketConfig: metric.BatchProcessLatencyBuckets, }), RowSelections: b.Counter( metric.Metadata{ diff --git a/pkg/util/admission/elastic_cpu_granter.go b/pkg/util/admission/elastic_cpu_granter.go index 02a16af28313..e63dba3aa127 100644 --- a/pkg/util/admission/elastic_cpu_granter.go +++ b/pkg/util/admission/elastic_cpu_granter.go @@ -369,10 +369,10 @@ func makeElasticCPUGranterMetrics() *elasticCPUGranterMetrics { AvailableNanos: metric.NewGauge(elasticCPUAvailableNanos), NanosExhaustedDuration: metric.NewGauge(elasticCPUNanosExhaustedDuration), OverLimitDuration: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePrometheus, - Metadata: elasticCPUOverLimitDurations, - Duration: base.DefaultHistogramWindowInterval(), - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, + Metadata: elasticCPUOverLimitDurations, + Duration: base.DefaultHistogramWindowInterval(), + BucketConfig: metric.IOLatencyBuckets, }), Utilization: metric.NewGaugeFloat64(elasticCPUGranterUtilization), UtilizationLimit: metric.NewGaugeFloat64(elasticCPUGranterUtilizationLimit), diff --git a/pkg/util/admission/work_queue.go b/pkg/util/admission/work_queue.go index 2284e5472fd1..d5526b11f08a 100644 --- a/pkg/util/admission/work_queue.go +++ b/pkg/util/admission/work_queue.go @@ -1784,10 +1784,10 @@ func makeWorkQueueMetricsSingle(name string) workQueueMetricsSingle { Admitted: metric.NewCounter(addName(name, admittedMeta)), Errored: metric.NewCounter(addName(name, erroredMeta)), WaitDurations: metric.NewHistogram(metric.HistogramOptions{ - Mode: metric.HistogramModePreferHdrLatency, - Metadata: addName(name, waitDurationsMeta), - Duration: base.DefaultHistogramWindowInterval(), - Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + Metadata: addName(name, waitDurationsMeta), + Duration: base.DefaultHistogramWindowInterval(), + BucketConfig: metric.IOLatencyBuckets, }), WaitQueueLength: metric.NewGauge(addName(name, waitQueueLengthMeta)), } diff --git a/pkg/util/metric/BUILD.bazel b/pkg/util/metric/BUILD.bazel index b07dd49b6b97..182865a142fd 100644 --- a/pkg/util/metric/BUILD.bazel +++ b/pkg/util/metric/BUILD.bazel @@ -59,7 +59,6 @@ go_test( "//pkg/testutils/datapathutils", "//pkg/testutils/echotest", "//pkg/util/log", - "@com_github_dustin_go_humanize//:go-humanize", "@com_github_kr_pretty//:pretty", "@com_github_prometheus_client_golang//prometheus", "@com_github_prometheus_client_model//go", diff --git a/pkg/util/metric/aggmetric/agg_metric_test.go b/pkg/util/metric/aggmetric/agg_metric_test.go index 73652961ddea..61b8e9885d9f 100644 --- a/pkg/util/metric/aggmetric/agg_metric_test.go +++ b/pkg/util/metric/aggmetric/agg_metric_test.go @@ -72,10 +72,10 @@ func TestAggMetric(t *testing.T) { Metadata: metric.Metadata{ Name: "histo_gram", }, - Duration: base.DefaultHistogramWindowInterval(), - MaxVal: 100, - SigFigs: 1, - Buckets: metric.Count1KBuckets, + Duration: base.DefaultHistogramWindowInterval(), + MaxVal: 100, + SigFigs: 1, + BucketConfig: metric.Count1KBuckets, }, "tenant_id") r.AddMetric(h) @@ -175,11 +175,11 @@ func TestAggMetricBuilder(t *testing.T) { g := b.Gauge(metric.Metadata{Name: "bar_gauge"}) f := b.GaugeFloat64(metric.Metadata{Name: "baz_gauge"}) h := b.Histogram(metric.HistogramOptions{ - Metadata: metric.Metadata{Name: "histo_gram"}, - Duration: base.DefaultHistogramWindowInterval(), - MaxVal: 100, - SigFigs: 1, - Buckets: metric.Count1KBuckets, + Metadata: metric.Metadata{Name: "histo_gram"}, + Duration: base.DefaultHistogramWindowInterval(), + MaxVal: 100, + SigFigs: 1, + BucketConfig: metric.Count1KBuckets, }) for i := 5; i < 10; i++ { diff --git a/pkg/util/metric/histogram_buckets.go b/pkg/util/metric/histogram_buckets.go index 51c47beea4d1..ee70aa5d194c 100644 --- a/pkg/util/metric/histogram_buckets.go +++ b/pkg/util/metric/histogram_buckets.go @@ -10,333 +10,132 @@ package metric -// IOLatencyBuckets are prometheus histogram buckets suitable for a histogram -// that records a quantity (nanosecond-denominated) in which most measurements -// resemble those of typical disk latencies, i.e. which are in the micro- and -// millisecond range during normal operation. -var IOLatencyBuckets = []float64{ - // Generated via TestHistogramBuckets/IOLatencyBuckets. - 10000.000000, // 10µs - 12638.482029, // 12.638µs - 15973.122801, // 15.973µs - 20187.602547, // 20.187µs - 25514.065200, // 25.514µs - 32245.905453, // 32.245µs - 40753.929659, // 40.753µs - 51506.780762, // 51.506µs - 65096.752305, // 65.096µs - 82272.413417, // 82.272µs - 103979.841848, // 103.979µs - 131414.736261, // 131.414µs - 166088.278263, // 166.088µs - 209910.372011, // 209.91µs - 265294.846443, // 265.294µs - 335292.414925, // 335.292µs - 423758.716060, // 423.758µs - 535566.691771, // 535.566µs - 676875.000946, // 676.875µs - 855467.253557, // 855.467µs - 1081180.751077, // 1.08118ms - 1366448.349295, // 1.366448ms - 1726983.290659, // 1.726983ms - 2182644.728397, // 2.182644ms - 2758531.617629, // 2.758531ms - 3486365.227678, // 3.486365ms - 4406236.427774, // 4.406236ms - 5568813.990945, // 5.568813ms - 7038135.554932, // 7.038135ms - 8895134.973108, // 8.895134ms - 11242100.350621, // 11.2421ms - 14208308.325339, // 14.208308ms - 17957144.943716, // 17.957144ms - 22695105.366947, // 22.695105ms - 28683168.133420, // 28.683168ms - 36251170.499885, // 36.25117ms - 45815976.690545, // 45.815976ms - 57904439.806025, // 57.904439ms - 73182422.190762, // 73.182422ms - 92491472.772173, // 92.491472ms - 116895181.649858, // 116.895181ms - 147737765.259851, // 147.737765ms - 186718109.129192, // 186.718109ms - 235983346.678219, // 235.983346ms - 298247128.621688, // 298.247128ms - 376939097.538835, // 376.939097ms - 476393801.040133, // 476.393801ms - 602089449.333611, // 602.089449ms - 760949668.545986, // 760.949668ms - 961724871.115294, // 961.724871ms - 1215474250.076283, // 1.21547425s - 1536174946.671824, // 1.536174946s - 1941491945.743876, // 1.941491945s - 2453751106.639811, // 2.453751106s - 3101168926.574770, // 3.101168926s - 3919406774.847209, // 3.919406774s - 4953535208.959157, // 4.953535208s - 6260516572.014802, // 6.260516572s - 7912342618.981298, // 7.912342618s - 9999999999.999969, // 9.999999999s +import "github.com/prometheus/client_golang/prometheus" + +// staticBucketConfig describes the buckets we want to generate for a specific +// category of metrics. +type staticBucketConfig struct { + category string + min float64 + max float64 + count int + units unitType + distribution distribution } -// BatchProcessLatencyBuckets are prometheus histogram buckets suitable for a -// histogram that records a quantity (nanosecond-denominated) in which most -// measurements are in the seconds to minutes range during normal operation. -var BatchProcessLatencyBuckets = []float64{ - // Generated via TestHistogramBuckets/BatchProcessLatencyBuckets. - 500000000.000000, // 500ms - 557259285.358743, // 557.259285ms - 621075822.237074, // 621.075822ms - 692200537.706851, // 692.200537ms - 771470353.934916, // 771.470353ms - 859818036.218456, // 859.818036ms - 958283168.803309, // 958.283168ms - 1068024387.637287, // 1.068024387s - 1190333014.000928, // 1.190333014s - 1326648249.442152, // 1.326648249s - 1478574110.813123, // 1.47857411s - 1647898304.683320, // 1.647898304s - 1836613263.223422, // 1.836613263s - 2046939589.088547, // 2.046939589s - 2281352185.176006, // 2.281352185s - 2542609376.725576, // 2.542609376s - 2833785368.441068, // 2.833785368s - 3158306418.555065, // 3.158306418s - 3519991155.495853, // 3.519991155s - 3923095511.561431, // 3.923095511s - 4372362802.333632, // 4.372362802s - 4873079541.115184, // 4.873079541s - 5431137645.156319, // 5.431137645s - 6053103765.649553, // 6.053103765s - 6746296557.296375, // 6.746296557s - 7518872796.674253, // 7.518872796s - 8379923362.755980, // 8.379923362s - 9339580208.980864, // 9.339580208s - 10409135585.614676, // 10.409135585s - 11601174915.283792, // 11.601174915s - 12929724885.225649, // 12.929724885s - 14410418498.852003, // 14.410418498s - 16060679028.781363, // 16.060679028s - 17899925035.909710, // 17.899925035s - 19949798866.972237, // 19.949798866s - 22234421319.319225, // 22.234421319s - 24780675469.538071, // 24.780675469s - 27618523005.723442, // 27.618523005s - 30781356785.666904, // 30.781356785s - 34306393769.506477, // 34.306393769s - 38235112950.461639, // 38.23511295s - 42613743436.770157, // 42.613743436s - 47493808428.070732, // 47.493808428s - 52932731487.183495, // 52.932731487s - 58994512241.268242, // 58.994512241s - 65750479463.313522, // 1m5.750479463s - 73280130395.441635, // 1m13.280130395s - 81672066190.318619, // 1m21.67206619s - 91025034477.977859, // 1m31.025034477s - 101449091325.905777, // 1m41.449091325s - 113066896265.136261, // 1m53.066896265s - 126015155620.881943, // 2m6.01515562s - 140446231131.326965, // 2m20.446231131s - 156529932783.144257, // 2m36.529932783s - 174455516959.974152, // 2m54.455516959s - 194433913416.010529, // 3m14.433913416s - 216700207279.419586, // 3m36.700207279s - 241516405291.241699, // 4m1.516405291s - 269174518830.019897, // 4m29.17451883s - 300000000000.000854, // 5m0s +// distribution describes the population distribution that best describes the +// metric for which we record histogram data +type distribution int + +const ( + Uniform distribution = iota + Exponential + // TODO(ericharmeling): add more distributions +) + +// unitType describes the unit type of the metric for which we record +// histogram data +type unitType int + +const ( + LATENCY unitType = iota + SIZE + COUNT +) + +var IOLatencyBuckets = staticBucketConfig{ + category: "IOLatencyBuckets", + min: 10e3, // 10µs + max: 10e9, // 10s + count: 60, + units: LATENCY, + distribution: Exponential, } -// LongRunning60mLatencyBuckets are prometheus histogram buckets suitable -// for a histogram that records a quantity (nanosecond-denominated) for -// long-running processes (multiple minutes). -var LongRunning60mLatencyBuckets = []float64{ - // Generated via TestHistogramBuckets/LongRunning60mLatencyBuckets. - 500000000.000000, // 500ms - 581230667.894489, // 581.230667ms - 675658178.602148, // 675.658178ms - 785426508.834601, // 785.426508ms - 913027948.623944, // 913.027948ms - 1061359688.770060, // 1.061359688s - 1233789601.560218, // 1.233789601s - 1434232708.312242, // 1.434232708s - 1667240069.936893, // 1.667240069s - 1938102118.779750, // 1.938102118s - 2252968777.892157, // 2.252968777s - 2618989095.039379, // 2.618989095s - 3044473561.836243, // 3.044473561s - 3539082803.466387, // 3.539082803s - 4114046923.185338, // 4.114046923s - 4782420481.824564, // 4.782420481s - 5559378901.606352, // 5.559378901s - 6462563024.118382, // 6.462563024s - 7512479645.637113, // 7.512479645s - 8732967123.954826, // 8.732967123s - 10151736628.313759, // 10.151736628s - 11801001321.527510, // 11.801001321s - 13718207759.870365, // 13.718207759s - 15946886117.169632, // 15.946886117s - 18537638537.439724, // 18.537638537s - 21549288056.605419, // 21.549288056s - 25050214179.583008, // 25.050214179s - 29119905436.998066, // 29.119905436s - 33850764172.341507, // 33.850764172s - 39350204537.257782, // 39.350204537s - 45743091329.950188, // 45.743091329s - 53174575050.531136, // 53.17457505s - 61813387543.251701, // 1m1.813387543s - 71855673053.170151, // 1m11.855673053s - 83529441681.404266, // 1m23.529441681s - 97099746354.672745, // 1m37.099746354s - 112874700852.223846, // 1m52.874700852s - 131212475529.457443, // 2m11.212475529s - 152529429576.151703, // 2m32.529429576s - 177309564452.224213, // 2m57.309564452s - 206115513141.294464, // 3m26.115513141s - 239601314733.059875, // 3m59.601314733s - 278527264381.388123, // 4m38.527264381s - 323777175806.438293, // 5m23.777175806s - 376378448285.935181, // 6m16.378448285s - 437525393756.650940, // 7m17.525393756s - 508606353667.955078, // 8m28.606353667s - 591235221275.612671, // 9m51.235221275s - 687288085089.540771, // 11m27.288085089s - 798945825465.036499, // 13m18.945825465s - 928743631493.114136, // 15m28.743631493s - 1079628562470.991943, // 17m59.62856247s - 1255026460885.963623, // 20m55.026460885s - 1458919736172.010742, // 24m18.919736172s - 1695937785319.419434, // 28m15.937785319s - 1971462103337.413574, // 32m51.462103337s - 2291748470102.958496, // 38m11.748470102s - 2664068987848.231934, // 44m24.068987848s - 3096877194248.046875, // 51m36.877194248s - 3600000000000.007812, // 1h0m0s +var BatchProcessLatencyBuckets = staticBucketConfig{ + category: "BatchProcessLatencyBuckets", + min: 500e6, // 500ms + max: 300e9, // 5m + count: 60, + units: LATENCY, + distribution: Exponential, } -// Count1KBuckets are prometheus histogram buckets suitable for a histogram that -// records a quantity that is a count (unit-less) in which most measurements are -// in the 1 to ~1000 range during normal operation. -var Count1KBuckets = []float64{ - // Generated via TestHistogramBuckets/Count1KBuckets. - 1.000000, - 2.000000, - 4.000000, - 8.000000, - 16.000000, - 32.000000, - 64.000000, - 128.000000, - 256.000000, - 512.000000, - 1024.000000, +var LongRunning60mLatencyBuckets = staticBucketConfig{ + category: "LongRunning60mLatencyBuckets", + min: 500e6, // 500ms + max: 3600e9, // 1h + count: 60, + units: LATENCY, + distribution: Exponential, } -// Percent100Buckets are prometheus histogram buckets suitable for a histogram that -// records a percent quantity [0,100] -var Percent100Buckets = []float64{ - // Generated via TestHistogramBuckets/Percent100Buckets. - 10.000000, - 20.000000, - 30.000000, - 40.000000, - 50.000000, - 60.000000, - 70.000000, - 80.000000, - 90.000000, - 100.000000, +var DataSize16MBBuckets = staticBucketConfig{ + category: "DataSize16MBBuckets", + min: 1e3, // 1kB + max: 16384e3, // 16MB + count: 15, + units: SIZE, + distribution: Exponential, } -// DataSize16MBBuckets are prometheus histogram buckets suitable for a histogram that -// records a quantity that is a size (byte-denominated) in which most measurements are -// in the kB to MB range during normal operation. -var DataSize16MBBuckets = []float64{ - // Generated via TestHistogramBuckets/DataSize16MBBuckets. - 1000.000000, // 1.0 kB - 2000.000000, // 2.0 kB - 4000.000000, // 4.0 kB - 8000.000000, // 8.0 kB - 16000.000000, // 16 kB - 32000.000000, // 32 kB - 64000.000000, // 64 kB - 128000.000000, // 128 kB - 256000.000000, // 256 kB - 512000.000000, // 512 kB - 1024000.000000, // 1.0 MB - 2048000.000000, // 2.0 MB - 4096000.000000, // 4.1 MB - 8192000.000000, // 8.2 MB - 16384000.000000, // 16 MB +var MemoryUsage64MBBuckets = staticBucketConfig{ + category: "MemoryUsage64MBBuckets", + min: 1, // 1B + max: 64e6, // 64MB + count: 15, + units: SIZE, + distribution: Exponential, } -// MemoryUsage64MBBuckets are prometheus histogram buckets suitable for a histogram that -// records memory usage (in Bytes) -var MemoryUsage64MBBuckets = []float64{ - // Generated via TestHistogramBuckets/MemoryUsage64MBBuckets. - 1.000000, // 1 B - 3.610641, // 3 B - 13.036727, // 13 B - 47.070938, // 47 B - 169.956248, // 169 B - 613.650962, // 613 B - 2215.673192, // 2.2 kB - 8000.000000, // 8.0 kB - 28885.126301, // 29 kB - 104293.815179, // 104 kB - 376567.502984, // 377 kB - 1359649.985574, // 1.4 MB - 4909207.694830, // 4.9 MB - 17725385.537954, // 18 MB - 64000000.000000, // 64 MB +var ReplicaCPUTimeBuckets = staticBucketConfig{ + category: "ReplicaCPUTimeBuckets", + min: 50e4, // 500µs + max: 5e9, // 5s + count: 20, + units: LATENCY, + distribution: Exponential, } -var ReplicaCPUTimeBuckets = []float64{ - 500000.000000, // 500µs - 811888.369594, // 811.888µs - 1318325.449365, // 1.318325ms - 2140666.199360, // 2.140666ms - 3475963.980888, // 3.475963ms - 5644189.458423, // 5.644189ms - 9164903.554162, // 9.164903ms - 14881757.208157, // 14.881757ms - 24164651.192859, // 24.164651ms - 39237998.517573, // 39.237998ms - 63713749.285157, // 63.713749ms - 103456904.055739, // 103.456904ms - 167990914.314189, // 167.990914ms - 272779739.058426, // 272.779739ms - 442933395.205041, // 442.933395ms - 719224944.143830, // 719.224944ms - 1167860734.545059, // 1.167860734s - 1896345095.366121, // 1.896345095s - 3079241055.330125, // 3.079241055s - 4999999999.999990, // 4.999999999s +var ReplicaBatchRequestCountBuckets = staticBucketConfig{ + category: "ReplicaBatchRequestCountBuckets", + min: 1, + max: 16e3, + count: 20, + units: COUNT, + distribution: Exponential, } -// ReplicaBatchRequestCountBuckets are prometheus histogram buckets suitable -// for a histogram that records request counts to a replica. NOTE: The default -// load based split threshold is 2500 Requests (>= BatchRequests) when QPS -// splitting is enabled. We don't expect more than 2500 batch requests for a -// replica in most clusters. However with CPU splits (default), this no longer -// holds. -var ReplicaBatchRequestCountBuckets = []float64{ - 1.000000, - 1.664445, - 2.770377, - 4.611141, - 7.674991, - 12.774602, - 21.262623, - 35.390468, - 58.905491, - 98.044956, - 163.190445, - 271.621536, - 452.099132, - 752.494181, - 1252.485246, - 2084.692921, - 3469.856899, - 5775.386284, - 9612.813352, - 16000.000000, +var Count1KBuckets = staticBucketConfig{ + category: "Count1KBuckets", + min: 1, + max: 1024, + count: 11, + units: COUNT, + distribution: Exponential, +} +var Percent100Buckets = staticBucketConfig{ + category: "Percent100Buckets", + min: 0, + max: 100, + count: 10, + units: COUNT, + distribution: Uniform, +} + +var StaticBucketConfigs = []staticBucketConfig{IOLatencyBuckets, + BatchProcessLatencyBuckets, LongRunning60mLatencyBuckets, + DataSize16MBBuckets, MemoryUsage64MBBuckets, ReplicaCPUTimeBuckets, + ReplicaBatchRequestCountBuckets, Count1KBuckets, Percent100Buckets} + +func (config staticBucketConfig) GetBucketsFromBucketConfig() []float64 { + var buckets []float64 + if config.distribution == Uniform { + width := (config.max - config.min) / float64(config.count) + buckets = prometheus.LinearBuckets(config.min, width, config.count) + } else if config.distribution == Exponential { + buckets = prometheus.ExponentialBucketsRange(config.min, config.max, + config.count) + } + return buckets } diff --git a/pkg/util/metric/histogram_buckets_test.go b/pkg/util/metric/histogram_buckets_test.go index 54e7b11dc4d4..65dd2f3be734 100644 --- a/pkg/util/metric/histogram_buckets_test.go +++ b/pkg/util/metric/histogram_buckets_test.go @@ -14,82 +14,32 @@ import ( "fmt" "strings" "testing" - "time" - "github.com/dustin/go-humanize" - "github.com/prometheus/client_golang/prometheus" - "github.com/stretchr/testify/require" + "github.com/cockroachdb/cockroach/pkg/testutils/datapathutils" + "github.com/cockroachdb/cockroach/pkg/testutils/echotest" ) -const LATENCY = "LATENCY" -const SIZE = "SIZE" - // TestHistogramBuckets is used to generate additional prometheus buckets to be // used with Histogram. Please include obs-inf in the review process of new // buckets. func TestHistogramBuckets(t *testing.T) { - verifyAndPrint := func(t *testing.T, exp, act []float64, histType string) { + verifyAndPrint := func(t *testing.T, exp []float64, category string) string { t.Helper() var buf strings.Builder for idx, f := range exp { if idx == 0 { - fmt.Fprintf(&buf, "// Generated via %s.", t.Name()) - } - switch histType { - case LATENCY: - fmt.Fprintf(&buf, "\n%f, // %s", f, time.Duration(f)) - case SIZE: - fmt.Fprintf(&buf, "\n%f, // %s", f, humanize.Bytes(uint64(f))) - default: - fmt.Fprintf(&buf, "\n%f,", f) + fmt.Fprintf(&buf, "%s", category) } + fmt.Fprintf(&buf, "\n%f", f) } - t.Logf("%s", &buf) - require.InDeltaSlice(t, exp, act, 1 /* delta */, "Please update the bucket boundaries for %s", t.Name()) + return buf.String() } - t.Run("IOLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 60) - verifyAndPrint(t, exp, IOLatencyBuckets, LATENCY) - }) - - t.Run("BatchProcessLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(500e6, 300e9, 60) - verifyAndPrint(t, exp, BatchProcessLatencyBuckets, LATENCY) - }) - - t.Run("LongRunning60mLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(500e6, 3600e9, 60) - verifyAndPrint(t, exp, LongRunning60mLatencyBuckets, LATENCY) - }) - - t.Run("Count1KBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBuckets(1, 2, 11) - verifyAndPrint(t, exp, Count1KBuckets, "") - }) - t.Run("Percent100Buckets", func(t *testing.T) { - exp := prometheus.LinearBuckets(10, 10, 10) - verifyAndPrint(t, exp, Percent100Buckets, "") - }) + for _, config := range StaticBucketConfigs { + exp := config.GetBucketsFromBucketConfig() + buf := verifyAndPrint(t, exp, config.category) - t.Run("DataSize16MBBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBuckets(1e3, 2, 15) - verifyAndPrint(t, exp, DataSize16MBBuckets, SIZE) - }) - - t.Run("MemoryUsage64MBBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(1, 64e6, 15) - verifyAndPrint(t, exp, MemoryUsage64MBBuckets, SIZE) - }) - - t.Run("ReplicaCPUTimeBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(50e4 /* 50µs */, 5e9 /* 5s */, 20) - verifyAndPrint(t, exp, ReplicaCPUTimeBuckets, LATENCY) - }) - - t.Run("ReplicaBatchRequestCountBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(1, 16e3, 20) - verifyAndPrint(t, exp, ReplicaBatchRequestCountBuckets, "") - }) + echotest.Require(t, buf, datapathutils.TestDataPath(t, config.category)) + } } diff --git a/pkg/util/metric/metric.go b/pkg/util/metric/metric.go index e3508c2ebdb3..9c62c0cf641f 100644 --- a/pkg/util/metric/metric.go +++ b/pkg/util/metric/metric.go @@ -246,6 +246,9 @@ type HistogramOptions struct { // Buckets are only relevant to Prometheus histograms, and represent // the pre-defined histogram bucket boundaries to be used. Buckets []float64 + // BucketConfig is only relevant to Prometheus histograms, and represents + // the pre-defined histogram bucket configuration used to generate buckets. + BucketConfig staticBucketConfig // Mode defines the type of histogram to be used. See individual // comments on each HistogramMode value for details. Mode HistogramMode @@ -259,16 +262,24 @@ func NewHistogram(opt HistogramOptions) IHistogram { return NewHdrHistogram(opt.Metadata, opt.Duration, opt.MaxVal, opt.SigFigs) } } else { - return newHistogram(opt.Metadata, opt.Duration, opt.Buckets) + return newHistogram(opt.Metadata, opt.Duration, opt.Buckets, + opt.BucketConfig) } } // NewHistogram is a prometheus-backed histogram. Depending on the value of // opts.Buckets, this is suitable for recording any kind of quantity. Common // sensible choices are {IO,Network}LatencyBuckets. -func newHistogram(meta Metadata, duration time.Duration, buckets []float64) *Histogram { +func newHistogram( + meta Metadata, duration time.Duration, buckets []float64, bucketConfig staticBucketConfig, +) *Histogram { // TODO(obs-inf): prometheus supports labeled histograms but they require more // plumbing and don't fit into the PrometheusObservable interface any more. + + // If no buckets are provided, generate buckets from bucket configuration + if buckets == nil && bucketConfig.count != 0 { + buckets = bucketConfig.GetBucketsFromBucketConfig() + } opts := prometheus.HistogramOpts{ Buckets: buckets, } diff --git a/pkg/util/metric/metric_test.go b/pkg/util/metric/metric_test.go index a0f9cde324f3..c58c5a79fe21 100644 --- a/pkg/util/metric/metric_test.go +++ b/pkg/util/metric/metric_test.go @@ -276,7 +276,6 @@ func TestNewHistogramRotate(t *testing.T) { Mode: HistogramModePrometheus, Metadata: emptyMetadata, Duration: 10 * time.Second, - Buckets: nil, }) for i := 0; i < 4; i++ { // Windowed histogram is initially empty. @@ -306,10 +305,10 @@ func TestHistogramWindowed(t *testing.T) { duration := 10 * time.Second h := NewHistogram(HistogramOptions{ - Mode: HistogramModePrometheus, - Metadata: Metadata{}, - Duration: duration, - Buckets: IOLatencyBuckets, + Mode: HistogramModePrometheus, + Metadata: Metadata{}, + Duration: duration, + BucketConfig: IOLatencyBuckets, }) measurements := []int64{200000000, 0, 4000000, 5000000, 10000000, 20000000, @@ -327,6 +326,8 @@ func TestHistogramWindowed(t *testing.T) { // greater than each measurement. count := 0 j := 0 + IOLatencyBuckets := IOLatencyBuckets. + GetBucketsFromBucketConfig() var expQuantileValues []float64 for i := range IOLatencyBuckets { if j < len(sortedMeasurements) && IOLatencyBuckets[i] > float64( @@ -400,7 +401,8 @@ func TestHistogramWindowed(t *testing.T) { func TestMergeWindowedHistogram(t *testing.T) { measurements := []int64{4000000, 90000000} opts := prometheus.HistogramOpts{ - Buckets: IOLatencyBuckets, + Buckets: IOLatencyBuckets. + GetBucketsFromBucketConfig(), } prevWindow := prometheus.NewHistogram(opts) diff --git a/pkg/util/metric/registry_test.go b/pkg/util/metric/registry_test.go index 48f4aba216bd..0162aeba7573 100644 --- a/pkg/util/metric/registry_test.go +++ b/pkg/util/metric/registry_test.go @@ -77,10 +77,10 @@ func TestRegistry(t *testing.T) { r.AddMetric(topCounter) r.AddMetric(NewHistogram(HistogramOptions{ - Mode: HistogramModePrometheus, - Metadata: Metadata{Name: "top.histogram"}, - Duration: time.Minute, - Buckets: Count1KBuckets, + Mode: HistogramModePrometheus, + Metadata: Metadata{Name: "top.histogram"}, + Duration: time.Minute, + BucketConfig: Count1KBuckets, })) r.AddMetric(NewGauge(Metadata{Name: "bottom.gauge"})) @@ -109,10 +109,10 @@ func TestRegistry(t *testing.T) { StructGauge64: NewGaugeFloat64(Metadata{Name: "struct.gauge64"}), StructCounter: NewCounter(Metadata{Name: "struct.counter"}), StructHistogram: NewHistogram(HistogramOptions{ - Mode: HistogramModePrometheus, - Metadata: Metadata{Name: "struct.histogram"}, - Duration: time.Minute, - Buckets: Count1KBuckets, + Mode: HistogramModePrometheus, + Metadata: Metadata{Name: "struct.histogram"}, + Duration: time.Minute, + BucketConfig: Count1KBuckets, }), NestedStructGauge: NestedStruct{ NestedStructGauge: NewGauge(Metadata{Name: "nested.struct.gauge"}), @@ -133,10 +133,10 @@ func TestRegistry(t *testing.T) { privateStructGauge64: NewGaugeFloat64(Metadata{Name: "private.struct.gauge64"}), privateStructCounter: NewCounter(Metadata{Name: "private.struct.counter"}), privateStructHistogram: NewHistogram(HistogramOptions{ - Mode: HistogramModePrometheus, - Metadata: Metadata{Name: "private.struct.histogram"}, - Duration: time.Minute, - Buckets: Count1KBuckets, + Mode: HistogramModePrometheus, + Metadata: Metadata{Name: "private.struct.histogram"}, + Duration: time.Minute, + BucketConfig: Count1KBuckets, }), privateNestedStructGauge: NestedStruct{ NestedStructGauge: NewGauge(Metadata{Name: "private.nested.struct.gauge"}), diff --git a/pkg/util/metric/testdata/BatchProcessLatencyBuckets b/pkg/util/metric/testdata/BatchProcessLatencyBuckets new file mode 100644 index 000000000000..eac33a1e4e98 --- /dev/null +++ b/pkg/util/metric/testdata/BatchProcessLatencyBuckets @@ -0,0 +1,63 @@ +echo +---- +BatchProcessLatencyBuckets +500000000.000000 +557259285.358743 +621075822.237074 +692200537.706851 +771470353.934916 +859818036.218456 +958283168.803309 +1068024387.637287 +1190333014.000928 +1326648249.442152 +1478574110.813123 +1647898304.683320 +1836613263.223422 +2046939589.088547 +2281352185.176006 +2542609376.725576 +2833785368.441068 +3158306418.555065 +3519991155.495853 +3923095511.561431 +4372362802.333632 +4873079541.115184 +5431137645.156319 +6053103765.649553 +6746296557.296375 +7518872796.674253 +8379923362.755980 +9339580208.980864 +10409135585.614676 +11601174915.283792 +12929724885.225649 +14410418498.852003 +16060679028.781363 +17899925035.909710 +19949798866.972237 +22234421319.319225 +24780675469.538071 +27618523005.723442 +30781356785.666904 +34306393769.506477 +38235112950.461639 +42613743436.770157 +47493808428.070732 +52932731487.183495 +58994512241.268242 +65750479463.313522 +73280130395.441635 +81672066190.318619 +91025034477.977859 +101449091325.905777 +113066896265.136261 +126015155620.881943 +140446231131.326965 +156529932783.144257 +174455516959.974152 +194433913416.010529 +216700207279.419586 +241516405291.241699 +269174518830.019897 +300000000000.000854 diff --git a/pkg/util/metric/testdata/Count1KBuckets b/pkg/util/metric/testdata/Count1KBuckets new file mode 100644 index 000000000000..c67ca55e52b4 --- /dev/null +++ b/pkg/util/metric/testdata/Count1KBuckets @@ -0,0 +1,14 @@ +echo +---- +Count1KBuckets +1.000000 +2.000000 +4.000000 +8.000000 +16.000000 +32.000000 +64.000000 +128.000000 +256.000000 +512.000000 +1024.000000 diff --git a/pkg/util/metric/testdata/DataSize16MBBuckets b/pkg/util/metric/testdata/DataSize16MBBuckets new file mode 100644 index 000000000000..332273fa6d4b --- /dev/null +++ b/pkg/util/metric/testdata/DataSize16MBBuckets @@ -0,0 +1,18 @@ +echo +---- +DataSize16MBBuckets +1000.000000 +2000.000000 +4000.000000 +8000.000000 +16000.000000 +32000.000000 +64000.000000 +128000.000000 +256000.000000 +512000.000000 +1024000.000000 +2048000.000000 +4096000.000000 +8192000.000000 +16384000.000000 diff --git a/pkg/util/metric/testdata/IOLatencyBuckets b/pkg/util/metric/testdata/IOLatencyBuckets new file mode 100644 index 000000000000..5efb03f6dd0b --- /dev/null +++ b/pkg/util/metric/testdata/IOLatencyBuckets @@ -0,0 +1,63 @@ +echo +---- +IOLatencyBuckets +10000.000000 +12638.482029 +15973.122801 +20187.602547 +25514.065200 +32245.905453 +40753.929659 +51506.780762 +65096.752305 +82272.413417 +103979.841848 +131414.736261 +166088.278263 +209910.372011 +265294.846443 +335292.414925 +423758.716060 +535566.691771 +676875.000946 +855467.253557 +1081180.751077 +1366448.349295 +1726983.290659 +2182644.728397 +2758531.617629 +3486365.227678 +4406236.427774 +5568813.990945 +7038135.554932 +8895134.973108 +11242100.350621 +14208308.325339 +17957144.943716 +22695105.366947 +28683168.133420 +36251170.499885 +45815976.690545 +57904439.806025 +73182422.190762 +92491472.772173 +116895181.649858 +147737765.259851 +186718109.129192 +235983346.678219 +298247128.621688 +376939097.538835 +476393801.040133 +602089449.333611 +760949668.545986 +961724871.115294 +1215474250.076283 +1536174946.671824 +1941491945.743876 +2453751106.639811 +3101168926.574770 +3919406774.847209 +4953535208.959157 +6260516572.014802 +7912342618.981298 +9999999999.999969 diff --git a/pkg/util/metric/testdata/LongRunning60mLatencyBuckets b/pkg/util/metric/testdata/LongRunning60mLatencyBuckets new file mode 100644 index 000000000000..21f6895607e2 --- /dev/null +++ b/pkg/util/metric/testdata/LongRunning60mLatencyBuckets @@ -0,0 +1,63 @@ +echo +---- +LongRunning60mLatencyBuckets +500000000.000000 +581230667.894489 +675658178.602148 +785426508.834601 +913027948.623944 +1061359688.770060 +1233789601.560218 +1434232708.312242 +1667240069.936893 +1938102118.779750 +2252968777.892157 +2618989095.039379 +3044473561.836243 +3539082803.466387 +4114046923.185338 +4782420481.824564 +5559378901.606352 +6462563024.118382 +7512479645.637113 +8732967123.954826 +10151736628.313759 +11801001321.527510 +13718207759.870365 +15946886117.169632 +18537638537.439724 +21549288056.605419 +25050214179.583008 +29119905436.998066 +33850764172.341507 +39350204537.257782 +45743091329.950188 +53174575050.531136 +61813387543.251701 +71855673053.170151 +83529441681.404266 +97099746354.672745 +112874700852.223846 +131212475529.457443 +152529429576.151703 +177309564452.224213 +206115513141.294464 +239601314733.059875 +278527264381.388123 +323777175806.438293 +376378448285.935181 +437525393756.650940 +508606353667.955078 +591235221275.612671 +687288085089.540771 +798945825465.036499 +928743631493.114136 +1079628562470.991943 +1255026460885.963623 +1458919736172.010742 +1695937785319.419434 +1971462103337.413574 +2291748470102.958496 +2664068987848.231934 +3096877194248.046875 +3600000000000.007812 diff --git a/pkg/util/metric/testdata/MemoryUsage64MBBuckets b/pkg/util/metric/testdata/MemoryUsage64MBBuckets new file mode 100644 index 000000000000..14d20759b92e --- /dev/null +++ b/pkg/util/metric/testdata/MemoryUsage64MBBuckets @@ -0,0 +1,18 @@ +echo +---- +MemoryUsage64MBBuckets +1.000000 +3.610641 +13.036727 +47.070938 +169.956248 +613.650962 +2215.673192 +8000.000000 +28885.126301 +104293.815179 +376567.502984 +1359649.985574 +4909207.694830 +17725385.537954 +64000000.000000 diff --git a/pkg/util/metric/testdata/Percent100Buckets b/pkg/util/metric/testdata/Percent100Buckets new file mode 100644 index 000000000000..8b7d4b5bf693 --- /dev/null +++ b/pkg/util/metric/testdata/Percent100Buckets @@ -0,0 +1,13 @@ +echo +---- +Percent100Buckets +0.000000 +10.000000 +20.000000 +30.000000 +40.000000 +50.000000 +60.000000 +70.000000 +80.000000 +90.000000 diff --git a/pkg/util/metric/testdata/ReplicaBatchRequestCountBuckets b/pkg/util/metric/testdata/ReplicaBatchRequestCountBuckets new file mode 100644 index 000000000000..7e86fbdf5219 --- /dev/null +++ b/pkg/util/metric/testdata/ReplicaBatchRequestCountBuckets @@ -0,0 +1,23 @@ +echo +---- +ReplicaBatchRequestCountBuckets +1.000000 +1.664445 +2.770377 +4.611141 +7.674991 +12.774602 +21.262623 +35.390468 +58.905491 +98.044956 +163.190445 +271.621536 +452.099132 +752.494181 +1252.485246 +2084.692921 +3469.856899 +5775.386284 +9612.813352 +16000.000000 diff --git a/pkg/util/metric/testdata/ReplicaCPUTimeBuckets b/pkg/util/metric/testdata/ReplicaCPUTimeBuckets new file mode 100644 index 000000000000..e9c3c6ce24cf --- /dev/null +++ b/pkg/util/metric/testdata/ReplicaCPUTimeBuckets @@ -0,0 +1,23 @@ +echo +---- +ReplicaCPUTimeBuckets +500000.000000 +811888.369594 +1318325.449365 +2140666.199360 +3475963.980888 +5644189.458423 +9164903.554162 +14881757.208157 +24164651.192859 +39237998.517573 +63713749.285157 +103456904.055739 +167990914.314189 +272779739.058426 +442933395.205041 +719224944.143830 +1167860734.545059 +1896345095.366121 +3079241055.330125 +4999999999.999990