From a28aa6cbb4b0de3ff92b4dcc74e319d2e2a220c2 Mon Sep 17 00:00:00 2001 From: David Hartunian Date: Thu, 26 Jan 2023 12:35:30 -0500 Subject: [PATCH 1/2] pkg/util/metric: optionally reintroduce legacy hdrhistogram model Addresses https://github.com/cockroachdb/cockroach/issues/95833 This patch reeintroduces the old HdrHistogram model to optionally be enabled in favor of the new Prometheus model, gated behind an environment variable called `COCKROACH_ENABLE_HDR_HISTOGRAMS`, allowing users a means to "fall back" to the old model in the event that the new model does not adequately serve their needs (think of this as an "insurance policy" to protect against this from happening again with no real mitigation - ideally, this environment variable should never have to be used). Note: some histograms were introduced *after* the new Prometheus histograms were added to CockroachDB. In this case, we use the `ForceUsePrometheus` option in the `HistogramOptions` struct to ignore the value of the env var, since there never was a time where these specific histograms used the HdrHistogram model. Release note (ops change): Histogram metrics can now optionally use the legacy HdrHistogram model by setting the environment var `COCKROACH_ENABLE_HDR_HISTOGRAMS=true` on CockroachDB nodes. **Note that this is not recommended** unless users are having difficulties with the newer Prometheus-backed histogram model. Enabling can cause performance issues with timeseries databases like Prometheus, as processing and storing the increased number of buckets is taxing on both CPU and storage. Note that the HdrHistogram model is slated for full deprecation in upcoming releases. --- pkg/ccl/changefeedccl/metrics.go | 96 +++++-- pkg/ccl/sqlproxyccl/connector.go | 2 +- pkg/ccl/sqlproxyccl/connector_test.go | 27 +- pkg/ccl/sqlproxyccl/metrics.go | 61 +++-- pkg/ccl/streamingccl/streamingest/metrics.go | 42 ++- pkg/kv/bulk/bulk_metrics.go | 14 +- pkg/kv/kvclient/kvcoord/txn_metrics.go | 45 ++-- pkg/kv/kvprober/kvprober.go | 22 +- pkg/kv/kvserver/liveness/liveness.go | 11 +- pkg/kv/kvserver/logstore/logstore.go | 2 +- .../kvserver/logstore/logstore_bench_test.go | 7 +- pkg/kv/kvserver/metrics.go | 89 ++++--- pkg/kv/kvserver/scheduler.go | 2 +- pkg/kv/kvserver/txnwait/metrics.go | 28 +- pkg/rpc/clock_offset.go | 11 +- pkg/server/node.go | 11 +- pkg/server/status/recorder_test.go | 7 +- pkg/sql/conn_executor.go | 87 +++--- pkg/sql/execinfra/metrics.go | 35 ++- pkg/sql/executor_statement_metrics.go | 18 +- pkg/sql/mem_metrics.go | 48 +++- pkg/sql/pgwire/pre_serve.go | 11 +- pkg/sql/pgwire/server.go | 11 +- .../sqlstats/persistedsqlstats/provider.go | 2 +- pkg/sql/sqlstats/sslocal/sql_stats.go | 2 +- pkg/sql/sqlstats/sslocal/sslocal_provider.go | 2 +- pkg/sql/ttl/ttljob/ttljob_metrics.go | 37 +-- pkg/util/admission/work_queue.go | 11 +- pkg/util/metric/BUILD.bazel | 4 + pkg/util/metric/aggmetric/BUILD.bazel | 1 - pkg/util/metric/aggmetric/agg_metric.go | 7 +- pkg/util/metric/aggmetric/agg_metric_test.go | 43 ++- pkg/util/metric/aggmetric/histogram.go | 27 +- .../testdata/add_after_destroy_hdr.txt | 23 ++ .../metric/aggmetric/testdata/basic_hdr.txt | 24 ++ .../metric/aggmetric/testdata/destroy_hdr.txt | 17 ++ pkg/util/metric/hdrhistogram.go | 248 ++++++++++++++++++ pkg/util/metric/metric.go | 101 ++++++- pkg/util/metric/metric_ext_test.go | 7 +- pkg/util/metric/metric_test.go | 28 +- pkg/util/metric/registry_test.go | 37 ++- pkg/util/mon/bytes_usage.go | 10 +- .../scheduler_latency_test.go | 7 +- 43 files changed, 1017 insertions(+), 308 deletions(-) create mode 100644 pkg/util/metric/aggmetric/testdata/add_after_destroy_hdr.txt create mode 100644 pkg/util/metric/aggmetric/testdata/basic_hdr.txt create mode 100644 pkg/util/metric/aggmetric/testdata/destroy_hdr.txt create mode 100644 pkg/util/metric/hdrhistogram.go diff --git a/pkg/ccl/changefeedccl/metrics.go b/pkg/ccl/changefeedccl/metrics.go index 4b042273df8a..45578bc6645e 100644 --- a/pkg/ccl/changefeedccl/metrics.go +++ b/pkg/ccl/changefeedccl/metrics.go @@ -28,6 +28,14 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeutil" ) +const ( + changefeedCheckpointHistMaxLatency = 30 * time.Second + changefeedBatchHistMaxLatency = 30 * time.Second + changefeedFlushHistMaxLatency = 1 * time.Minute + admitLatencyMaxValue = 1 * time.Minute + commitLatencyMaxValue = 10 * time.Minute +) + // max length for the scope name. const maxSLIScopeNameLen = 128 @@ -488,16 +496,46 @@ func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics { ErrorRetries: b.Counter(metaChangefeedErrorRetries), EmittedMessages: b.Counter(metaChangefeedEmittedMessages), FilteredMessages: b.Counter(metaChangefeedFilteredMessages), - MessageSize: b.Histogram(metaMessageSize, histogramWindow, metric.DataSize16MBBuckets), + MessageSize: b.Histogram(metric.HistogramOptions{ + Metadata: metaMessageSize, + Duration: histogramWindow, + MaxVal: 10 << 20, /* 10MB max message size */ + SigFigs: 1, + Buckets: metric.DataSize16MBBuckets, + }), EmittedBytes: b.Counter(metaChangefeedEmittedBytes), FlushedBytes: b.Counter(metaChangefeedFlushedBytes), Flushes: b.Counter(metaChangefeedFlushes), SizeBasedFlushes: b.Counter(metaSizeBasedFlushes), - BatchHistNanos: b.Histogram(metaChangefeedBatchHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets), - FlushHistNanos: b.Histogram(metaChangefeedFlushHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets), - CommitLatency: b.Histogram(metaCommitLatency, histogramWindow, metric.BatchProcessLatencyBuckets), - AdmitLatency: b.Histogram(metaAdmitLatency, histogramWindow, metric.BatchProcessLatencyBuckets), + BatchHistNanos: b.Histogram(metric.HistogramOptions{ + Metadata: metaChangefeedBatchHistNanos, + Duration: histogramWindow, + MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(), + SigFigs: 1, + Buckets: metric.BatchProcessLatencyBuckets, + }), + FlushHistNanos: b.Histogram(metric.HistogramOptions{ + Metadata: metaChangefeedFlushHistNanos, + Duration: histogramWindow, + MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(), + SigFigs: 2, + Buckets: metric.BatchProcessLatencyBuckets, + }), + CommitLatency: b.Histogram(metric.HistogramOptions{ + Metadata: metaCommitLatency, + Duration: histogramWindow, + MaxVal: commitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + Buckets: metric.BatchProcessLatencyBuckets, + }), + AdmitLatency: b.Histogram(metric.HistogramOptions{ + Metadata: metaAdmitLatency, + Duration: histogramWindow, + MaxVal: admitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + Buckets: metric.BatchProcessLatencyBuckets, + }), BackfillCount: b.Gauge(metaChangefeedBackfillCount), BackfillPendingRanges: b.Gauge(metaChangefeedBackfillPendingRanges), RunningCount: b.Gauge(metaChangefeedRunning), @@ -572,12 +610,12 @@ type Metrics struct { Failures *metric.Counter ResolvedMessages *metric.Counter QueueTimeNanos *metric.Counter - CheckpointHistNanos *metric.Histogram + CheckpointHistNanos metric.IHistogram FrontierUpdates *metric.Counter ThrottleMetrics cdcutils.Metrics ReplanCount *metric.Counter - ParallelConsumerFlushNanos *metric.Histogram - ParallelConsumerConsumeNanos *metric.Histogram + ParallelConsumerFlushNanos metric.IHistogram + ParallelConsumerConsumeNanos metric.IHistogram ParallelConsumerInFlightEvents *metric.Gauge mu struct { @@ -599,18 +637,36 @@ func (m *Metrics) getSLIMetrics(scope string) (*sliMetrics, error) { // MakeMetrics makes the metrics for changefeed monitoring. func MakeMetrics(histogramWindow time.Duration) metric.Struct { m := &Metrics{ - AggMetrics: newAggregateMetrics(histogramWindow), - KVFeedMetrics: kvevent.MakeMetrics(histogramWindow), - SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow), - ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages), - Failures: metric.NewCounter(metaChangefeedFailures), - QueueTimeNanos: metric.NewCounter(metaEventQueueTime), - CheckpointHistNanos: metric.NewHistogram(metaChangefeedCheckpointHistNanos, histogramWindow, metric.IOLatencyBuckets), - FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates), - ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow), - ReplanCount: metric.NewCounter(metaChangefeedReplanCount), - ParallelConsumerFlushNanos: metric.NewHistogram(metaChangefeedEventConsumerFlushNanos, histogramWindow, metric.IOLatencyBuckets), - ParallelConsumerConsumeNanos: metric.NewHistogram(metaChangefeedEventConsumerConsumeNanos, histogramWindow, metric.IOLatencyBuckets), + AggMetrics: newAggregateMetrics(histogramWindow), + KVFeedMetrics: kvevent.MakeMetrics(histogramWindow), + SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow), + ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages), + Failures: metric.NewCounter(metaChangefeedFailures), + QueueTimeNanos: metric.NewCounter(metaEventQueueTime), + CheckpointHistNanos: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaChangefeedCheckpointHistNanos, + Duration: histogramWindow, + MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(), + SigFigs: 2, + Buckets: metric.IOLatencyBuckets, + }), + FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates), + ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow), + ReplanCount: metric.NewCounter(metaChangefeedReplanCount), + // Below two metrics were never implemented using the hdr histogram. Set ForceUsePrometheus + // to true. + ParallelConsumerFlushNanos: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaChangefeedEventConsumerFlushNanos, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, + }), + ParallelConsumerConsumeNanos: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaChangefeedEventConsumerConsumeNanos, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePrometheus, + }), ParallelConsumerInFlightEvents: metric.NewGauge(metaChangefeedEventConsumerInFlightEvents), } diff --git a/pkg/ccl/sqlproxyccl/connector.go b/pkg/ccl/sqlproxyccl/connector.go index 8329efecc8ae..6a891a7ab53f 100644 --- a/pkg/ccl/sqlproxyccl/connector.go +++ b/pkg/ccl/sqlproxyccl/connector.go @@ -77,7 +77,7 @@ type connector struct { // DialTenantLatency tracks how long it takes to retrieve the address for // a tenant and set up a tcp connection to the address. - DialTenantLatency *metric.Histogram + DialTenantLatency metric.IHistogram // DialTenantRetries counts how often dialing a tenant is retried. DialTenantRetries *metric.Counter diff --git a/pkg/ccl/sqlproxyccl/connector_test.go b/pkg/ccl/sqlproxyccl/connector_test.go index 8a28955b71a6..6fa21c56afe3 100644 --- a/pkg/ccl/sqlproxyccl/connector_test.go +++ b/pkg/ccl/sqlproxyccl/connector_test.go @@ -380,9 +380,12 @@ func TestConnector_dialTenantCluster(t *testing.T) { c := &connector{ TenantID: roachpb.MustMakeTenantID(42), - DialTenantLatency: metric.NewHistogram( - metaDialTenantLatency, time.Millisecond, metric.NetworkLatencyBuckets, - ), + DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePrometheus, + Metadata: metaDialTenantLatency, + Duration: time.Millisecond, + Buckets: metric.NetworkLatencyBuckets, + }), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } dc := &testTenantDirectoryCache{} @@ -460,9 +463,12 @@ func TestConnector_dialTenantCluster(t *testing.T) { defer cancel() c := &connector{ - DialTenantLatency: metric.NewHistogram( - metaDialTenantLatency, time.Millisecond, metric.NetworkLatencyBuckets, - ), + DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDialTenantLatency, + Duration: time.Millisecond, + Buckets: metric.NetworkLatencyBuckets, + }), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } c.testingKnobs.lookupAddr = func(ctx context.Context) (string, error) { @@ -491,9 +497,12 @@ func TestConnector_dialTenantCluster(t *testing.T) { var reportFailureFnCount int c := &connector{ TenantID: roachpb.MustMakeTenantID(42), - DialTenantLatency: metric.NewHistogram( - metaDialTenantLatency, time.Millisecond, metric.NetworkLatencyBuckets, - ), + DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDialTenantLatency, + Duration: time.Millisecond, + Buckets: metric.NetworkLatencyBuckets, + }), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } c.DirectoryCache = &testTenantDirectoryCache{ diff --git a/pkg/ccl/sqlproxyccl/metrics.go b/pkg/ccl/sqlproxyccl/metrics.go index 2fe0d8e16131..2432f9b1f668 100644 --- a/pkg/ccl/sqlproxyccl/metrics.go +++ b/pkg/ccl/sqlproxyccl/metrics.go @@ -23,19 +23,19 @@ type metrics struct { RoutingErrCount *metric.Counter RefusedConnCount *metric.Counter SuccessfulConnCount *metric.Counter - ConnectionLatency *metric.Histogram + ConnectionLatency metric.IHistogram AuthFailedCount *metric.Counter ExpiredClientConnCount *metric.Counter - DialTenantLatency *metric.Histogram + DialTenantLatency metric.IHistogram DialTenantRetries *metric.Counter ConnMigrationSuccessCount *metric.Counter ConnMigrationErrorFatalCount *metric.Counter ConnMigrationErrorRecoverableCount *metric.Counter ConnMigrationAttemptedCount *metric.Counter - ConnMigrationAttemptedLatency *metric.Histogram - ConnMigrationTransferResponseMessageSize *metric.Histogram + ConnMigrationAttemptedLatency metric.IHistogram + ConnMigrationTransferResponseMessageSize metric.IHistogram QueryCancelReceivedPGWire *metric.Counter QueryCancelReceivedHTTP *metric.Counter @@ -49,6 +49,16 @@ func (metrics) MetricStruct() {} var _ metric.Struct = metrics{} +const ( + // maxExpectedTransferResponseMessageSize corresponds to maximum expected + // response message size for the SHOW TRANSFER STATE query. We choose 16MB + // here to match the defaultMaxReadBufferSize used for ingesting SQL + // statements in the SQL server (see pkg/sql/pgwire/pgwirebase/encoding.go). + // + // This will be used to tune sql.session_transfer.max_session_size. + maxExpectedTransferResponseMessageSize = 1 << 24 // 16MB +) + var ( metaCurConnCount = metric.Metadata{ Name: "proxy.sql.conns", @@ -213,18 +223,20 @@ func makeProxyMetrics() metrics { RoutingErrCount: metric.NewCounter(metaRoutingErrCount), RefusedConnCount: metric.NewCounter(metaRefusedConnCount), SuccessfulConnCount: metric.NewCounter(metaSuccessfulConnCount), - ConnectionLatency: metric.NewHistogram( - metaConnMigrationAttemptedCount, - base.DefaultHistogramWindowInterval(), - metric.NetworkLatencyBuckets, - ), + ConnectionLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaConnMigrationAttemptedCount, + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.NetworkLatencyBuckets, + }), AuthFailedCount: metric.NewCounter(metaAuthFailedCount), ExpiredClientConnCount: metric.NewCounter(metaExpiredClientConnCount), // Connector metrics. - DialTenantLatency: metric.NewHistogram( - metaDialTenantLatency, - base.DefaultHistogramWindowInterval(), - metric.NetworkLatencyBuckets, + DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDialTenantLatency, + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.NetworkLatencyBuckets}, ), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), // Connection migration metrics. @@ -232,16 +244,19 @@ func makeProxyMetrics() metrics { ConnMigrationErrorFatalCount: metric.NewCounter(metaConnMigrationErrorFatalCount), ConnMigrationErrorRecoverableCount: metric.NewCounter(metaConnMigrationErrorRecoverableCount), ConnMigrationAttemptedCount: metric.NewCounter(metaConnMigrationAttemptedCount), - ConnMigrationAttemptedLatency: metric.NewHistogram( - metaConnMigrationAttemptedLatency, - base.DefaultHistogramWindowInterval(), - metric.NetworkLatencyBuckets, - ), - ConnMigrationTransferResponseMessageSize: metric.NewHistogram( - metaConnMigrationTransferResponseMessageSize, - base.DefaultHistogramWindowInterval(), - metric.DataSize16MBBuckets, - ), + ConnMigrationAttemptedLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaConnMigrationAttemptedLatency, + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.NetworkLatencyBuckets, + }), + ConnMigrationTransferResponseMessageSize: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaConnMigrationTransferResponseMessageSize, + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.DataSize16MBBuckets, + MaxVal: maxExpectedTransferResponseMessageSize, + SigFigs: 1, + }), QueryCancelReceivedPGWire: metric.NewCounter(metaQueryCancelReceivedPGWire), QueryCancelReceivedHTTP: metric.NewCounter(metaQueryCancelReceivedHTTP), QueryCancelIgnored: metric.NewCounter(metaQueryCancelIgnored), diff --git a/pkg/ccl/streamingccl/streamingest/metrics.go b/pkg/ccl/streamingccl/streamingest/metrics.go index f9f28f38a247..c2858cd5ad36 100644 --- a/pkg/ccl/streamingccl/streamingest/metrics.go +++ b/pkg/ccl/streamingccl/streamingest/metrics.go @@ -15,6 +15,12 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/metric" ) +const ( + streamingFlushHistMaxLatency = 1 * time.Minute + streamingAdmitLatencyMaxValue = 3 * time.Minute + streamingCommitLatencyMaxValue = 10 * time.Minute +) + var ( metaReplicationEventsIngested = metric.Metadata{ Name: "replication.events_ingested", @@ -120,9 +126,9 @@ type Metrics struct { Flushes *metric.Counter JobProgressUpdates *metric.Counter ResolvedEvents *metric.Counter - FlushHistNanos *metric.Histogram - CommitLatency *metric.Histogram - AdmitLatency *metric.Histogram + FlushHistNanos metric.IHistogram + CommitLatency metric.IHistogram + AdmitLatency metric.IHistogram RunningCount *metric.Gauge EarliestDataCheckpointSpan *metric.Gauge LatestDataCheckpointSpan *metric.Gauge @@ -143,12 +149,30 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct { Flushes: metric.NewCounter(metaReplicationFlushes), ResolvedEvents: metric.NewCounter(metaReplicationResolvedEventsIngested), JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates), - FlushHistNanos: metric.NewHistogram(metaReplicationFlushHistNanos, - histogramWindow, metric.BatchProcessLatencyBuckets), - CommitLatency: metric.NewHistogram(metaReplicationCommitLatency, - histogramWindow, metric.BatchProcessLatencyBuckets), - AdmitLatency: metric.NewHistogram(metaReplicationAdmitLatency, - histogramWindow, metric.BatchProcessLatencyBuckets), + FlushHistNanos: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaReplicationFlushHistNanos, + Duration: histogramWindow, + Buckets: metric.BatchProcessLatencyBuckets, + MaxVal: streamingFlushHistMaxLatency.Nanoseconds(), + SigFigs: 1, + Mode: metric.HistogramModePreferHdrLatency, + }), + CommitLatency: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaReplicationCommitLatency, + Duration: histogramWindow, + Buckets: metric.BatchProcessLatencyBuckets, + MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + Mode: metric.HistogramModePreferHdrLatency, + }), + AdmitLatency: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaReplicationAdmitLatency, + Duration: histogramWindow, + Buckets: metric.BatchProcessLatencyBuckets, + MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + Mode: metric.HistogramModePreferHdrLatency, + }), RunningCount: metric.NewGauge(metaStreamsRunning), EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan), LatestDataCheckpointSpan: metric.NewGauge(metaLatestDataCheckpointSpan), diff --git a/pkg/kv/bulk/bulk_metrics.go b/pkg/kv/bulk/bulk_metrics.go index f3390d54733e..7cbbc748a20b 100644 --- a/pkg/kv/bulk/bulk_metrics.go +++ b/pkg/kv/bulk/bulk_metrics.go @@ -20,7 +20,7 @@ import ( // Metrics contains pointers to the metrics for // monitoring bulk operations. type Metrics struct { - MaxBytesHist *metric.Histogram + MaxBytesHist metric.IHistogram CurBytesCount *metric.Gauge } @@ -44,10 +44,20 @@ var ( } ) +// See pkg/sql/mem_metrics.go +// log10int64times1000 = log10(math.MaxInt64) * 1000, rounded up somewhat +const log10int64times1000 = 19 * 1000 + // MakeBulkMetrics instantiates the metrics holder for bulk operation monitoring. func MakeBulkMetrics(histogramWindow time.Duration) Metrics { return Metrics{ - MaxBytesHist: metric.NewHistogram(metaMemMaxBytes, histogramWindow, metric.MemoryUsage64MBBuckets), + MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaMemMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), CurBytesCount: metric.NewGauge(metaMemCurBytes), } } diff --git a/pkg/kv/kvclient/kvcoord/txn_metrics.go b/pkg/kv/kvclient/kvcoord/txn_metrics.go index eb6313012717..fcca64aa74ae 100644 --- a/pkg/kv/kvclient/kvcoord/txn_metrics.go +++ b/pkg/kv/kvclient/kvcoord/txn_metrics.go @@ -31,14 +31,14 @@ type TxnMetrics struct { RefreshMemoryLimitExceeded *metric.Counter RefreshAutoRetries *metric.Counter - Durations *metric.Histogram + Durations metric.IHistogram TxnsWithCondensedIntents *metric.Counter TxnsWithCondensedIntentsGauge *metric.Gauge TxnsRejectedByLockSpanBudget *metric.Counter // Restarts is the number of times we had to restart the transaction. - Restarts *metric.Histogram + Restarts metric.IHistogram // Counts of restart types. RestartsWriteTooOld telemetry.CounterWithMetric @@ -264,21 +264,32 @@ var ( // windowed portions retain data for approximately histogramWindow. func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics { return TxnMetrics{ - Aborts: metric.NewCounter(metaAbortsRates), - Commits: metric.NewCounter(metaCommitsRates), - Commits1PC: metric.NewCounter(metaCommits1PCRates), - ParallelCommits: metric.NewCounter(metaParallelCommitsRates), - CommitWaits: metric.NewCounter(metaCommitWaitCount), - RefreshSuccess: metric.NewCounter(metaRefreshSuccess), - RefreshFail: metric.NewCounter(metaRefreshFail), - RefreshFailWithCondensedSpans: metric.NewCounter(metaRefreshFailWithCondensedSpans), - RefreshMemoryLimitExceeded: metric.NewCounter(metaRefreshMemoryLimitExceeded), - RefreshAutoRetries: metric.NewCounter(metaRefreshAutoRetries), - Durations: metric.NewHistogram(metaDurationsHistograms, histogramWindow, metric.IOLatencyBuckets), - TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans), - TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge), - TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget), - Restarts: metric.NewHistogram(metaRestartsHistogram, histogramWindow, metric.Count1KBuckets), + Aborts: metric.NewCounter(metaAbortsRates), + Commits: metric.NewCounter(metaCommitsRates), + Commits1PC: metric.NewCounter(metaCommits1PCRates), + ParallelCommits: metric.NewCounter(metaParallelCommitsRates), + CommitWaits: metric.NewCounter(metaCommitWaitCount), + RefreshSuccess: metric.NewCounter(metaRefreshSuccess), + RefreshFail: metric.NewCounter(metaRefreshFail), + RefreshFailWithCondensedSpans: metric.NewCounter(metaRefreshFailWithCondensedSpans), + RefreshMemoryLimitExceeded: metric.NewCounter(metaRefreshMemoryLimitExceeded), + RefreshAutoRetries: metric.NewCounter(metaRefreshAutoRetries), + Durations: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDurationsHistograms, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans), + TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge), + TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget), + Restarts: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaRestartsHistogram, + Duration: histogramWindow, + MaxVal: 100, + SigFigs: 3, + Buckets: metric.Count1KBuckets, + }), RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld), RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti), RestartsSerializable: telemetry.NewCounterWithMetric(metaRestartsSerializable), diff --git a/pkg/kv/kvprober/kvprober.go b/pkg/kv/kvprober/kvprober.go index 2e0d84529fc9..0ec07006efe1 100644 --- a/pkg/kv/kvprober/kvprober.go +++ b/pkg/kv/kvprober/kvprober.go @@ -140,10 +140,10 @@ var ( type Metrics struct { ReadProbeAttempts *metric.Counter ReadProbeFailures *metric.Counter - ReadProbeLatency *metric.Histogram + ReadProbeLatency metric.IHistogram WriteProbeAttempts *metric.Counter WriteProbeFailures *metric.Counter - WriteProbeLatency *metric.Histogram + WriteProbeLatency metric.IHistogram WriteProbeQuarantineOldestDuration *metric.Gauge ProbePlanAttempts *metric.Counter ProbePlanFailures *metric.Counter @@ -229,14 +229,20 @@ func NewProber(opts Opts) *Prober { metrics: Metrics{ ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts), ReadProbeFailures: metric.NewCounter(metaReadProbeFailures), - ReadProbeLatency: metric.NewHistogram( - metaReadProbeLatency, opts.HistogramWindowInterval, metric.NetworkLatencyBuckets, - ), + ReadProbeLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaReadProbeLatency, + Duration: opts.HistogramWindowInterval, + Buckets: metric.NetworkLatencyBuckets, + }), WriteProbeAttempts: metric.NewCounter(metaWriteProbeAttempts), WriteProbeFailures: metric.NewCounter(metaWriteProbeFailures), - WriteProbeLatency: metric.NewHistogram( - metaWriteProbeLatency, opts.HistogramWindowInterval, metric.NetworkLatencyBuckets, - ), + WriteProbeLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaWriteProbeLatency, + Duration: opts.HistogramWindowInterval, + Buckets: metric.NetworkLatencyBuckets, + }), WriteProbeQuarantineOldestDuration: metric.NewFunctionalGauge( metaWriteProbeQuarantineOldestDuration, func() int64 { return qPool.oldestDuration().Nanoseconds() }, diff --git a/pkg/kv/kvserver/liveness/liveness.go b/pkg/kv/kvserver/liveness/liveness.go index 3411ea1bbd78..a11f3abc4114 100644 --- a/pkg/kv/kvserver/liveness/liveness.go +++ b/pkg/kv/kvserver/liveness/liveness.go @@ -144,7 +144,7 @@ type Metrics struct { HeartbeatSuccesses *metric.Counter HeartbeatFailures telemetry.CounterWithMetric EpochIncrements telemetry.CounterWithMetric - HeartbeatLatency *metric.Histogram + HeartbeatLatency metric.IHistogram } // IsLiveCallback is invoked when a node's IsLive state changes to true. @@ -310,9 +310,12 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness { HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses), HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures), EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements), - HeartbeatLatency: metric.NewHistogram( - metaHeartbeatLatency, opts.HistogramWindowInterval, metric.NetworkLatencyBuckets, - ), + HeartbeatLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaHeartbeatLatency, + Duration: opts.HistogramWindowInterval, + Buckets: metric.NetworkLatencyBuckets, + }), } nl.mu.nodes = make(map[roachpb.NodeID]Record) nl.heartbeatToken <- struct{}{} diff --git a/pkg/kv/kvserver/logstore/logstore.go b/pkg/kv/kvserver/logstore/logstore.go index f1784431b0f5..e1cd72d34483 100644 --- a/pkg/kv/kvserver/logstore/logstore.go +++ b/pkg/kv/kvserver/logstore/logstore.go @@ -90,7 +90,7 @@ type AppendStats struct { // Metrics contains metrics specific to the log storage. type Metrics struct { - RaftLogCommitLatency *metric.Histogram + RaftLogCommitLatency metric.IHistogram } // LogStore is a stub of a separated Raft log storage. diff --git a/pkg/kv/kvserver/logstore/logstore_bench_test.go b/pkg/kv/kvserver/logstore/logstore_bench_test.go index 0c3f8e0473e2..f09472e27385 100644 --- a/pkg/kv/kvserver/logstore/logstore_bench_test.go +++ b/pkg/kv/kvserver/logstore/logstore_bench_test.go @@ -60,7 +60,12 @@ func runBenchmarkLogStore_StoreEntries(b *testing.B, bytes int64) { EntryCache: ec, Settings: cluster.MakeTestingClusterSettings(), Metrics: Metrics{ - RaftLogCommitLatency: metric.NewHistogram(metric.Metadata{}, 10*time.Second, metric.IOLatencyBuckets), + RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePrometheus, + Metadata: metric.Metadata{}, + Duration: 10 * time.Second, + Buckets: metric.IOLatencyBuckets, + }), }, } diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 38953afe5d50..5476a6c17e9a 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -1850,15 +1850,15 @@ type StoreMetrics struct { // Raft processing metrics. RaftTicks *metric.Counter - RaftQuotaPoolPercentUsed *metric.Histogram + RaftQuotaPoolPercentUsed metric.IHistogram RaftWorkingDurationNanos *metric.Counter RaftTickingDurationNanos *metric.Counter RaftCommandsApplied *metric.Counter - RaftLogCommitLatency *metric.Histogram - RaftCommandCommitLatency *metric.Histogram - RaftHandleReadyLatency *metric.Histogram - RaftApplyCommittedLatency *metric.Histogram - RaftSchedulerLatency *metric.Histogram + RaftLogCommitLatency metric.IHistogram + RaftCommandCommitLatency metric.IHistogram + RaftHandleReadyLatency metric.IHistogram + RaftApplyCommittedLatency metric.IHistogram + RaftSchedulerLatency metric.IHistogram RaftTimeoutCampaign *metric.Counter // Raft message metrics. @@ -1990,8 +1990,8 @@ type StoreMetrics struct { ReplicaCircuitBreakerCumTripped *metric.Counter // Replica batch evaluation metrics. - ReplicaReadBatchEvaluationLatency *metric.Histogram - ReplicaWriteBatchEvaluationLatency *metric.Histogram + ReplicaReadBatchEvaluationLatency metric.IHistogram + ReplicaWriteBatchEvaluationLatency metric.IHistogram ReplicaReadBatchDroppedLatchesBeforeEval *metric.Counter ReplicaReadBatchWithoutInterleavingIter *metric.Counter @@ -2377,27 +2377,46 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { // Raft processing metrics. RaftTicks: metric.NewCounter(metaRaftTicks), - RaftQuotaPoolPercentUsed: metric.NewHistogram( - metaRaftQuotaPoolPercentUsed, histogramWindow, metric.Percent100Buckets, - ), + RaftQuotaPoolPercentUsed: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaRaftQuotaPoolPercentUsed, + Duration: histogramWindow, + MaxVal: 100, + SigFigs: 1, + Buckets: metric.Percent100Buckets, + }), RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos), RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos), RaftCommandsApplied: metric.NewCounter(metaRaftCommandsApplied), - RaftLogCommitLatency: metric.NewHistogram( - metaRaftLogCommitLatency, histogramWindow, metric.IOLatencyBuckets, - ), - RaftCommandCommitLatency: metric.NewHistogram( - metaRaftCommandCommitLatency, histogramWindow, metric.IOLatencyBuckets, - ), - RaftHandleReadyLatency: metric.NewHistogram( - metaRaftHandleReadyLatency, histogramWindow, metric.IOLatencyBuckets, - ), - RaftApplyCommittedLatency: metric.NewHistogram( - metaRaftApplyCommittedLatency, histogramWindow, metric.IOLatencyBuckets, - ), - RaftSchedulerLatency: metric.NewHistogram( - metaRaftSchedulerLatency, histogramWindow, metric.IOLatencyBuckets, - ), + RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftLogCommitLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + RaftCommandCommitLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftCommandCommitLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + RaftHandleReadyLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftHandleReadyLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + RaftApplyCommittedLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftApplyCommittedLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + RaftSchedulerLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftSchedulerLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), // Raft message metrics. @@ -2538,12 +2557,18 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { ReplicaCircuitBreakerCumTripped: metric.NewCounter(metaReplicaCircuitBreakerCumTripped), // Replica batch evaluation. - ReplicaReadBatchEvaluationLatency: metric.NewHistogram( - metaReplicaReadBatchEvaluationLatency, histogramWindow, metric.IOLatencyBuckets, - ), - ReplicaWriteBatchEvaluationLatency: metric.NewHistogram( - metaReplicaWriteBatchEvaluationLatency, histogramWindow, metric.IOLatencyBuckets, - ), + ReplicaReadBatchEvaluationLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaReplicaReadBatchEvaluationLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + ReplicaWriteBatchEvaluationLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaReplicaWriteBatchEvaluationLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), FlushUtilization: metric.NewGaugeFloat64(metaStorageFlushUtilization), FsyncLatency: metric.NewManualWindowHistogram(metaStorageFsyncLatency, pebble.FsyncLatencyBuckets), diff --git a/pkg/kv/kvserver/scheduler.go b/pkg/kv/kvserver/scheduler.go index 85db3cefa1fb..062bb4562843 100644 --- a/pkg/kv/kvserver/scheduler.go +++ b/pkg/kv/kvserver/scheduler.go @@ -181,7 +181,7 @@ type raftScheduleState struct { type raftScheduler struct { ambientContext log.AmbientContext processor raftProcessor - latency *metric.Histogram + latency metric.IHistogram numWorkers int maxTicks int diff --git a/pkg/kv/kvserver/txnwait/metrics.go b/pkg/kv/kvserver/txnwait/metrics.go index 2e9d1d2a2055..4610fd8e375c 100644 --- a/pkg/kv/kvserver/txnwait/metrics.go +++ b/pkg/kv/kvserver/txnwait/metrics.go @@ -22,8 +22,8 @@ type Metrics struct { PusherWaiting *metric.Gauge QueryWaiting *metric.Gauge PusherSlow *metric.Gauge - PusherWaitTime *metric.Histogram - QueryWaitTime *metric.Histogram + PusherWaitTime metric.IHistogram + QueryWaitTime metric.IHistogram DeadlocksTotal *metric.Counter } @@ -66,27 +66,31 @@ func NewMetrics(histogramWindowInterval time.Duration) *Metrics { }, ), - PusherWaitTime: metric.NewHistogram( - metric.Metadata{ + PusherWaitTime: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "txnwaitqueue.pusher.wait_time", Help: "Histogram of durations spent in queue by pushers", Measurement: "Pusher wait time", Unit: metric.Unit_NANOSECONDS, }, - histogramWindowInterval, - metric.LongRunning60mLatencyBuckets, - ), + MaxVal: time.Hour.Nanoseconds(), + SigFigs: 1, + Duration: histogramWindowInterval, + Buckets: metric.LongRunning60mLatencyBuckets, + }), - QueryWaitTime: metric.NewHistogram( - metric.Metadata{ + QueryWaitTime: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "txnwaitqueue.query.wait_time", Help: "Histogram of durations spent in queue by queries", Measurement: "Query wait time", Unit: metric.Unit_NANOSECONDS, }, - histogramWindowInterval, - metric.LongRunning60mLatencyBuckets, - ), + MaxVal: time.Hour.Nanoseconds(), + SigFigs: 1, + Duration: histogramWindowInterval, + Buckets: metric.LongRunning60mLatencyBuckets, + }), DeadlocksTotal: metric.NewCounter( metric.Metadata{ diff --git a/pkg/rpc/clock_offset.go b/pkg/rpc/clock_offset.go index bd5877f26b6b..99daad030a72 100644 --- a/pkg/rpc/clock_offset.go +++ b/pkg/rpc/clock_offset.go @@ -29,7 +29,7 @@ import ( type RemoteClockMetrics struct { ClockOffsetMeanNanos *metric.Gauge ClockOffsetStdDevNanos *metric.Gauge - LatencyHistogramNanos *metric.Histogram + LatencyHistogramNanos metric.IHistogram } // avgLatencyMeasurementAge determines how to exponentially weight the @@ -136,9 +136,12 @@ func newRemoteClockMonitor( r.metrics = RemoteClockMetrics{ ClockOffsetMeanNanos: metric.NewGauge(metaClockOffsetMeanNanos), ClockOffsetStdDevNanos: metric.NewGauge(metaClockOffsetStdDevNanos), - LatencyHistogramNanos: metric.NewHistogram( - metaLatencyHistogramNanos, histogramWindowInterval, metric.IOLatencyBuckets, - ), + LatencyHistogramNanos: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaLatencyHistogramNanos, + Duration: histogramWindowInterval, + Buckets: metric.IOLatencyBuckets, + }), } return &r } diff --git a/pkg/server/node.go b/pkg/server/node.go index 76019595ca04..2259a512bd75 100644 --- a/pkg/server/node.go +++ b/pkg/server/node.go @@ -167,7 +167,7 @@ var ( ) type nodeMetrics struct { - Latency *metric.Histogram + Latency metric.IHistogram Success *metric.Counter Err *metric.Counter DiskStalls *metric.Counter @@ -178,9 +178,12 @@ type nodeMetrics struct { func makeNodeMetrics(reg *metric.Registry, histogramWindow time.Duration) nodeMetrics { nm := nodeMetrics{ - Latency: metric.NewHistogram( - metaExecLatency, histogramWindow, metric.IOLatencyBuckets, - ), + Latency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaExecLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), Success: metric.NewCounter(metaExecSuccess), Err: metric.NewCounter(metaExecError), DiskStalls: metric.NewCounter(metaDiskStalls), diff --git a/pkg/server/status/recorder_test.go b/pkg/server/status/recorder_test.go index 82016bd8902c..da77954343af 100644 --- a/pkg/server/status/recorder_test.go +++ b/pkg/server/status/recorder_test.go @@ -385,7 +385,12 @@ func TestMetricsRecorder(t *testing.T) { c.Inc((data.val)) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "histogram": - h := metric.NewHistogram(metric.Metadata{Name: reg.prefix + data.name}, time.Second, []float64{1.0, 10.0, 100.0, 1000.0}) + h := metric.NewHistogram(metric.HistogramOptions{ + Metadata: metric.Metadata{Name: reg.prefix + data.name}, + Duration: time.Second, + Buckets: []float64{1.0, 10.0, 100.0, 1000.0}, + Mode: metric.HistogramModePrometheus, + }) reg.reg.AddMetric(h) h.RecordValue(data.val) for _, q := range recordHistogramQuantiles { diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index b593acf1f863..ae3dffb05c35 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -452,21 +452,36 @@ func makeMetrics(internal bool) Metrics { SQLOptPlanCacheHits: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheHits, internal)), SQLOptPlanCacheMisses: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheMisses, internal)), // TODO(mrtracy): See HistogramWindowInterval in server/config.go for the 6x factor. - DistSQLExecLatency: metric.NewHistogram( - getMetricMeta(MetaDistSQLExecLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), - SQLExecLatency: metric.NewHistogram( - getMetricMeta(MetaSQLExecLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), - DistSQLServiceLatency: metric.NewHistogram( - getMetricMeta(MetaDistSQLServiceLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), - SQLServiceLatency: metric.NewHistogram( - getMetricMeta(MetaSQLServiceLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), - SQLTxnLatency: metric.NewHistogram( - getMetricMeta(MetaSQLTxnLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), + DistSQLExecLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaDistSQLExecLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), + SQLExecLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaSQLExecLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), + DistSQLServiceLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaDistSQLServiceLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), + SQLServiceLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaSQLServiceLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), + SQLTxnLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaSQLTxnLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), SQLTxnsOpen: metric.NewGauge(getMetricMeta(MetaSQLTxnsOpen, internal)), SQLActiveStatements: metric.NewGauge(getMetricMeta(MetaSQLActiveQueries, internal)), SQLContendedTxns: metric.NewCounter(getMetricMeta(MetaSQLTxnContended, internal)), @@ -490,28 +505,38 @@ func makeMetrics(internal bool) Metrics { func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics { return ServerMetrics{ StatsMetrics: StatsMetrics{ - SQLStatsMemoryMaxBytesHist: metric.NewHistogram( - MetaSQLStatsMemMaxBytes, - cfg.HistogramWindowInterval, - metric.MemoryUsage64MBBuckets, - ), + SQLStatsMemoryMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaSQLStatsMemMaxBytes, + Duration: cfg.HistogramWindowInterval, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), SQLStatsMemoryCurBytesCount: metric.NewGauge(MetaSQLStatsMemCurBytes), - ReportedSQLStatsMemoryMaxBytesHist: metric.NewHistogram( - MetaReportedSQLStatsMemMaxBytes, - cfg.HistogramWindowInterval, - metric.MemoryUsage64MBBuckets, - ), + ReportedSQLStatsMemoryMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaReportedSQLStatsMemMaxBytes, + Duration: cfg.HistogramWindowInterval, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes), DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats), SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted), SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure), - SQLStatsFlushDuration: metric.NewHistogram( - MetaSQLStatsFlushDuration, 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), + SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: MetaSQLStatsFlushDuration, + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), SQLStatsRemovedRows: metric.NewCounter(MetaSQLStatsRemovedRows), - SQLTxnStatsCollectionOverhead: metric.NewHistogram( - MetaSQLTxnStatsCollectionOverhead, 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), + SQLTxnStatsCollectionOverhead: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: MetaSQLTxnStatsCollectionOverhead, + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), }, ContentionSubsystemMetrics: txnidcache.NewMetrics(), InsightsMetrics: insights.NewMetrics(), diff --git a/pkg/sql/execinfra/metrics.go b/pkg/sql/execinfra/metrics.go index 17ad207d88db..30123ce27ee6 100644 --- a/pkg/sql/execinfra/metrics.go +++ b/pkg/sql/execinfra/metrics.go @@ -24,11 +24,11 @@ type DistSQLMetrics struct { ContendedQueriesCount *metric.Counter FlowsActive *metric.Gauge FlowsTotal *metric.Counter - MaxBytesHist *metric.Histogram + MaxBytesHist metric.IHistogram CurBytesCount *metric.Gauge VecOpenFDs *metric.Gauge CurDiskBytesCount *metric.Gauge - MaxDiskBytesHist *metric.Histogram + MaxDiskBytesHist metric.IHistogram QueriesSpilled *metric.Counter SpilledBytesWritten *metric.Counter SpilledBytesRead *metric.Counter @@ -120,6 +120,10 @@ var ( } ) +// See pkg/sql/mem_metrics.go +// log10int64times1000 = log10(math.MaxInt64) * 1000, rounded up somewhat +const log10int64times1000 = 19 * 1000 + // MakeDistSQLMetrics instantiates the metrics holder for DistSQL monitoring. func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics { return DistSQLMetrics{ @@ -128,14 +132,25 @@ func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics { ContendedQueriesCount: metric.NewCounter(metaContendedQueriesCount), FlowsActive: metric.NewGauge(metaFlowsActive), FlowsTotal: metric.NewCounter(metaFlowsTotal), - MaxBytesHist: metric.NewHistogram(metaMemMaxBytes, histogramWindow, metric.MemoryUsage64MBBuckets), - CurBytesCount: metric.NewGauge(metaMemCurBytes), - VecOpenFDs: metric.NewGauge(metaVecOpenFDs), - CurDiskBytesCount: metric.NewGauge(metaDiskCurBytes), - MaxDiskBytesHist: metric.NewHistogram(metaDiskMaxBytes, histogramWindow, metric.MemoryUsage64MBBuckets), - QueriesSpilled: metric.NewCounter(metaQueriesSpilled), - SpilledBytesWritten: metric.NewCounter(metaSpilledBytesWritten), - SpilledBytesRead: metric.NewCounter(metaSpilledBytesRead), + MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaMemMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), + CurBytesCount: metric.NewGauge(metaMemCurBytes), + VecOpenFDs: metric.NewGauge(metaVecOpenFDs), + CurDiskBytesCount: metric.NewGauge(metaDiskCurBytes), + MaxDiskBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaDiskMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets}), + QueriesSpilled: metric.NewCounter(metaQueriesSpilled), + SpilledBytesWritten: metric.NewCounter(metaSpilledBytesWritten), + SpilledBytesRead: metric.NewCounter(metaSpilledBytesRead), } } diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go index e9cfcdf12c95..3eb34a83eb75 100644 --- a/pkg/sql/executor_statement_metrics.go +++ b/pkg/sql/executor_statement_metrics.go @@ -37,11 +37,11 @@ type EngineMetrics struct { SQLOptPlanCacheHits *metric.Counter SQLOptPlanCacheMisses *metric.Counter - DistSQLExecLatency *metric.Histogram - SQLExecLatency *metric.Histogram - DistSQLServiceLatency *metric.Histogram - SQLServiceLatency *metric.Histogram - SQLTxnLatency *metric.Histogram + DistSQLExecLatency metric.IHistogram + SQLExecLatency metric.IHistogram + DistSQLServiceLatency metric.IHistogram + SQLServiceLatency metric.IHistogram + SQLTxnLatency metric.IHistogram SQLTxnsOpen *metric.Gauge SQLActiveStatements *metric.Gauge SQLContendedTxns *metric.Counter @@ -70,20 +70,20 @@ func (EngineMetrics) MetricStruct() {} // StatsMetrics groups metrics related to SQL Stats collection. type StatsMetrics struct { - SQLStatsMemoryMaxBytesHist *metric.Histogram + SQLStatsMemoryMaxBytesHist metric.IHistogram SQLStatsMemoryCurBytesCount *metric.Gauge - ReportedSQLStatsMemoryMaxBytesHist *metric.Histogram + ReportedSQLStatsMemoryMaxBytesHist metric.IHistogram ReportedSQLStatsMemoryCurBytesCount *metric.Gauge DiscardedStatsCount *metric.Counter SQLStatsFlushStarted *metric.Counter SQLStatsFlushFailure *metric.Counter - SQLStatsFlushDuration *metric.Histogram + SQLStatsFlushDuration metric.IHistogram SQLStatsRemovedRows *metric.Counter - SQLTxnStatsCollectionOverhead *metric.Histogram + SQLTxnStatsCollectionOverhead metric.IHistogram } // StatsMetrics is part of the metric.Struct interface. diff --git a/pkg/sql/mem_metrics.go b/pkg/sql/mem_metrics.go index db2198d6bfd8..248aff4ddc88 100644 --- a/pkg/sql/mem_metrics.go +++ b/pkg/sql/mem_metrics.go @@ -19,7 +19,7 @@ import ( // BaseMemoryMetrics contains a max histogram and a current count of the // bytes allocated by a sql endpoint. type BaseMemoryMetrics struct { - MaxBytesHist *metric.Histogram + MaxBytesHist metric.IHistogram CurBytesCount *metric.Gauge } @@ -30,9 +30,9 @@ type BaseMemoryMetrics struct { // - "internal" for activities related to leases, schema changes, etc. type MemoryMetrics struct { BaseMemoryMetrics - TxnMaxBytesHist *metric.Histogram + TxnMaxBytesHist metric.IHistogram TxnCurBytesCount *metric.Gauge - SessionMaxBytesHist *metric.Histogram + SessionMaxBytesHist metric.IHistogram SessionCurBytesCount *metric.Gauge } @@ -41,6 +41,22 @@ func (MemoryMetrics) MetricStruct() {} var _ metric.Struct = MemoryMetrics{} +// TODO(knz): Until #10014 is addressed, the UI graphs don't have a +// log scale on the Y axis and the histograms are thus displayed using +// a manual log scale: we store the logarithm in the value in the DB +// and plot that logarithm in the UI. +// +// We could, but do not, store the full value in the DB and compute +// the log in the UI, because the current histogram implementation +// does not deal well with large maxima (#10015). +// +// Since the DB stores an integer, we scale the values by 1000 so that +// a modicum of precision is restored when exponentiating the value. +// + +// log10int64times1000 = log10(math.MaxInt64) * 1000, rounded up somewhat +const log10int64times1000 = 19 * 1000 + func makeMemMetricMetadata(name, help string) metric.Metadata { return metric.Metadata{ Name: name, @@ -57,7 +73,13 @@ func MakeBaseMemMetrics(endpoint string, histogramWindow time.Duration) BaseMemo MetaMemMaxBytes := makeMemMetricMetadata(prefix+".max", "Memory usage per sql statement for "+endpoint) MetaMemCurBytes := makeMemMetricMetadata(prefix+".current", "Current sql statement memory usage for "+endpoint) return BaseMemoryMetrics{ - MaxBytesHist: metric.NewHistogram(MetaMemMaxBytes, histogramWindow, metric.MemoryUsage64MBBuckets), + MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaMemMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), CurBytesCount: metric.NewGauge(MetaMemCurBytes), } } @@ -71,10 +93,20 @@ func MakeMemMetrics(endpoint string, histogramWindow time.Duration) MemoryMetric MetaMemMaxSessionBytes := makeMemMetricMetadata(prefix+".session.max", "Memory usage per sql session for "+endpoint) MetaMemSessionCurBytes := makeMemMetricMetadata(prefix+".session.current", "Current sql session memory usage for "+endpoint) return MemoryMetrics{ - BaseMemoryMetrics: base, - TxnMaxBytesHist: metric.NewHistogram(MetaMemMaxTxnBytes, histogramWindow, metric.MemoryUsage64MBBuckets), - TxnCurBytesCount: metric.NewGauge(MetaMemTxnCurBytes), - SessionMaxBytesHist: metric.NewHistogram(MetaMemMaxSessionBytes, histogramWindow, metric.MemoryUsage64MBBuckets), + BaseMemoryMetrics: base, + TxnMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaMemMaxTxnBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets}), + TxnCurBytesCount: metric.NewGauge(MetaMemTxnCurBytes), + SessionMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaMemMaxSessionBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets}), SessionCurBytesCount: metric.NewGauge(MetaMemSessionCurBytes), } diff --git a/pkg/sql/pgwire/pre_serve.go b/pkg/sql/pgwire/pre_serve.go index e9d2a975c29e..b87b61ad406b 100644 --- a/pkg/sql/pgwire/pre_serve.go +++ b/pkg/sql/pgwire/pre_serve.go @@ -183,7 +183,7 @@ type tenantIndependentMetrics struct { PreServeBytesOutCount *metric.Counter PreServeConnFailures *metric.Counter PreServeNewConns *metric.Counter - PreServeMaxBytes *metric.Histogram + PreServeMaxBytes metric.IHistogram PreServeCurBytes *metric.Gauge } @@ -193,8 +193,13 @@ func makeTenantIndependentMetrics(histogramWindow time.Duration) tenantIndepende PreServeBytesOutCount: metric.NewCounter(MetaPreServeBytesOut), PreServeNewConns: metric.NewCounter(MetaPreServeNewConns), PreServeConnFailures: metric.NewCounter(MetaPreServeConnFailures), - PreServeMaxBytes: metric.NewHistogram(MetaPreServeMaxBytes, histogramWindow, metric.MemoryUsage64MBBuckets), - PreServeCurBytes: metric.NewGauge(MetaPreServeCurBytes), + PreServeMaxBytes: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaPreServeMaxBytes, + Duration: histogramWindow, + Buckets: metric.MemoryUsage64MBBuckets, + Mode: metric.HistogramModePrometheus, + }), + PreServeCurBytes: metric.NewGauge(MetaPreServeCurBytes), } } diff --git a/pkg/sql/pgwire/server.go b/pkg/sql/pgwire/server.go index 1e07fc426b36..56d9a0b14ccc 100644 --- a/pkg/sql/pgwire/server.go +++ b/pkg/sql/pgwire/server.go @@ -256,7 +256,7 @@ type tenantSpecificMetrics struct { BytesOutCount *metric.Counter Conns *metric.Gauge NewConns *metric.Counter - ConnLatency *metric.Histogram + ConnLatency metric.IHistogram ConnFailures *metric.Counter PGWireCancelTotalCount *metric.Counter PGWireCancelIgnoredCount *metric.Counter @@ -273,9 +273,12 @@ func makeTenantSpecificMetrics( BytesOutCount: metric.NewCounter(MetaBytesOut), Conns: metric.NewGauge(MetaConns), NewConns: metric.NewCounter(MetaNewConns), - ConnLatency: metric.NewHistogram( - MetaConnLatency, histogramWindow, metric.IOLatencyBuckets, - ), + ConnLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: MetaConnLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), ConnFailures: metric.NewCounter(MetaConnFailures), PGWireCancelTotalCount: metric.NewCounter(MetaPGWireCancelTotal), PGWireCancelIgnoredCount: metric.NewCounter(MetaPGWireCancelIgnored), diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go index f6581e68eac1..3a95761a8499 100644 --- a/pkg/sql/sqlstats/persistedsqlstats/provider.go +++ b/pkg/sql/sqlstats/persistedsqlstats/provider.go @@ -43,7 +43,7 @@ type Config struct { // Metrics. FlushCounter *metric.Counter - FlushDuration *metric.Histogram + FlushDuration metric.IHistogram FailureCounter *metric.Counter // Testing knobs. diff --git a/pkg/sql/sqlstats/sslocal/sql_stats.go b/pkg/sql/sqlstats/sslocal/sql_stats.go index 90c3350961eb..92fe6dcf3904 100644 --- a/pkg/sql/sqlstats/sslocal/sql_stats.go +++ b/pkg/sql/sqlstats/sslocal/sql_stats.go @@ -75,7 +75,7 @@ func newSQLStats( uniqueStmtFingerprintLimit *settings.IntSetting, uniqueTxnFingerprintLimit *settings.IntSetting, curMemBytesCount *metric.Gauge, - maxMemBytesHist *metric.Histogram, + maxMemBytesHist metric.IHistogram, insightsWriter insights.WriterProvider, parentMon *mon.BytesMonitor, flushTarget Sink, diff --git a/pkg/sql/sqlstats/sslocal/sslocal_provider.go b/pkg/sql/sqlstats/sslocal/sslocal_provider.go index b508533ada09..e375c9c00e27 100644 --- a/pkg/sql/sqlstats/sslocal/sslocal_provider.go +++ b/pkg/sql/sqlstats/sslocal/sslocal_provider.go @@ -35,7 +35,7 @@ func New( maxStmtFingerprints *settings.IntSetting, maxTxnFingerprints *settings.IntSetting, curMemoryBytesCount *metric.Gauge, - maxMemoryBytesHist *metric.Histogram, + maxMemoryBytesHist metric.IHistogram, insightsWriter insights.WriterProvider, pool *mon.BytesMonitor, reportingSink Sink, diff --git a/pkg/sql/ttl/ttljob/ttljob_metrics.go b/pkg/sql/ttl/ttljob/ttljob_metrics.go index d9b450a8d481..ae526f574226 100644 --- a/pkg/sql/ttl/ttljob/ttljob_metrics.go +++ b/pkg/sql/ttl/ttljob/ttljob_metrics.go @@ -96,41 +96,48 @@ func (m *RowLevelTTLAggMetrics) loadMetrics(labelMetrics bool, relation string) } func makeRowLevelTTLAggMetrics(histogramWindowInterval time.Duration) metric.Struct { + sigFigs := 2 b := aggmetric.MakeBuilder("relation") ret := &RowLevelTTLAggMetrics{ - SpanTotalDuration: b.Histogram( - metric.Metadata{ + SpanTotalDuration: b.Histogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "jobs.row_level_ttl.span_total_duration", Help: "Duration for processing a span during row level TTL.", Measurement: "nanoseconds", Unit: metric.Unit_NANOSECONDS, MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, - histogramWindowInterval, - metric.LongRunning60mLatencyBuckets, - ), - SelectDuration: b.Histogram( - metric.Metadata{ + MaxVal: time.Hour.Nanoseconds(), + SigFigs: sigFigs, + Duration: histogramWindowInterval, + Buckets: metric.LongRunning60mLatencyBuckets, + }), + SelectDuration: b.Histogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "jobs.row_level_ttl.select_duration", Help: "Duration for select requests during row level TTL.", Measurement: "nanoseconds", Unit: metric.Unit_NANOSECONDS, MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, - histogramWindowInterval, - metric.BatchProcessLatencyBuckets, - ), - DeleteDuration: b.Histogram( - metric.Metadata{ + MaxVal: time.Minute.Nanoseconds(), + SigFigs: sigFigs, + Duration: histogramWindowInterval, + Buckets: metric.BatchProcessLatencyBuckets, + }), + DeleteDuration: b.Histogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "jobs.row_level_ttl.delete_duration", Help: "Duration for delete requests during row level TTL.", Measurement: "nanoseconds", Unit: metric.Unit_NANOSECONDS, MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, - histogramWindowInterval, - metric.BatchProcessLatencyBuckets, - ), + MaxVal: time.Minute.Nanoseconds(), + SigFigs: sigFigs, + Duration: histogramWindowInterval, + Buckets: metric.BatchProcessLatencyBuckets, + }), RowSelections: b.Counter( metric.Metadata{ Name: "jobs.row_level_ttl.rows_selected", diff --git a/pkg/util/admission/work_queue.go b/pkg/util/admission/work_queue.go index be9b4fabc3d4..d6211bee01ab 100644 --- a/pkg/util/admission/work_queue.go +++ b/pkg/util/admission/work_queue.go @@ -1561,7 +1561,7 @@ type workQueueMetricsSingle struct { Requested *metric.Counter Admitted *metric.Counter Errored *metric.Counter - WaitDurations *metric.Histogram + WaitDurations metric.IHistogram WaitQueueLength *metric.Gauge } @@ -1622,9 +1622,12 @@ func makeWorkQueueMetricsSingle(name string) workQueueMetricsSingle { Requested: metric.NewCounter(addName(name, requestedMeta)), Admitted: metric.NewCounter(addName(name, admittedMeta)), Errored: metric.NewCounter(addName(name, erroredMeta)), - WaitDurations: metric.NewHistogram( - addName(name, waitDurationsMeta), base.DefaultHistogramWindowInterval(), metric.IOLatencyBuckets, - ), + WaitDurations: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: addName(name, waitDurationsMeta), + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.IOLatencyBuckets, + }), WaitQueueLength: metric.NewGauge(addName(name, waitQueueLengthMeta)), } } diff --git a/pkg/util/metric/BUILD.bazel b/pkg/util/metric/BUILD.bazel index 7d0473c33b40..e94462787b40 100644 --- a/pkg/util/metric/BUILD.bazel +++ b/pkg/util/metric/BUILD.bazel @@ -8,6 +8,7 @@ go_library( srcs = [ "doc.go", "graphite_exporter.go", + "hdrhistogram.go", "histogram_buckets.go", "metric.go", "prometheus_exporter.go", @@ -22,10 +23,13 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/util/metric", visibility = ["//visibility:public"], deps = [ + "//pkg/util", + "//pkg/util/envutil", "//pkg/util/log", "//pkg/util/syncutil", "//pkg/util/timeutil", "@com_github_cockroachdb_errors//:errors", + "@com_github_codahale_hdrhistogram//:hdrhistogram", "@com_github_gogo_protobuf//proto", "@com_github_prometheus_client_golang//prometheus", "@com_github_prometheus_client_golang//prometheus/graphite", diff --git a/pkg/util/metric/aggmetric/BUILD.bazel b/pkg/util/metric/aggmetric/BUILD.bazel index daf77ba5d5ff..5984de7e5154 100644 --- a/pkg/util/metric/aggmetric/BUILD.bazel +++ b/pkg/util/metric/aggmetric/BUILD.bazel @@ -17,7 +17,6 @@ go_library( "@com_github_cockroachdb_errors//:errors", "@com_github_gogo_protobuf//proto", "@com_github_google_btree//:btree", - "@com_github_prometheus_client_golang//prometheus", "@com_github_prometheus_client_model//go", ], ) diff --git a/pkg/util/metric/aggmetric/agg_metric.go b/pkg/util/metric/aggmetric/agg_metric.go index c9afb965d64f..ab5ad03ce5b6 100644 --- a/pkg/util/metric/aggmetric/agg_metric.go +++ b/pkg/util/metric/aggmetric/agg_metric.go @@ -15,7 +15,6 @@ package aggmetric import ( "strings" - "time" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/cockroachdb/cockroach/pkg/util/syncutil" @@ -50,10 +49,8 @@ func (b Builder) Counter(metadata metric.Metadata) *AggCounter { } // Histogram constructs a new AggHistogram with the Builder's labels. -func (b Builder) Histogram( - metadata metric.Metadata, duration time.Duration, buckets []float64, -) *AggHistogram { - return NewHistogram(metadata, duration, buckets, b.labels...) +func (b Builder) Histogram(opts metric.HistogramOptions) *AggHistogram { + return NewHistogram(opts, b.labels...) } type childSet struct { diff --git a/pkg/util/metric/aggmetric/agg_metric_test.go b/pkg/util/metric/aggmetric/agg_metric_test.go index 842a30504889..a139347d38db 100644 --- a/pkg/util/metric/aggmetric/agg_metric_test.go +++ b/pkg/util/metric/aggmetric/agg_metric_test.go @@ -13,6 +13,7 @@ package aggmetric_test import ( "bufio" "bytes" + "fmt" "sort" "strings" "testing" @@ -63,10 +64,15 @@ func TestAggMetric(t *testing.T) { Name: "baz_gauge", }, "tenant_id") r.AddMetric(f) - - h := aggmetric.NewHistogram(metric.Metadata{ - Name: "histo_gram", - }, base.DefaultHistogramWindowInterval(), metric.Count1KBuckets, "tenant_id") + h := aggmetric.NewHistogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ + Name: "histo_gram", + }, + Duration: base.DefaultHistogramWindowInterval(), + MaxVal: 100, + SigFigs: 1, + Buckets: metric.Count1KBuckets, + }, "tenant_id") r.AddMetric(h) tenant2 := roachpb.MustMakeTenantID(2) @@ -87,18 +93,28 @@ func TestAggMetric(t *testing.T) { g3.Inc(3) g3.Dec(1) f2.Update(1.5) + fmt.Println(r) f3.Update(2.5) h2.RecordValue(10) h3.RecordValue(90) - echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, "basic.txt")) + testFile := "basic.txt" + if metric.HdrEnabled() { + testFile = "basic_hdr.txt" + } + echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, testFile)) }) t.Run("destroy", func(t *testing.T) { + fmt.Println(r) g3.Unlink() c2.Unlink() f3.Unlink() h3.Unlink() - echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, "destroy.txt")) + testFile := "destroy.txt" + if metric.HdrEnabled() { + testFile = "destroy_hdr.txt" + } + echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, testFile)) }) t.Run("panic on already exists", func(t *testing.T) { @@ -119,7 +135,11 @@ func TestAggMetric(t *testing.T) { c2 = c.AddChild(tenant2.String()) f3 = f.AddChild(tenant3.String()) h3 = h.AddChild(tenant3.String()) - echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, "add_after_destroy.txt")) + testFile := "add_after_destroy.txt" + if metric.HdrEnabled() { + testFile = "add_after_destroy_hdr.txt" + } + echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, testFile)) }) t.Run("panic on label length mismatch", func(t *testing.T) { @@ -135,8 +155,13 @@ func TestAggMetricBuilder(t *testing.T) { c := b.Counter(metric.Metadata{Name: "foo_counter"}) g := b.Gauge(metric.Metadata{Name: "bar_gauge"}) f := b.GaugeFloat64(metric.Metadata{Name: "baz_gauge"}) - h := b.Histogram(metric.Metadata{Name: "histo_gram"}, - base.DefaultHistogramWindowInterval(), metric.Count1KBuckets) + h := b.Histogram(metric.HistogramOptions{ + Metadata: metric.Metadata{Name: "histo_gram"}, + Duration: base.DefaultHistogramWindowInterval(), + MaxVal: 100, + SigFigs: 1, + Buckets: metric.Count1KBuckets, + }) for i := 5; i < 10; i++ { tenantLabel := roachpb.MustMakeTenantID(uint64(i)).String() diff --git a/pkg/util/metric/aggmetric/histogram.go b/pkg/util/metric/aggmetric/histogram.go index 68d30fab17a2..1c07cc1ef3d3 100644 --- a/pkg/util/metric/aggmetric/histogram.go +++ b/pkg/util/metric/aggmetric/histogram.go @@ -11,10 +11,7 @@ package aggmetric import ( - "time" - "github.com/cockroachdb/cockroach/pkg/util/metric" - "github.com/prometheus/client_golang/prometheus" io_prometheus_client "github.com/prometheus/client_model/go" ) @@ -23,8 +20,8 @@ import ( // children, while its children are additionally exported to prometheus via the // PrometheusIterable interface. type AggHistogram struct { - h metric.Histogram - create func() *metric.Histogram + h metric.IHistogram + create func() metric.IHistogram childSet } @@ -34,14 +31,12 @@ var _ metric.PrometheusExportable = (*AggHistogram)(nil) var _ metric.WindowedHistogram = (*AggHistogram)(nil) // NewHistogram constructs a new AggHistogram. -func NewHistogram( - metadata metric.Metadata, duration time.Duration, buckets []float64, childLabels ...string, -) *AggHistogram { - create := func() *metric.Histogram { - return metric.NewHistogram(metadata, duration, buckets) +func NewHistogram(opts metric.HistogramOptions, childLabels ...string) *AggHistogram { + create := func() metric.IHistogram { + return metric.NewHistogram(opts) } a := &AggHistogram{ - h: *create(), + h: create(), create: create, } a.init(childLabels) @@ -96,19 +91,13 @@ func (a *AggHistogram) ToPrometheusMetric() *io_prometheus_client.Metric { return a.h.ToPrometheusMetric() } -// Windowed returns a copy of the current windowed histogram data and its -// rotation interval. -func (a *AggHistogram) Windowed() prometheus.Histogram { - return a.h.Windowed() -} - // AddChild adds a Counter to this AggCounter. This method panics if a Counter // already exists for this set of labelVals. func (a *AggHistogram) AddChild(labelVals ...string) *Histogram { child := &Histogram{ parent: a, labelValuesSlice: labelValuesSlice(labelVals), - h: *a.create(), + h: a.create(), } a.add(child) return child @@ -121,7 +110,7 @@ func (a *AggHistogram) AddChild(labelVals ...string) *Histogram { type Histogram struct { parent *AggHistogram labelValuesSlice - h metric.Histogram + h metric.IHistogram } // ToPrometheusMetric constructs a prometheus metric for this Histogram. diff --git a/pkg/util/metric/aggmetric/testdata/add_after_destroy_hdr.txt b/pkg/util/metric/aggmetric/testdata/add_after_destroy_hdr.txt new file mode 100644 index 000000000000..ffcbd4571a33 --- /dev/null +++ b/pkg/util/metric/aggmetric/testdata/add_after_destroy_hdr.txt @@ -0,0 +1,23 @@ +echo +---- +bar_gauge 4 +bar_gauge{tenant_id="2"} 2 +bar_gauge{tenant_id="3"} 0 +baz_gauge 4 +baz_gauge{tenant_id="2"} 1.5 +baz_gauge{tenant_id="3"} 0 +foo_counter 6 +foo_counter{tenant_id="2"} 0 +foo_counter{tenant_id="3"} 4 +histo_gram_bucket{le="+Inf"} 2 +histo_gram_bucket{le="10"} 1 +histo_gram_bucket{le="91"} 2 +histo_gram_bucket{tenant_id="2",le="+Inf"} 1 +histo_gram_bucket{tenant_id="2",le="10"} 1 +histo_gram_bucket{tenant_id="3",le="+Inf"} 0 +histo_gram_count 2 +histo_gram_count{tenant_id="2"} 1 +histo_gram_count{tenant_id="3"} 0 +histo_gram_sum 101 +histo_gram_sum{tenant_id="2"} 10 +histo_gram_sum{tenant_id="3"} 0 \ No newline at end of file diff --git a/pkg/util/metric/aggmetric/testdata/basic_hdr.txt b/pkg/util/metric/aggmetric/testdata/basic_hdr.txt new file mode 100644 index 000000000000..a796b8ef3406 --- /dev/null +++ b/pkg/util/metric/aggmetric/testdata/basic_hdr.txt @@ -0,0 +1,24 @@ +echo +---- +bar_gauge 4 +bar_gauge{tenant_id="2"} 2 +bar_gauge{tenant_id="3"} 2 +baz_gauge 4 +baz_gauge{tenant_id="2"} 1.5 +baz_gauge{tenant_id="3"} 2.5 +foo_counter 6 +foo_counter{tenant_id="2"} 2 +foo_counter{tenant_id="3"} 4 +histo_gram_bucket{le="+Inf"} 2 +histo_gram_bucket{le="10"} 1 +histo_gram_bucket{le="91"} 2 +histo_gram_bucket{tenant_id="2",le="+Inf"} 1 +histo_gram_bucket{tenant_id="2",le="10"} 1 +histo_gram_bucket{tenant_id="3",le="+Inf"} 1 +histo_gram_bucket{tenant_id="3",le="91"} 1 +histo_gram_count 2 +histo_gram_count{tenant_id="2"} 1 +histo_gram_count{tenant_id="3"} 1 +histo_gram_sum 101 +histo_gram_sum{tenant_id="2"} 10 +histo_gram_sum{tenant_id="3"} 91 \ No newline at end of file diff --git a/pkg/util/metric/aggmetric/testdata/destroy_hdr.txt b/pkg/util/metric/aggmetric/testdata/destroy_hdr.txt new file mode 100644 index 000000000000..dd17b7aae5bb --- /dev/null +++ b/pkg/util/metric/aggmetric/testdata/destroy_hdr.txt @@ -0,0 +1,17 @@ +echo +---- +bar_gauge 4 +bar_gauge{tenant_id="2"} 2 +baz_gauge 4 +baz_gauge{tenant_id="2"} 1.5 +foo_counter 6 +foo_counter{tenant_id="3"} 4 +histo_gram_bucket{le="+Inf"} 2 +histo_gram_bucket{le="10"} 1 +histo_gram_bucket{le="91"} 2 +histo_gram_bucket{tenant_id="2",le="+Inf"} 1 +histo_gram_bucket{tenant_id="2",le="10"} 1 +histo_gram_count 2 +histo_gram_count{tenant_id="2"} 1 +histo_gram_sum 101 +histo_gram_sum{tenant_id="2"} 10 \ No newline at end of file diff --git a/pkg/util/metric/hdrhistogram.go b/pkg/util/metric/hdrhistogram.go new file mode 100644 index 000000000000..e33582b9c2e3 --- /dev/null +++ b/pkg/util/metric/hdrhistogram.go @@ -0,0 +1,248 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package metric + +import ( + "time" + + "github.com/cockroachdb/cockroach/pkg/util/syncutil" + "github.com/codahale/hdrhistogram" + prometheusgo "github.com/prometheus/client_model/go" +) + +const ( + // HdrHistogramMaxLatency is the maximum value tracked in latency histograms. Higher + // values will be recorded as this value instead. + HdrHistogramMaxLatency = 10 * time.Second + + // The number of histograms to keep in rolling window. + hdrHistogramHistWrapNum = 2 // TestSampleInterval is passed to histograms during tests which don't +) + +// A HdrHistogram collects observed values by keeping bucketed counts. For +// convenience, internally two sets of buckets are kept: A cumulative set (i.e. +// data is never evicted) and a windowed set (which keeps only recently +// collected samples). +// +// Top-level methods generally apply to the cumulative buckets; the windowed +// variant is exposed through the Windowed method. +// +// TODO(#96357): remove HdrHistogram model entirely once the Prometheus +// backed histogram and its bucket boundaries have been reliably proven in +// production. +type HdrHistogram struct { + Metadata + maxVal int64 + mu struct { + syncutil.Mutex + cumulative *hdrhistogram.Histogram + *tickHelper + sliding *hdrhistogram.WindowedHistogram + } +} + +var _ IHistogram = &HdrHistogram{} +var _ PrometheusExportable = &HdrHistogram{} +var _ Iterable = &HdrHistogram{} + +// NewHdrHistogram initializes a given Histogram. The contained windowed histogram +// rotates every 'duration'; both the windowed and the cumulative histogram +// track nonnegative values up to 'maxVal' with 'sigFigs' decimal points of +// precision. +func NewHdrHistogram( + metadata Metadata, duration time.Duration, maxVal int64, sigFigs int, +) *HdrHistogram { + h := &HdrHistogram{ + Metadata: metadata, + maxVal: maxVal, + } + wHist := hdrhistogram.NewWindowed(hdrHistogramHistWrapNum, 0, maxVal, sigFigs) + h.mu.cumulative = hdrhistogram.New(0, maxVal, sigFigs) + h.mu.sliding = wHist + h.mu.tickHelper = &tickHelper{ + nextT: now(), + tickInterval: duration / hdrHistogramHistWrapNum, + onTick: func() { + wHist.Rotate() + }, + } + return h +} + +// NewHdrLatency is a convenience function which returns a histogram with +// suitable defaults for latency tracking. Values are expressed in ns, +// are truncated into the interval [0, HdrHistogramMaxLatency] and are recorded +// with one digit of precision (i.e. errors of <10ms at 100ms, <6s at 60s). +// +// The windowed portion of the Histogram retains values for approximately +// histogramWindow. +func NewHdrLatency(metadata Metadata, histogramWindow time.Duration) *HdrHistogram { + return NewHdrHistogram( + metadata, histogramWindow, HdrHistogramMaxLatency.Nanoseconds(), 1, + ) +} + +// RecordValue adds the given value to the histogram. Recording a value in +// excess of the configured maximum value for that histogram results in +// recording the maximum value instead. +func (h *HdrHistogram) RecordValue(v int64) { + h.mu.Lock() + defer h.mu.Unlock() + + if h.mu.sliding.Current.RecordValue(v) != nil { + _ = h.mu.sliding.Current.RecordValue(h.maxVal) + } + if h.mu.cumulative.RecordValue(v) != nil { + _ = h.mu.cumulative.RecordValue(h.maxVal) + } +} + +// TotalCount returns the (cumulative) number of samples. +func (h *HdrHistogram) TotalCount() int64 { + h.mu.Lock() + defer h.mu.Unlock() + return h.mu.cumulative.TotalCount() +} + +// Min returns the minimum. +func (h *HdrHistogram) Min() int64 { + h.mu.Lock() + defer h.mu.Unlock() + return h.mu.cumulative.Min() +} + +// Inspect calls the closure with the empty string and the receiver. +func (h *HdrHistogram) Inspect(f func(interface{})) { + h.mu.Lock() + maybeTick(h.mu.tickHelper) + h.mu.Unlock() + f(h) +} + +// GetType returns the prometheus type enum for this metric. +func (h *HdrHistogram) GetType() *prometheusgo.MetricType { + return prometheusgo.MetricType_HISTOGRAM.Enum() +} + +// ToPrometheusMetric returns a filled-in prometheus metric of the right type. +func (h *HdrHistogram) ToPrometheusMetric() *prometheusgo.Metric { + hist := &prometheusgo.Histogram{} + + h.mu.Lock() + maybeTick(h.mu.tickHelper) + bars := h.mu.cumulative.Distribution() + hist.Bucket = make([]*prometheusgo.Bucket, 0, len(bars)) + + var cumCount uint64 + var sum float64 + for _, bar := range bars { + if bar.Count == 0 { + // No need to expose trivial buckets. + continue + } + upperBound := float64(bar.To) + sum += upperBound * float64(bar.Count) + + cumCount += uint64(bar.Count) + curCumCount := cumCount // need a new alloc thanks to bad proto code + + hist.Bucket = append(hist.Bucket, &prometheusgo.Bucket{ + CumulativeCount: &curCumCount, + UpperBound: &upperBound, + }) + } + hist.SampleCount = &cumCount + hist.SampleSum = &sum // can do better here; we approximate in the loop + h.mu.Unlock() + + return &prometheusgo.Metric{ + Histogram: hist, + } +} + +// TotalCountWindowed implements the WindowedHistogram interface. +func (h *HdrHistogram) TotalCountWindowed() int64 { + return int64(h.ToPrometheusMetricWindowed().Histogram.GetSampleCount()) +} + +// TotalSumWindowed implements the WindowedHistogram interface. +func (h *HdrHistogram) TotalSumWindowed() float64 { + return h.ToPrometheusMetricWindowed().Histogram.GetSampleSum() +} + +func (h *HdrHistogram) toPrometheusMetricWindowedLocked() *prometheusgo.Metric { + hist := &prometheusgo.Histogram{} + + maybeTick(h.mu.tickHelper) + bars := h.mu.sliding.Current.Distribution() + hist.Bucket = make([]*prometheusgo.Bucket, 0, len(bars)) + + var cumCount uint64 + var sum float64 + for _, bar := range bars { + if bar.Count == 0 { + // No need to expose trivial buckets. + continue + } + upperBound := float64(bar.To) + sum += upperBound * float64(bar.Count) + + cumCount += uint64(bar.Count) + curCumCount := cumCount // need a new alloc thanks to bad proto code + + hist.Bucket = append(hist.Bucket, &prometheusgo.Bucket{ + CumulativeCount: &curCumCount, + UpperBound: &upperBound, + }) + } + hist.SampleCount = &cumCount + hist.SampleSum = &sum // can do better here; we approximate in the loop + + return &prometheusgo.Metric{ + Histogram: hist, + } +} + +// ToPrometheusMetricWindowed returns a filled-in prometheus metric of the right type. +func (h *HdrHistogram) ToPrometheusMetricWindowed() *prometheusgo.Metric { + h.mu.Lock() + defer h.mu.Unlock() + return h.toPrometheusMetricWindowedLocked() +} + +// GetMetadata returns the metric's metadata including the Prometheus +// MetricType. +func (h *HdrHistogram) GetMetadata() Metadata { + baseMetadata := h.Metadata + baseMetadata.MetricType = prometheusgo.MetricType_HISTOGRAM + return baseMetadata +} + +func (h *HdrHistogram) ValueAtQuantileWindowed(q float64) float64 { + h.mu.Lock() + defer h.mu.Unlock() + + return ValueAtQuantileWindowed(h.toPrometheusMetricWindowedLocked().Histogram, q) +} + +func (h *HdrHistogram) Mean() float64 { + h.mu.Lock() + defer h.mu.Unlock() + + return h.mu.cumulative.Mean() +} + +func (h *HdrHistogram) TotalSum() float64 { + h.mu.Lock() + defer h.mu.Unlock() + + return h.ToPrometheusMetric().GetSummary().GetSampleSum() +} diff --git a/pkg/util/metric/metric.go b/pkg/util/metric/metric.go index a6b794422e4b..afd19cf83638 100644 --- a/pkg/util/metric/metric.go +++ b/pkg/util/metric/metric.go @@ -17,6 +17,8 @@ import ( "sync/atomic" "time" + "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/gogo/protobuf/proto" @@ -25,11 +27,9 @@ import ( "github.com/rcrowley/go-metrics" ) -const ( - // TestSampleInterval is passed to histograms during tests which don't - // want to concern themselves with supplying a "correct" interval. - TestSampleInterval = time.Duration(math.MaxInt64) -) +// TestSampleInterval is passed to histograms during tests which don't +// want to concern themselves with supplying a "correct" interval. +const TestSampleInterval = time.Duration(math.MaxInt64) // Iterable provides a method for synchronized access to interior objects. type Iterable interface { @@ -176,10 +176,81 @@ func maybeTick(m periodic) { } } +// useHdrHistogramsEnvVar can be used to switch all histograms to use the +// legacy HDR histograms (except for those that explicitly force the use +// of the newer Prometheus via HistogramModePrometheus). HDR Histograms +// dynamically generate bucket boundaries, which can lead to hundreds of +// buckets. This can cause performance issues with timeseries databases +// like Prometheus. +const useHdrHistogramsEnvVar = "COCKROACH_ENABLE_HDR_HISTOGRAMS" + +var hdrEnabled = util.ConstantWithMetamorphicTestBool(useHdrHistogramsEnvVar, envutil.EnvOrDefaultBool(useHdrHistogramsEnvVar, false)) + +// HdrEnabled returns whether or not the HdrHistogram model is enabled +// in the metric package. Primarily useful in tests where we want to validate +// different outputs depending on whether or not HDR is enabled. +func HdrEnabled() bool { + return hdrEnabled +} + +type HistogramMode byte + +const ( + // HistogramModePrometheus will force the constructed histogram to use + // the Prometheus histogram model, regardless of the value of + // useHdrHistogramsEnvVar. This option should be used for all + // newly defined histograms moving forward. + // + // NB: If neither this mode nor the HistogramModePreferHdrLatency mode + // is set, MaxVal and SigFigs must be defined to maintain backwards + // compatibility with the legacy HdrHistogram model. + HistogramModePrometheus HistogramMode = iota + 1 + // HistogramModePreferHdrLatency will cause the returned histogram to + // use the HdrHistgoram model and be configured with suitable defaults + // for latency tracking iff useHdrHistogramsEnvVar is enabled. + // + // NB: If this option is set, no MaxVal or SigFigs are required in the + // HistogramOptions to maintain backwards compatibility with the legacy + // HdrHistogram model, since suitable defaults are used for both. + HistogramModePreferHdrLatency +) + +type HistogramOptions struct { + // Metadata is the metric Metadata associated with the histogram. + Metadata Metadata + // Duration is the histogram's window duration. + Duration time.Duration + // MaxVal is only relevant to the HdrHistogram, and represents the + // highest trackable value in the resulting histogram buckets. + MaxVal int64 + // SigFigs is only relevant to the HdrHistogram, and represents + // the number of significant figures to be used to determine the + // degree of accuracy used in measurements. + SigFigs int + // Buckets are only relevant to Prometheus histograms, and represent + // the pre-defined histogram bucket boundaries to be used. + Buckets []float64 + // Mode defines the type of histogram to be used. See individual + // comments on each HistogramMode value for details. + Mode HistogramMode +} + +func NewHistogram(opt HistogramOptions) IHistogram { + if hdrEnabled && opt.Mode != HistogramModePrometheus { + if opt.Mode == HistogramModePreferHdrLatency { + return NewHdrLatency(opt.Metadata, opt.Duration) + } else { + return NewHdrHistogram(opt.Metadata, opt.Duration, opt.MaxVal, opt.SigFigs) + } + } else { + return newHistogram(opt.Metadata, opt.Duration, opt.Buckets) + } +} + // NewHistogram is a prometheus-backed histogram. Depending on the value of // opts.Buckets, this is suitable for recording any kind of quantity. Common // sensible choices are {IO,Network}LatencyBuckets. -func NewHistogram(meta Metadata, windowDuration time.Duration, buckets []float64) *Histogram { +func newHistogram(meta Metadata, windowDuration time.Duration, buckets []float64) *Histogram { // TODO(obs-inf): prometheus supports labeled histograms but they require more // plumbing and don't fit into the PrometheusObservable interface any more. opts := prometheus.HistogramOpts{ @@ -236,6 +307,21 @@ type Histogram struct { } } +type IHistogram interface { + Iterable + PrometheusExportable + WindowedHistogram + + RecordValue(n int64) + TotalCount() int64 + TotalSum() float64 + TotalCountWindowed() int64 + TotalSumWindowed() float64 + Mean() float64 +} + +var _ IHistogram = &Histogram{} + func (h *Histogram) nextTick() time.Time { h.windowed.RLock() defer h.windowed.RUnlock() @@ -326,7 +412,8 @@ func (h *Histogram) TotalSumWindowed() float64 { // Mean returns the (cumulative) mean of samples. func (h *Histogram) Mean() float64 { - return h.TotalSum() / float64(h.TotalCount()) + pm := h.ToPrometheusMetric() + return pm.Histogram.GetSampleSum() / float64(pm.Histogram.GetSampleCount()) } // ValueAtQuantileWindowed implements the WindowedHistogram interface. diff --git a/pkg/util/metric/metric_ext_test.go b/pkg/util/metric/metric_ext_test.go index a06cdfc0cb63..60c14ba25bc5 100644 --- a/pkg/util/metric/metric_ext_test.go +++ b/pkg/util/metric/metric_ext_test.go @@ -25,7 +25,12 @@ func TestHistogramPrometheus(t *testing.T) { // Regression test against https://github.com/cockroachdb/cockroach/pull/88331. // The output includes buckets for which the upper bound equals the previous // bucket's upper bound. - h := metric.NewHistogram(metric.Metadata{}, time.Second, []float64{1, 2, 3, 4, 5, 6, 10, 20, 30}) + h := metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePrometheus, + Metadata: metric.Metadata{}, + Duration: time.Second, + Buckets: []float64{1, 2, 3, 4, 5, 6, 10, 20, 30}, + }) h.RecordValue(1) h.RecordValue(5) h.RecordValue(5) diff --git a/pkg/util/metric/metric_test.go b/pkg/util/metric/metric_test.go index 2a273bc7fa85..d8a451ec06ad 100644 --- a/pkg/util/metric/metric_test.go +++ b/pkg/util/metric/metric_test.go @@ -112,17 +112,18 @@ func TestHistogram(t *testing.T) { return &n } - h := NewHistogram( - Metadata{}, - time.Hour, - []float64{ + h := NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: Metadata{}, + Duration: time.Hour, + Buckets: []float64{ 1.0, 5.0, 10.0, 25.0, 100.0, }, - ) + }) // should return 0 if no observations are made require.Equal(t, 0.0, h.ValueAtQuantileWindowed(0)) @@ -236,23 +237,24 @@ func TestNewHistogramRotate(t *testing.T) { defer TestingSetNow(nil)() setNow(0) - h := NewHistogram(emptyMetadata, 10*time.Second, nil) + h := NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: emptyMetadata, + Duration: 10 * time.Second, + Buckets: nil, + }) for i := 0; i < 4; i++ { // Windowed histogram is initially empty. h.Inspect(func(interface{}) {}) // triggers ticking - var m prometheusgo.Metric - require.NoError(t, h.Windowed().Write(&m)) - require.Zero(t, *m.Histogram.SampleSum) + require.Zero(t, h.TotalSumWindowed()) // But cumulative histogram has history (if i > 0). - require.EqualValues(t, i, *h.ToPrometheusMetric().Histogram.SampleCount) + require.EqualValues(t, i, h.TotalCount()) // Add a measurement and verify it's there. { h.RecordValue(12345) f := float64(12345) - var m prometheusgo.Metric - require.NoError(t, h.Windowed().Write(&m)) - require.Equal(t, *m.Histogram.SampleSum, f) + require.Equal(t, h.TotalSumWindowed(), f) } // Tick. This rotates the histogram. setNow(time.Duration(i+1) * 10 * time.Second) diff --git a/pkg/util/metric/registry_test.go b/pkg/util/metric/registry_test.go index 5d2b2a6c4e88..48f4aba216bd 100644 --- a/pkg/util/metric/registry_test.go +++ b/pkg/util/metric/registry_test.go @@ -76,14 +76,19 @@ func TestRegistry(t *testing.T) { topCounter := NewCounter(Metadata{Name: "top.counter"}) r.AddMetric(topCounter) - r.AddMetric(NewHistogram(Metadata{Name: "top.histogram"}, time.Minute, Count1KBuckets)) + r.AddMetric(NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: Metadata{Name: "top.histogram"}, + Duration: time.Minute, + Buckets: Count1KBuckets, + })) r.AddMetric(NewGauge(Metadata{Name: "bottom.gauge"})) ms := &struct { StructGauge *Gauge StructGauge64 *GaugeFloat64 StructCounter *Counter - StructHistogram *Histogram + StructHistogram IHistogram NestedStructGauge NestedStruct ArrayStructCounters [4]*Counter // Ensure that nil struct values in arrays are safe. @@ -92,7 +97,7 @@ func TestRegistry(t *testing.T) { privateStructGauge *Gauge privateStructGauge64 *GaugeFloat64 privateStructCounter *Counter - privateStructHistogram *Histogram + privateStructHistogram IHistogram privateNestedStructGauge NestedStruct privateArrayStructCounters [2]*Counter NotAMetric int @@ -100,10 +105,15 @@ func TestRegistry(t *testing.T) { ReallyNotAMetric *Registry DefinitelyNotAnArrayOfMetrics [2]int }{ - StructGauge: NewGauge(Metadata{Name: "struct.gauge"}), - StructGauge64: NewGaugeFloat64(Metadata{Name: "struct.gauge64"}), - StructCounter: NewCounter(Metadata{Name: "struct.counter"}), - StructHistogram: NewHistogram(Metadata{Name: "struct.histogram"}, time.Minute, Count1KBuckets), + StructGauge: NewGauge(Metadata{Name: "struct.gauge"}), + StructGauge64: NewGaugeFloat64(Metadata{Name: "struct.gauge64"}), + StructCounter: NewCounter(Metadata{Name: "struct.counter"}), + StructHistogram: NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: Metadata{Name: "struct.histogram"}, + Duration: time.Minute, + Buckets: Count1KBuckets, + }), NestedStructGauge: NestedStruct{ NestedStructGauge: NewGauge(Metadata{Name: "nested.struct.gauge"}), }, @@ -119,10 +129,15 @@ func TestRegistry(t *testing.T) { NestedStructGauge: NewGauge(Metadata{Name: "nested.struct.array.1.gauge"}), }, }, - privateStructGauge: NewGauge(Metadata{Name: "private.struct.gauge"}), - privateStructGauge64: NewGaugeFloat64(Metadata{Name: "private.struct.gauge64"}), - privateStructCounter: NewCounter(Metadata{Name: "private.struct.counter"}), - privateStructHistogram: NewHistogram(Metadata{Name: "private.struct.histogram"}, time.Minute, Count1KBuckets), + privateStructGauge: NewGauge(Metadata{Name: "private.struct.gauge"}), + privateStructGauge64: NewGaugeFloat64(Metadata{Name: "private.struct.gauge64"}), + privateStructCounter: NewCounter(Metadata{Name: "private.struct.counter"}), + privateStructHistogram: NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: Metadata{Name: "private.struct.histogram"}, + Duration: time.Minute, + Buckets: Count1KBuckets, + }), privateNestedStructGauge: NestedStruct{ NestedStructGauge: NewGauge(Metadata{Name: "private.nested.struct.gauge"}), }, diff --git a/pkg/util/mon/bytes_usage.go b/pkg/util/mon/bytes_usage.go index 4ea0489afa20..5ba2493730dd 100644 --- a/pkg/util/mon/bytes_usage.go +++ b/pkg/util/mon/bytes_usage.go @@ -195,7 +195,7 @@ type BytesMonitor struct { // maxBytesHist is the metric object used to track the high watermark of bytes // allocated by the monitor during its lifetime. - maxBytesHist *metric.Histogram + maxBytesHist metric.IHistogram } // name identifies this monitor in logging messages. @@ -273,7 +273,7 @@ func NewMonitor( name redact.RedactableString, res Resource, curCount *metric.Gauge, - maxHist *metric.Histogram, + maxHist metric.IHistogram, increment int64, noteworthy int64, settings *cluster.Settings, @@ -289,7 +289,7 @@ func NewMonitorWithLimit( res Resource, limit int64, curCount *metric.Gauge, - maxHist *metric.Histogram, + maxHist metric.IHistogram, increment int64, noteworthy int64, settings *cluster.Settings, @@ -386,7 +386,7 @@ func NewUnlimitedMonitor( name redact.RedactableString, res Resource, curCount *metric.Gauge, - maxHist *metric.Histogram, + maxHist metric.IHistogram, noteworthy int64, settings *cluster.Settings, ) *BytesMonitor { @@ -485,7 +485,7 @@ func (mm *BytesMonitor) AllocBytes() int64 { } // SetMetrics sets the metric objects for the monitor. -func (mm *BytesMonitor) SetMetrics(curCount *metric.Gauge, maxHist *metric.Histogram) { +func (mm *BytesMonitor) SetMetrics(curCount *metric.Gauge, maxHist metric.IHistogram) { mm.mu.Lock() defer mm.mu.Unlock() mm.mu.curBytesCount = curCount diff --git a/pkg/util/schedulerlatency/scheduler_latency_test.go b/pkg/util/schedulerlatency/scheduler_latency_test.go index 1ec15ed46987..2539e6f62adb 100644 --- a/pkg/util/schedulerlatency/scheduler_latency_test.go +++ b/pkg/util/schedulerlatency/scheduler_latency_test.go @@ -170,7 +170,12 @@ func TestComputeSchedulerPercentileAgainstPrometheus(t *testing.T) { } // Compare values against metric.Histogram (prometheus-based implementation) - promhist := metric.NewHistogram(metric.Metadata{}, time.Hour, hist.Buckets) + promhist := metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePrometheus, + Metadata: metric.Metadata{}, + Duration: time.Hour, + Buckets: hist.Buckets, + }) for i := 0; i < len(hist.Counts); i++ { for j := 0; j < int(hist.Counts[i]); j++ { // Since the scheduler buckets are non-inclusive of Upper Bound and prometheus From 4b32a98cbdd44243297847e2291f731554f2fe1d Mon Sep 17 00:00:00 2001 From: Alex Barganier Date: Mon, 30 Jan 2023 16:23:56 -0400 Subject: [PATCH 2/2] pkg/util/metric: increase bucket counts for Prometheus histograms This patch increases the fidelity of the histogram buckets for the new Prometheus model. This is primarily done by increasing the bucket counts for all latency buckets, but may also be manually tweaked according to feedback from various engineering teams for their own use cases. Release note (ops change): Prometheus histograms will now export more buckets across the board to improve precision & fidelity of information reported by histogram metrics, such as quantiles. This will lead to an increase in storage requirements to process these histogram metrics in downstream systems like Prometheus, but should still be a marked improvement when compared to the legacy HdrHistogram model. If users have issues with the precision of these bucket boundaries, they can set the environment variable `COCKROACH_ENABLE_HDR_HISTOGRAMS=true` to revert to using the legacy HdrHistogram model instead, although this is not recommended otherwise as the HdrHistogram strains systems like Prometheus with excessive numbers of histogram buckets. Note that HdrHistograms are slated for full deprecation in upcoming releases. --- pkg/util/metric/histogram_buckets.go | 294 +++++++++++++++++----- pkg/util/metric/histogram_buckets_test.go | 8 +- 2 files changed, 241 insertions(+), 61 deletions(-) diff --git a/pkg/util/metric/histogram_buckets.go b/pkg/util/metric/histogram_buckets.go index 8ce3c9dce884..bf9f70579afb 100644 --- a/pkg/util/metric/histogram_buckets.go +++ b/pkg/util/metric/histogram_buckets.go @@ -17,20 +17,65 @@ package metric var IOLatencyBuckets = []float64{ // Generated via TestHistogramBuckets/IOLatencyBuckets. 10000.000000, // 10µs - 26826.957953, // 26.826µs - 71968.567300, // 71.968µs - 193069.772888, // 193.069µs - 517947.467923, // 517.947µs - 1389495.494373, // 1.389495ms - 3727593.720315, // 3.727593ms - 10000000.000000, // 9.999999ms - 26826957.952797, // 26.826957ms - 71968567.300115, // 71.968567ms - 193069772.888325, // 193.069772ms - 517947467.923120, // 517.947467ms - 1389495494.373135, // 1.389495494s - 3727593720.314933, // 3.72759372s - 9999999999.999981, // 9.999999999s + 12638.482029, // 12.638µs + 15973.122801, // 15.973µs + 20187.602547, // 20.187µs + 25514.065200, // 25.514µs + 32245.905453, // 32.245µs + 40753.929659, // 40.753µs + 51506.780762, // 51.506µs + 65096.752305, // 65.096µs + 82272.413417, // 82.272µs + 103979.841848, // 103.979µs + 131414.736261, // 131.414µs + 166088.278263, // 166.088µs + 209910.372011, // 209.91µs + 265294.846443, // 265.294µs + 335292.414925, // 335.292µs + 423758.716060, // 423.758µs + 535566.691771, // 535.566µs + 676875.000946, // 676.875µs + 855467.253557, // 855.467µs + 1081180.751077, // 1.08118ms + 1366448.349295, // 1.366448ms + 1726983.290659, // 1.726983ms + 2182644.728397, // 2.182644ms + 2758531.617629, // 2.758531ms + 3486365.227678, // 3.486365ms + 4406236.427774, // 4.406236ms + 5568813.990945, // 5.568813ms + 7038135.554932, // 7.038135ms + 8895134.973108, // 8.895134ms + 11242100.350621, // 11.2421ms + 14208308.325339, // 14.208308ms + 17957144.943716, // 17.957144ms + 22695105.366947, // 22.695105ms + 28683168.133420, // 28.683168ms + 36251170.499885, // 36.25117ms + 45815976.690545, // 45.815976ms + 57904439.806025, // 57.904439ms + 73182422.190762, // 73.182422ms + 92491472.772173, // 92.491472ms + 116895181.649858, // 116.895181ms + 147737765.259851, // 147.737765ms + 186718109.129192, // 186.718109ms + 235983346.678219, // 235.983346ms + 298247128.621688, // 298.247128ms + 376939097.538835, // 376.939097ms + 476393801.040133, // 476.393801ms + 602089449.333611, // 602.089449ms + 760949668.545986, // 760.949668ms + 961724871.115294, // 961.724871ms + 1215474250.076283, // 1.21547425s + 1536174946.671824, // 1.536174946s + 1941491945.743876, // 1.941491945s + 2453751106.639811, // 2.453751106s + 3101168926.574770, // 3.101168926s + 3919406774.847209, // 3.919406774s + 4953535208.959157, // 4.953535208s + 6260516572.014802, // 6.260516572s + 7912342618.981298, // 7.912342618s + 9999999999.999969, // 9.999999999s } // NetworkLatencyBuckets are prometheus histogram buckets suitable for a histogram @@ -39,21 +84,66 @@ var IOLatencyBuckets = []float64{ // range during normal operation. var NetworkLatencyBuckets = []float64{ // Generated via TestHistogramBuckets/NetworkLatencyBuckets. - 500000.000000, // 500µs - 860513.842995, // 860.513µs - 1480968.147973, // 1.480968ms - 2548787.184731, // 2.548787ms - 4386533.310619, // 4.386533ms - 7549345.273094, // 7.549345ms - 12992632.226094, // 12.992632ms - 22360679.774998, // 22.360679ms - 38483348.970335, // 38.483348ms - 66230909.027573, // 66.230909ms - 113985228.104760, // 113.985228ms - 196171733.362212, // 196.171733ms - 337616984.325077, // 337.616984ms - 581048177.284016, // 581.048177ms - 999999999.999999, // 999.999999ms + 500000.000000, // 500µs + 568747.715565, // 568.747µs + 646947.927922, // 646.947µs + 735900.312190, // 735.9µs + 837083.242884, // 837.083µs + 952178.364257, // 952.178µs + 1083098.538963, // 1.083098ms + 1232019.639535, // 1.232019ms + 1401416.711034, // 1.401416ms + 1594105.105912, // 1.594105ms + 1813287.274717, // 1.813287ms + 2062605.990318, // 2.062605ms + 2346204.890209, // 2.346204ms + 2668797.343109, // 2.668797ms + 3035744.784401, // 3.035744ms + 3453145.822334, // 3.453145ms + 3927937.595933, // 3.927937ms + 4468011.069141, // 4.468011ms + 5082342.177389, // 5.082342ms + 5781141.006222, // 5.781141ms + 6576021.481300, // 6.576021ms + 7480194.389996, // 7.480194ms + 8508686.942589, // 8.508686ms + 9678592.522117, // 9.678592ms + 11009354.773683, // 11.009354ms + 12523090.754761, // 12.52309ms + 14244958.517175, // 14.244958ms + 16203575.229933, // 16.203575ms + 18431492.792031, // 18.431492ms + 20965738.839853, // 20.965738ms + 23848432.140611, // 23.848432ms + 27127482.599575, // 27.127482ms + 30857387.515093, // 30.857387ms + 35100137.315047, // 35.100137ms + 39926245.827925, // 39.926245ms + 45415922.211464, // 45.415922ms + 51660404.016126, // 51.660404ms + 58763473.538708, // 58.763473ms + 66843182.667648, // 66.843182ms + 76033814.886682, // 76.033814ms + 86488117.045035, // 86.488117ms + 98379837.985822, // 98.379837ms + 111906616.224248, // 111.906616ms + 127293264.668375, // 127.293264ms + 144795506.973983, // 144.795506ms + 164704227.631154, // 164.704227ms + 187350306.418342, // 187.350306ms + 213110117.571795, // 213.110117ms + 242411785.065635, // 242.411785ms + 275742297.964389, // 275.742297ms + 313655604.103963, // 313.655604ms + 356781816.616787, // 356.781816ms + 405837686.312094, // 405.837686ms + 461638513.960647, // 461.638513ms + 525111700.464186, // 525.1117ms + 597312160.111267, // 597.31216ms + 679439853.085354, // 679.439853ms + 772859728.612681, // 772.859728ms + 879124410.201811, // 879.12441ms + 1000000000.000001, // 1s } // BatchProcessLatencyBuckets are prometheus histogram buckets suitable for a @@ -62,20 +152,65 @@ var NetworkLatencyBuckets = []float64{ var BatchProcessLatencyBuckets = []float64{ // Generated via TestHistogramBuckets/BatchProcessLatencyBuckets. 500000000.000000, // 500ms - 789604072.059876, // 789.604072ms - 1246949181.227077, // 1.246949181s - 1969192302.297256, // 1.969192302s - 3109764521.125753, // 3.109764521s - 4910965458.056452, // 4.910965458s - 7755436646.853539, // 7.755436646s - 12247448713.915894, // 12.247448713s - 19341270753.704967, // 19.341270753s - 30543892291.876068, // 30.543892291s - 48235163460.447227, // 48.23516346s - 76173362969.685760, // 1m16.173362969s - 120293595166.717728, // 2m0.293595166s - 189968625172.725128, // 3m9.968625172s - 300000000000.000183, // 5m0s + 557259285.358743, // 557.259285ms + 621075822.237074, // 621.075822ms + 692200537.706851, // 692.200537ms + 771470353.934916, // 771.470353ms + 859818036.218456, // 859.818036ms + 958283168.803309, // 958.283168ms + 1068024387.637287, // 1.068024387s + 1190333014.000928, // 1.190333014s + 1326648249.442152, // 1.326648249s + 1478574110.813123, // 1.47857411s + 1647898304.683320, // 1.647898304s + 1836613263.223422, // 1.836613263s + 2046939589.088547, // 2.046939589s + 2281352185.176006, // 2.281352185s + 2542609376.725576, // 2.542609376s + 2833785368.441068, // 2.833785368s + 3158306418.555065, // 3.158306418s + 3519991155.495853, // 3.519991155s + 3923095511.561431, // 3.923095511s + 4372362802.333632, // 4.372362802s + 4873079541.115184, // 4.873079541s + 5431137645.156319, // 5.431137645s + 6053103765.649553, // 6.053103765s + 6746296557.296375, // 6.746296557s + 7518872796.674253, // 7.518872796s + 8379923362.755980, // 8.379923362s + 9339580208.980864, // 9.339580208s + 10409135585.614676, // 10.409135585s + 11601174915.283792, // 11.601174915s + 12929724885.225649, // 12.929724885s + 14410418498.852003, // 14.410418498s + 16060679028.781363, // 16.060679028s + 17899925035.909710, // 17.899925035s + 19949798866.972237, // 19.949798866s + 22234421319.319225, // 22.234421319s + 24780675469.538071, // 24.780675469s + 27618523005.723442, // 27.618523005s + 30781356785.666904, // 30.781356785s + 34306393769.506477, // 34.306393769s + 38235112950.461639, // 38.23511295s + 42613743436.770157, // 42.613743436s + 47493808428.070732, // 47.493808428s + 52932731487.183495, // 52.932731487s + 58994512241.268242, // 58.994512241s + 65750479463.313522, // 1m5.750479463s + 73280130395.441635, // 1m13.280130395s + 81672066190.318619, // 1m21.67206619s + 91025034477.977859, // 1m31.025034477s + 101449091325.905777, // 1m41.449091325s + 113066896265.136261, // 1m53.066896265s + 126015155620.881943, // 2m6.01515562s + 140446231131.326965, // 2m20.446231131s + 156529932783.144257, // 2m36.529932783s + 174455516959.974152, // 2m54.455516959s + 194433913416.010529, // 3m14.433913416s + 216700207279.419586, // 3m36.700207279s + 241516405291.241699, // 4m1.516405291s + 269174518830.019897, // 4m29.17451883s + 300000000000.000854, // 5m0s } // LongRunning60mLatencyBuckets are prometheus histogram buckets suitable @@ -84,20 +219,65 @@ var BatchProcessLatencyBuckets = []float64{ var LongRunning60mLatencyBuckets = []float64{ // Generated via TestHistogramBuckets/LongRunning60mLatencyBuckets. 500000000.000000, // 500ms - 942961049.923126, // 942.961049ms - 1778351083.344248, // 1.778351083s - 3353831609.364442, // 3.353831609s - 6325065151.263324, // 6.325065151s - 11928580151.734879, // 11.928580151s - 22496372927.944168, // 22.496372927s - 42426406871.192848, // 42.426406871s - 80012898335.451462, // 1m20.012898335s - 150898093243.579315, // 2m30.898093243s - 284582048872.726685, // 4m44.582048872s - 536699575188.601318, // 8m56.699575188s - 1012173589826.278687, // 16m52.173589826s - 1908880541934.094238, // 31m48.880541934s - 3599999999999.998535, // 59m59.999999999s + 581230667.894489, // 581.230667ms + 675658178.602148, // 675.658178ms + 785426508.834601, // 785.426508ms + 913027948.623944, // 913.027948ms + 1061359688.770060, // 1.061359688s + 1233789601.560218, // 1.233789601s + 1434232708.312242, // 1.434232708s + 1667240069.936893, // 1.667240069s + 1938102118.779750, // 1.938102118s + 2252968777.892157, // 2.252968777s + 2618989095.039379, // 2.618989095s + 3044473561.836243, // 3.044473561s + 3539082803.466387, // 3.539082803s + 4114046923.185338, // 4.114046923s + 4782420481.824564, // 4.782420481s + 5559378901.606352, // 5.559378901s + 6462563024.118382, // 6.462563024s + 7512479645.637113, // 7.512479645s + 8732967123.954826, // 8.732967123s + 10151736628.313759, // 10.151736628s + 11801001321.527510, // 11.801001321s + 13718207759.870365, // 13.718207759s + 15946886117.169632, // 15.946886117s + 18537638537.439724, // 18.537638537s + 21549288056.605419, // 21.549288056s + 25050214179.583008, // 25.050214179s + 29119905436.998066, // 29.119905436s + 33850764172.341507, // 33.850764172s + 39350204537.257782, // 39.350204537s + 45743091329.950188, // 45.743091329s + 53174575050.531136, // 53.17457505s + 61813387543.251701, // 1m1.813387543s + 71855673053.170151, // 1m11.855673053s + 83529441681.404266, // 1m23.529441681s + 97099746354.672745, // 1m37.099746354s + 112874700852.223846, // 1m52.874700852s + 131212475529.457443, // 2m11.212475529s + 152529429576.151703, // 2m32.529429576s + 177309564452.224213, // 2m57.309564452s + 206115513141.294464, // 3m26.115513141s + 239601314733.059875, // 3m59.601314733s + 278527264381.388123, // 4m38.527264381s + 323777175806.438293, // 5m23.777175806s + 376378448285.935181, // 6m16.378448285s + 437525393756.650940, // 7m17.525393756s + 508606353667.955078, // 8m28.606353667s + 591235221275.612671, // 9m51.235221275s + 687288085089.540771, // 11m27.288085089s + 798945825465.036499, // 13m18.945825465s + 928743631493.114136, // 15m28.743631493s + 1079628562470.991943, // 17m59.62856247s + 1255026460885.963623, // 20m55.026460885s + 1458919736172.010742, // 24m18.919736172s + 1695937785319.419434, // 28m15.937785319s + 1971462103337.413574, // 32m51.462103337s + 2291748470102.958496, // 38m11.748470102s + 2664068987848.231934, // 44m24.068987848s + 3096877194248.046875, // 51m36.877194248s + 3600000000000.007812, // 1h0m0s } // Count1KBuckets are prometheus histogram buckets suitable for a histogram that diff --git a/pkg/util/metric/histogram_buckets_test.go b/pkg/util/metric/histogram_buckets_test.go index 7fb183d70bf6..6f28454b89ff 100644 --- a/pkg/util/metric/histogram_buckets_test.go +++ b/pkg/util/metric/histogram_buckets_test.go @@ -48,22 +48,22 @@ func TestHistogramBuckets(t *testing.T) { require.InDeltaSlice(t, exp, act, 1 /* delta */, "Please update the bucket boundaries for %s", t.Name()) } t.Run("IOLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 15) + exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 60) verifyAndPrint(t, exp, IOLatencyBuckets, LATENCY) }) t.Run("NetworkLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(500e3, 1e9, 15) + exp := prometheus.ExponentialBucketsRange(500e3, 1e9, 60) verifyAndPrint(t, exp, NetworkLatencyBuckets, LATENCY) }) t.Run("BatchProcessLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(500e6, 300e9, 15) + exp := prometheus.ExponentialBucketsRange(500e6, 300e9, 60) verifyAndPrint(t, exp, BatchProcessLatencyBuckets, LATENCY) }) t.Run("LongRunning60mLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(500e6, 3600e9, 15) + exp := prometheus.ExponentialBucketsRange(500e6, 3600e9, 60) verifyAndPrint(t, exp, LongRunning60mLatencyBuckets, LATENCY) })