From 44d0ffa14ebb9fd4e54c7a124209928ce037a80f Mon Sep 17 00:00:00 2001 From: David Hartunian Date: Thu, 26 Jan 2023 12:35:30 -0500 Subject: [PATCH 1/2] pkg/util/metric: optionally reintroduce legacy hdrhistogram model Addresses https://github.com/cockroachdb/cockroach/issues/95833 This patch reeintroduces the old HdrHistogram model to optionally be enabled in favor of the new Prometheus model, gated behind an environment variable called `COCKROACH_ENABLE_HDR_HISTOGRAMS`, allowing users a means to "fall back" to the old model in the event that the new model does not adequately serve their needs (think of this as an "insurance policy" to protect against this from happening again with no real mitigation - ideally, this environment variable should never have to be used). Note: some histograms were introduced *after* the new Prometheus histograms were added to CockroachDB. In this case, we use the `ForceUsePrometheus` option in the `HistogramOptions` struct to ignore the value of the env var, since there never was a time where these specific histograms used the HdrHistogram model. Release note (ops change): Histogram metrics can now optionally use the legacy HdrHistogram model by setting the environment var `COCKROACH_ENABLE_HDR_HISTOGRAMS=true` on CockroachDB nodes. **Note that this is not recommended** unless users are having difficulties with the newer Prometheus-backed histogram model. Enabling can cause performance issues with timeseries databases like Prometheus, as processing and storing the increased number of buckets is taxing on both CPU and storage. Note that the HdrHistogram model is slated for full deprecation in upcoming releases. --- pkg/ccl/changefeedccl/metrics.go | 74 +++-- pkg/ccl/sqlproxyccl/connector.go | 2 +- pkg/ccl/sqlproxyccl/connector_test.go | 27 +- pkg/ccl/sqlproxyccl/metrics.go | 61 +++-- pkg/ccl/streamingccl/streamingest/metrics.go | 39 ++- pkg/kv/bulk/bulk_metrics.go | 14 +- pkg/kv/kvclient/kvcoord/txn_metrics.go | 45 ++-- pkg/kv/kvprober/kvprober.go | 22 +- pkg/kv/kvserver/liveness/liveness.go | 11 +- pkg/kv/kvserver/metrics.go | 89 ++++--- pkg/kv/kvserver/scheduler.go | 2 +- pkg/kv/kvserver/txnwait/metrics.go | 28 +- pkg/rpc/clock_offset.go | 11 +- pkg/server/node.go | 11 +- pkg/server/status/recorder_test.go | 7 +- pkg/sql/conn_executor.go | 87 +++--- pkg/sql/execinfra/metrics.go | 44 ++- pkg/sql/executor_statement_metrics.go | 18 +- pkg/sql/mem_metrics.go | 48 +++- pkg/sql/pgwire/server.go | 11 +- .../sqlstats/persistedsqlstats/provider.go | 2 +- pkg/sql/sqlstats/sslocal/sql_stats.go | 2 +- pkg/sql/sqlstats/sslocal/sslocal_provider.go | 2 +- pkg/sql/ttl/ttljob/ttljob_metrics.go | 37 +-- pkg/util/admission/work_queue.go | 11 +- pkg/util/metric/BUILD.bazel | 4 + pkg/util/metric/aggmetric/BUILD.bazel | 4 +- pkg/util/metric/aggmetric/agg_metric.go | 7 +- pkg/util/metric/aggmetric/agg_metric_test.go | 181 +++---------- pkg/util/metric/aggmetric/histogram.go | 27 +- .../aggmetric/testdata/add_after_destroy.txt | 53 ++++ .../testdata/add_after_destroy_hdr.txt | 23 ++ pkg/util/metric/aggmetric/testdata/basic.txt | 54 ++++ .../metric/aggmetric/testdata/basic_hdr.txt | 24 ++ .../metric/aggmetric/testdata/destroy.txt | 36 +++ .../metric/aggmetric/testdata/destroy_hdr.txt | 17 ++ pkg/util/metric/hdrhistogram.go | 252 ++++++++++++++++++ pkg/util/metric/metric.go | 108 +++++++- pkg/util/metric/metric_test.go | 28 +- pkg/util/metric/registry_test.go | 37 ++- pkg/util/mon/bytes_usage.go | 10 +- 41 files changed, 1133 insertions(+), 437 deletions(-) create mode 100644 pkg/util/metric/aggmetric/testdata/add_after_destroy.txt create mode 100644 pkg/util/metric/aggmetric/testdata/add_after_destroy_hdr.txt create mode 100644 pkg/util/metric/aggmetric/testdata/basic.txt create mode 100644 pkg/util/metric/aggmetric/testdata/basic_hdr.txt create mode 100644 pkg/util/metric/aggmetric/testdata/destroy.txt create mode 100644 pkg/util/metric/aggmetric/testdata/destroy_hdr.txt create mode 100644 pkg/util/metric/hdrhistogram.go diff --git a/pkg/ccl/changefeedccl/metrics.go b/pkg/ccl/changefeedccl/metrics.go index 6ee871291459..c5d42c801611 100644 --- a/pkg/ccl/changefeedccl/metrics.go +++ b/pkg/ccl/changefeedccl/metrics.go @@ -35,6 +35,14 @@ import ( var enableSLIMetrics = envutil.EnvOrDefaultBool( "COCKROACH_EXPERIMENTAL_ENABLE_PER_CHANGEFEED_METRICS", false) +const ( + changefeedCheckpointHistMaxLatency = 30 * time.Second + changefeedBatchHistMaxLatency = 30 * time.Second + changefeedFlushHistMaxLatency = 1 * time.Minute + admitLatencyMaxValue = 1 * time.Minute + commitLatencyMaxValue = 10 * time.Minute +) + // max length for the scope name. const maxSLIScopeNameLen = 128 @@ -482,18 +490,48 @@ func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics { // retain significant figures of 2. b := aggmetric.MakeBuilder("scope") a := &AggMetrics{ - ErrorRetries: b.Counter(metaChangefeedErrorRetries), - EmittedMessages: b.Counter(metaChangefeedEmittedMessages), - MessageSize: b.Histogram(metaMessageSize, histogramWindow, metric.DataSize16MBBuckets), + ErrorRetries: b.Counter(metaChangefeedErrorRetries), + EmittedMessages: b.Counter(metaChangefeedEmittedMessages), + MessageSize: b.Histogram(metric.HistogramOptions{ + Metadata: metaMessageSize, + Duration: histogramWindow, + MaxVal: 10 << 20, /* 10MB max message size */ + SigFigs: 1, + Buckets: metric.DataSize16MBBuckets, + }), EmittedBytes: b.Counter(metaChangefeedEmittedBytes), FlushedBytes: b.Counter(metaChangefeedFlushedBytes), Flushes: b.Counter(metaChangefeedFlushes), SizeBasedFlushes: b.Counter(metaSizeBasedFlushes), - BatchHistNanos: b.Histogram(metaChangefeedBatchHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets), - FlushHistNanos: b.Histogram(metaChangefeedFlushHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets), - CommitLatency: b.Histogram(metaCommitLatency, histogramWindow, metric.BatchProcessLatencyBuckets), - AdmitLatency: b.Histogram(metaAdmitLatency, histogramWindow, metric.BatchProcessLatencyBuckets), + BatchHistNanos: b.Histogram(metric.HistogramOptions{ + Metadata: metaChangefeedBatchHistNanos, + Duration: histogramWindow, + MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(), + SigFigs: 1, + Buckets: metric.BatchProcessLatencyBuckets, + }), + FlushHistNanos: b.Histogram(metric.HistogramOptions{ + Metadata: metaChangefeedFlushHistNanos, + Duration: histogramWindow, + MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(), + SigFigs: 2, + Buckets: metric.BatchProcessLatencyBuckets, + }), + CommitLatency: b.Histogram(metric.HistogramOptions{ + Metadata: metaCommitLatency, + Duration: histogramWindow, + MaxVal: commitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + Buckets: metric.BatchProcessLatencyBuckets, + }), + AdmitLatency: b.Histogram(metric.HistogramOptions{ + Metadata: metaAdmitLatency, + Duration: histogramWindow, + MaxVal: admitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + Buckets: metric.BatchProcessLatencyBuckets, + }), BackfillCount: b.Gauge(metaChangefeedBackfillCount), BackfillPendingRanges: b.Gauge(metaChangefeedBackfillPendingRanges), RunningCount: b.Gauge(metaChangefeedRunning), @@ -574,7 +612,7 @@ type Metrics struct { Failures *metric.Counter ResolvedMessages *metric.Counter QueueTimeNanos *metric.Counter - CheckpointHistNanos *metric.Histogram + CheckpointHistNanos metric.IHistogram FrontierUpdates *metric.Counter ThrottleMetrics cdcutils.Metrics ReplanCount *metric.Counter @@ -601,13 +639,19 @@ func (m *Metrics) getSLIMetrics(scope string) (*sliMetrics, error) { // MakeMetrics makes the metrics for changefeed monitoring. func MakeMetrics(histogramWindow time.Duration) metric.Struct { m := &Metrics{ - AggMetrics: newAggregateMetrics(histogramWindow), - KVFeedMetrics: kvevent.MakeMetrics(histogramWindow), - SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow), - ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages), - Failures: metric.NewCounter(metaChangefeedFailures), - QueueTimeNanos: metric.NewCounter(metaEventQueueTime), - CheckpointHistNanos: metric.NewHistogram(metaChangefeedCheckpointHistNanos, histogramWindow, metric.IOLatencyBuckets), + AggMetrics: newAggregateMetrics(histogramWindow), + KVFeedMetrics: kvevent.MakeMetrics(histogramWindow), + SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow), + ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages), + Failures: metric.NewCounter(metaChangefeedFailures), + QueueTimeNanos: metric.NewCounter(metaEventQueueTime), + CheckpointHistNanos: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaChangefeedCheckpointHistNanos, + Duration: histogramWindow, + MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(), + SigFigs: 2, + Buckets: metric.IOLatencyBuckets, + }), FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates), ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow), ReplanCount: metric.NewCounter(metaChangefeedReplanCount), diff --git a/pkg/ccl/sqlproxyccl/connector.go b/pkg/ccl/sqlproxyccl/connector.go index e8502c37ca7a..3abcdf01189a 100644 --- a/pkg/ccl/sqlproxyccl/connector.go +++ b/pkg/ccl/sqlproxyccl/connector.go @@ -76,7 +76,7 @@ type connector struct { // DialTenantLatency tracks how long it takes to retrieve the address for // a tenant and set up a tcp connection to the address. - DialTenantLatency *metric.Histogram + DialTenantLatency metric.IHistogram // DialTenantRetries counts how often dialing a tenant is retried. DialTenantRetries *metric.Counter diff --git a/pkg/ccl/sqlproxyccl/connector_test.go b/pkg/ccl/sqlproxyccl/connector_test.go index 022dbde8300f..6fd847d6ea9a 100644 --- a/pkg/ccl/sqlproxyccl/connector_test.go +++ b/pkg/ccl/sqlproxyccl/connector_test.go @@ -379,9 +379,12 @@ func TestConnector_dialTenantCluster(t *testing.T) { c := &connector{ TenantID: roachpb.MakeTenantID(42), - DialTenantLatency: metric.NewHistogram( - metaDialTenantLatency, time.Millisecond, metric.NetworkLatencyBuckets, - ), + DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePrometheus, + Metadata: metaDialTenantLatency, + Duration: time.Millisecond, + Buckets: metric.NetworkLatencyBuckets, + }), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } dc := &testTenantDirectoryCache{} @@ -459,9 +462,12 @@ func TestConnector_dialTenantCluster(t *testing.T) { defer cancel() c := &connector{ - DialTenantLatency: metric.NewHistogram( - metaDialTenantLatency, time.Millisecond, metric.NetworkLatencyBuckets, - ), + DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDialTenantLatency, + Duration: time.Millisecond, + Buckets: metric.NetworkLatencyBuckets, + }), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } c.testingKnobs.lookupAddr = func(ctx context.Context) (string, error) { @@ -490,9 +496,12 @@ func TestConnector_dialTenantCluster(t *testing.T) { var reportFailureFnCount int c := &connector{ TenantID: roachpb.MakeTenantID(42), - DialTenantLatency: metric.NewHistogram( - metaDialTenantLatency, time.Millisecond, metric.NetworkLatencyBuckets, - ), + DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDialTenantLatency, + Duration: time.Millisecond, + Buckets: metric.NetworkLatencyBuckets, + }), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } c.DirectoryCache = &testTenantDirectoryCache{ diff --git a/pkg/ccl/sqlproxyccl/metrics.go b/pkg/ccl/sqlproxyccl/metrics.go index 2fe0d8e16131..2432f9b1f668 100644 --- a/pkg/ccl/sqlproxyccl/metrics.go +++ b/pkg/ccl/sqlproxyccl/metrics.go @@ -23,19 +23,19 @@ type metrics struct { RoutingErrCount *metric.Counter RefusedConnCount *metric.Counter SuccessfulConnCount *metric.Counter - ConnectionLatency *metric.Histogram + ConnectionLatency metric.IHistogram AuthFailedCount *metric.Counter ExpiredClientConnCount *metric.Counter - DialTenantLatency *metric.Histogram + DialTenantLatency metric.IHistogram DialTenantRetries *metric.Counter ConnMigrationSuccessCount *metric.Counter ConnMigrationErrorFatalCount *metric.Counter ConnMigrationErrorRecoverableCount *metric.Counter ConnMigrationAttemptedCount *metric.Counter - ConnMigrationAttemptedLatency *metric.Histogram - ConnMigrationTransferResponseMessageSize *metric.Histogram + ConnMigrationAttemptedLatency metric.IHistogram + ConnMigrationTransferResponseMessageSize metric.IHistogram QueryCancelReceivedPGWire *metric.Counter QueryCancelReceivedHTTP *metric.Counter @@ -49,6 +49,16 @@ func (metrics) MetricStruct() {} var _ metric.Struct = metrics{} +const ( + // maxExpectedTransferResponseMessageSize corresponds to maximum expected + // response message size for the SHOW TRANSFER STATE query. We choose 16MB + // here to match the defaultMaxReadBufferSize used for ingesting SQL + // statements in the SQL server (see pkg/sql/pgwire/pgwirebase/encoding.go). + // + // This will be used to tune sql.session_transfer.max_session_size. + maxExpectedTransferResponseMessageSize = 1 << 24 // 16MB +) + var ( metaCurConnCount = metric.Metadata{ Name: "proxy.sql.conns", @@ -213,18 +223,20 @@ func makeProxyMetrics() metrics { RoutingErrCount: metric.NewCounter(metaRoutingErrCount), RefusedConnCount: metric.NewCounter(metaRefusedConnCount), SuccessfulConnCount: metric.NewCounter(metaSuccessfulConnCount), - ConnectionLatency: metric.NewHistogram( - metaConnMigrationAttemptedCount, - base.DefaultHistogramWindowInterval(), - metric.NetworkLatencyBuckets, - ), + ConnectionLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaConnMigrationAttemptedCount, + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.NetworkLatencyBuckets, + }), AuthFailedCount: metric.NewCounter(metaAuthFailedCount), ExpiredClientConnCount: metric.NewCounter(metaExpiredClientConnCount), // Connector metrics. - DialTenantLatency: metric.NewHistogram( - metaDialTenantLatency, - base.DefaultHistogramWindowInterval(), - metric.NetworkLatencyBuckets, + DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDialTenantLatency, + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.NetworkLatencyBuckets}, ), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), // Connection migration metrics. @@ -232,16 +244,19 @@ func makeProxyMetrics() metrics { ConnMigrationErrorFatalCount: metric.NewCounter(metaConnMigrationErrorFatalCount), ConnMigrationErrorRecoverableCount: metric.NewCounter(metaConnMigrationErrorRecoverableCount), ConnMigrationAttemptedCount: metric.NewCounter(metaConnMigrationAttemptedCount), - ConnMigrationAttemptedLatency: metric.NewHistogram( - metaConnMigrationAttemptedLatency, - base.DefaultHistogramWindowInterval(), - metric.NetworkLatencyBuckets, - ), - ConnMigrationTransferResponseMessageSize: metric.NewHistogram( - metaConnMigrationTransferResponseMessageSize, - base.DefaultHistogramWindowInterval(), - metric.DataSize16MBBuckets, - ), + ConnMigrationAttemptedLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaConnMigrationAttemptedLatency, + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.NetworkLatencyBuckets, + }), + ConnMigrationTransferResponseMessageSize: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaConnMigrationTransferResponseMessageSize, + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.DataSize16MBBuckets, + MaxVal: maxExpectedTransferResponseMessageSize, + SigFigs: 1, + }), QueryCancelReceivedPGWire: metric.NewCounter(metaQueryCancelReceivedPGWire), QueryCancelReceivedHTTP: metric.NewCounter(metaQueryCancelReceivedHTTP), QueryCancelIgnored: metric.NewCounter(metaQueryCancelIgnored), diff --git a/pkg/ccl/streamingccl/streamingest/metrics.go b/pkg/ccl/streamingccl/streamingest/metrics.go index aec88cc3a517..2b491d4b5bbc 100644 --- a/pkg/ccl/streamingccl/streamingest/metrics.go +++ b/pkg/ccl/streamingccl/streamingest/metrics.go @@ -15,6 +15,12 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/metric" ) +const ( + streamingFlushHistMaxLatency = 1 * time.Minute + streamingAdmitLatencyMaxValue = 3 * time.Minute + streamingCommitLatencyMaxValue = 10 * time.Minute +) + var ( metaStreamingEventsIngested = metric.Metadata{ Name: "streaming.events_ingested", @@ -107,9 +113,9 @@ type Metrics struct { Flushes *metric.Counter JobProgressUpdates *metric.Counter ResolvedEvents *metric.Counter - FlushHistNanos *metric.Histogram - CommitLatency *metric.Histogram - AdmitLatency *metric.Histogram + FlushHistNanos metric.IHistogram + CommitLatency metric.IHistogram + AdmitLatency metric.IHistogram RunningCount *metric.Gauge EarliestDataCheckpointSpan *metric.Gauge LatestDataCheckpointSpan *metric.Gauge @@ -128,12 +134,27 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct { Flushes: metric.NewCounter(metaStreamingFlushes), ResolvedEvents: metric.NewCounter(metaStreamingResolvedEventsIngested), JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates), - FlushHistNanos: metric.NewHistogram(metaStreamingFlushHistNanos, - histogramWindow, metric.BatchProcessLatencyBuckets), - CommitLatency: metric.NewHistogram(metaStreamingCommitLatency, - histogramWindow, metric.BatchProcessLatencyBuckets), - AdmitLatency: metric.NewHistogram(metaStreamingAdmitLatency, - histogramWindow, metric.BatchProcessLatencyBuckets), + FlushHistNanos: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaStreamingFlushHistNanos, + Duration: histogramWindow, + Buckets: metric.BatchProcessLatencyBuckets, + MaxVal: streamingFlushHistMaxLatency.Nanoseconds(), + SigFigs: 1, + }), + CommitLatency: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaStreamingCommitLatency, + Duration: histogramWindow, + Buckets: metric.BatchProcessLatencyBuckets, + MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + }), + AdmitLatency: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaStreamingAdmitLatency, + Duration: histogramWindow, + Buckets: metric.BatchProcessLatencyBuckets, + MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(), + SigFigs: 1, + }), RunningCount: metric.NewGauge(metaStreamsRunning), EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan), LatestDataCheckpointSpan: metric.NewGauge(metaLatestDataCheckpointSpan), diff --git a/pkg/kv/bulk/bulk_metrics.go b/pkg/kv/bulk/bulk_metrics.go index f3390d54733e..7cbbc748a20b 100644 --- a/pkg/kv/bulk/bulk_metrics.go +++ b/pkg/kv/bulk/bulk_metrics.go @@ -20,7 +20,7 @@ import ( // Metrics contains pointers to the metrics for // monitoring bulk operations. type Metrics struct { - MaxBytesHist *metric.Histogram + MaxBytesHist metric.IHistogram CurBytesCount *metric.Gauge } @@ -44,10 +44,20 @@ var ( } ) +// See pkg/sql/mem_metrics.go +// log10int64times1000 = log10(math.MaxInt64) * 1000, rounded up somewhat +const log10int64times1000 = 19 * 1000 + // MakeBulkMetrics instantiates the metrics holder for bulk operation monitoring. func MakeBulkMetrics(histogramWindow time.Duration) Metrics { return Metrics{ - MaxBytesHist: metric.NewHistogram(metaMemMaxBytes, histogramWindow, metric.MemoryUsage64MBBuckets), + MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaMemMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), CurBytesCount: metric.NewGauge(metaMemCurBytes), } } diff --git a/pkg/kv/kvclient/kvcoord/txn_metrics.go b/pkg/kv/kvclient/kvcoord/txn_metrics.go index eb6313012717..fcca64aa74ae 100644 --- a/pkg/kv/kvclient/kvcoord/txn_metrics.go +++ b/pkg/kv/kvclient/kvcoord/txn_metrics.go @@ -31,14 +31,14 @@ type TxnMetrics struct { RefreshMemoryLimitExceeded *metric.Counter RefreshAutoRetries *metric.Counter - Durations *metric.Histogram + Durations metric.IHistogram TxnsWithCondensedIntents *metric.Counter TxnsWithCondensedIntentsGauge *metric.Gauge TxnsRejectedByLockSpanBudget *metric.Counter // Restarts is the number of times we had to restart the transaction. - Restarts *metric.Histogram + Restarts metric.IHistogram // Counts of restart types. RestartsWriteTooOld telemetry.CounterWithMetric @@ -264,21 +264,32 @@ var ( // windowed portions retain data for approximately histogramWindow. func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics { return TxnMetrics{ - Aborts: metric.NewCounter(metaAbortsRates), - Commits: metric.NewCounter(metaCommitsRates), - Commits1PC: metric.NewCounter(metaCommits1PCRates), - ParallelCommits: metric.NewCounter(metaParallelCommitsRates), - CommitWaits: metric.NewCounter(metaCommitWaitCount), - RefreshSuccess: metric.NewCounter(metaRefreshSuccess), - RefreshFail: metric.NewCounter(metaRefreshFail), - RefreshFailWithCondensedSpans: metric.NewCounter(metaRefreshFailWithCondensedSpans), - RefreshMemoryLimitExceeded: metric.NewCounter(metaRefreshMemoryLimitExceeded), - RefreshAutoRetries: metric.NewCounter(metaRefreshAutoRetries), - Durations: metric.NewHistogram(metaDurationsHistograms, histogramWindow, metric.IOLatencyBuckets), - TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans), - TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge), - TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget), - Restarts: metric.NewHistogram(metaRestartsHistogram, histogramWindow, metric.Count1KBuckets), + Aborts: metric.NewCounter(metaAbortsRates), + Commits: metric.NewCounter(metaCommitsRates), + Commits1PC: metric.NewCounter(metaCommits1PCRates), + ParallelCommits: metric.NewCounter(metaParallelCommitsRates), + CommitWaits: metric.NewCounter(metaCommitWaitCount), + RefreshSuccess: metric.NewCounter(metaRefreshSuccess), + RefreshFail: metric.NewCounter(metaRefreshFail), + RefreshFailWithCondensedSpans: metric.NewCounter(metaRefreshFailWithCondensedSpans), + RefreshMemoryLimitExceeded: metric.NewCounter(metaRefreshMemoryLimitExceeded), + RefreshAutoRetries: metric.NewCounter(metaRefreshAutoRetries), + Durations: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaDurationsHistograms, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans), + TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge), + TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget), + Restarts: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaRestartsHistogram, + Duration: histogramWindow, + MaxVal: 100, + SigFigs: 3, + Buckets: metric.Count1KBuckets, + }), RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld), RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti), RestartsSerializable: telemetry.NewCounterWithMetric(metaRestartsSerializable), diff --git a/pkg/kv/kvprober/kvprober.go b/pkg/kv/kvprober/kvprober.go index 2e0d84529fc9..0ec07006efe1 100644 --- a/pkg/kv/kvprober/kvprober.go +++ b/pkg/kv/kvprober/kvprober.go @@ -140,10 +140,10 @@ var ( type Metrics struct { ReadProbeAttempts *metric.Counter ReadProbeFailures *metric.Counter - ReadProbeLatency *metric.Histogram + ReadProbeLatency metric.IHistogram WriteProbeAttempts *metric.Counter WriteProbeFailures *metric.Counter - WriteProbeLatency *metric.Histogram + WriteProbeLatency metric.IHistogram WriteProbeQuarantineOldestDuration *metric.Gauge ProbePlanAttempts *metric.Counter ProbePlanFailures *metric.Counter @@ -229,14 +229,20 @@ func NewProber(opts Opts) *Prober { metrics: Metrics{ ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts), ReadProbeFailures: metric.NewCounter(metaReadProbeFailures), - ReadProbeLatency: metric.NewHistogram( - metaReadProbeLatency, opts.HistogramWindowInterval, metric.NetworkLatencyBuckets, - ), + ReadProbeLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaReadProbeLatency, + Duration: opts.HistogramWindowInterval, + Buckets: metric.NetworkLatencyBuckets, + }), WriteProbeAttempts: metric.NewCounter(metaWriteProbeAttempts), WriteProbeFailures: metric.NewCounter(metaWriteProbeFailures), - WriteProbeLatency: metric.NewHistogram( - metaWriteProbeLatency, opts.HistogramWindowInterval, metric.NetworkLatencyBuckets, - ), + WriteProbeLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaWriteProbeLatency, + Duration: opts.HistogramWindowInterval, + Buckets: metric.NetworkLatencyBuckets, + }), WriteProbeQuarantineOldestDuration: metric.NewFunctionalGauge( metaWriteProbeQuarantineOldestDuration, func() int64 { return qPool.oldestDuration().Nanoseconds() }, diff --git a/pkg/kv/kvserver/liveness/liveness.go b/pkg/kv/kvserver/liveness/liveness.go index d2e92628c4db..04e314dfad00 100644 --- a/pkg/kv/kvserver/liveness/liveness.go +++ b/pkg/kv/kvserver/liveness/liveness.go @@ -144,7 +144,7 @@ type Metrics struct { HeartbeatSuccesses *metric.Counter HeartbeatFailures telemetry.CounterWithMetric EpochIncrements telemetry.CounterWithMetric - HeartbeatLatency *metric.Histogram + HeartbeatLatency metric.IHistogram } // IsLiveCallback is invoked when a node's IsLive state changes to true. @@ -309,9 +309,12 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness { HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses), HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures), EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements), - HeartbeatLatency: metric.NewHistogram( - metaHeartbeatLatency, opts.HistogramWindowInterval, metric.NetworkLatencyBuckets, - ), + HeartbeatLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaHeartbeatLatency, + Duration: opts.HistogramWindowInterval, + Buckets: metric.NetworkLatencyBuckets, + }), } nl.mu.nodes = make(map[roachpb.NodeID]Record) nl.heartbeatToken <- struct{}{} diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index ea91e0453217..28df6082e1fb 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -1780,15 +1780,15 @@ type StoreMetrics struct { // Raft processing metrics. RaftTicks *metric.Counter - RaftQuotaPoolPercentUsed *metric.Histogram + RaftQuotaPoolPercentUsed metric.IHistogram RaftWorkingDurationNanos *metric.Counter RaftTickingDurationNanos *metric.Counter RaftCommandsApplied *metric.Counter - RaftLogCommitLatency *metric.Histogram - RaftCommandCommitLatency *metric.Histogram - RaftHandleReadyLatency *metric.Histogram - RaftApplyCommittedLatency *metric.Histogram - RaftSchedulerLatency *metric.Histogram + RaftLogCommitLatency metric.IHistogram + RaftCommandCommitLatency metric.IHistogram + RaftHandleReadyLatency metric.IHistogram + RaftApplyCommittedLatency metric.IHistogram + RaftSchedulerLatency metric.IHistogram RaftTimeoutCampaign *metric.Counter // Raft message metrics. @@ -1920,8 +1920,8 @@ type StoreMetrics struct { ReplicaCircuitBreakerCumTripped *metric.Counter // Replica batch evaluation metrics. - ReplicaReadBatchEvaluationLatency *metric.Histogram - ReplicaWriteBatchEvaluationLatency *metric.Histogram + ReplicaReadBatchEvaluationLatency metric.IHistogram + ReplicaWriteBatchEvaluationLatency metric.IHistogram } type tenantMetricsRef struct { @@ -2287,27 +2287,46 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { // Raft processing metrics. RaftTicks: metric.NewCounter(metaRaftTicks), - RaftQuotaPoolPercentUsed: metric.NewHistogram( - metaRaftQuotaPoolPercentUsed, histogramWindow, metric.Percent100Buckets, - ), + RaftQuotaPoolPercentUsed: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaRaftQuotaPoolPercentUsed, + Duration: histogramWindow, + MaxVal: 100, + SigFigs: 1, + Buckets: metric.Percent100Buckets, + }), RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos), RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos), RaftCommandsApplied: metric.NewCounter(metaRaftCommandsApplied), - RaftLogCommitLatency: metric.NewHistogram( - metaRaftLogCommitLatency, histogramWindow, metric.IOLatencyBuckets, - ), - RaftCommandCommitLatency: metric.NewHistogram( - metaRaftCommandCommitLatency, histogramWindow, metric.IOLatencyBuckets, - ), - RaftHandleReadyLatency: metric.NewHistogram( - metaRaftHandleReadyLatency, histogramWindow, metric.IOLatencyBuckets, - ), - RaftApplyCommittedLatency: metric.NewHistogram( - metaRaftApplyCommittedLatency, histogramWindow, metric.IOLatencyBuckets, - ), - RaftSchedulerLatency: metric.NewHistogram( - metaRaftSchedulerLatency, histogramWindow, metric.IOLatencyBuckets, - ), + RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftLogCommitLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + RaftCommandCommitLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftCommandCommitLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + RaftHandleReadyLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftHandleReadyLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + RaftApplyCommittedLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftApplyCommittedLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + RaftSchedulerLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaRaftSchedulerLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), // Raft message metrics. @@ -2448,12 +2467,18 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { ReplicaCircuitBreakerCumTripped: metric.NewCounter(metaReplicaCircuitBreakerCumTripped), // Replica batch evaluation. - ReplicaReadBatchEvaluationLatency: metric.NewHistogram( - metaReplicaReadBatchEvaluationLatency, histogramWindow, metric.IOLatencyBuckets, - ), - ReplicaWriteBatchEvaluationLatency: metric.NewHistogram( - metaReplicaWriteBatchEvaluationLatency, histogramWindow, metric.IOLatencyBuckets, - ), + ReplicaReadBatchEvaluationLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaReplicaReadBatchEvaluationLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), + ReplicaWriteBatchEvaluationLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaReplicaWriteBatchEvaluationLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), } { diff --git a/pkg/kv/kvserver/scheduler.go b/pkg/kv/kvserver/scheduler.go index dbfdddb843d6..0dbc5345cf0e 100644 --- a/pkg/kv/kvserver/scheduler.go +++ b/pkg/kv/kvserver/scheduler.go @@ -169,7 +169,7 @@ type raftScheduleState struct { type raftScheduler struct { ambientContext log.AmbientContext processor raftProcessor - latency *metric.Histogram + latency metric.IHistogram numWorkers int mu struct { diff --git a/pkg/kv/kvserver/txnwait/metrics.go b/pkg/kv/kvserver/txnwait/metrics.go index 2e9d1d2a2055..4610fd8e375c 100644 --- a/pkg/kv/kvserver/txnwait/metrics.go +++ b/pkg/kv/kvserver/txnwait/metrics.go @@ -22,8 +22,8 @@ type Metrics struct { PusherWaiting *metric.Gauge QueryWaiting *metric.Gauge PusherSlow *metric.Gauge - PusherWaitTime *metric.Histogram - QueryWaitTime *metric.Histogram + PusherWaitTime metric.IHistogram + QueryWaitTime metric.IHistogram DeadlocksTotal *metric.Counter } @@ -66,27 +66,31 @@ func NewMetrics(histogramWindowInterval time.Duration) *Metrics { }, ), - PusherWaitTime: metric.NewHistogram( - metric.Metadata{ + PusherWaitTime: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "txnwaitqueue.pusher.wait_time", Help: "Histogram of durations spent in queue by pushers", Measurement: "Pusher wait time", Unit: metric.Unit_NANOSECONDS, }, - histogramWindowInterval, - metric.LongRunning60mLatencyBuckets, - ), + MaxVal: time.Hour.Nanoseconds(), + SigFigs: 1, + Duration: histogramWindowInterval, + Buckets: metric.LongRunning60mLatencyBuckets, + }), - QueryWaitTime: metric.NewHistogram( - metric.Metadata{ + QueryWaitTime: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "txnwaitqueue.query.wait_time", Help: "Histogram of durations spent in queue by queries", Measurement: "Query wait time", Unit: metric.Unit_NANOSECONDS, }, - histogramWindowInterval, - metric.LongRunning60mLatencyBuckets, - ), + MaxVal: time.Hour.Nanoseconds(), + SigFigs: 1, + Duration: histogramWindowInterval, + Buckets: metric.LongRunning60mLatencyBuckets, + }), DeadlocksTotal: metric.NewCounter( metric.Metadata{ diff --git a/pkg/rpc/clock_offset.go b/pkg/rpc/clock_offset.go index 48f290dd517c..46895e0001fd 100644 --- a/pkg/rpc/clock_offset.go +++ b/pkg/rpc/clock_offset.go @@ -28,7 +28,7 @@ import ( type RemoteClockMetrics struct { ClockOffsetMeanNanos *metric.Gauge ClockOffsetStdDevNanos *metric.Gauge - LatencyHistogramNanos *metric.Histogram + LatencyHistogramNanos metric.IHistogram } // avgLatencyMeasurementAge determines how to exponentially weight the @@ -122,9 +122,12 @@ func newRemoteClockMonitor( r.metrics = RemoteClockMetrics{ ClockOffsetMeanNanos: metric.NewGauge(metaClockOffsetMeanNanos), ClockOffsetStdDevNanos: metric.NewGauge(metaClockOffsetStdDevNanos), - LatencyHistogramNanos: metric.NewHistogram( - metaLatencyHistogramNanos, histogramWindowInterval, metric.IOLatencyBuckets, - ), + LatencyHistogramNanos: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaLatencyHistogramNanos, + Duration: histogramWindowInterval, + Buckets: metric.IOLatencyBuckets, + }), } return &r } diff --git a/pkg/server/node.go b/pkg/server/node.go index b6a339517aef..a752e36ef632 100644 --- a/pkg/server/node.go +++ b/pkg/server/node.go @@ -165,7 +165,7 @@ var ( ) type nodeMetrics struct { - Latency *metric.Histogram + Latency metric.IHistogram Success *metric.Counter Err *metric.Counter DiskStalls *metric.Counter @@ -176,9 +176,12 @@ type nodeMetrics struct { func makeNodeMetrics(reg *metric.Registry, histogramWindow time.Duration) nodeMetrics { nm := nodeMetrics{ - Latency: metric.NewHistogram( - metaExecLatency, histogramWindow, metric.IOLatencyBuckets, - ), + Latency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: metaExecLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), Success: metric.NewCounter(metaExecSuccess), Err: metric.NewCounter(metaExecError), DiskStalls: metric.NewCounter(metaDiskStalls), diff --git a/pkg/server/status/recorder_test.go b/pkg/server/status/recorder_test.go index 25f27021437d..cafa94c88a38 100644 --- a/pkg/server/status/recorder_test.go +++ b/pkg/server/status/recorder_test.go @@ -286,7 +286,12 @@ func TestMetricsRecorder(t *testing.T) { c.Inc((data.val)) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "histogram": - h := metric.NewHistogram(metric.Metadata{Name: reg.prefix + data.name}, time.Second, []float64{1.0, 10.0, 100.0, 1000.0}) + h := metric.NewHistogram(metric.HistogramOptions{ + Metadata: metric.Metadata{Name: reg.prefix + data.name}, + Duration: time.Second, + Buckets: []float64{1.0, 10.0, 100.0, 1000.0}, + Mode: metric.HistogramModePrometheus, + }) reg.reg.AddMetric(h) h.RecordValue(data.val) for _, q := range recordHistogramQuantiles { diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 18248a60f7f0..4507f73134c7 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -449,21 +449,36 @@ func makeMetrics(internal bool) Metrics { SQLOptPlanCacheHits: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheHits, internal)), SQLOptPlanCacheMisses: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheMisses, internal)), // TODO(mrtracy): See HistogramWindowInterval in server/config.go for the 6x factor. - DistSQLExecLatency: metric.NewHistogram( - getMetricMeta(MetaDistSQLExecLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), - SQLExecLatency: metric.NewHistogram( - getMetricMeta(MetaSQLExecLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), - DistSQLServiceLatency: metric.NewHistogram( - getMetricMeta(MetaDistSQLServiceLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), - SQLServiceLatency: metric.NewHistogram( - getMetricMeta(MetaSQLServiceLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), - SQLTxnLatency: metric.NewHistogram( - getMetricMeta(MetaSQLTxnLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), + DistSQLExecLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaDistSQLExecLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), + SQLExecLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaSQLExecLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), + DistSQLServiceLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaDistSQLServiceLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), + SQLServiceLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaSQLServiceLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), + SQLTxnLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: getMetricMeta(MetaSQLTxnLatency, internal), + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), SQLTxnsOpen: metric.NewGauge(getMetricMeta(MetaSQLTxnsOpen, internal)), SQLActiveStatements: metric.NewGauge(getMetricMeta(MetaSQLActiveQueries, internal)), SQLContendedTxns: metric.NewCounter(getMetricMeta(MetaSQLTxnContended, internal)), @@ -487,28 +502,38 @@ func makeMetrics(internal bool) Metrics { func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics { return ServerMetrics{ StatsMetrics: StatsMetrics{ - SQLStatsMemoryMaxBytesHist: metric.NewHistogram( - MetaSQLStatsMemMaxBytes, - cfg.HistogramWindowInterval, - metric.MemoryUsage64MBBuckets, - ), + SQLStatsMemoryMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaSQLStatsMemMaxBytes, + Duration: cfg.HistogramWindowInterval, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), SQLStatsMemoryCurBytesCount: metric.NewGauge(MetaSQLStatsMemCurBytes), - ReportedSQLStatsMemoryMaxBytesHist: metric.NewHistogram( - MetaReportedSQLStatsMemMaxBytes, - cfg.HistogramWindowInterval, - metric.MemoryUsage64MBBuckets, - ), + ReportedSQLStatsMemoryMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaReportedSQLStatsMemMaxBytes, + Duration: cfg.HistogramWindowInterval, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes), DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats), SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted), SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure), - SQLStatsFlushDuration: metric.NewHistogram( - MetaSQLStatsFlushDuration, 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), + SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: MetaSQLStatsFlushDuration, + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), SQLStatsRemovedRows: metric.NewCounter(MetaSQLStatsRemovedRows), - SQLTxnStatsCollectionOverhead: metric.NewHistogram( - MetaSQLTxnStatsCollectionOverhead, 6*metricsSampleInterval, metric.IOLatencyBuckets, - ), + SQLTxnStatsCollectionOverhead: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: MetaSQLTxnStatsCollectionOverhead, + Duration: 6 * metricsSampleInterval, + Buckets: metric.IOLatencyBuckets, + }), }, ContentionSubsystemMetrics: txnidcache.NewMetrics(), InsightsMetrics: insights.NewMetrics(), diff --git a/pkg/sql/execinfra/metrics.go b/pkg/sql/execinfra/metrics.go index 85f36259b959..e27dc2707069 100644 --- a/pkg/sql/execinfra/metrics.go +++ b/pkg/sql/execinfra/metrics.go @@ -26,12 +26,12 @@ type DistSQLMetrics struct { FlowsTotal *metric.Counter FlowsQueued *metric.Gauge FlowsScheduled *metric.Counter - QueueWaitHist *metric.Histogram - MaxBytesHist *metric.Histogram + QueueWaitHist metric.IHistogram + MaxBytesHist metric.IHistogram CurBytesCount *metric.Gauge VecOpenFDs *metric.Gauge CurDiskBytesCount *metric.Gauge - MaxDiskBytesHist *metric.Histogram + MaxDiskBytesHist metric.IHistogram QueriesSpilled *metric.Counter SpilledBytesWritten *metric.Counter SpilledBytesRead *metric.Counter @@ -141,6 +141,10 @@ var ( } ) +// See pkg/sql/mem_metrics.go +// log10int64times1000 = log10(math.MaxInt64) * 1000, rounded up somewhat +const log10int64times1000 = 19 * 1000 + // MakeDistSQLMetrics instantiates the metrics holder for DistSQL monitoring. func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics { return DistSQLMetrics{ @@ -151,15 +155,31 @@ func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics { FlowsTotal: metric.NewCounter(metaFlowsTotal), FlowsQueued: metric.NewGauge(metaFlowsQueued), FlowsScheduled: metric.NewCounter(metaFlowsScheduled), - QueueWaitHist: metric.NewHistogram(metaQueueWaitHist, histogramWindow, metric.IOLatencyBuckets), - MaxBytesHist: metric.NewHistogram(metaMemMaxBytes, histogramWindow, metric.MemoryUsage64MBBuckets), - CurBytesCount: metric.NewGauge(metaMemCurBytes), - VecOpenFDs: metric.NewGauge(metaVecOpenFDs), - CurDiskBytesCount: metric.NewGauge(metaDiskCurBytes), - MaxDiskBytesHist: metric.NewHistogram(metaDiskMaxBytes, histogramWindow, metric.MemoryUsage64MBBuckets), - QueriesSpilled: metric.NewCounter(metaQueriesSpilled), - SpilledBytesWritten: metric.NewCounter(metaSpilledBytesWritten), - SpilledBytesRead: metric.NewCounter(metaSpilledBytesRead), + QueueWaitHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaQueueWaitHist, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + Mode: metric.HistogramModePreferHdrLatency, + }), + MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaMemMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), + CurBytesCount: metric.NewGauge(metaMemCurBytes), + VecOpenFDs: metric.NewGauge(metaVecOpenFDs), + CurDiskBytesCount: metric.NewGauge(metaDiskCurBytes), + MaxDiskBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: metaDiskMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets}), + QueriesSpilled: metric.NewCounter(metaQueriesSpilled), + SpilledBytesWritten: metric.NewCounter(metaSpilledBytesWritten), + SpilledBytesRead: metric.NewCounter(metaSpilledBytesRead), } } diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go index d650970ac1d2..277f6df3a8c5 100644 --- a/pkg/sql/executor_statement_metrics.go +++ b/pkg/sql/executor_statement_metrics.go @@ -36,11 +36,11 @@ type EngineMetrics struct { SQLOptPlanCacheHits *metric.Counter SQLOptPlanCacheMisses *metric.Counter - DistSQLExecLatency *metric.Histogram - SQLExecLatency *metric.Histogram - DistSQLServiceLatency *metric.Histogram - SQLServiceLatency *metric.Histogram - SQLTxnLatency *metric.Histogram + DistSQLExecLatency metric.IHistogram + SQLExecLatency metric.IHistogram + DistSQLServiceLatency metric.IHistogram + SQLServiceLatency metric.IHistogram + SQLTxnLatency metric.IHistogram SQLTxnsOpen *metric.Gauge SQLActiveStatements *metric.Gauge SQLContendedTxns *metric.Counter @@ -69,20 +69,20 @@ func (EngineMetrics) MetricStruct() {} // StatsMetrics groups metrics related to SQL Stats collection. type StatsMetrics struct { - SQLStatsMemoryMaxBytesHist *metric.Histogram + SQLStatsMemoryMaxBytesHist metric.IHistogram SQLStatsMemoryCurBytesCount *metric.Gauge - ReportedSQLStatsMemoryMaxBytesHist *metric.Histogram + ReportedSQLStatsMemoryMaxBytesHist metric.IHistogram ReportedSQLStatsMemoryCurBytesCount *metric.Gauge DiscardedStatsCount *metric.Counter SQLStatsFlushStarted *metric.Counter SQLStatsFlushFailure *metric.Counter - SQLStatsFlushDuration *metric.Histogram + SQLStatsFlushDuration metric.IHistogram SQLStatsRemovedRows *metric.Counter - SQLTxnStatsCollectionOverhead *metric.Histogram + SQLTxnStatsCollectionOverhead metric.IHistogram } // StatsMetrics is part of the metric.Struct interface. diff --git a/pkg/sql/mem_metrics.go b/pkg/sql/mem_metrics.go index db2198d6bfd8..248aff4ddc88 100644 --- a/pkg/sql/mem_metrics.go +++ b/pkg/sql/mem_metrics.go @@ -19,7 +19,7 @@ import ( // BaseMemoryMetrics contains a max histogram and a current count of the // bytes allocated by a sql endpoint. type BaseMemoryMetrics struct { - MaxBytesHist *metric.Histogram + MaxBytesHist metric.IHistogram CurBytesCount *metric.Gauge } @@ -30,9 +30,9 @@ type BaseMemoryMetrics struct { // - "internal" for activities related to leases, schema changes, etc. type MemoryMetrics struct { BaseMemoryMetrics - TxnMaxBytesHist *metric.Histogram + TxnMaxBytesHist metric.IHistogram TxnCurBytesCount *metric.Gauge - SessionMaxBytesHist *metric.Histogram + SessionMaxBytesHist metric.IHistogram SessionCurBytesCount *metric.Gauge } @@ -41,6 +41,22 @@ func (MemoryMetrics) MetricStruct() {} var _ metric.Struct = MemoryMetrics{} +// TODO(knz): Until #10014 is addressed, the UI graphs don't have a +// log scale on the Y axis and the histograms are thus displayed using +// a manual log scale: we store the logarithm in the value in the DB +// and plot that logarithm in the UI. +// +// We could, but do not, store the full value in the DB and compute +// the log in the UI, because the current histogram implementation +// does not deal well with large maxima (#10015). +// +// Since the DB stores an integer, we scale the values by 1000 so that +// a modicum of precision is restored when exponentiating the value. +// + +// log10int64times1000 = log10(math.MaxInt64) * 1000, rounded up somewhat +const log10int64times1000 = 19 * 1000 + func makeMemMetricMetadata(name, help string) metric.Metadata { return metric.Metadata{ Name: name, @@ -57,7 +73,13 @@ func MakeBaseMemMetrics(endpoint string, histogramWindow time.Duration) BaseMemo MetaMemMaxBytes := makeMemMetricMetadata(prefix+".max", "Memory usage per sql statement for "+endpoint) MetaMemCurBytes := makeMemMetricMetadata(prefix+".current", "Current sql statement memory usage for "+endpoint) return BaseMemoryMetrics{ - MaxBytesHist: metric.NewHistogram(MetaMemMaxBytes, histogramWindow, metric.MemoryUsage64MBBuckets), + MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaMemMaxBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets, + }), CurBytesCount: metric.NewGauge(MetaMemCurBytes), } } @@ -71,10 +93,20 @@ func MakeMemMetrics(endpoint string, histogramWindow time.Duration) MemoryMetric MetaMemMaxSessionBytes := makeMemMetricMetadata(prefix+".session.max", "Memory usage per sql session for "+endpoint) MetaMemSessionCurBytes := makeMemMetricMetadata(prefix+".session.current", "Current sql session memory usage for "+endpoint) return MemoryMetrics{ - BaseMemoryMetrics: base, - TxnMaxBytesHist: metric.NewHistogram(MetaMemMaxTxnBytes, histogramWindow, metric.MemoryUsage64MBBuckets), - TxnCurBytesCount: metric.NewGauge(MetaMemTxnCurBytes), - SessionMaxBytesHist: metric.NewHistogram(MetaMemMaxSessionBytes, histogramWindow, metric.MemoryUsage64MBBuckets), + BaseMemoryMetrics: base, + TxnMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaMemMaxTxnBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets}), + TxnCurBytesCount: metric.NewGauge(MetaMemTxnCurBytes), + SessionMaxBytesHist: metric.NewHistogram(metric.HistogramOptions{ + Metadata: MetaMemMaxSessionBytes, + Duration: histogramWindow, + MaxVal: log10int64times1000, + SigFigs: 3, + Buckets: metric.MemoryUsage64MBBuckets}), SessionCurBytesCount: metric.NewGauge(MetaMemSessionCurBytes), } diff --git a/pkg/sql/pgwire/server.go b/pkg/sql/pgwire/server.go index 735b9027a221..1e2ed1e3ef25 100644 --- a/pkg/sql/pgwire/server.go +++ b/pkg/sql/pgwire/server.go @@ -279,7 +279,7 @@ type ServerMetrics struct { BytesOutCount *metric.Counter Conns *metric.Gauge NewConns *metric.Counter - ConnLatency *metric.Histogram + ConnLatency metric.IHistogram ConnFailures *metric.Counter PGWireCancelTotalCount *metric.Counter PGWireCancelIgnoredCount *metric.Counter @@ -296,9 +296,12 @@ func makeServerMetrics( BytesOutCount: metric.NewCounter(MetaBytesOut), Conns: metric.NewGauge(MetaConns), NewConns: metric.NewCounter(MetaNewConns), - ConnLatency: metric.NewHistogram( - MetaConnLatency, histogramWindow, metric.IOLatencyBuckets, - ), + ConnLatency: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: MetaConnLatency, + Duration: histogramWindow, + Buckets: metric.IOLatencyBuckets, + }), ConnFailures: metric.NewCounter(MetaConnFailures), PGWireCancelTotalCount: metric.NewCounter(MetaPGWireCancelTotal), PGWireCancelIgnoredCount: metric.NewCounter(MetaPGWireCancelIgnored), diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go index 4bf1483f1da5..a65544932ae3 100644 --- a/pkg/sql/sqlstats/persistedsqlstats/provider.go +++ b/pkg/sql/sqlstats/persistedsqlstats/provider.go @@ -45,7 +45,7 @@ type Config struct { // Metrics. FlushCounter *metric.Counter - FlushDuration *metric.Histogram + FlushDuration metric.IHistogram FailureCounter *metric.Counter // Testing knobs. diff --git a/pkg/sql/sqlstats/sslocal/sql_stats.go b/pkg/sql/sqlstats/sslocal/sql_stats.go index 90c3350961eb..92fe6dcf3904 100644 --- a/pkg/sql/sqlstats/sslocal/sql_stats.go +++ b/pkg/sql/sqlstats/sslocal/sql_stats.go @@ -75,7 +75,7 @@ func newSQLStats( uniqueStmtFingerprintLimit *settings.IntSetting, uniqueTxnFingerprintLimit *settings.IntSetting, curMemBytesCount *metric.Gauge, - maxMemBytesHist *metric.Histogram, + maxMemBytesHist metric.IHistogram, insightsWriter insights.WriterProvider, parentMon *mon.BytesMonitor, flushTarget Sink, diff --git a/pkg/sql/sqlstats/sslocal/sslocal_provider.go b/pkg/sql/sqlstats/sslocal/sslocal_provider.go index bf7d7e439a08..b035941011a7 100644 --- a/pkg/sql/sqlstats/sslocal/sslocal_provider.go +++ b/pkg/sql/sqlstats/sslocal/sslocal_provider.go @@ -37,7 +37,7 @@ func New( maxStmtFingerprints *settings.IntSetting, maxTxnFingerprints *settings.IntSetting, curMemoryBytesCount *metric.Gauge, - maxMemoryBytesHist *metric.Histogram, + maxMemoryBytesHist metric.IHistogram, insightsWriter insights.WriterProvider, pool *mon.BytesMonitor, reportingSink Sink, diff --git a/pkg/sql/ttl/ttljob/ttljob_metrics.go b/pkg/sql/ttl/ttljob/ttljob_metrics.go index 547239a65e59..6f0356ddcc20 100644 --- a/pkg/sql/ttl/ttljob/ttljob_metrics.go +++ b/pkg/sql/ttl/ttljob/ttljob_metrics.go @@ -96,41 +96,48 @@ func (m *RowLevelTTLAggMetrics) loadMetrics(labelMetrics bool, relation string) } func makeRowLevelTTLAggMetrics(histogramWindowInterval time.Duration) metric.Struct { + sigFigs := 2 b := aggmetric.MakeBuilder("relation") ret := &RowLevelTTLAggMetrics{ - SpanTotalDuration: b.Histogram( - metric.Metadata{ + SpanTotalDuration: b.Histogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "jobs.row_level_ttl.span_total_duration", Help: "Duration for processing a span during row level TTL.", Measurement: "nanoseconds", Unit: metric.Unit_NANOSECONDS, MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, - histogramWindowInterval, - metric.LongRunning60mLatencyBuckets, - ), - SelectDuration: b.Histogram( - metric.Metadata{ + MaxVal: time.Hour.Nanoseconds(), + SigFigs: sigFigs, + Duration: histogramWindowInterval, + Buckets: metric.LongRunning60mLatencyBuckets, + }), + SelectDuration: b.Histogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "jobs.row_level_ttl.select_duration", Help: "Duration for select requests during row level TTL.", Measurement: "nanoseconds", Unit: metric.Unit_NANOSECONDS, MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, - histogramWindowInterval, - metric.BatchProcessLatencyBuckets, - ), - DeleteDuration: b.Histogram( - metric.Metadata{ + MaxVal: time.Minute.Nanoseconds(), + SigFigs: sigFigs, + Duration: histogramWindowInterval, + Buckets: metric.BatchProcessLatencyBuckets, + }), + DeleteDuration: b.Histogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ Name: "jobs.row_level_ttl.delete_duration", Help: "Duration for delete requests during row level TTL.", Measurement: "nanoseconds", Unit: metric.Unit_NANOSECONDS, MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, - histogramWindowInterval, - metric.BatchProcessLatencyBuckets, - ), + MaxVal: time.Minute.Nanoseconds(), + SigFigs: sigFigs, + Duration: histogramWindowInterval, + Buckets: metric.BatchProcessLatencyBuckets, + }), RowSelections: b.Counter( metric.Metadata{ Name: "jobs.row_level_ttl.rows_selected", diff --git a/pkg/util/admission/work_queue.go b/pkg/util/admission/work_queue.go index 1d20b16f6ce0..f59fe1d8bafa 100644 --- a/pkg/util/admission/work_queue.go +++ b/pkg/util/admission/work_queue.go @@ -1541,7 +1541,7 @@ type WorkQueueMetrics struct { Admitted *metric.Counter Errored *metric.Counter WaitDurationSum *metric.Counter - WaitDurations *metric.Histogram + WaitDurations metric.IHistogram WaitQueueLength *metric.Gauge } @@ -1554,9 +1554,12 @@ func makeWorkQueueMetrics(name string) WorkQueueMetrics { Admitted: metric.NewCounter(addName(name, admittedMeta)), Errored: metric.NewCounter(addName(name, erroredMeta)), WaitDurationSum: metric.NewCounter(addName(name, waitDurationSumMeta)), - WaitDurations: metric.NewHistogram( - addName(name, waitDurationsMeta), base.DefaultHistogramWindowInterval(), metric.IOLatencyBuckets, - ), + WaitDurations: metric.NewHistogram(metric.HistogramOptions{ + Mode: metric.HistogramModePreferHdrLatency, + Metadata: addName(name, waitDurationsMeta), + Duration: base.DefaultHistogramWindowInterval(), + Buckets: metric.IOLatencyBuckets, + }), WaitQueueLength: metric.NewGauge(addName(name, waitQueueLengthMeta)), } } diff --git a/pkg/util/metric/BUILD.bazel b/pkg/util/metric/BUILD.bazel index fe1ad312f84d..575e6e08bc0b 100644 --- a/pkg/util/metric/BUILD.bazel +++ b/pkg/util/metric/BUILD.bazel @@ -8,6 +8,7 @@ go_library( srcs = [ "doc.go", "graphite_exporter.go", + "hdrhistogram.go", "histogram_buckets.go", "metric.go", "prometheus_exporter.go", @@ -22,10 +23,13 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/util/metric", visibility = ["//visibility:public"], deps = [ + "//pkg/util", + "//pkg/util/envutil", "//pkg/util/log", "//pkg/util/syncutil", "//pkg/util/timeutil", "@com_github_cockroachdb_errors//:errors", + "@com_github_codahale_hdrhistogram//:hdrhistogram", "@com_github_gogo_protobuf//proto", "@com_github_prometheus_client_golang//prometheus", "@com_github_prometheus_client_golang//prometheus/graphite", diff --git a/pkg/util/metric/aggmetric/BUILD.bazel b/pkg/util/metric/aggmetric/BUILD.bazel index 4f66e44deba4..a1b468d64742 100644 --- a/pkg/util/metric/aggmetric/BUILD.bazel +++ b/pkg/util/metric/aggmetric/BUILD.bazel @@ -17,7 +17,6 @@ go_library( "@com_github_cockroachdb_errors//:errors", "@com_github_gogo_protobuf//proto", "@com_github_google_btree//:btree", - "@com_github_prometheus_client_golang//prometheus", "@com_github_prometheus_client_model//go", ], ) @@ -27,10 +26,13 @@ go_test( size = "small", srcs = ["agg_metric_test.go"], args = ["-test.timeout=55s"], + data = glob(["testdata/**"]), deps = [ ":aggmetric", "//pkg/base", "//pkg/roachpb", + "//pkg/testutils", + "//pkg/testutils/echotest", "//pkg/util/leaktest", "//pkg/util/metric", "@com_github_prometheus_client_model//go", diff --git a/pkg/util/metric/aggmetric/agg_metric.go b/pkg/util/metric/aggmetric/agg_metric.go index c9afb965d64f..ab5ad03ce5b6 100644 --- a/pkg/util/metric/aggmetric/agg_metric.go +++ b/pkg/util/metric/aggmetric/agg_metric.go @@ -15,7 +15,6 @@ package aggmetric import ( "strings" - "time" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/cockroachdb/cockroach/pkg/util/syncutil" @@ -50,10 +49,8 @@ func (b Builder) Counter(metadata metric.Metadata) *AggCounter { } // Histogram constructs a new AggHistogram with the Builder's labels. -func (b Builder) Histogram( - metadata metric.Metadata, duration time.Duration, buckets []float64, -) *AggHistogram { - return NewHistogram(metadata, duration, buckets, b.labels...) +func (b Builder) Histogram(opts metric.HistogramOptions) *AggHistogram { + return NewHistogram(opts, b.labels...) } type childSet struct { diff --git a/pkg/util/metric/aggmetric/agg_metric_test.go b/pkg/util/metric/aggmetric/agg_metric_test.go index acb9b9883b97..c245e1d6be84 100644 --- a/pkg/util/metric/aggmetric/agg_metric_test.go +++ b/pkg/util/metric/aggmetric/agg_metric_test.go @@ -19,6 +19,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/echotest" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/cockroachdb/cockroach/pkg/util/metric/aggmetric" @@ -61,10 +63,15 @@ func TestAggMetric(t *testing.T) { Name: "baz_gauge", }, "tenant_id") r.AddMetric(f) - - h := aggmetric.NewHistogram(metric.Metadata{ - Name: "histo_gram", - }, base.DefaultHistogramWindowInterval(), metric.Count1KBuckets, "tenant_id") + h := aggmetric.NewHistogram(metric.HistogramOptions{ + Metadata: metric.Metadata{ + Name: "histo_gram", + }, + Duration: base.DefaultHistogramWindowInterval(), + MaxVal: 100, + SigFigs: 1, + Buckets: metric.Count1KBuckets, + }, "tenant_id") r.AddMetric(h) tenant2 := roachpb.MakeTenantID(2) @@ -88,59 +95,11 @@ func TestAggMetric(t *testing.T) { f3.Update(2.5) h2.RecordValue(10) h3.RecordValue(90) - require.Equal(t, - `bar_gauge 4 -bar_gauge{tenant_id="2"} 2 -bar_gauge{tenant_id="3"} 2 -baz_gauge 4 -baz_gauge{tenant_id="2"} 1.5 -baz_gauge{tenant_id="3"} 2.5 -foo_counter 6 -foo_counter{tenant_id="2"} 2 -foo_counter{tenant_id="3"} 4 -histo_gram_bucket{le="+Inf"} 2 -histo_gram_bucket{le="1"} 0 -histo_gram_bucket{le="1024"} 2 -histo_gram_bucket{le="128"} 2 -histo_gram_bucket{le="16"} 1 -histo_gram_bucket{le="2"} 0 -histo_gram_bucket{le="256"} 2 -histo_gram_bucket{le="32"} 1 -histo_gram_bucket{le="4"} 0 -histo_gram_bucket{le="512"} 2 -histo_gram_bucket{le="64"} 1 -histo_gram_bucket{le="8"} 0 -histo_gram_bucket{tenant_id="2",le="+Inf"} 1 -histo_gram_bucket{tenant_id="2",le="1"} 0 -histo_gram_bucket{tenant_id="2",le="1024"} 1 -histo_gram_bucket{tenant_id="2",le="128"} 1 -histo_gram_bucket{tenant_id="2",le="16"} 1 -histo_gram_bucket{tenant_id="2",le="2"} 0 -histo_gram_bucket{tenant_id="2",le="256"} 1 -histo_gram_bucket{tenant_id="2",le="32"} 1 -histo_gram_bucket{tenant_id="2",le="4"} 0 -histo_gram_bucket{tenant_id="2",le="512"} 1 -histo_gram_bucket{tenant_id="2",le="64"} 1 -histo_gram_bucket{tenant_id="2",le="8"} 0 -histo_gram_bucket{tenant_id="3",le="+Inf"} 1 -histo_gram_bucket{tenant_id="3",le="1"} 0 -histo_gram_bucket{tenant_id="3",le="1024"} 1 -histo_gram_bucket{tenant_id="3",le="128"} 1 -histo_gram_bucket{tenant_id="3",le="16"} 0 -histo_gram_bucket{tenant_id="3",le="2"} 0 -histo_gram_bucket{tenant_id="3",le="256"} 1 -histo_gram_bucket{tenant_id="3",le="32"} 0 -histo_gram_bucket{tenant_id="3",le="4"} 0 -histo_gram_bucket{tenant_id="3",le="512"} 1 -histo_gram_bucket{tenant_id="3",le="64"} 0 -histo_gram_bucket{tenant_id="3",le="8"} 0 -histo_gram_count 2 -histo_gram_count{tenant_id="2"} 1 -histo_gram_count{tenant_id="3"} 1 -histo_gram_sum 100 -histo_gram_sum{tenant_id="2"} 10 -histo_gram_sum{tenant_id="3"} 90`, - writePrometheusMetrics(t)) + testFile := "basic.txt" + if metric.HdrEnabled() { + testFile = "basic_hdr.txt" + } + echotest.Require(t, writePrometheusMetrics(t), testutils.TestDataPath(t, testFile)) }) t.Run("destroy", func(t *testing.T) { @@ -148,42 +107,11 @@ histo_gram_sum{tenant_id="3"} 90`, c2.Destroy() f3.Destroy() h3.Destroy() - require.Equal(t, - `bar_gauge 2 -bar_gauge{tenant_id="2"} 2 -baz_gauge 1.5 -baz_gauge{tenant_id="2"} 1.5 -foo_counter 6 -foo_counter{tenant_id="3"} 4 -histo_gram_bucket{le="+Inf"} 2 -histo_gram_bucket{le="1"} 0 -histo_gram_bucket{le="1024"} 2 -histo_gram_bucket{le="128"} 2 -histo_gram_bucket{le="16"} 1 -histo_gram_bucket{le="2"} 0 -histo_gram_bucket{le="256"} 2 -histo_gram_bucket{le="32"} 1 -histo_gram_bucket{le="4"} 0 -histo_gram_bucket{le="512"} 2 -histo_gram_bucket{le="64"} 1 -histo_gram_bucket{le="8"} 0 -histo_gram_bucket{tenant_id="2",le="+Inf"} 1 -histo_gram_bucket{tenant_id="2",le="1"} 0 -histo_gram_bucket{tenant_id="2",le="1024"} 1 -histo_gram_bucket{tenant_id="2",le="128"} 1 -histo_gram_bucket{tenant_id="2",le="16"} 1 -histo_gram_bucket{tenant_id="2",le="2"} 0 -histo_gram_bucket{tenant_id="2",le="256"} 1 -histo_gram_bucket{tenant_id="2",le="32"} 1 -histo_gram_bucket{tenant_id="2",le="4"} 0 -histo_gram_bucket{tenant_id="2",le="512"} 1 -histo_gram_bucket{tenant_id="2",le="64"} 1 -histo_gram_bucket{tenant_id="2",le="8"} 0 -histo_gram_count 2 -histo_gram_count{tenant_id="2"} 1 -histo_gram_sum 100 -histo_gram_sum{tenant_id="2"} 10`, - writePrometheusMetrics(t)) + testFile := "destroy.txt" + if metric.HdrEnabled() { + testFile = "destroy_hdr.txt" + } + echotest.Require(t, writePrometheusMetrics(t), testutils.TestDataPath(t, testFile)) }) t.Run("panic on already exists", func(t *testing.T) { @@ -204,59 +132,11 @@ histo_gram_sum{tenant_id="2"} 10`, c2 = c.AddChild(tenant2.String()) f3 = f.AddChild(tenant3.String()) h3 = h.AddChild(tenant3.String()) - require.Equal(t, - `bar_gauge 2 -bar_gauge{tenant_id="2"} 2 -bar_gauge{tenant_id="3"} 0 -baz_gauge 1.5 -baz_gauge{tenant_id="2"} 1.5 -baz_gauge{tenant_id="3"} 0 -foo_counter 6 -foo_counter{tenant_id="2"} 0 -foo_counter{tenant_id="3"} 4 -histo_gram_bucket{le="+Inf"} 2 -histo_gram_bucket{le="1"} 0 -histo_gram_bucket{le="1024"} 2 -histo_gram_bucket{le="128"} 2 -histo_gram_bucket{le="16"} 1 -histo_gram_bucket{le="2"} 0 -histo_gram_bucket{le="256"} 2 -histo_gram_bucket{le="32"} 1 -histo_gram_bucket{le="4"} 0 -histo_gram_bucket{le="512"} 2 -histo_gram_bucket{le="64"} 1 -histo_gram_bucket{le="8"} 0 -histo_gram_bucket{tenant_id="2",le="+Inf"} 1 -histo_gram_bucket{tenant_id="2",le="1"} 0 -histo_gram_bucket{tenant_id="2",le="1024"} 1 -histo_gram_bucket{tenant_id="2",le="128"} 1 -histo_gram_bucket{tenant_id="2",le="16"} 1 -histo_gram_bucket{tenant_id="2",le="2"} 0 -histo_gram_bucket{tenant_id="2",le="256"} 1 -histo_gram_bucket{tenant_id="2",le="32"} 1 -histo_gram_bucket{tenant_id="2",le="4"} 0 -histo_gram_bucket{tenant_id="2",le="512"} 1 -histo_gram_bucket{tenant_id="2",le="64"} 1 -histo_gram_bucket{tenant_id="2",le="8"} 0 -histo_gram_bucket{tenant_id="3",le="+Inf"} 0 -histo_gram_bucket{tenant_id="3",le="1"} 0 -histo_gram_bucket{tenant_id="3",le="1024"} 0 -histo_gram_bucket{tenant_id="3",le="128"} 0 -histo_gram_bucket{tenant_id="3",le="16"} 0 -histo_gram_bucket{tenant_id="3",le="2"} 0 -histo_gram_bucket{tenant_id="3",le="256"} 0 -histo_gram_bucket{tenant_id="3",le="32"} 0 -histo_gram_bucket{tenant_id="3",le="4"} 0 -histo_gram_bucket{tenant_id="3",le="512"} 0 -histo_gram_bucket{tenant_id="3",le="64"} 0 -histo_gram_bucket{tenant_id="3",le="8"} 0 -histo_gram_count 2 -histo_gram_count{tenant_id="2"} 1 -histo_gram_count{tenant_id="3"} 0 -histo_gram_sum 100 -histo_gram_sum{tenant_id="2"} 10 -histo_gram_sum{tenant_id="3"} 0`, - writePrometheusMetrics(t)) + testFile := "add_after_destroy.txt" + if metric.HdrEnabled() { + testFile = "add_after_destroy_hdr.txt" + } + echotest.Require(t, writePrometheusMetrics(t), testutils.TestDataPath(t, testFile)) }) t.Run("panic on label length mismatch", func(t *testing.T) { @@ -272,8 +152,13 @@ func TestAggMetricBuilder(t *testing.T) { c := b.Counter(metric.Metadata{Name: "foo_counter"}) g := b.Gauge(metric.Metadata{Name: "bar_gauge"}) f := b.GaugeFloat64(metric.Metadata{Name: "baz_gauge"}) - h := b.Histogram(metric.Metadata{Name: "histo_gram"}, - base.DefaultHistogramWindowInterval(), metric.Count1KBuckets) + h := b.Histogram(metric.HistogramOptions{ + Metadata: metric.Metadata{Name: "histo_gram"}, + Duration: base.DefaultHistogramWindowInterval(), + MaxVal: 100, + SigFigs: 1, + Buckets: metric.Count1KBuckets, + }) for i := 5; i < 10; i++ { tenantLabel := roachpb.MakeTenantID(uint64(i)).String() diff --git a/pkg/util/metric/aggmetric/histogram.go b/pkg/util/metric/aggmetric/histogram.go index 66bb7aa3853b..736de6a108c3 100644 --- a/pkg/util/metric/aggmetric/histogram.go +++ b/pkg/util/metric/aggmetric/histogram.go @@ -11,10 +11,7 @@ package aggmetric import ( - "time" - "github.com/cockroachdb/cockroach/pkg/util/metric" - "github.com/prometheus/client_golang/prometheus" io_prometheus_client "github.com/prometheus/client_model/go" ) @@ -23,8 +20,8 @@ import ( // children, while its children are additionally exported to prometheus via the // PrometheusIterable interface. type AggHistogram struct { - h metric.Histogram - create func() *metric.Histogram + h metric.IHistogram + create func() metric.IHistogram childSet } @@ -34,14 +31,12 @@ var _ metric.PrometheusExportable = (*AggHistogram)(nil) var _ metric.WindowedHistogram = (*AggHistogram)(nil) // NewHistogram constructs a new AggHistogram. -func NewHistogram( - metadata metric.Metadata, duration time.Duration, buckets []float64, childLabels ...string, -) *AggHistogram { - create := func() *metric.Histogram { - return metric.NewHistogram(metadata, duration, buckets) +func NewHistogram(opts metric.HistogramOptions, childLabels ...string) *AggHistogram { + create := func() metric.IHistogram { + return metric.NewHistogram(opts) } a := &AggHistogram{ - h: *create(), + h: create(), create: create, } a.init(childLabels) @@ -96,19 +91,13 @@ func (a *AggHistogram) ToPrometheusMetric() *io_prometheus_client.Metric { return a.h.ToPrometheusMetric() } -// Windowed returns a copy of the current windowed histogram data and its -// rotation interval. -func (a *AggHistogram) Windowed() prometheus.Histogram { - return a.h.Windowed() -} - // AddChild adds a Counter to this AggCounter. This method panics if a Counter // already exists for this set of labelVals. func (a *AggHistogram) AddChild(labelVals ...string) *Histogram { child := &Histogram{ parent: a, labelValuesSlice: labelValuesSlice(labelVals), - h: *a.create(), + h: a.create(), } a.add(child) return child @@ -121,7 +110,7 @@ func (a *AggHistogram) AddChild(labelVals ...string) *Histogram { type Histogram struct { parent *AggHistogram labelValuesSlice - h metric.Histogram + h metric.IHistogram } // ToPrometheusMetric constructs a prometheus metric for this Histogram. diff --git a/pkg/util/metric/aggmetric/testdata/add_after_destroy.txt b/pkg/util/metric/aggmetric/testdata/add_after_destroy.txt new file mode 100644 index 000000000000..da924a958081 --- /dev/null +++ b/pkg/util/metric/aggmetric/testdata/add_after_destroy.txt @@ -0,0 +1,53 @@ +echo +---- +bar_gauge 2 +bar_gauge{tenant_id="2"} 2 +bar_gauge{tenant_id="3"} 0 +baz_gauge 1.5 +baz_gauge{tenant_id="2"} 1.5 +baz_gauge{tenant_id="3"} 0 +foo_counter 6 +foo_counter{tenant_id="2"} 0 +foo_counter{tenant_id="3"} 4 +histo_gram_bucket{le="+Inf"} 2 +histo_gram_bucket{le="1"} 0 +histo_gram_bucket{le="1024"} 2 +histo_gram_bucket{le="128"} 2 +histo_gram_bucket{le="16"} 1 +histo_gram_bucket{le="2"} 0 +histo_gram_bucket{le="256"} 2 +histo_gram_bucket{le="32"} 1 +histo_gram_bucket{le="4"} 0 +histo_gram_bucket{le="512"} 2 +histo_gram_bucket{le="64"} 1 +histo_gram_bucket{le="8"} 0 +histo_gram_bucket{tenant_id="2",le="+Inf"} 1 +histo_gram_bucket{tenant_id="2",le="1"} 0 +histo_gram_bucket{tenant_id="2",le="1024"} 1 +histo_gram_bucket{tenant_id="2",le="128"} 1 +histo_gram_bucket{tenant_id="2",le="16"} 1 +histo_gram_bucket{tenant_id="2",le="2"} 0 +histo_gram_bucket{tenant_id="2",le="256"} 1 +histo_gram_bucket{tenant_id="2",le="32"} 1 +histo_gram_bucket{tenant_id="2",le="4"} 0 +histo_gram_bucket{tenant_id="2",le="512"} 1 +histo_gram_bucket{tenant_id="2",le="64"} 1 +histo_gram_bucket{tenant_id="2",le="8"} 0 +histo_gram_bucket{tenant_id="3",le="+Inf"} 0 +histo_gram_bucket{tenant_id="3",le="1"} 0 +histo_gram_bucket{tenant_id="3",le="1024"} 0 +histo_gram_bucket{tenant_id="3",le="128"} 0 +histo_gram_bucket{tenant_id="3",le="16"} 0 +histo_gram_bucket{tenant_id="3",le="2"} 0 +histo_gram_bucket{tenant_id="3",le="256"} 0 +histo_gram_bucket{tenant_id="3",le="32"} 0 +histo_gram_bucket{tenant_id="3",le="4"} 0 +histo_gram_bucket{tenant_id="3",le="512"} 0 +histo_gram_bucket{tenant_id="3",le="64"} 0 +histo_gram_bucket{tenant_id="3",le="8"} 0 +histo_gram_count 2 +histo_gram_count{tenant_id="2"} 1 +histo_gram_count{tenant_id="3"} 0 +histo_gram_sum 100 +histo_gram_sum{tenant_id="2"} 10 +histo_gram_sum{tenant_id="3"} 0 \ No newline at end of file diff --git a/pkg/util/metric/aggmetric/testdata/add_after_destroy_hdr.txt b/pkg/util/metric/aggmetric/testdata/add_after_destroy_hdr.txt new file mode 100644 index 000000000000..a891cc503df1 --- /dev/null +++ b/pkg/util/metric/aggmetric/testdata/add_after_destroy_hdr.txt @@ -0,0 +1,23 @@ +echo +---- +bar_gauge 2 +bar_gauge{tenant_id="2"} 2 +bar_gauge{tenant_id="3"} 0 +baz_gauge 1.5 +baz_gauge{tenant_id="2"} 1.5 +baz_gauge{tenant_id="3"} 0 +foo_counter 6 +foo_counter{tenant_id="2"} 0 +foo_counter{tenant_id="3"} 4 +histo_gram_bucket{le="+Inf"} 2 +histo_gram_bucket{le="10"} 1 +histo_gram_bucket{le="91"} 2 +histo_gram_bucket{tenant_id="2",le="+Inf"} 1 +histo_gram_bucket{tenant_id="2",le="10"} 1 +histo_gram_bucket{tenant_id="3",le="+Inf"} 0 +histo_gram_count 2 +histo_gram_count{tenant_id="2"} 1 +histo_gram_count{tenant_id="3"} 0 +histo_gram_sum 101 +histo_gram_sum{tenant_id="2"} 10 +histo_gram_sum{tenant_id="3"} 0 \ No newline at end of file diff --git a/pkg/util/metric/aggmetric/testdata/basic.txt b/pkg/util/metric/aggmetric/testdata/basic.txt new file mode 100644 index 000000000000..3191b6775854 --- /dev/null +++ b/pkg/util/metric/aggmetric/testdata/basic.txt @@ -0,0 +1,54 @@ + +echo +---- +bar_gauge 4 +bar_gauge{tenant_id="2"} 2 +bar_gauge{tenant_id="3"} 2 +baz_gauge 4 +baz_gauge{tenant_id="2"} 1.5 +baz_gauge{tenant_id="3"} 2.5 +foo_counter 6 +foo_counter{tenant_id="2"} 2 +foo_counter{tenant_id="3"} 4 +histo_gram_bucket{le="+Inf"} 2 +histo_gram_bucket{le="1"} 0 +histo_gram_bucket{le="1024"} 2 +histo_gram_bucket{le="128"} 2 +histo_gram_bucket{le="16"} 1 +histo_gram_bucket{le="2"} 0 +histo_gram_bucket{le="256"} 2 +histo_gram_bucket{le="32"} 1 +histo_gram_bucket{le="4"} 0 +histo_gram_bucket{le="512"} 2 +histo_gram_bucket{le="64"} 1 +histo_gram_bucket{le="8"} 0 +histo_gram_bucket{tenant_id="2",le="+Inf"} 1 +histo_gram_bucket{tenant_id="2",le="1"} 0 +histo_gram_bucket{tenant_id="2",le="1024"} 1 +histo_gram_bucket{tenant_id="2",le="128"} 1 +histo_gram_bucket{tenant_id="2",le="16"} 1 +histo_gram_bucket{tenant_id="2",le="2"} 0 +histo_gram_bucket{tenant_id="2",le="256"} 1 +histo_gram_bucket{tenant_id="2",le="32"} 1 +histo_gram_bucket{tenant_id="2",le="4"} 0 +histo_gram_bucket{tenant_id="2",le="512"} 1 +histo_gram_bucket{tenant_id="2",le="64"} 1 +histo_gram_bucket{tenant_id="2",le="8"} 0 +histo_gram_bucket{tenant_id="3",le="+Inf"} 1 +histo_gram_bucket{tenant_id="3",le="1"} 0 +histo_gram_bucket{tenant_id="3",le="1024"} 1 +histo_gram_bucket{tenant_id="3",le="128"} 1 +histo_gram_bucket{tenant_id="3",le="16"} 0 +histo_gram_bucket{tenant_id="3",le="2"} 0 +histo_gram_bucket{tenant_id="3",le="256"} 1 +histo_gram_bucket{tenant_id="3",le="32"} 0 +histo_gram_bucket{tenant_id="3",le="4"} 0 +histo_gram_bucket{tenant_id="3",le="512"} 1 +histo_gram_bucket{tenant_id="3",le="64"} 0 +histo_gram_bucket{tenant_id="3",le="8"} 0 +histo_gram_count 2 +histo_gram_count{tenant_id="2"} 1 +histo_gram_count{tenant_id="3"} 1 +histo_gram_sum 100 +histo_gram_sum{tenant_id="2"} 10 +histo_gram_sum{tenant_id="3"} 90 \ No newline at end of file diff --git a/pkg/util/metric/aggmetric/testdata/basic_hdr.txt b/pkg/util/metric/aggmetric/testdata/basic_hdr.txt new file mode 100644 index 000000000000..a796b8ef3406 --- /dev/null +++ b/pkg/util/metric/aggmetric/testdata/basic_hdr.txt @@ -0,0 +1,24 @@ +echo +---- +bar_gauge 4 +bar_gauge{tenant_id="2"} 2 +bar_gauge{tenant_id="3"} 2 +baz_gauge 4 +baz_gauge{tenant_id="2"} 1.5 +baz_gauge{tenant_id="3"} 2.5 +foo_counter 6 +foo_counter{tenant_id="2"} 2 +foo_counter{tenant_id="3"} 4 +histo_gram_bucket{le="+Inf"} 2 +histo_gram_bucket{le="10"} 1 +histo_gram_bucket{le="91"} 2 +histo_gram_bucket{tenant_id="2",le="+Inf"} 1 +histo_gram_bucket{tenant_id="2",le="10"} 1 +histo_gram_bucket{tenant_id="3",le="+Inf"} 1 +histo_gram_bucket{tenant_id="3",le="91"} 1 +histo_gram_count 2 +histo_gram_count{tenant_id="2"} 1 +histo_gram_count{tenant_id="3"} 1 +histo_gram_sum 101 +histo_gram_sum{tenant_id="2"} 10 +histo_gram_sum{tenant_id="3"} 91 \ No newline at end of file diff --git a/pkg/util/metric/aggmetric/testdata/destroy.txt b/pkg/util/metric/aggmetric/testdata/destroy.txt new file mode 100644 index 000000000000..17b58be1c737 --- /dev/null +++ b/pkg/util/metric/aggmetric/testdata/destroy.txt @@ -0,0 +1,36 @@ +echo +---- +bar_gauge 2 +bar_gauge{tenant_id="2"} 2 +baz_gauge 1.5 +baz_gauge{tenant_id="2"} 1.5 +foo_counter 6 +foo_counter{tenant_id="3"} 4 +histo_gram_bucket{le="+Inf"} 2 +histo_gram_bucket{le="1"} 0 +histo_gram_bucket{le="1024"} 2 +histo_gram_bucket{le="128"} 2 +histo_gram_bucket{le="16"} 1 +histo_gram_bucket{le="2"} 0 +histo_gram_bucket{le="256"} 2 +histo_gram_bucket{le="32"} 1 +histo_gram_bucket{le="4"} 0 +histo_gram_bucket{le="512"} 2 +histo_gram_bucket{le="64"} 1 +histo_gram_bucket{le="8"} 0 +histo_gram_bucket{tenant_id="2",le="+Inf"} 1 +histo_gram_bucket{tenant_id="2",le="1"} 0 +histo_gram_bucket{tenant_id="2",le="1024"} 1 +histo_gram_bucket{tenant_id="2",le="128"} 1 +histo_gram_bucket{tenant_id="2",le="16"} 1 +histo_gram_bucket{tenant_id="2",le="2"} 0 +histo_gram_bucket{tenant_id="2",le="256"} 1 +histo_gram_bucket{tenant_id="2",le="32"} 1 +histo_gram_bucket{tenant_id="2",le="4"} 0 +histo_gram_bucket{tenant_id="2",le="512"} 1 +histo_gram_bucket{tenant_id="2",le="64"} 1 +histo_gram_bucket{tenant_id="2",le="8"} 0 +histo_gram_count 2 +histo_gram_count{tenant_id="2"} 1 +histo_gram_sum 100 +histo_gram_sum{tenant_id="2"} 10 \ No newline at end of file diff --git a/pkg/util/metric/aggmetric/testdata/destroy_hdr.txt b/pkg/util/metric/aggmetric/testdata/destroy_hdr.txt new file mode 100644 index 000000000000..6b60d90e5a29 --- /dev/null +++ b/pkg/util/metric/aggmetric/testdata/destroy_hdr.txt @@ -0,0 +1,17 @@ +echo +---- +bar_gauge 2 +bar_gauge{tenant_id="2"} 2 +baz_gauge 1.5 +baz_gauge{tenant_id="2"} 1.5 +foo_counter 6 +foo_counter{tenant_id="3"} 4 +histo_gram_bucket{le="+Inf"} 2 +histo_gram_bucket{le="10"} 1 +histo_gram_bucket{le="91"} 2 +histo_gram_bucket{tenant_id="2",le="+Inf"} 1 +histo_gram_bucket{tenant_id="2",le="10"} 1 +histo_gram_count 2 +histo_gram_count{tenant_id="2"} 1 +histo_gram_sum 101 +histo_gram_sum{tenant_id="2"} 10 \ No newline at end of file diff --git a/pkg/util/metric/hdrhistogram.go b/pkg/util/metric/hdrhistogram.go new file mode 100644 index 000000000000..5f12505c4f5a --- /dev/null +++ b/pkg/util/metric/hdrhistogram.go @@ -0,0 +1,252 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package metric + +import ( + "time" + + "github.com/cockroachdb/cockroach/pkg/util/syncutil" + "github.com/codahale/hdrhistogram" + prometheusgo "github.com/prometheus/client_model/go" +) + +const ( + // HdrHistogramMaxLatency is the maximum value tracked in latency histograms. Higher + // values will be recorded as this value instead. + HdrHistogramMaxLatency = 10 * time.Second + + // The number of histograms to keep in rolling window. + hdrHistogramHistWrapNum = 2 // TestSampleInterval is passed to histograms during tests which don't +) + +// A HdrHistogram collects observed values by keeping bucketed counts. For +// convenience, internally two sets of buckets are kept: A cumulative set (i.e. +// data is never evicted) and a windowed set (which keeps only recently +// collected samples). +// +// Top-level methods generally apply to the cumulative buckets; the windowed +// variant is exposed through the Windowed method. +// +// TODO(#96357): remove HdrHistogram model entirely once the Prometheus +// backed histogram and its bucket boundaries have been reliably proven in +// production. +type HdrHistogram struct { + Metadata + maxVal int64 + mu struct { + syncutil.Mutex + cumulative *hdrhistogram.Histogram + *tickHelper + sliding *hdrhistogram.WindowedHistogram + } +} + +var _ IHistogram = &HdrHistogram{} +var _ PrometheusExportable = &HdrHistogram{} +var _ Iterable = &HdrHistogram{} + +// NewHdrHistogram initializes a given Histogram. The contained windowed histogram +// rotates every 'duration'; both the windowed and the cumulative histogram +// track nonnegative values up to 'maxVal' with 'sigFigs' decimal points of +// precision. +func NewHdrHistogram( + metadata Metadata, duration time.Duration, maxVal int64, sigFigs int, +) *HdrHistogram { + h := &HdrHistogram{ + Metadata: metadata, + maxVal: maxVal, + } + wHist := hdrhistogram.NewWindowed(hdrHistogramHistWrapNum, 0, maxVal, sigFigs) + h.mu.cumulative = hdrhistogram.New(0, maxVal, sigFigs) + h.mu.sliding = wHist + h.mu.tickHelper = &tickHelper{ + nextT: now(), + tickInterval: duration / hdrHistogramHistWrapNum, + onTick: func() { + wHist.Rotate() + }, + } + return h +} + +// NewHdrLatency is a convenience function which returns a histogram with +// suitable defaults for latency tracking. Values are expressed in ns, +// are truncated into the interval [0, HdrHistogramMaxLatency] and are recorded +// with one digit of precision (i.e. errors of <10ms at 100ms, <6s at 60s). +// +// The windowed portion of the Histogram retains values for approximately +// histogramWindow. +func NewHdrLatency(metadata Metadata, histogramWindow time.Duration) *HdrHistogram { + return NewHdrHistogram( + metadata, histogramWindow, HdrHistogramMaxLatency.Nanoseconds(), 1, + ) +} + +// RecordValue adds the given value to the histogram. Recording a value in +// excess of the configured maximum value for that histogram results in +// recording the maximum value instead. +func (h *HdrHistogram) RecordValue(v int64) { + h.mu.Lock() + defer h.mu.Unlock() + + if h.mu.sliding.Current.RecordValue(v) != nil { + _ = h.mu.sliding.Current.RecordValue(h.maxVal) + } + if h.mu.cumulative.RecordValue(v) != nil { + _ = h.mu.cumulative.RecordValue(h.maxVal) + } +} + +// TotalCount returns the (cumulative) number of samples. +func (h *HdrHistogram) TotalCount() int64 { + h.mu.Lock() + defer h.mu.Unlock() + return h.mu.cumulative.TotalCount() +} + +// Min returns the minimum. +func (h *HdrHistogram) Min() int64 { + h.mu.Lock() + defer h.mu.Unlock() + return h.mu.cumulative.Min() +} + +// Inspect calls the closure with the empty string and the receiver. +func (h *HdrHistogram) Inspect(f func(interface{})) { + h.mu.Lock() + maybeTick(h.mu.tickHelper) + h.mu.Unlock() + f(h) +} + +// GetType returns the prometheus type enum for this metric. +func (h *HdrHistogram) GetType() *prometheusgo.MetricType { + return prometheusgo.MetricType_HISTOGRAM.Enum() +} + +// ToPrometheusMetric returns a filled-in prometheus metric of the right type. +func (h *HdrHistogram) ToPrometheusMetric() *prometheusgo.Metric { + hist := &prometheusgo.Histogram{} + + h.mu.Lock() + maybeTick(h.mu.tickHelper) + bars := h.mu.cumulative.Distribution() + hist.Bucket = make([]*prometheusgo.Bucket, 0, len(bars)) + + var cumCount uint64 + var sum float64 + for _, bar := range bars { + if bar.Count == 0 { + // No need to expose trivial buckets. + continue + } + upperBound := float64(bar.To) + sum += upperBound * float64(bar.Count) + + cumCount += uint64(bar.Count) + curCumCount := cumCount // need a new alloc thanks to bad proto code + + hist.Bucket = append(hist.Bucket, &prometheusgo.Bucket{ + CumulativeCount: &curCumCount, + UpperBound: &upperBound, + }) + } + hist.SampleCount = &cumCount + hist.SampleSum = &sum // can do better here; we approximate in the loop + h.mu.Unlock() + + return &prometheusgo.Metric{ + Histogram: hist, + } +} + +// TotalCountWindowed implements the WindowedHistogram interface. +func (h *HdrHistogram) TotalCountWindowed() int64 { + return int64(h.ToPrometheusMetricWindowed().Histogram.GetSampleCount()) +} + +// TotalSumWindowed implements the WindowedHistogram interface. +func (h *HdrHistogram) TotalSumWindowed() float64 { + return h.ToPrometheusMetricWindowed().Histogram.GetSampleSum() +} + +func (h *HdrHistogram) toPrometheusMetricWindowedLocked() *prometheusgo.Metric { + hist := &prometheusgo.Histogram{} + + maybeTick(h.mu.tickHelper) + bars := h.mu.sliding.Current.Distribution() + hist.Bucket = make([]*prometheusgo.Bucket, 0, len(bars)) + + var cumCount uint64 + var sum float64 + for _, bar := range bars { + if bar.Count == 0 { + // No need to expose trivial buckets. + continue + } + upperBound := float64(bar.To) + sum += upperBound * float64(bar.Count) + + cumCount += uint64(bar.Count) + curCumCount := cumCount // need a new alloc thanks to bad proto code + + hist.Bucket = append(hist.Bucket, &prometheusgo.Bucket{ + CumulativeCount: &curCumCount, + UpperBound: &upperBound, + }) + } + hist.SampleCount = &cumCount + hist.SampleSum = &sum // can do better here; we approximate in the loop + + return &prometheusgo.Metric{ + Histogram: hist, + } +} + +// ToPrometheusMetricWindowed returns a filled-in prometheus metric of the right type. +func (h *HdrHistogram) ToPrometheusMetricWindowed() *prometheusgo.Metric { + h.mu.Lock() + defer h.mu.Unlock() + return h.toPrometheusMetricWindowedLocked() +} + +// GetMetadata returns the metric's metadata including the Prometheus +// MetricType. +func (h *HdrHistogram) GetMetadata() Metadata { + baseMetadata := h.Metadata + baseMetadata.MetricType = prometheusgo.MetricType_HISTOGRAM + return baseMetadata +} + +// ValueAtQuantileWindowed calculates the windowed quantile value for +// this HdrHistogram. +func (h *HdrHistogram) ValueAtQuantileWindowed(q float64) float64 { + h.mu.Lock() + defer h.mu.Unlock() + + return ValueAtQuantileWindowed(h.toPrometheusMetricWindowedLocked().Histogram, q) +} + +// Mean calculates the cumulative mean value for this HdrHistogram. +func (h *HdrHistogram) Mean() float64 { + h.mu.Lock() + defer h.mu.Unlock() + + return h.mu.cumulative.Mean() +} + +// TotalSum calculates the cumulative sample sum value for this HdrHistogram. +func (h *HdrHistogram) TotalSum() float64 { + h.mu.Lock() + defer h.mu.Unlock() + + return h.ToPrometheusMetric().GetSummary().GetSampleSum() +} diff --git a/pkg/util/metric/metric.go b/pkg/util/metric/metric.go index 715ea2311a75..357b2aa8cfe8 100644 --- a/pkg/util/metric/metric.go +++ b/pkg/util/metric/metric.go @@ -17,6 +17,8 @@ import ( "sync/atomic" "time" + "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/gogo/protobuf/proto" @@ -25,11 +27,9 @@ import ( "github.com/rcrowley/go-metrics" ) -const ( - // TestSampleInterval is passed to histograms during tests which don't - // want to concern themselves with supplying a "correct" interval. - TestSampleInterval = time.Duration(math.MaxInt64) -) +// TestSampleInterval is passed to histograms during tests which don't +// want to concern themselves with supplying a "correct" interval. +const TestSampleInterval = time.Duration(math.MaxInt64) // Iterable provides a method for synchronized access to interior objects. type Iterable interface { @@ -176,10 +176,86 @@ func maybeTick(m periodic) { } } +// useHdrHistogramsEnvVar can be used to switch all histograms to use the +// legacy HDR histograms (except for those that explicitly force the use +// of the newer Prometheus via HistogramModePrometheus). HDR Histograms +// dynamically generate bucket boundaries, which can lead to hundreds of +// buckets. This can cause performance issues with timeseries databases +// like Prometheus. +const useHdrHistogramsEnvVar = "COCKROACH_ENABLE_HDR_HISTOGRAMS" + +var hdrEnabled = util.ConstantWithMetamorphicTestBool(useHdrHistogramsEnvVar, envutil.EnvOrDefaultBool(useHdrHistogramsEnvVar, false)) + +// HdrEnabled returns whether or not the HdrHistogram model is enabled +// in the metric package. Primarily useful in tests where we want to validate +// different outputs depending on whether or not HDR is enabled. +func HdrEnabled() bool { + return hdrEnabled +} + +// HistogramMode specifies which type of histogram should be preferred +// under various circumstances. See various constants for details. +type HistogramMode byte + +const ( + // HistogramModePrometheus will force the constructed histogram to use + // the Prometheus histogram model, regardless of the value of + // useHdrHistogramsEnvVar. This option should be used for all + // newly defined histograms moving forward. + // + // NB: If neither this mode nor the HistogramModePreferHdrLatency mode + // is set, MaxVal and SigFigs must be defined to maintain backwards + // compatibility with the legacy HdrHistogram model. + HistogramModePrometheus HistogramMode = iota + 1 + // HistogramModePreferHdrLatency will cause the returned histogram to + // use the HdrHistgoram model and be configured with suitable defaults + // for latency tracking iff useHdrHistogramsEnvVar is enabled. + // + // NB: If this option is set, no MaxVal or SigFigs are required in the + // HistogramOptions to maintain backwards compatibility with the legacy + // HdrHistogram model, since suitable defaults are used for both. + HistogramModePreferHdrLatency +) + +// HistogramOptions offers various configuration options available when +// creating a new Histogram. +type HistogramOptions struct { + // Metadata is the metric Metadata associated with the histogram. + Metadata Metadata + // Duration is the histogram's window duration. + Duration time.Duration + // MaxVal is only relevant to the HdrHistogram, and represents the + // highest trackable value in the resulting histogram buckets. + MaxVal int64 + // SigFigs is only relevant to the HdrHistogram, and represents + // the number of significant figures to be used to determine the + // degree of accuracy used in measurements. + SigFigs int + // Buckets are only relevant to Prometheus histograms, and represent + // the pre-defined histogram bucket boundaries to be used. + Buckets []float64 + // Mode defines the type of histogram to be used. See individual + // comments on each HistogramMode value for details. + Mode HistogramMode +} + +// NewHistogram creates a new IHistogram. The returned type is determined +// based on the provided HistogramOptions, and/or the value of the +// useHdrHistogramsEnvVar environment variable. +func NewHistogram(opt HistogramOptions) IHistogram { + if hdrEnabled && opt.Mode != HistogramModePrometheus { + if opt.Mode == HistogramModePreferHdrLatency { + return NewHdrLatency(opt.Metadata, opt.Duration) + } + return NewHdrHistogram(opt.Metadata, opt.Duration, opt.MaxVal, opt.SigFigs) + } + return newHistogram(opt.Metadata, opt.Duration, opt.Buckets) +} + // NewHistogram is a prometheus-backed histogram. Depending on the value of // opts.Buckets, this is suitable for recording any kind of quantity. Common // sensible choices are {IO,Network}LatencyBuckets. -func NewHistogram(meta Metadata, windowDuration time.Duration, buckets []float64) *Histogram { +func newHistogram(meta Metadata, windowDuration time.Duration, buckets []float64) *Histogram { // TODO(obs-inf): prometheus supports labeled histograms but they require more // plumbing and don't fit into the PrometheusObservable interface any more. opts := prometheus.HistogramOpts{ @@ -236,6 +312,23 @@ type Histogram struct { } } +// IHistogram is the interface that all core histogram +// implementations should adhere to. +type IHistogram interface { + Iterable + PrometheusExportable + WindowedHistogram + + RecordValue(n int64) + TotalCount() int64 + TotalSum() float64 + TotalCountWindowed() int64 + TotalSumWindowed() float64 + Mean() float64 +} + +var _ IHistogram = &Histogram{} + func (h *Histogram) nextTick() time.Time { h.windowed.RLock() defer h.windowed.RUnlock() @@ -326,7 +419,8 @@ func (h *Histogram) TotalSumWindowed() float64 { // Mean returns the (cumulative) mean of samples. func (h *Histogram) Mean() float64 { - return h.TotalSum() / float64(h.TotalCount()) + pm := h.ToPrometheusMetric() + return pm.Histogram.GetSampleSum() / float64(pm.Histogram.GetSampleCount()) } // ValueAtQuantileWindowed implements the WindowedHistogram interface. diff --git a/pkg/util/metric/metric_test.go b/pkg/util/metric/metric_test.go index 13f3dbe7fc91..dc353c4843e6 100644 --- a/pkg/util/metric/metric_test.go +++ b/pkg/util/metric/metric_test.go @@ -111,17 +111,18 @@ func TestHistogram(t *testing.T) { return &n } - h := NewHistogram( - Metadata{}, - time.Hour, - []float64{ + h := NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: Metadata{}, + Duration: time.Hour, + Buckets: []float64{ 1.0, 5.0, 10.0, 25.0, 100.0, }, - ) + }) // should return 0 if no observations are made require.Equal(t, 0.0, h.ValueAtQuantileWindowed(0)) @@ -169,23 +170,24 @@ func TestNewHistogramRotate(t *testing.T) { defer TestingSetNow(nil)() setNow(0) - h := NewHistogram(emptyMetadata, 10*time.Second, nil) + h := NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: emptyMetadata, + Duration: 10 * time.Second, + Buckets: nil, + }) for i := 0; i < 4; i++ { // Windowed histogram is initially empty. h.Inspect(func(interface{}) {}) // triggers ticking - var m prometheusgo.Metric - require.NoError(t, h.Windowed().Write(&m)) - require.Zero(t, *m.Histogram.SampleSum) + require.Zero(t, h.TotalSumWindowed()) // But cumulative histogram has history (if i > 0). - require.EqualValues(t, i, *h.ToPrometheusMetric().Histogram.SampleCount) + require.EqualValues(t, i, h.TotalCount()) // Add a measurement and verify it's there. { h.RecordValue(12345) f := float64(12345) - var m prometheusgo.Metric - require.NoError(t, h.Windowed().Write(&m)) - require.Equal(t, *m.Histogram.SampleSum, f) + require.Equal(t, h.TotalSumWindowed(), f) } // Tick. This rotates the histogram. setNow(time.Duration(i+1) * 10 * time.Second) diff --git a/pkg/util/metric/registry_test.go b/pkg/util/metric/registry_test.go index 5d2b2a6c4e88..48f4aba216bd 100644 --- a/pkg/util/metric/registry_test.go +++ b/pkg/util/metric/registry_test.go @@ -76,14 +76,19 @@ func TestRegistry(t *testing.T) { topCounter := NewCounter(Metadata{Name: "top.counter"}) r.AddMetric(topCounter) - r.AddMetric(NewHistogram(Metadata{Name: "top.histogram"}, time.Minute, Count1KBuckets)) + r.AddMetric(NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: Metadata{Name: "top.histogram"}, + Duration: time.Minute, + Buckets: Count1KBuckets, + })) r.AddMetric(NewGauge(Metadata{Name: "bottom.gauge"})) ms := &struct { StructGauge *Gauge StructGauge64 *GaugeFloat64 StructCounter *Counter - StructHistogram *Histogram + StructHistogram IHistogram NestedStructGauge NestedStruct ArrayStructCounters [4]*Counter // Ensure that nil struct values in arrays are safe. @@ -92,7 +97,7 @@ func TestRegistry(t *testing.T) { privateStructGauge *Gauge privateStructGauge64 *GaugeFloat64 privateStructCounter *Counter - privateStructHistogram *Histogram + privateStructHistogram IHistogram privateNestedStructGauge NestedStruct privateArrayStructCounters [2]*Counter NotAMetric int @@ -100,10 +105,15 @@ func TestRegistry(t *testing.T) { ReallyNotAMetric *Registry DefinitelyNotAnArrayOfMetrics [2]int }{ - StructGauge: NewGauge(Metadata{Name: "struct.gauge"}), - StructGauge64: NewGaugeFloat64(Metadata{Name: "struct.gauge64"}), - StructCounter: NewCounter(Metadata{Name: "struct.counter"}), - StructHistogram: NewHistogram(Metadata{Name: "struct.histogram"}, time.Minute, Count1KBuckets), + StructGauge: NewGauge(Metadata{Name: "struct.gauge"}), + StructGauge64: NewGaugeFloat64(Metadata{Name: "struct.gauge64"}), + StructCounter: NewCounter(Metadata{Name: "struct.counter"}), + StructHistogram: NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: Metadata{Name: "struct.histogram"}, + Duration: time.Minute, + Buckets: Count1KBuckets, + }), NestedStructGauge: NestedStruct{ NestedStructGauge: NewGauge(Metadata{Name: "nested.struct.gauge"}), }, @@ -119,10 +129,15 @@ func TestRegistry(t *testing.T) { NestedStructGauge: NewGauge(Metadata{Name: "nested.struct.array.1.gauge"}), }, }, - privateStructGauge: NewGauge(Metadata{Name: "private.struct.gauge"}), - privateStructGauge64: NewGaugeFloat64(Metadata{Name: "private.struct.gauge64"}), - privateStructCounter: NewCounter(Metadata{Name: "private.struct.counter"}), - privateStructHistogram: NewHistogram(Metadata{Name: "private.struct.histogram"}, time.Minute, Count1KBuckets), + privateStructGauge: NewGauge(Metadata{Name: "private.struct.gauge"}), + privateStructGauge64: NewGaugeFloat64(Metadata{Name: "private.struct.gauge64"}), + privateStructCounter: NewCounter(Metadata{Name: "private.struct.counter"}), + privateStructHistogram: NewHistogram(HistogramOptions{ + Mode: HistogramModePrometheus, + Metadata: Metadata{Name: "private.struct.histogram"}, + Duration: time.Minute, + Buckets: Count1KBuckets, + }), privateNestedStructGauge: NestedStruct{ NestedStructGauge: NewGauge(Metadata{Name: "private.nested.struct.gauge"}), }, diff --git a/pkg/util/mon/bytes_usage.go b/pkg/util/mon/bytes_usage.go index 5cb1adf18cfb..96ba3bb5d357 100644 --- a/pkg/util/mon/bytes_usage.go +++ b/pkg/util/mon/bytes_usage.go @@ -195,7 +195,7 @@ type BytesMonitor struct { // maxBytesHist is the metric object used to track the high watermark of bytes // allocated by the monitor during its lifetime. - maxBytesHist *metric.Histogram + maxBytesHist metric.IHistogram } // name identifies this monitor in logging messages. @@ -273,7 +273,7 @@ func NewMonitor( name redact.RedactableString, res Resource, curCount *metric.Gauge, - maxHist *metric.Histogram, + maxHist metric.IHistogram, increment int64, noteworthy int64, settings *cluster.Settings, @@ -289,7 +289,7 @@ func NewMonitorWithLimit( res Resource, limit int64, curCount *metric.Gauge, - maxHist *metric.Histogram, + maxHist metric.IHistogram, increment int64, noteworthy int64, settings *cluster.Settings, @@ -386,7 +386,7 @@ func NewUnlimitedMonitor( name redact.RedactableString, res Resource, curCount *metric.Gauge, - maxHist *metric.Histogram, + maxHist metric.IHistogram, noteworthy int64, settings *cluster.Settings, ) *BytesMonitor { @@ -485,7 +485,7 @@ func (mm *BytesMonitor) AllocBytes() int64 { } // SetMetrics sets the metric objects for the monitor. -func (mm *BytesMonitor) SetMetrics(curCount *metric.Gauge, maxHist *metric.Histogram) { +func (mm *BytesMonitor) SetMetrics(curCount *metric.Gauge, maxHist metric.IHistogram) { mm.mu.Lock() defer mm.mu.Unlock() mm.mu.curBytesCount = curCount From b916c5b87e9aa9de6fa3f694f1d8be1fd2673297 Mon Sep 17 00:00:00 2001 From: Alex Barganier Date: Mon, 30 Jan 2023 16:23:56 -0400 Subject: [PATCH 2/2] pkg/util/metric: increase bucket counts for Prometheus histograms This patch increases the fidelity of the histogram buckets for the new Prometheus model. This is primarily done by increasing the bucket counts for all latency buckets, but may also be manually tweaked according to feedback from various engineering teams for their own use cases. Release note (ops change): Prometheus histograms will now export more buckets across the board to improve precision & fidelity of information reported by histogram metrics, such as quantiles. This will lead to an increase in storage requirements to process these histogram metrics in downstream systems like Prometheus, but should still be a marked improvement when compared to the legacy HdrHistogram model. If users have issues with the precision of these bucket boundaries, they can set the environment variable `COCKROACH_ENABLE_HDR_HISTOGRAMS=true` to revert to using the legacy HdrHistogram model instead, although this is not recommended otherwise as the HdrHistogram strains systems like Prometheus with excessive numbers of histogram buckets. Note that HdrHistograms are slated for full deprecation in upcoming releases. --- pkg/util/metric/histogram_buckets.go | 294 +++++++++++++++++----- pkg/util/metric/histogram_buckets_test.go | 8 +- 2 files changed, 241 insertions(+), 61 deletions(-) diff --git a/pkg/util/metric/histogram_buckets.go b/pkg/util/metric/histogram_buckets.go index 8ce3c9dce884..bf9f70579afb 100644 --- a/pkg/util/metric/histogram_buckets.go +++ b/pkg/util/metric/histogram_buckets.go @@ -17,20 +17,65 @@ package metric var IOLatencyBuckets = []float64{ // Generated via TestHistogramBuckets/IOLatencyBuckets. 10000.000000, // 10µs - 26826.957953, // 26.826µs - 71968.567300, // 71.968µs - 193069.772888, // 193.069µs - 517947.467923, // 517.947µs - 1389495.494373, // 1.389495ms - 3727593.720315, // 3.727593ms - 10000000.000000, // 9.999999ms - 26826957.952797, // 26.826957ms - 71968567.300115, // 71.968567ms - 193069772.888325, // 193.069772ms - 517947467.923120, // 517.947467ms - 1389495494.373135, // 1.389495494s - 3727593720.314933, // 3.72759372s - 9999999999.999981, // 9.999999999s + 12638.482029, // 12.638µs + 15973.122801, // 15.973µs + 20187.602547, // 20.187µs + 25514.065200, // 25.514µs + 32245.905453, // 32.245µs + 40753.929659, // 40.753µs + 51506.780762, // 51.506µs + 65096.752305, // 65.096µs + 82272.413417, // 82.272µs + 103979.841848, // 103.979µs + 131414.736261, // 131.414µs + 166088.278263, // 166.088µs + 209910.372011, // 209.91µs + 265294.846443, // 265.294µs + 335292.414925, // 335.292µs + 423758.716060, // 423.758µs + 535566.691771, // 535.566µs + 676875.000946, // 676.875µs + 855467.253557, // 855.467µs + 1081180.751077, // 1.08118ms + 1366448.349295, // 1.366448ms + 1726983.290659, // 1.726983ms + 2182644.728397, // 2.182644ms + 2758531.617629, // 2.758531ms + 3486365.227678, // 3.486365ms + 4406236.427774, // 4.406236ms + 5568813.990945, // 5.568813ms + 7038135.554932, // 7.038135ms + 8895134.973108, // 8.895134ms + 11242100.350621, // 11.2421ms + 14208308.325339, // 14.208308ms + 17957144.943716, // 17.957144ms + 22695105.366947, // 22.695105ms + 28683168.133420, // 28.683168ms + 36251170.499885, // 36.25117ms + 45815976.690545, // 45.815976ms + 57904439.806025, // 57.904439ms + 73182422.190762, // 73.182422ms + 92491472.772173, // 92.491472ms + 116895181.649858, // 116.895181ms + 147737765.259851, // 147.737765ms + 186718109.129192, // 186.718109ms + 235983346.678219, // 235.983346ms + 298247128.621688, // 298.247128ms + 376939097.538835, // 376.939097ms + 476393801.040133, // 476.393801ms + 602089449.333611, // 602.089449ms + 760949668.545986, // 760.949668ms + 961724871.115294, // 961.724871ms + 1215474250.076283, // 1.21547425s + 1536174946.671824, // 1.536174946s + 1941491945.743876, // 1.941491945s + 2453751106.639811, // 2.453751106s + 3101168926.574770, // 3.101168926s + 3919406774.847209, // 3.919406774s + 4953535208.959157, // 4.953535208s + 6260516572.014802, // 6.260516572s + 7912342618.981298, // 7.912342618s + 9999999999.999969, // 9.999999999s } // NetworkLatencyBuckets are prometheus histogram buckets suitable for a histogram @@ -39,21 +84,66 @@ var IOLatencyBuckets = []float64{ // range during normal operation. var NetworkLatencyBuckets = []float64{ // Generated via TestHistogramBuckets/NetworkLatencyBuckets. - 500000.000000, // 500µs - 860513.842995, // 860.513µs - 1480968.147973, // 1.480968ms - 2548787.184731, // 2.548787ms - 4386533.310619, // 4.386533ms - 7549345.273094, // 7.549345ms - 12992632.226094, // 12.992632ms - 22360679.774998, // 22.360679ms - 38483348.970335, // 38.483348ms - 66230909.027573, // 66.230909ms - 113985228.104760, // 113.985228ms - 196171733.362212, // 196.171733ms - 337616984.325077, // 337.616984ms - 581048177.284016, // 581.048177ms - 999999999.999999, // 999.999999ms + 500000.000000, // 500µs + 568747.715565, // 568.747µs + 646947.927922, // 646.947µs + 735900.312190, // 735.9µs + 837083.242884, // 837.083µs + 952178.364257, // 952.178µs + 1083098.538963, // 1.083098ms + 1232019.639535, // 1.232019ms + 1401416.711034, // 1.401416ms + 1594105.105912, // 1.594105ms + 1813287.274717, // 1.813287ms + 2062605.990318, // 2.062605ms + 2346204.890209, // 2.346204ms + 2668797.343109, // 2.668797ms + 3035744.784401, // 3.035744ms + 3453145.822334, // 3.453145ms + 3927937.595933, // 3.927937ms + 4468011.069141, // 4.468011ms + 5082342.177389, // 5.082342ms + 5781141.006222, // 5.781141ms + 6576021.481300, // 6.576021ms + 7480194.389996, // 7.480194ms + 8508686.942589, // 8.508686ms + 9678592.522117, // 9.678592ms + 11009354.773683, // 11.009354ms + 12523090.754761, // 12.52309ms + 14244958.517175, // 14.244958ms + 16203575.229933, // 16.203575ms + 18431492.792031, // 18.431492ms + 20965738.839853, // 20.965738ms + 23848432.140611, // 23.848432ms + 27127482.599575, // 27.127482ms + 30857387.515093, // 30.857387ms + 35100137.315047, // 35.100137ms + 39926245.827925, // 39.926245ms + 45415922.211464, // 45.415922ms + 51660404.016126, // 51.660404ms + 58763473.538708, // 58.763473ms + 66843182.667648, // 66.843182ms + 76033814.886682, // 76.033814ms + 86488117.045035, // 86.488117ms + 98379837.985822, // 98.379837ms + 111906616.224248, // 111.906616ms + 127293264.668375, // 127.293264ms + 144795506.973983, // 144.795506ms + 164704227.631154, // 164.704227ms + 187350306.418342, // 187.350306ms + 213110117.571795, // 213.110117ms + 242411785.065635, // 242.411785ms + 275742297.964389, // 275.742297ms + 313655604.103963, // 313.655604ms + 356781816.616787, // 356.781816ms + 405837686.312094, // 405.837686ms + 461638513.960647, // 461.638513ms + 525111700.464186, // 525.1117ms + 597312160.111267, // 597.31216ms + 679439853.085354, // 679.439853ms + 772859728.612681, // 772.859728ms + 879124410.201811, // 879.12441ms + 1000000000.000001, // 1s } // BatchProcessLatencyBuckets are prometheus histogram buckets suitable for a @@ -62,20 +152,65 @@ var NetworkLatencyBuckets = []float64{ var BatchProcessLatencyBuckets = []float64{ // Generated via TestHistogramBuckets/BatchProcessLatencyBuckets. 500000000.000000, // 500ms - 789604072.059876, // 789.604072ms - 1246949181.227077, // 1.246949181s - 1969192302.297256, // 1.969192302s - 3109764521.125753, // 3.109764521s - 4910965458.056452, // 4.910965458s - 7755436646.853539, // 7.755436646s - 12247448713.915894, // 12.247448713s - 19341270753.704967, // 19.341270753s - 30543892291.876068, // 30.543892291s - 48235163460.447227, // 48.23516346s - 76173362969.685760, // 1m16.173362969s - 120293595166.717728, // 2m0.293595166s - 189968625172.725128, // 3m9.968625172s - 300000000000.000183, // 5m0s + 557259285.358743, // 557.259285ms + 621075822.237074, // 621.075822ms + 692200537.706851, // 692.200537ms + 771470353.934916, // 771.470353ms + 859818036.218456, // 859.818036ms + 958283168.803309, // 958.283168ms + 1068024387.637287, // 1.068024387s + 1190333014.000928, // 1.190333014s + 1326648249.442152, // 1.326648249s + 1478574110.813123, // 1.47857411s + 1647898304.683320, // 1.647898304s + 1836613263.223422, // 1.836613263s + 2046939589.088547, // 2.046939589s + 2281352185.176006, // 2.281352185s + 2542609376.725576, // 2.542609376s + 2833785368.441068, // 2.833785368s + 3158306418.555065, // 3.158306418s + 3519991155.495853, // 3.519991155s + 3923095511.561431, // 3.923095511s + 4372362802.333632, // 4.372362802s + 4873079541.115184, // 4.873079541s + 5431137645.156319, // 5.431137645s + 6053103765.649553, // 6.053103765s + 6746296557.296375, // 6.746296557s + 7518872796.674253, // 7.518872796s + 8379923362.755980, // 8.379923362s + 9339580208.980864, // 9.339580208s + 10409135585.614676, // 10.409135585s + 11601174915.283792, // 11.601174915s + 12929724885.225649, // 12.929724885s + 14410418498.852003, // 14.410418498s + 16060679028.781363, // 16.060679028s + 17899925035.909710, // 17.899925035s + 19949798866.972237, // 19.949798866s + 22234421319.319225, // 22.234421319s + 24780675469.538071, // 24.780675469s + 27618523005.723442, // 27.618523005s + 30781356785.666904, // 30.781356785s + 34306393769.506477, // 34.306393769s + 38235112950.461639, // 38.23511295s + 42613743436.770157, // 42.613743436s + 47493808428.070732, // 47.493808428s + 52932731487.183495, // 52.932731487s + 58994512241.268242, // 58.994512241s + 65750479463.313522, // 1m5.750479463s + 73280130395.441635, // 1m13.280130395s + 81672066190.318619, // 1m21.67206619s + 91025034477.977859, // 1m31.025034477s + 101449091325.905777, // 1m41.449091325s + 113066896265.136261, // 1m53.066896265s + 126015155620.881943, // 2m6.01515562s + 140446231131.326965, // 2m20.446231131s + 156529932783.144257, // 2m36.529932783s + 174455516959.974152, // 2m54.455516959s + 194433913416.010529, // 3m14.433913416s + 216700207279.419586, // 3m36.700207279s + 241516405291.241699, // 4m1.516405291s + 269174518830.019897, // 4m29.17451883s + 300000000000.000854, // 5m0s } // LongRunning60mLatencyBuckets are prometheus histogram buckets suitable @@ -84,20 +219,65 @@ var BatchProcessLatencyBuckets = []float64{ var LongRunning60mLatencyBuckets = []float64{ // Generated via TestHistogramBuckets/LongRunning60mLatencyBuckets. 500000000.000000, // 500ms - 942961049.923126, // 942.961049ms - 1778351083.344248, // 1.778351083s - 3353831609.364442, // 3.353831609s - 6325065151.263324, // 6.325065151s - 11928580151.734879, // 11.928580151s - 22496372927.944168, // 22.496372927s - 42426406871.192848, // 42.426406871s - 80012898335.451462, // 1m20.012898335s - 150898093243.579315, // 2m30.898093243s - 284582048872.726685, // 4m44.582048872s - 536699575188.601318, // 8m56.699575188s - 1012173589826.278687, // 16m52.173589826s - 1908880541934.094238, // 31m48.880541934s - 3599999999999.998535, // 59m59.999999999s + 581230667.894489, // 581.230667ms + 675658178.602148, // 675.658178ms + 785426508.834601, // 785.426508ms + 913027948.623944, // 913.027948ms + 1061359688.770060, // 1.061359688s + 1233789601.560218, // 1.233789601s + 1434232708.312242, // 1.434232708s + 1667240069.936893, // 1.667240069s + 1938102118.779750, // 1.938102118s + 2252968777.892157, // 2.252968777s + 2618989095.039379, // 2.618989095s + 3044473561.836243, // 3.044473561s + 3539082803.466387, // 3.539082803s + 4114046923.185338, // 4.114046923s + 4782420481.824564, // 4.782420481s + 5559378901.606352, // 5.559378901s + 6462563024.118382, // 6.462563024s + 7512479645.637113, // 7.512479645s + 8732967123.954826, // 8.732967123s + 10151736628.313759, // 10.151736628s + 11801001321.527510, // 11.801001321s + 13718207759.870365, // 13.718207759s + 15946886117.169632, // 15.946886117s + 18537638537.439724, // 18.537638537s + 21549288056.605419, // 21.549288056s + 25050214179.583008, // 25.050214179s + 29119905436.998066, // 29.119905436s + 33850764172.341507, // 33.850764172s + 39350204537.257782, // 39.350204537s + 45743091329.950188, // 45.743091329s + 53174575050.531136, // 53.17457505s + 61813387543.251701, // 1m1.813387543s + 71855673053.170151, // 1m11.855673053s + 83529441681.404266, // 1m23.529441681s + 97099746354.672745, // 1m37.099746354s + 112874700852.223846, // 1m52.874700852s + 131212475529.457443, // 2m11.212475529s + 152529429576.151703, // 2m32.529429576s + 177309564452.224213, // 2m57.309564452s + 206115513141.294464, // 3m26.115513141s + 239601314733.059875, // 3m59.601314733s + 278527264381.388123, // 4m38.527264381s + 323777175806.438293, // 5m23.777175806s + 376378448285.935181, // 6m16.378448285s + 437525393756.650940, // 7m17.525393756s + 508606353667.955078, // 8m28.606353667s + 591235221275.612671, // 9m51.235221275s + 687288085089.540771, // 11m27.288085089s + 798945825465.036499, // 13m18.945825465s + 928743631493.114136, // 15m28.743631493s + 1079628562470.991943, // 17m59.62856247s + 1255026460885.963623, // 20m55.026460885s + 1458919736172.010742, // 24m18.919736172s + 1695937785319.419434, // 28m15.937785319s + 1971462103337.413574, // 32m51.462103337s + 2291748470102.958496, // 38m11.748470102s + 2664068987848.231934, // 44m24.068987848s + 3096877194248.046875, // 51m36.877194248s + 3600000000000.007812, // 1h0m0s } // Count1KBuckets are prometheus histogram buckets suitable for a histogram that diff --git a/pkg/util/metric/histogram_buckets_test.go b/pkg/util/metric/histogram_buckets_test.go index 7fb183d70bf6..6f28454b89ff 100644 --- a/pkg/util/metric/histogram_buckets_test.go +++ b/pkg/util/metric/histogram_buckets_test.go @@ -48,22 +48,22 @@ func TestHistogramBuckets(t *testing.T) { require.InDeltaSlice(t, exp, act, 1 /* delta */, "Please update the bucket boundaries for %s", t.Name()) } t.Run("IOLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 15) + exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 60) verifyAndPrint(t, exp, IOLatencyBuckets, LATENCY) }) t.Run("NetworkLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(500e3, 1e9, 15) + exp := prometheus.ExponentialBucketsRange(500e3, 1e9, 60) verifyAndPrint(t, exp, NetworkLatencyBuckets, LATENCY) }) t.Run("BatchProcessLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(500e6, 300e9, 15) + exp := prometheus.ExponentialBucketsRange(500e6, 300e9, 60) verifyAndPrint(t, exp, BatchProcessLatencyBuckets, LATENCY) }) t.Run("LongRunning60mLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(500e6, 3600e9, 15) + exp := prometheus.ExponentialBucketsRange(500e6, 3600e9, 60) verifyAndPrint(t, exp, LongRunning60mLatencyBuckets, LATENCY) })