Skip to content

Commit

Permalink
metrics: refactor histogram bucket generation and testing
Browse files Browse the repository at this point in the history
This commit refactors histogram bucketing for legibility
and composibility. It also introduces a data-driven test
for histogram bucket generation.

This refactor should make it easier to add additional
metric categories, distributions, and bucket types.

Part of #97144.

Release note: None
  • Loading branch information
ericharmeling committed Aug 10, 2023
1 parent 1f8fa96 commit c748c25
Show file tree
Hide file tree
Showing 41 changed files with 736 additions and 675 deletions.
86 changes: 43 additions & 43 deletions pkg/ccl/changefeedccl/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -552,52 +552,52 @@ func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics {
EmittedMessages: b.Counter(metaChangefeedEmittedMessages),
FilteredMessages: b.Counter(metaChangefeedFilteredMessages),
MessageSize: b.Histogram(metric.HistogramOptions{
Metadata: metaMessageSize,
Duration: histogramWindow,
MaxVal: 10 << 20, /* 10MB max message size */
SigFigs: 1,
Buckets: metric.DataSize16MBBuckets,
Metadata: metaMessageSize,
Duration: histogramWindow,
MaxVal: 10 << 20, /* 10MB max message size */
SigFigs: 1,
BucketConfig: metric.DataSize16MBBuckets,
}),
EmittedBytes: b.Counter(metaChangefeedEmittedBytes),
FlushedBytes: b.Counter(metaChangefeedFlushedBytes),
Flushes: b.Counter(metaChangefeedFlushes),
SizeBasedFlushes: b.Counter(metaSizeBasedFlushes),
ParallelIOQueueNanos: b.Histogram(metric.HistogramOptions{
Metadata: metaChangefeedParallelIOQueueNanos,
Duration: histogramWindow,
MaxVal: changefeedIOQueueMaxLatency.Nanoseconds(),
SigFigs: 2,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaChangefeedParallelIOQueueNanos,
Duration: histogramWindow,
MaxVal: changefeedIOQueueMaxLatency.Nanoseconds(),
SigFigs: 2,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
SinkIOInflight: b.Gauge(metaChangefeedSinkIOInflight),

BatchHistNanos: b.Histogram(metric.HistogramOptions{
Metadata: metaChangefeedBatchHistNanos,
Duration: histogramWindow,
MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(),
SigFigs: 1,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaChangefeedBatchHistNanos,
Duration: histogramWindow,
MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(),
SigFigs: 1,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
FlushHistNanos: b.Histogram(metric.HistogramOptions{
Metadata: metaChangefeedFlushHistNanos,
Duration: histogramWindow,
MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(),
SigFigs: 2,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaChangefeedFlushHistNanos,
Duration: histogramWindow,
MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(),
SigFigs: 2,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
CommitLatency: b.Histogram(metric.HistogramOptions{
Metadata: metaCommitLatency,
Duration: histogramWindow,
MaxVal: commitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaCommitLatency,
Duration: histogramWindow,
MaxVal: commitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
AdmitLatency: b.Histogram(metric.HistogramOptions{
Metadata: metaAdmitLatency,
Duration: histogramWindow,
MaxVal: admitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaAdmitLatency,
Duration: histogramWindow,
MaxVal: admitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
BackfillCount: b.Gauge(metaChangefeedBackfillCount),
BackfillPendingRanges: b.Gauge(metaChangefeedBackfillPendingRanges),
Expand Down Expand Up @@ -712,27 +712,27 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
Failures: metric.NewCounter(metaChangefeedFailures),
QueueTimeNanos: metric.NewCounter(metaEventQueueTime),
CheckpointHistNanos: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaChangefeedCheckpointHistNanos,
Duration: histogramWindow,
MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(),
SigFigs: 2,
Buckets: metric.IOLatencyBuckets,
Metadata: metaChangefeedCheckpointHistNanos,
Duration: histogramWindow,
MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(),
SigFigs: 2,
BucketConfig: metric.IOLatencyBuckets,
}),
FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates),
ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow),
// Below two metrics were never implemented using the hdr histogram. Set ForceUsePrometheus
// to true.
ParallelConsumerFlushNanos: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaChangefeedEventConsumerFlushNanos,
Duration: histogramWindow,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: metaChangefeedEventConsumerFlushNanos,
Duration: histogramWindow,
BucketConfig: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
}),
ParallelConsumerConsumeNanos: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaChangefeedEventConsumerConsumeNanos,
Duration: histogramWindow,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: metaChangefeedEventConsumerConsumeNanos,
Duration: histogramWindow,
BucketConfig: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
}),
ParallelConsumerInFlightEvents: metric.NewGauge(metaChangefeedEventConsumerInFlightEvents),
}
Expand Down
24 changes: 12 additions & 12 deletions pkg/ccl/sqlproxyccl/connector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,10 +381,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {
c := &connector{
TenantID: roachpb.MustMakeTenantID(42),
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePrometheus,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
BucketConfig: metric.IOLatencyBuckets,
}),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
Expand Down Expand Up @@ -466,10 +466,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {

c := &connector{
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
BucketConfig: metric.IOLatencyBuckets,
}),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
Expand Down Expand Up @@ -500,10 +500,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {
c := &connector{
TenantID: roachpb.MustMakeTenantID(42),
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
BucketConfig: metric.IOLatencyBuckets,
}),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
Expand Down
34 changes: 17 additions & 17 deletions pkg/ccl/sqlproxyccl/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,19 +234,19 @@ func makeProxyMetrics() metrics {
RefusedConnCount: metric.NewCounter(metaRefusedConnCount),
SuccessfulConnCount: metric.NewCounter(metaSuccessfulConnCount),
ConnectionLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaConnMigrationAttemptedCount,
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaConnMigrationAttemptedCount,
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets,
}),
AuthFailedCount: metric.NewCounter(metaAuthFailedCount),
ExpiredClientConnCount: metric.NewCounter(metaExpiredClientConnCount),
// Connector metrics.
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets},
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets},
),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
// Connection migration metrics.
Expand All @@ -255,17 +255,17 @@ func makeProxyMetrics() metrics {
ConnMigrationErrorRecoverableCount: metric.NewCounter(metaConnMigrationErrorRecoverableCount),
ConnMigrationAttemptedCount: metric.NewCounter(metaConnMigrationAttemptedCount),
ConnMigrationAttemptedLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaConnMigrationAttemptedLatency,
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaConnMigrationAttemptedLatency,
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets,
}),
ConnMigrationTransferResponseMessageSize: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaConnMigrationTransferResponseMessageSize,
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.DataSize16MBBuckets,
MaxVal: maxExpectedTransferResponseMessageSize,
SigFigs: 1,
Metadata: metaConnMigrationTransferResponseMessageSize,
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.DataSize16MBBuckets,
MaxVal: maxExpectedTransferResponseMessageSize,
SigFigs: 1,
}),
QueryCancelReceivedPGWire: metric.NewCounter(metaQueryCancelReceivedPGWire),
QueryCancelReceivedHTTP: metric.NewCounter(metaQueryCancelReceivedHTTP),
Expand Down
30 changes: 15 additions & 15 deletions pkg/ccl/streamingccl/streamingest/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,25 +171,25 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates),
ReplanCount: metric.NewCounter(metaDistSQLReplanCount),
FlushHistNanos: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaReplicationFlushHistNanos,
Duration: histogramWindow,
Buckets: metric.BatchProcessLatencyBuckets,
MaxVal: streamingFlushHistMaxLatency.Nanoseconds(),
SigFigs: 1,
Metadata: metaReplicationFlushHistNanos,
Duration: histogramWindow,
BucketConfig: metric.BatchProcessLatencyBuckets,
MaxVal: streamingFlushHistMaxLatency.Nanoseconds(),
SigFigs: 1,
}),
CommitLatency: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaReplicationCommitLatency,
Duration: histogramWindow,
Buckets: metric.BatchProcessLatencyBuckets,
MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
Metadata: metaReplicationCommitLatency,
Duration: histogramWindow,
BucketConfig: metric.BatchProcessLatencyBuckets,
MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
}),
AdmitLatency: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaReplicationAdmitLatency,
Duration: histogramWindow,
Buckets: metric.BatchProcessLatencyBuckets,
MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
Metadata: metaReplicationAdmitLatency,
Duration: histogramWindow,
BucketConfig: metric.BatchProcessLatencyBuckets,
MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
}),
RunningCount: metric.NewGauge(metaStreamsRunning),
EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan),
Expand Down
10 changes: 5 additions & 5 deletions pkg/kv/bulk/bulk_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ const log10int64times1000 = 19 * 1000
func MakeBulkMetrics(histogramWindow time.Duration) Metrics {
return Metrics{
MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaMemMaxBytes,
Duration: histogramWindow,
MaxVal: log10int64times1000,
SigFigs: 3,
Buckets: metric.MemoryUsage64MBBuckets,
Metadata: metaMemMaxBytes,
Duration: histogramWindow,
MaxVal: log10int64times1000,
SigFigs: 3,
BucketConfig: metric.MemoryUsage64MBBuckets,
}),
CurBytesCount: metric.NewGauge(metaMemCurBytes),
}
Expand Down
18 changes: 9 additions & 9 deletions pkg/kv/kvclient/kvcoord/txn_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,20 +285,20 @@ func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics {
ClientRefreshAutoRetries: metric.NewCounter(metaClientRefreshAutoRetries),
ServerRefreshSuccess: metric.NewCounter(metaServerRefreshSuccess),
Durations: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDurationsHistograms,
Duration: histogramWindow,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDurationsHistograms,
Duration: histogramWindow,
BucketConfig: metric.IOLatencyBuckets,
}),
TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans),
TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge),
TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget),
Restarts: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaRestartsHistogram,
Duration: histogramWindow,
MaxVal: 100,
SigFigs: 3,
Buckets: metric.Count1KBuckets,
Metadata: metaRestartsHistogram,
Duration: histogramWindow,
MaxVal: 100,
SigFigs: 3,
BucketConfig: metric.Count1KBuckets,
}),
RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld),
RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti),
Expand Down
16 changes: 8 additions & 8 deletions pkg/kv/kvprober/kvprober.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,18 +276,18 @@ func NewProber(opts Opts) *Prober {
ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts),
ReadProbeFailures: metric.NewCounter(metaReadProbeFailures),
ReadProbeLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaReadProbeLatency,
Duration: opts.HistogramWindowInterval,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaReadProbeLatency,
Duration: opts.HistogramWindowInterval,
BucketConfig: metric.IOLatencyBuckets,
}),
WriteProbeAttempts: metric.NewCounter(metaWriteProbeAttempts),
WriteProbeFailures: metric.NewCounter(metaWriteProbeFailures),
WriteProbeLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaWriteProbeLatency,
Duration: opts.HistogramWindowInterval,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaWriteProbeLatency,
Duration: opts.HistogramWindowInterval,
BucketConfig: metric.IOLatencyBuckets,
}),
WriteProbeQuarantineOldestDuration: metric.NewFunctionalGauge(
metaWriteProbeQuarantineOldestDuration,
Expand Down
8 changes: 4 additions & 4 deletions pkg/kv/kvserver/client_manual_proposal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,10 +232,10 @@ LIMIT
Settings: st,
Metrics: logstore.Metrics{
RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePrometheus,
Metadata: fakeMeta,
Duration: time.Millisecond,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: fakeMeta,
Duration: time.Millisecond,
BucketConfig: metric.IOLatencyBuckets,
}),
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,10 @@ func newMetrics(c *Controller) *metrics {
)
m.WaitDuration[wc] = metric.NewHistogram(
metric.HistogramOptions{
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
},
)
m.TotalStreamCount[wc] = metric.NewFunctionalGauge(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,10 @@ func NewMetrics(registry *metric.Registry) *Metrics {
)
m.WaitDuration[wc] = metric.NewHistogram(
metric.HistogramOptions{
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
},
)
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/kv/kvserver/liveness/liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,10 +367,10 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness {
HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures),
EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements),
HeartbeatLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaHeartbeatLatency,
Duration: opts.HistogramWindowInterval,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaHeartbeatLatency,
Duration: opts.HistogramWindowInterval,
BucketConfig: metric.IOLatencyBuckets,
}),
}
nl.cache = newCache(opts.Gossip, opts.Clock, nl.cacheUpdated)
Expand Down
Loading

0 comments on commit c748c25

Please sign in to comment.