From f7443768cac09740a943052689da7c32debbb907 Mon Sep 17 00:00:00 2001 From: Andrii Vorobiov Date: Fri, 3 Jun 2022 16:12:25 +0300 Subject: [PATCH] kvserver: add evaluate batch latency metrics This patch introduces new metrics that measure read/write batch evaluation latency. It should provide better visibility what slows down batch processing (it is a small part of bigger project to measure e2e latency). Release note: None --- pkg/kv/kvserver/metrics.go | 27 +++++++++++++++++++++++++++ pkg/kv/kvserver/replica_evaluate.go | 1 - pkg/kv/kvserver/replica_read.go | 3 +++ pkg/kv/kvserver/replica_write.go | 2 ++ pkg/ts/catalog/chart_catalog.go | 13 +++++++++++++ pkg/ts/catalog/metrics.go | 2 ++ 6 files changed, 47 insertions(+), 1 deletion(-) diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 918fac8ed656..9adfaf4e4020 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -1440,6 +1440,25 @@ Replicas in this state will fail-fast all inbound requests. Measurement: "Events", Unit: metric.Unit_COUNT, } + // Replica read batch evaluation. + metaReplicaReadBatchEvaluationLatency = metric.Metadata{ + Name: "kv.replica_read_batch_evaluate.latency", + Help: `Execution duration for evaluating a BatchRequest on the read-only path after latches have been acquired. + +A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately.`, + Measurement: "Nanoseconds", + Unit: metric.Unit_NANOSECONDS, + } + // Replica read-write batch evaluation. + metaReplicaWriteBatchEvaluationLatency = metric.Metadata{ + Name: "kv.replica_write_batch_evaluate.latency", + Help: `Execution duration for evaluating a BatchRequest on the read-write path after latches have been acquired. + +A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately. +Note that the measurement does not include the duration for replicating the evaluated command.`, + Measurement: "Nanoseconds", + Unit: metric.Unit_NANOSECONDS, + } ) // StoreMetrics is the set of metrics for a given store. @@ -1700,6 +1719,10 @@ type StoreMetrics struct { // Replica circuit breaker. ReplicaCircuitBreakerCurTripped *metric.Gauge ReplicaCircuitBreakerCumTripped *metric.Counter + + // Replica batch evaluation metrics. + ReplicaReadBatchEvaluationLatency *metric.Histogram + ReplicaWriteBatchEvaluationLatency *metric.Histogram } type tenantMetricsRef struct { @@ -2170,6 +2193,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { // Replica circuit breaker. ReplicaCircuitBreakerCurTripped: metric.NewGauge(metaReplicaCircuitBreakerCurTripped), ReplicaCircuitBreakerCumTripped: metric.NewCounter(metaReplicaCircuitBreakerCumTripped), + + // Replica batch evaluation. + ReplicaReadBatchEvaluationLatency: metric.NewLatency(metaReplicaReadBatchEvaluationLatency, histogramWindow), + ReplicaWriteBatchEvaluationLatency: metric.NewLatency(metaReplicaWriteBatchEvaluationLatency, histogramWindow), } { diff --git a/pkg/kv/kvserver/replica_evaluate.go b/pkg/kv/kvserver/replica_evaluate.go index a1c43e3acc57..459e680d2f6a 100644 --- a/pkg/kv/kvserver/replica_evaluate.go +++ b/pkg/kv/kvserver/replica_evaluate.go @@ -153,7 +153,6 @@ func evaluateBatch( ui uncertainty.Interval, readOnly bool, ) (_ *roachpb.BatchResponse, _ result.Result, retErr *roachpb.Error) { - defer func() { // Ensure that errors don't carry the WriteTooOld flag set. The client // handles non-error responses with the WriteTooOld flag set, and errors diff --git a/pkg/kv/kvserver/replica_read.go b/pkg/kv/kvserver/replica_read.go index 4c899652ee70..7df6a5c6ac8e 100644 --- a/pkg/kv/kvserver/replica_read.go +++ b/pkg/kv/kvserver/replica_read.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/mon" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/kr/pretty" ) @@ -296,7 +297,9 @@ func (r *Replica) executeReadOnlyBatchWithServersideRefreshes( boundAccount.Clear(ctx) log.VEventf(ctx, 2, "server-side retry of batch") } + now := timeutil.Now() br, res, pErr = evaluateBatch(ctx, kvserverbase.CmdIDKey(""), rw, rec, nil, ba, st, ui, true /* readOnly */) + r.store.metrics.ReplicaReadBatchEvaluationLatency.RecordValue(timeutil.Since(now).Nanoseconds()) // If we can retry, set a higher batch timestamp and continue. // Allow one retry only. if pErr == nil || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, g, nil /* deadline */) { diff --git a/pkg/kv/kvserver/replica_write.go b/pkg/kv/kvserver/replica_write.go index a1bd5f41d0f7..6de60bf406b3 100644 --- a/pkg/kv/kvserver/replica_write.go +++ b/pkg/kv/kvserver/replica_write.go @@ -650,7 +650,9 @@ func (r *Replica) evaluateWriteBatchWrapper( g *concurrency.Guard, ) (storage.Batch, *roachpb.BatchResponse, result.Result, *roachpb.Error) { batch, opLogger := r.newBatchedEngine(ba, g) + now := timeutil.Now() br, res, pErr := evaluateBatch(ctx, idKey, batch, rec, ms, ba, st, ui, false /* readOnly */) + r.store.metrics.ReplicaWriteBatchEvaluationLatency.RecordValue(timeutil.Since(now).Nanoseconds()) if pErr == nil { if opLogger != nil { res.LogicalOpLog = &kvserverpb.LogicalOpLog{ diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go index 7e66a96fd777..11f8745cc5f3 100644 --- a/pkg/ts/catalog/chart_catalog.go +++ b/pkg/ts/catalog/chart_catalog.go @@ -3293,4 +3293,17 @@ var charts = []sectionDescription{ }, }, }, + { + Organization: [][]string{{ReplicationLayer, "Latency"}}, + Charts: []chartDescription{ + { + Title: "Execution duration for read batch evaluation.", + Metrics: []string{"kv.replica_read_batch_evaluate.latency"}, + }, + { + Title: "Execution duration for read batch evaluation.", + Metrics: []string{"kv.replica_write_batch_evaluate.latency"}, + }, + }, + }, } diff --git a/pkg/ts/catalog/metrics.go b/pkg/ts/catalog/metrics.go index 70e5e08e59cb..4614f5f57755 100644 --- a/pkg/ts/catalog/metrics.go +++ b/pkg/ts/catalog/metrics.go @@ -100,6 +100,8 @@ var histogramMetricsNames = map[string]struct{}{ "streaming.admit_latency": {}, "streaming.commit_latency": {}, "streaming.flush_hist_nanos": {}, + "kv.replica_read_batch_evaluate.latency": {}, + "kv.replica_write_batch_evaluate.latency": {}, } func allInternalTSMetricsNames() []string {