From f7443768cac09740a943052689da7c32debbb907 Mon Sep 17 00:00:00 2001
From: Andrii Vorobiov <and.vorobiov@gmail.com>
Date: Fri, 3 Jun 2022 16:12:25 +0300
Subject: [PATCH] kvserver: add evaluate batch latency metrics

This patch introduces new metrics that measure read/write
batch evaluation latency. It should provide better visibility
what slows down batch processing (it is a small part of bigger
project to measure e2e latency).

Release note: None
---
 pkg/kv/kvserver/metrics.go          | 27 +++++++++++++++++++++++++++
 pkg/kv/kvserver/replica_evaluate.go |  1 -
 pkg/kv/kvserver/replica_read.go     |  3 +++
 pkg/kv/kvserver/replica_write.go    |  2 ++
 pkg/ts/catalog/chart_catalog.go     | 13 +++++++++++++
 pkg/ts/catalog/metrics.go           |  2 ++
 6 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go
index 918fac8ed656..9adfaf4e4020 100644
--- a/pkg/kv/kvserver/metrics.go
+++ b/pkg/kv/kvserver/metrics.go
@@ -1440,6 +1440,25 @@ Replicas in this state will fail-fast all inbound requests.
 		Measurement: "Events",
 		Unit:        metric.Unit_COUNT,
 	}
+	// Replica read batch evaluation.
+	metaReplicaReadBatchEvaluationLatency = metric.Metadata{
+		Name: "kv.replica_read_batch_evaluate.latency",
+		Help: `Execution duration for evaluating a BatchRequest on the read-only path after latches have been acquired.
+
+A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately.`,
+		Measurement: "Nanoseconds",
+		Unit:        metric.Unit_NANOSECONDS,
+	}
+	// Replica read-write batch evaluation.
+	metaReplicaWriteBatchEvaluationLatency = metric.Metadata{
+		Name: "kv.replica_write_batch_evaluate.latency",
+		Help: `Execution duration for evaluating a BatchRequest on the read-write path after latches have been acquired.
+
+A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately.
+Note that the measurement does not include the duration for replicating the evaluated command.`,
+		Measurement: "Nanoseconds",
+		Unit:        metric.Unit_NANOSECONDS,
+	}
 )
 
 // StoreMetrics is the set of metrics for a given store.
@@ -1700,6 +1719,10 @@ type StoreMetrics struct {
 	// Replica circuit breaker.
 	ReplicaCircuitBreakerCurTripped *metric.Gauge
 	ReplicaCircuitBreakerCumTripped *metric.Counter
+
+	// Replica batch evaluation metrics.
+	ReplicaReadBatchEvaluationLatency  *metric.Histogram
+	ReplicaWriteBatchEvaluationLatency *metric.Histogram
 }
 
 type tenantMetricsRef struct {
@@ -2170,6 +2193,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
 		// Replica circuit breaker.
 		ReplicaCircuitBreakerCurTripped: metric.NewGauge(metaReplicaCircuitBreakerCurTripped),
 		ReplicaCircuitBreakerCumTripped: metric.NewCounter(metaReplicaCircuitBreakerCumTripped),
+
+		// Replica batch evaluation.
+		ReplicaReadBatchEvaluationLatency:  metric.NewLatency(metaReplicaReadBatchEvaluationLatency, histogramWindow),
+		ReplicaWriteBatchEvaluationLatency: metric.NewLatency(metaReplicaWriteBatchEvaluationLatency, histogramWindow),
 	}
 
 	{
diff --git a/pkg/kv/kvserver/replica_evaluate.go b/pkg/kv/kvserver/replica_evaluate.go
index a1c43e3acc57..459e680d2f6a 100644
--- a/pkg/kv/kvserver/replica_evaluate.go
+++ b/pkg/kv/kvserver/replica_evaluate.go
@@ -153,7 +153,6 @@ func evaluateBatch(
 	ui uncertainty.Interval,
 	readOnly bool,
 ) (_ *roachpb.BatchResponse, _ result.Result, retErr *roachpb.Error) {
-
 	defer func() {
 		// Ensure that errors don't carry the WriteTooOld flag set. The client
 		// handles non-error responses with the WriteTooOld flag set, and errors
diff --git a/pkg/kv/kvserver/replica_read.go b/pkg/kv/kvserver/replica_read.go
index 4c899652ee70..7df6a5c6ac8e 100644
--- a/pkg/kv/kvserver/replica_read.go
+++ b/pkg/kv/kvserver/replica_read.go
@@ -26,6 +26,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/mon"
+	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	"github.com/kr/pretty"
 )
 
@@ -296,7 +297,9 @@ func (r *Replica) executeReadOnlyBatchWithServersideRefreshes(
 			boundAccount.Clear(ctx)
 			log.VEventf(ctx, 2, "server-side retry of batch")
 		}
+		now := timeutil.Now()
 		br, res, pErr = evaluateBatch(ctx, kvserverbase.CmdIDKey(""), rw, rec, nil, ba, st, ui, true /* readOnly */)
+		r.store.metrics.ReplicaReadBatchEvaluationLatency.RecordValue(timeutil.Since(now).Nanoseconds())
 		// If we can retry, set a higher batch timestamp and continue.
 		// Allow one retry only.
 		if pErr == nil || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, g, nil /* deadline */) {
diff --git a/pkg/kv/kvserver/replica_write.go b/pkg/kv/kvserver/replica_write.go
index a1bd5f41d0f7..6de60bf406b3 100644
--- a/pkg/kv/kvserver/replica_write.go
+++ b/pkg/kv/kvserver/replica_write.go
@@ -650,7 +650,9 @@ func (r *Replica) evaluateWriteBatchWrapper(
 	g *concurrency.Guard,
 ) (storage.Batch, *roachpb.BatchResponse, result.Result, *roachpb.Error) {
 	batch, opLogger := r.newBatchedEngine(ba, g)
+	now := timeutil.Now()
 	br, res, pErr := evaluateBatch(ctx, idKey, batch, rec, ms, ba, st, ui, false /* readOnly */)
+	r.store.metrics.ReplicaWriteBatchEvaluationLatency.RecordValue(timeutil.Since(now).Nanoseconds())
 	if pErr == nil {
 		if opLogger != nil {
 			res.LogicalOpLog = &kvserverpb.LogicalOpLog{
diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go
index 7e66a96fd777..11f8745cc5f3 100644
--- a/pkg/ts/catalog/chart_catalog.go
+++ b/pkg/ts/catalog/chart_catalog.go
@@ -3293,4 +3293,17 @@ var charts = []sectionDescription{
 			},
 		},
 	},
+	{
+		Organization: [][]string{{ReplicationLayer, "Latency"}},
+		Charts: []chartDescription{
+			{
+				Title:   "Execution duration for read batch evaluation.",
+				Metrics: []string{"kv.replica_read_batch_evaluate.latency"},
+			},
+			{
+				Title:   "Execution duration for read batch evaluation.",
+				Metrics: []string{"kv.replica_write_batch_evaluate.latency"},
+			},
+		},
+	},
 }
diff --git a/pkg/ts/catalog/metrics.go b/pkg/ts/catalog/metrics.go
index 70e5e08e59cb..4614f5f57755 100644
--- a/pkg/ts/catalog/metrics.go
+++ b/pkg/ts/catalog/metrics.go
@@ -100,6 +100,8 @@ var histogramMetricsNames = map[string]struct{}{
 	"streaming.admit_latency":                   {},
 	"streaming.commit_latency":                  {},
 	"streaming.flush_hist_nanos":                {},
+	"kv.replica_read_batch_evaluate.latency":    {},
+	"kv.replica_write_batch_evaluate.latency":   {},
 }
 
 func allInternalTSMetricsNames() []string {