Skip to content

Commit

Permalink
kvserver: add evaluate batch latency metrics
Browse files Browse the repository at this point in the history
This patch introduces new metrics that measure read/write
batch evaluation latency. It should provide better visibility
what slows down batch processing (it is a small part of bigger
project to measure e2e latency).

Release note: None
  • Loading branch information
koorosh committed Jun 10, 2022
1 parent e1cdb46 commit f744376
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 1 deletion.
27 changes: 27 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -1440,6 +1440,25 @@ Replicas in this state will fail-fast all inbound requests.
Measurement: "Events",
Unit: metric.Unit_COUNT,
}
// Replica read batch evaluation.
metaReplicaReadBatchEvaluationLatency = metric.Metadata{
Name: "kv.replica_read_batch_evaluate.latency",
Help: `Execution duration for evaluating a BatchRequest on the read-only path after latches have been acquired.
A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately.`,
Measurement: "Nanoseconds",
Unit: metric.Unit_NANOSECONDS,
}
// Replica read-write batch evaluation.
metaReplicaWriteBatchEvaluationLatency = metric.Metadata{
Name: "kv.replica_write_batch_evaluate.latency",
Help: `Execution duration for evaluating a BatchRequest on the read-write path after latches have been acquired.
A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately.
Note that the measurement does not include the duration for replicating the evaluated command.`,
Measurement: "Nanoseconds",
Unit: metric.Unit_NANOSECONDS,
}
)

// StoreMetrics is the set of metrics for a given store.
Expand Down Expand Up @@ -1700,6 +1719,10 @@ type StoreMetrics struct {
// Replica circuit breaker.
ReplicaCircuitBreakerCurTripped *metric.Gauge
ReplicaCircuitBreakerCumTripped *metric.Counter

// Replica batch evaluation metrics.
ReplicaReadBatchEvaluationLatency *metric.Histogram
ReplicaWriteBatchEvaluationLatency *metric.Histogram
}

type tenantMetricsRef struct {
Expand Down Expand Up @@ -2170,6 +2193,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
// Replica circuit breaker.
ReplicaCircuitBreakerCurTripped: metric.NewGauge(metaReplicaCircuitBreakerCurTripped),
ReplicaCircuitBreakerCumTripped: metric.NewCounter(metaReplicaCircuitBreakerCumTripped),

// Replica batch evaluation.
ReplicaReadBatchEvaluationLatency: metric.NewLatency(metaReplicaReadBatchEvaluationLatency, histogramWindow),
ReplicaWriteBatchEvaluationLatency: metric.NewLatency(metaReplicaWriteBatchEvaluationLatency, histogramWindow),
}

{
Expand Down
1 change: 0 additions & 1 deletion pkg/kv/kvserver/replica_evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ func evaluateBatch(
ui uncertainty.Interval,
readOnly bool,
) (_ *roachpb.BatchResponse, _ result.Result, retErr *roachpb.Error) {

defer func() {
// Ensure that errors don't carry the WriteTooOld flag set. The client
// handles non-error responses with the WriteTooOld flag set, and errors
Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/replica_read.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/kr/pretty"
)

Expand Down Expand Up @@ -296,7 +297,9 @@ func (r *Replica) executeReadOnlyBatchWithServersideRefreshes(
boundAccount.Clear(ctx)
log.VEventf(ctx, 2, "server-side retry of batch")
}
now := timeutil.Now()
br, res, pErr = evaluateBatch(ctx, kvserverbase.CmdIDKey(""), rw, rec, nil, ba, st, ui, true /* readOnly */)
r.store.metrics.ReplicaReadBatchEvaluationLatency.RecordValue(timeutil.Since(now).Nanoseconds())
// If we can retry, set a higher batch timestamp and continue.
// Allow one retry only.
if pErr == nil || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, g, nil /* deadline */) {
Expand Down
2 changes: 2 additions & 0 deletions pkg/kv/kvserver/replica_write.go
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,9 @@ func (r *Replica) evaluateWriteBatchWrapper(
g *concurrency.Guard,
) (storage.Batch, *roachpb.BatchResponse, result.Result, *roachpb.Error) {
batch, opLogger := r.newBatchedEngine(ba, g)
now := timeutil.Now()
br, res, pErr := evaluateBatch(ctx, idKey, batch, rec, ms, ba, st, ui, false /* readOnly */)
r.store.metrics.ReplicaWriteBatchEvaluationLatency.RecordValue(timeutil.Since(now).Nanoseconds())
if pErr == nil {
if opLogger != nil {
res.LogicalOpLog = &kvserverpb.LogicalOpLog{
Expand Down
13 changes: 13 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -3293,4 +3293,17 @@ var charts = []sectionDescription{
},
},
},
{
Organization: [][]string{{ReplicationLayer, "Latency"}},
Charts: []chartDescription{
{
Title: "Execution duration for read batch evaluation.",
Metrics: []string{"kv.replica_read_batch_evaluate.latency"},
},
{
Title: "Execution duration for read batch evaluation.",
Metrics: []string{"kv.replica_write_batch_evaluate.latency"},
},
},
},
}
2 changes: 2 additions & 0 deletions pkg/ts/catalog/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ var histogramMetricsNames = map[string]struct{}{
"streaming.admit_latency": {},
"streaming.commit_latency": {},
"streaming.flush_hist_nanos": {},
"kv.replica_read_batch_evaluate.latency": {},
"kv.replica_write_batch_evaluate.latency": {},
}

func allInternalTSMetricsNames() []string {
Expand Down

0 comments on commit f744376

Please sign in to comment.