diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 738e0689a080..fd1a11682a79 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -338,6 +338,44 @@ var ( Measurement: "KV Transactions", Unit: metric.Unit_COUNT, } + metaWriteEvaluationServerSideRetrySuccess = metric.Metadata{ + Name: "txn.server_side_retry.write_evaluation.success", + Help: "Number of write batches that were successfully refreshed server side", + Measurement: "KV Transactions", + Unit: metric.Unit_COUNT, + } + metaWriteEvaluationServerSideRetryFailure = metric.Metadata{ + Name: "txn.server_side_retry.write_evaluation.failure", + Help: "Number of write batches that were not successfully refreshed server side", + Measurement: "KV Transactions", + Unit: metric.Unit_COUNT, + } + metaReadEvaluationServerSideRetrySuccess = metric.Metadata{ + Name: "txn.server_side_retry.read_evaluation.success", + Help: "Number of read batches that were successfully refreshed server side", + Measurement: "KV Transactions", + Unit: metric.Unit_COUNT, + } + metaReadEvaluationServerSideRetryFailure = metric.Metadata{ + Name: "txn.server_side_retry.read_evaluation.failure", + Help: "Number of read batches that were not successfully refreshed server side", + Measurement: "KV Transactions", + Unit: metric.Unit_COUNT, + } + metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess = metric.Metadata{ + Name: "txn.server_side_retry.uncertainty_interval_error.success", + Help: "Number of batches that ran into uncertainty interval errors that were " + + "successfully refreshed server side", + Measurement: "KV Transactions", + Unit: metric.Unit_COUNT, + } + metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure = metric.Metadata{ + Name: "txn.server_side_retry.uncertainty_interval_error.failure", + Help: "Number of batches that ran into uncertainty interval errors that were not " + + "successfully refreshed server side", + Measurement: "KV Transactions", + Unit: metric.Unit_COUNT, + } // RocksDB/Pebble metrics. metaRdbBlockCacheHits = metric.Metadata{ @@ -1548,7 +1586,13 @@ type StoreMetrics struct { FollowerReadsCount *metric.Counter // Server-side transaction metrics. - CommitWaitsBeforeCommitTrigger *metric.Counter + CommitWaitsBeforeCommitTrigger *metric.Counter + WriteEvaluationServerSideRetrySuccess *metric.Counter + WriteEvaluationServerSideRetryFailure *metric.Counter + ReadEvaluationServerSideRetrySuccess *metric.Counter + ReadEvaluationServerSideRetryFailure *metric.Counter + ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess *metric.Counter + ReadWithinUncertaintyIntervalErrorServerSideRetryFailure *metric.Counter // Storage (pebble) metrics. Some are named RocksDB which is what we used // before pebble, and this name is kept for backwards compatibility despite @@ -2026,7 +2070,13 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { FollowerReadsCount: metric.NewCounter(metaFollowerReadsCount), // Server-side transaction metrics. - CommitWaitsBeforeCommitTrigger: metric.NewCounter(metaCommitWaitBeforeCommitTriggerCount), + CommitWaitsBeforeCommitTrigger: metric.NewCounter(metaCommitWaitBeforeCommitTriggerCount), + WriteEvaluationServerSideRetrySuccess: metric.NewCounter(metaWriteEvaluationServerSideRetrySuccess), + WriteEvaluationServerSideRetryFailure: metric.NewCounter(metaWriteEvaluationServerSideRetryFailure), + ReadEvaluationServerSideRetrySuccess: metric.NewCounter(metaReadEvaluationServerSideRetrySuccess), + ReadEvaluationServerSideRetryFailure: metric.NewCounter(metaReadEvaluationServerSideRetryFailure), + ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess), + ReadWithinUncertaintyIntervalErrorServerSideRetryFailure: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure), // RocksDB/Pebble metrics. RdbBlockCacheHits: metric.NewGauge(metaRdbBlockCacheHits), diff --git a/pkg/kv/kvserver/replica_read.go b/pkg/kv/kvserver/replica_read.go index 93f1e061a385..fd247735881f 100644 --- a/pkg/kv/kvserver/replica_read.go +++ b/pkg/kv/kvserver/replica_read.go @@ -306,10 +306,16 @@ func (r *Replica) executeReadOnlyBatchWithServersideRefreshes( now := timeutil.Now() br, res, pErr = evaluateBatch(ctx, kvserverbase.CmdIDKey(""), rw, rec, nil, ba, g, st, ui, true /* readOnly */) r.store.metrics.ReplicaReadBatchEvaluationLatency.RecordValue(timeutil.Since(now).Nanoseconds()) + // Allow only one retry. + if pErr == nil || retries > 0 { + break + } // If we can retry, set a higher batch timestamp and continue. - // Allow one retry only. - if pErr == nil || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, g, nil /* deadline */) { + if !canDoServersideRetry(ctx, pErr, ba, br, g, nil /* deadline */) { + r.store.Metrics().ReadEvaluationServerSideRetryFailure.Inc(1) break + } else { + r.store.Metrics().ReadEvaluationServerSideRetrySuccess.Inc(1) } } diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index 522b4d668a1c..defa9408359b 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -795,8 +795,11 @@ func (r *Replica) handleReadWithinUncertaintyIntervalError( // latchSpans, because we have already released our latches and plan to // re-acquire them if the retry is allowed. if !canDoServersideRetry(ctx, pErr, ba, nil /* br */, nil /* g */, nil /* deadline */) { + r.store.Metrics().ReadWithinUncertaintyIntervalErrorServerSideRetryFailure.Inc(1) return nil, pErr } + r.store.Metrics().ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess.Inc(1) + if ba.Txn == nil { // If the request is non-transactional and it was refreshed into the future // after observing a value with a timestamp in the future, immediately sleep diff --git a/pkg/kv/kvserver/replica_write.go b/pkg/kv/kvserver/replica_write.go index 4430d0e8a368..6d75b6d3384e 100644 --- a/pkg/kv/kvserver/replica_write.go +++ b/pkg/kv/kvserver/replica_write.go @@ -629,12 +629,18 @@ func (r *Replica) evaluateWriteBatchWithServersideRefreshes( success = false } - // If we can retry, set a higher batch timestamp and continue. // Allow one retry only; a non-txn batch containing overlapping // spans will always experience WriteTooOldError. - if success || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, g, deadline) { + if success || retries > 0 { break } + // If we can retry, set a higher batch timestamp and continue. + if !canDoServersideRetry(ctx, pErr, ba, br, g, deadline) { + r.store.Metrics().WriteEvaluationServerSideRetryFailure.Inc(1) + break + } else { + r.store.Metrics().WriteEvaluationServerSideRetrySuccess.Inc(1) + } } return batch, br, res, pErr } diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go index 8d58080cc226..5cbd0aca4ddb 100644 --- a/pkg/ts/catalog/chart_catalog.go +++ b/pkg/ts/catalog/chart_catalog.go @@ -1225,6 +1225,17 @@ var charts = []sectionDescription{ "txn.commit_waits.before_commit_trigger", }, }, + { + Title: "Server Side Retry", + Metrics: []string{ + "txn.server_side_retry.write_evaluation.success", + "txn.server_side_retry.write_evaluation.failure", + "txn.server_side_retry.read_evaluation.success", + "txn.server_side_retry.read_evaluation.failure", + "txn.server_side_retry.uncertainty_interval_error.success", + "txn.server_side_retry.uncertainty_interval_error.failure", + }, + }, { Title: "Durations", Metrics: []string{"txn.durations"},