Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kvserver: add server-side transaction retry metrics #84883

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 52 additions & 2 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,44 @@ var (
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaWriteEvaluationServerSideRetrySuccess = metric.Metadata{
Name: "txn.server_side_retry.write_evaluation.success",
Help: "Number of write batches that were successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaWriteEvaluationServerSideRetryFailure = metric.Metadata{
Name: "txn.server_side_retry.write_evaluation.failure",
Help: "Number of write batches that were not successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaReadEvaluationServerSideRetrySuccess = metric.Metadata{
Name: "txn.server_side_retry.read_evaluation.success",
Help: "Number of read batches that were successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaReadEvaluationServerSideRetryFailure = metric.Metadata{
Name: "txn.server_side_retry.read_evaluation.failure",
Help: "Number of read batches that were not successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess = metric.Metadata{
Name: "txn.server_side_retry.uncertainty_interval_error.success",
Help: "Number of batches that ran into uncertainty interval errors that were " +
"successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure = metric.Metadata{
Name: "txn.server_side_retry.uncertainty_interval_error.failure",
Help: "Number of batches that ran into uncertainty interval errors that were not " +
"successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}

// RocksDB/Pebble metrics.
metaRdbBlockCacheHits = metric.Metadata{
Expand Down Expand Up @@ -1548,7 +1586,13 @@ type StoreMetrics struct {
FollowerReadsCount *metric.Counter

// Server-side transaction metrics.
CommitWaitsBeforeCommitTrigger *metric.Counter
CommitWaitsBeforeCommitTrigger *metric.Counter
WriteEvaluationServerSideRetrySuccess *metric.Counter
WriteEvaluationServerSideRetryFailure *metric.Counter
ReadEvaluationServerSideRetrySuccess *metric.Counter
ReadEvaluationServerSideRetryFailure *metric.Counter
ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess *metric.Counter
ReadWithinUncertaintyIntervalErrorServerSideRetryFailure *metric.Counter

// Storage (pebble) metrics. Some are named RocksDB which is what we used
// before pebble, and this name is kept for backwards compatibility despite
Expand Down Expand Up @@ -2026,7 +2070,13 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
FollowerReadsCount: metric.NewCounter(metaFollowerReadsCount),

// Server-side transaction metrics.
CommitWaitsBeforeCommitTrigger: metric.NewCounter(metaCommitWaitBeforeCommitTriggerCount),
CommitWaitsBeforeCommitTrigger: metric.NewCounter(metaCommitWaitBeforeCommitTriggerCount),
WriteEvaluationServerSideRetrySuccess: metric.NewCounter(metaWriteEvaluationServerSideRetrySuccess),
WriteEvaluationServerSideRetryFailure: metric.NewCounter(metaWriteEvaluationServerSideRetryFailure),
ReadEvaluationServerSideRetrySuccess: metric.NewCounter(metaReadEvaluationServerSideRetrySuccess),
ReadEvaluationServerSideRetryFailure: metric.NewCounter(metaReadEvaluationServerSideRetryFailure),
ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess),
ReadWithinUncertaintyIntervalErrorServerSideRetryFailure: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure),

// RocksDB/Pebble metrics.
RdbBlockCacheHits: metric.NewGauge(metaRdbBlockCacheHits),
Expand Down
10 changes: 8 additions & 2 deletions pkg/kv/kvserver/replica_read.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,10 +306,16 @@ func (r *Replica) executeReadOnlyBatchWithServersideRefreshes(
now := timeutil.Now()
br, res, pErr = evaluateBatch(ctx, kvserverbase.CmdIDKey(""), rw, rec, nil, ba, g, st, ui, true /* readOnly */)
r.store.metrics.ReplicaReadBatchEvaluationLatency.RecordValue(timeutil.Since(now).Nanoseconds())
// Allow only one retry.
if pErr == nil || retries > 0 {
break
}
// If we can retry, set a higher batch timestamp and continue.
// Allow one retry only.
if pErr == nil || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, g, nil /* deadline */) {
if !canDoServersideRetry(ctx, pErr, ba, br, g, nil /* deadline */) {
r.store.Metrics().ReadEvaluationServerSideRetryFailure.Inc(1)
break
} else {
r.store.Metrics().ReadEvaluationServerSideRetrySuccess.Inc(1)
}
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/replica_send.go
Original file line number Diff line number Diff line change
Expand Up @@ -795,8 +795,11 @@ func (r *Replica) handleReadWithinUncertaintyIntervalError(
// latchSpans, because we have already released our latches and plan to
// re-acquire them if the retry is allowed.
if !canDoServersideRetry(ctx, pErr, ba, nil /* br */, nil /* g */, nil /* deadline */) {
r.store.Metrics().ReadWithinUncertaintyIntervalErrorServerSideRetryFailure.Inc(1)
return nil, pErr
}
r.store.Metrics().ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess.Inc(1)

if ba.Txn == nil {
// If the request is non-transactional and it was refreshed into the future
// after observing a value with a timestamp in the future, immediately sleep
Expand Down
10 changes: 8 additions & 2 deletions pkg/kv/kvserver/replica_write.go
Original file line number Diff line number Diff line change
Expand Up @@ -629,12 +629,18 @@ func (r *Replica) evaluateWriteBatchWithServersideRefreshes(
success = false
}

// If we can retry, set a higher batch timestamp and continue.
// Allow one retry only; a non-txn batch containing overlapping
// spans will always experience WriteTooOldError.
if success || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, g, deadline) {
if success || retries > 0 {
break
}
// If we can retry, set a higher batch timestamp and continue.
if !canDoServersideRetry(ctx, pErr, ba, br, g, deadline) {
r.store.Metrics().WriteEvaluationServerSideRetryFailure.Inc(1)
break
} else {
r.store.Metrics().WriteEvaluationServerSideRetrySuccess.Inc(1)
}
}
return batch, br, res, pErr
}
Expand Down
11 changes: 11 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,17 @@ var charts = []sectionDescription{
"txn.commit_waits.before_commit_trigger",
},
},
{
Title: "Server Side Retry",
Metrics: []string{
"txn.server_side_retry.write_evaluation.success",
"txn.server_side_retry.write_evaluation.failure",
"txn.server_side_retry.read_evaluation.success",
"txn.server_side_retry.read_evaluation.failure",
"txn.server_side_retry.uncertainty_interval_error.success",
"txn.server_side_retry.uncertainty_interval_error.failure",
},
},
{
Title: "Durations",
Metrics: []string{"txn.durations"},
Expand Down