Skip to content

Commit

Permalink
kvserver: add server-side transaction retry metrics
Browse files Browse the repository at this point in the history
This patch addes a few new metrics to track successful/failed
server-side transaction retries. Specifically, whenever we attempt
to retry a read or write batch or run into a read within uncertainty
interval error, we increment specific counters indicating if the
retry was successful or not.

Release note: None
  • Loading branch information
arulajmani committed Jul 21, 2022
1 parent 79edfce commit a2f9858
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 6 deletions.
54 changes: 52 additions & 2 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,44 @@ var (
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaWriteEvaluationServerSideRetrySuccess = metric.Metadata{
Name: "txn.server_side_retry.write_evaluation.success",
Help: "Number of write batches that were successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaWriteEvaluationServerSideRetryFailure = metric.Metadata{
Name: "txn.server_side_retry.write_evaluation.failure",
Help: "Number of write batches that were not successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaReadEvaluationServerSideRetrySuccess = metric.Metadata{
Name: "txn.server_side_retry.read_evaluation.success",
Help: "Number of read batches that were successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaReadEvaluationServerSideRetryFailure = metric.Metadata{
Name: "txn.server_side_retry.read_evaluation.failure",
Help: "Number of read batches that were not successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess = metric.Metadata{
Name: "txn.server_side_retry.uncertainty_interval_error.success",
Help: "Number of batches that ran into uncertainty interval errors that were " +
"successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}
metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure = metric.Metadata{
Name: "txn.server_side_retry.uncertainty_interval_error.failure",
Help: "Number of batches that ran into uncertainty interval errors that were not " +
"successfully refreshed server side",
Measurement: "KV Transactions",
Unit: metric.Unit_COUNT,
}

// RocksDB/Pebble metrics.
metaRdbBlockCacheHits = metric.Metadata{
Expand Down Expand Up @@ -1548,7 +1586,13 @@ type StoreMetrics struct {
FollowerReadsCount *metric.Counter

// Server-side transaction metrics.
CommitWaitsBeforeCommitTrigger *metric.Counter
CommitWaitsBeforeCommitTrigger *metric.Counter
WriteEvaluationServerSideRetrySuccess *metric.Counter
WriteEvaluationServerSideRetryFailure *metric.Counter
ReadEvaluationServerSideRetrySuccess *metric.Counter
ReadEvaluationServerSideRetryFailure *metric.Counter
ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess *metric.Counter
ReadWithinUncertaintyIntervalErrorServerSideRetryFailure *metric.Counter

// Storage (pebble) metrics. Some are named RocksDB which is what we used
// before pebble, and this name is kept for backwards compatibility despite
Expand Down Expand Up @@ -2026,7 +2070,13 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
FollowerReadsCount: metric.NewCounter(metaFollowerReadsCount),

// Server-side transaction metrics.
CommitWaitsBeforeCommitTrigger: metric.NewCounter(metaCommitWaitBeforeCommitTriggerCount),
CommitWaitsBeforeCommitTrigger: metric.NewCounter(metaCommitWaitBeforeCommitTriggerCount),
WriteEvaluationServerSideRetrySuccess: metric.NewCounter(metaWriteEvaluationServerSideRetrySuccess),
WriteEvaluationServerSideRetryFailure: metric.NewCounter(metaWriteEvaluationServerSideRetryFailure),
ReadEvaluationServerSideRetrySuccess: metric.NewCounter(metaReadEvaluationServerSideRetrySuccess),
ReadEvaluationServerSideRetryFailure: metric.NewCounter(metaReadEvaluationServerSideRetryFailure),
ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess),
ReadWithinUncertaintyIntervalErrorServerSideRetryFailure: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure),

// RocksDB/Pebble metrics.
RdbBlockCacheHits: metric.NewGauge(metaRdbBlockCacheHits),
Expand Down
10 changes: 8 additions & 2 deletions pkg/kv/kvserver/replica_read.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,10 +306,16 @@ func (r *Replica) executeReadOnlyBatchWithServersideRefreshes(
now := timeutil.Now()
br, res, pErr = evaluateBatch(ctx, kvserverbase.CmdIDKey(""), rw, rec, nil, ba, g, st, ui, true /* readOnly */)
r.store.metrics.ReplicaReadBatchEvaluationLatency.RecordValue(timeutil.Since(now).Nanoseconds())
// Allow only one retry.
if pErr == nil || retries > 0 {
break
}
// If we can retry, set a higher batch timestamp and continue.
// Allow one retry only.
if pErr == nil || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, g, nil /* deadline */) {
if !canDoServersideRetry(ctx, pErr, ba, br, g, nil /* deadline */) {
r.store.Metrics().ReadEvaluationServerSideRetryFailure.Inc(1)
break
} else {
r.store.Metrics().ReadEvaluationServerSideRetrySuccess.Inc(1)
}
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/kv/kvserver/replica_send.go
Original file line number Diff line number Diff line change
Expand Up @@ -795,8 +795,12 @@ func (r *Replica) handleReadWithinUncertaintyIntervalError(
// latchSpans, because we have already released our latches and plan to
// re-acquire them if the retry is allowed.
if !canDoServersideRetry(ctx, pErr, ba, nil /* br */, nil /* g */, nil /* deadline */) {
r.store.Metrics().ReadWithinUncertaintyIntervalErrorServerSideRetryFailure.Inc(1)
return nil, pErr
} else {
r.store.Metrics().ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess.Inc(1)
}

if ba.Txn == nil {
// If the request is non-transactional and it was refreshed into the future
// after observing a value with a timestamp in the future, immediately sleep
Expand Down
10 changes: 8 additions & 2 deletions pkg/kv/kvserver/replica_write.go
Original file line number Diff line number Diff line change
Expand Up @@ -629,12 +629,18 @@ func (r *Replica) evaluateWriteBatchWithServersideRefreshes(
success = false
}

// If we can retry, set a higher batch timestamp and continue.
// Allow one retry only; a non-txn batch containing overlapping
// spans will always experience WriteTooOldError.
if success || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, g, deadline) {
if success || retries > 0 {
break
}
// If we can retry, set a higher batch timestamp and continue.
if !canDoServersideRetry(ctx, pErr, ba, br, g, deadline) {
r.store.Metrics().WriteEvaluationServerSideRetryFailure.Inc(1)
break
} else {
r.store.Metrics().WriteEvaluationServerSideRetrySuccess.Inc(1)
}
}
return batch, br, res, pErr
}
Expand Down

0 comments on commit a2f9858

Please sign in to comment.