From 933c4cd42da83679fcd460fcdd68ecaee0ad7b10 Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Mon, 6 Nov 2023 15:39:08 -0500 Subject: [PATCH] kv: add server-side metrics for {successful,failed} 1PC evaluation This patch adds metrics for successful and failed 1PC evaluations, which together give us the number of attempted 1PC evaluations by a store. We then modify an existing test to use these metrics to verify that it was able to successfully commit using 1PC. Epic: None Release note: None --- docs/generated/metrics/metrics.html | 2 ++ pkg/kv/kvserver/metrics.go | 16 ++++++++++++++++ pkg/kv/kvserver/replica_test.go | 15 +++++++++++++-- pkg/kv/kvserver/replica_write.go | 5 ++++- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index 372707cf821b..09fa9e685259 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -703,6 +703,8 @@ STORAGEtscache.skl.pagesNumber of pages in the timestamp cachePagesGAUGECOUNTAVGNONE STORAGEtscache.skl.rotationsNumber of page rotations in the timestamp cachePage RotationsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEtxn.commit_waits.before_commit_triggerNumber of KV transactions that had to commit-wait on the server before committing because they had a commit triggerKV TransactionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +STORAGEtxn.server_side.1PC.failureNumber of batches that attempted to commit using 1PC and failedKV TransactionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +STORAGEtxn.server_side.1PC.successNumber of batches that attempted to commit using 1PC and succeededKV TransactionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEtxn.server_side_retry.read_evaluation.failureNumber of read batches that were not successfully refreshed server sideKV TransactionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEtxn.server_side_retry.read_evaluation.successNumber of read batches that were successfully refreshed server sideKV TransactionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEtxn.server_side_retry.uncertainty_interval_error.failureNumber of batches that ran into uncertainty interval errors that were not successfully refreshed server sideKV TransactionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index e1dd41cf83a5..cca2f6b2e20f 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -474,6 +474,18 @@ var ( Measurement: "KV Transactions", Unit: metric.Unit_COUNT, } + metaOnePhaseCommitSuccess = metric.Metadata{ + Name: "txn.server_side.1PC.success", + Help: "Number of batches that attempted to commit using 1PC and succeeded", + Measurement: "KV Transactions", + Unit: metric.Unit_COUNT, + } + metaOnePhaseCommitFailure = metric.Metadata{ + Name: "txn.server_side.1PC.failure", + Help: "Number of batches that attempted to commit using 1PC and failed", + Measurement: "KV Transactions", + Unit: metric.Unit_COUNT, + } //Ingest metrics metaIngestCount = metric.Metadata{ @@ -2357,6 +2369,8 @@ type StoreMetrics struct { ReadEvaluationServerSideRetryFailure *metric.Counter ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess *metric.Counter ReadWithinUncertaintyIntervalErrorServerSideRetryFailure *metric.Counter + OnePhaseCommitSuccess *metric.Counter + OnePhaseCommitFailure *metric.Counter // Storage (pebble) metrics. Some are named RocksDB which is what we used // before pebble, and this name is kept for backwards compatibility despite @@ -3035,6 +3049,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { ReadEvaluationServerSideRetryFailure: metric.NewCounter(metaReadEvaluationServerSideRetryFailure), ReadWithinUncertaintyIntervalErrorServerSideRetrySuccess: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetrySuccess), ReadWithinUncertaintyIntervalErrorServerSideRetryFailure: metric.NewCounter(metaReadWithinUncertaintyIntervalErrorServerSideRetryFailure), + OnePhaseCommitSuccess: metric.NewCounter(metaOnePhaseCommitSuccess), + OnePhaseCommitFailure: metric.NewCounter(metaOnePhaseCommitFailure), // Pebble metrics. // diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go index 564bbd121e4a..31090f854361 100644 --- a/pkg/kv/kvserver/replica_test.go +++ b/pkg/kv/kvserver/replica_test.go @@ -11222,9 +11222,14 @@ func TestReplicaAsyncIntentResolutionOn1PC(t *testing.T) { Knobs: base.TestingKnobs{Store: &storeKnobs}}) defer s.Stopper().Stop(ctx) + store, err := s.GetStores().(*Stores).GetStore(1) + require.NoError(t, err) + successfulOnePCBefore := store.Metrics().OnePhaseCommitSuccess.Count() + failedOnePCBefore := store.Metrics().OnePhaseCommitFailure.Count() + // Perform a range split between key A and B. keyA, keyB := roachpb.Key("a"), roachpb.Key("b") - _, _, err := s.SplitRange(keyB) + _, _, err = s.SplitRange(keyB) require.NoError(t, err) // Write a value to a key A and B. @@ -11248,12 +11253,18 @@ func TestReplicaAsyncIntentResolutionOn1PC(t *testing.T) { require.NoError(t, err) // Update the locked value and commit in a single batch. This should hit the - // one-phase commit fast-path and then release the "for update" lock(s). + // one-phase commit fast-path (verified below) and then release the + // "for update" lock(s). b = txn.NewBatch() b.Inc(keyA, 1) err = txn.CommitInBatch(ctx, b) require.NoError(t, err) + successfulOnePCAfter := store.Metrics().OnePhaseCommitSuccess.Count() + failedOnePCAfter := store.Metrics().OnePhaseCommitFailure.Count() + require.Equal(t, failedOnePCBefore, failedOnePCAfter) + require.Greater(t, successfulOnePCAfter, successfulOnePCBefore) + // If an external lock was acquired, we should see its resolution. if external { riReq := <-resIntentC diff --git a/pkg/kv/kvserver/replica_write.go b/pkg/kv/kvserver/replica_write.go index f9d0e61bdf67..b67697a1c813 100644 --- a/pkg/kv/kvserver/replica_write.go +++ b/pkg/kv/kvserver/replica_write.go @@ -503,9 +503,12 @@ func (r *Replica) evaluate1PC( var batch storage.Batch defer func() { // Close the batch unless it's passed to the caller (when the evaluation - // succeeds). + // succeeds). Also increment metrics. if onePCRes.success != onePCSucceeded { batch.Close() + r.store.Metrics().OnePhaseCommitFailure.Inc(1) + } else { + r.store.Metrics().OnePhaseCommitSuccess.Inc(1) } }()