From e61f04ab52ec92cdfd86b06b1ed3b47168cdd0f1 Mon Sep 17 00:00:00 2001 From: Mira Radeva Date: Wed, 1 May 2024 00:55:11 +0000 Subject: [PATCH] kvcoord: observability for in-flight writes and locking reads over intent budget Since #121088, in-flight writes can include locking reads; because we don't estimate the size of the locks accurately for ranged locking reads, it is possible that in-flight writes exceed the max intent tracking budget (`kv.transaction.max_intents_bytes`). That's fine for now, but in this patch we add some observability to be aware of this happening. Fixes: #121471 Release note: None --- docs/generated/metrics/metrics.html | 1 + pkg/kv/kvclient/kvcoord/txn_coord_sender.go | 9 +++-- .../kvcoord/txn_coord_sender_factory.go | 38 ++++++++++--------- .../kvcoord/txn_interceptor_pipeliner.go | 31 +++++++++++---- .../kvcoord/txn_interceptor_pipeliner_test.go | 9 +++-- pkg/kv/kvclient/kvcoord/txn_metrics.go | 21 +++++++--- 6 files changed, 70 insertions(+), 39 deletions(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index 5b513bbc1a42..33caa406cd89 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1596,6 +1596,7 @@ APPLICATIONtxn.condensed_intent_spans_gaugeKV transactions currently running that have exceeded their intent tracking memory budget (kv.transaction.max_intents_bytes). See also txn.condensed_intent_spans for a perpetual counter/rate.KV TransactionsGAUGECOUNTAVGNONE APPLICATIONtxn.condensed_intent_spans_rejectedKV transactions that have been aborted because they exceeded their intent tracking memory budget (kv.transaction.max_intents_bytes). Rejection is caused by kv.transaction.reject_over_max_intents_budget.KV TransactionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONtxn.durationsKV transaction durationsKV Txn DurationHISTOGRAMNANOSECONDSAVGNONE +APPLICATIONtxn.inflight_locks_over_tracking_budgetKV transactions whose in-flight writes and locking reads have exceeded the intent tracking memory budget (kv.transaction.max_intents_bytes).KV TransactionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONtxn.parallelcommitsNumber of KV transaction parallel commitsKV TransactionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONtxn.parallelcommits.auto_retriesNumber of commit tries after successful failed parallel commit attemptsRetriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONtxn.refresh.auto_retriesNumber of request retries after successful client-side refreshesRetriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE diff --git a/pkg/kv/kvclient/kvcoord/txn_coord_sender.go b/pkg/kv/kvclient/kvcoord/txn_coord_sender.go index 3956bbff5d22..ac1c2b4d27de 100644 --- a/pkg/kv/kvclient/kvcoord/txn_coord_sender.go +++ b/pkg/kv/kvclient/kvcoord/txn_coord_sender.go @@ -310,10 +310,11 @@ func (tc *TxnCoordSender) initCommonInterceptors( riGen.ds = ds } tc.interceptorAlloc.txnPipeliner = txnPipeliner{ - st: tcf.st, - riGen: riGen, - txnMetrics: &tc.metrics, - condensedIntentsEveryN: &tc.TxnCoordSenderFactory.condensedIntentsEveryN, + st: tcf.st, + riGen: riGen, + txnMetrics: &tc.metrics, + condensedIntentsEveryN: &tc.TxnCoordSenderFactory.condensedIntentsEveryN, + inflightOverBudgetEveryN: &tc.TxnCoordSenderFactory.inflightOverBudgetEveryN, } tc.interceptorAlloc.txnSpanRefresher = txnSpanRefresher{ st: tcf.st, diff --git a/pkg/kv/kvclient/kvcoord/txn_coord_sender_factory.go b/pkg/kv/kvclient/kvcoord/txn_coord_sender_factory.go index 306a8c5521d6..a472280b257f 100644 --- a/pkg/kv/kvclient/kvcoord/txn_coord_sender_factory.go +++ b/pkg/kv/kvclient/kvcoord/txn_coord_sender_factory.go @@ -27,14 +27,15 @@ import ( type TxnCoordSenderFactory struct { log.AmbientContext - st *cluster.Settings - wrapped kv.Sender - clock *hlc.Clock - heartbeatInterval time.Duration - linearizable bool // enables linearizable behavior - stopper *stop.Stopper - metrics TxnMetrics - condensedIntentsEveryN log.EveryN + st *cluster.Settings + wrapped kv.Sender + clock *hlc.Clock + heartbeatInterval time.Duration + linearizable bool // enables linearizable behavior + stopper *stop.Stopper + metrics TxnMetrics + condensedIntentsEveryN log.EveryN + inflightOverBudgetEveryN log.EveryN testingKnobs ClientTestingKnobs } @@ -64,16 +65,17 @@ func NewTxnCoordSenderFactory( cfg TxnCoordSenderFactoryConfig, wrapped kv.Sender, ) *TxnCoordSenderFactory { tcf := &TxnCoordSenderFactory{ - AmbientContext: cfg.AmbientCtx, - st: cfg.Settings, - wrapped: wrapped, - clock: cfg.Clock, - stopper: cfg.Stopper, - linearizable: cfg.Linearizable, - heartbeatInterval: cfg.HeartbeatInterval, - metrics: cfg.Metrics, - condensedIntentsEveryN: log.Every(time.Second), - testingKnobs: cfg.TestingKnobs, + AmbientContext: cfg.AmbientCtx, + st: cfg.Settings, + wrapped: wrapped, + clock: cfg.Clock, + stopper: cfg.Stopper, + linearizable: cfg.Linearizable, + heartbeatInterval: cfg.HeartbeatInterval, + metrics: cfg.Metrics, + condensedIntentsEveryN: log.Every(time.Second), + inflightOverBudgetEveryN: log.Every(time.Second), + testingKnobs: cfg.TestingKnobs, } if tcf.st == nil { tcf.st = cluster.MakeTestingClusterSettings() diff --git a/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go b/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go index b39aedaeee55..c978b9e443fe 100644 --- a/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go +++ b/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go @@ -221,12 +221,13 @@ var rejectTxnOverTrackedWritesBudget = settings.RegisterBoolSetting( // attached to any end transaction request that is passed through the pipeliner // to ensure that they the locks within them are released. type txnPipeliner struct { - st *cluster.Settings - riGen rangeIteratorFactory // used to condense lock spans, if provided - wrapped lockedSender - disabled bool - txnMetrics *TxnMetrics - condensedIntentsEveryN *log.EveryN + st *cluster.Settings + riGen rangeIteratorFactory // used to condense lock spans, if provided + wrapped lockedSender + disabled bool + txnMetrics *TxnMetrics + condensedIntentsEveryN *log.EveryN + inflightOverBudgetEveryN *log.EveryN // In-flight writes are intent point writes that have not yet been proved // to have succeeded. They will need to be proven before the transaction @@ -686,10 +687,26 @@ func (tp *txnPipeliner) updateLockTracking( return err } + // Because the in-flight writes can include locking reads, and because we + // don't estimate the size of the locks accurately for ranged locking reads, + // it is possible that ifWrites have exceeded the maxBytes threshold. That's + // fine for now, but we add some observability to be aware of this happening. + if tp.ifWrites.byteSize() > maxBytes { + if tp.inflightOverBudgetEveryN.ShouldLog() || log.ExpensiveLogEnabled(ctx, 2) { + log.Warningf(ctx, "a transaction's in-flight writes and locking reads have "+ + "exceeded the intent tracking limit (kv.transaction.max_intents_bytes). "+ + "in-flight writes and locking reads size: %d bytes, txn: %s, ba: %s", + tp.ifWrites.byteSize(), ba.Txn, ba.Summary()) + } + tp.txnMetrics.TxnsInFlightLocksOverTrackingBudget.Inc(1) + } + // Deal with compacting the lock spans. // Compute how many bytes are left for locks after accounting for the - // in-flight writes. + // in-flight writes. It's possible that locksBudget is negative, but the + // remainder of this function and maybeCondense handle this case (each span + // will be maximally condensed). locksBudget := maxBytes - tp.ifWrites.byteSize() // If we're below budget, there's nothing more to do. if tp.lockFootprint.bytesSize() <= locksBudget { diff --git a/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner_test.go b/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner_test.go index 4066a7211719..42ff77048113 100644 --- a/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner_test.go +++ b/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner_test.go @@ -89,10 +89,11 @@ func makeMockTxnPipeliner(iter condensableSpanSetRangeIterator) (txnPipeliner, * metrics := MakeTxnMetrics(time.Hour) everyN := log.Every(time.Hour) return txnPipeliner{ - st: cluster.MakeTestingClusterSettings(), - wrapped: mockSender, - txnMetrics: &metrics, - condensedIntentsEveryN: &everyN, + st: cluster.MakeTestingClusterSettings(), + wrapped: mockSender, + txnMetrics: &metrics, + condensedIntentsEveryN: &everyN, + inflightOverBudgetEveryN: &everyN, riGen: rangeIteratorFactory{ factory: func() condensableSpanSetRangeIterator { return iter diff --git a/pkg/kv/kvclient/kvcoord/txn_metrics.go b/pkg/kv/kvclient/kvcoord/txn_metrics.go index b9be7d73fac5..95f152c58912 100644 --- a/pkg/kv/kvclient/kvcoord/txn_metrics.go +++ b/pkg/kv/kvclient/kvcoord/txn_metrics.go @@ -36,9 +36,10 @@ type TxnMetrics struct { Durations metric.IHistogram - TxnsWithCondensedIntents *metric.Counter - TxnsWithCondensedIntentsGauge *metric.Gauge - TxnsRejectedByLockSpanBudget *metric.Counter + TxnsWithCondensedIntents *metric.Counter + TxnsWithCondensedIntentsGauge *metric.Gauge + TxnsRejectedByLockSpanBudget *metric.Counter + TxnsInFlightLocksOverTrackingBudget *metric.Counter // Restarts is the number of times we had to restart the transaction. Restarts metric.IHistogram @@ -179,6 +180,13 @@ var ( Measurement: "KV Transactions", Unit: metric.Unit_COUNT, } + metaTxnsInflightLocksOverTrackingBudget = metric.Metadata{ + Name: "txn.inflight_locks_over_tracking_budget", + Help: "KV transactions whose in-flight writes and locking reads have exceeded " + + "the intent tracking memory budget (kv.transaction.max_intents_bytes).", + Measurement: "KV Transactions", + Unit: metric.Unit_COUNT, + } metaRestartsHistogram = metric.Metadata{ Name: "txn.restarts", @@ -298,9 +306,10 @@ func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics { Duration: histogramWindow, BucketConfig: metric.IOLatencyBuckets, }), - TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans), - TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge), - TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget), + TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans), + TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge), + TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget), + TxnsInFlightLocksOverTrackingBudget: metric.NewCounter(metaTxnsInflightLocksOverTrackingBudget), Restarts: metric.NewHistogram(metric.HistogramOptions{ Metadata: metaRestartsHistogram, Duration: histogramWindow,