Merge cockroachdb#104784

104784: kv/concurrency: batch intent resolution of pushed intents from same txn r=arulajmani a=nvanbenschoten Fixes cockroachdb#103126. This commit extends the infrastructure introduced in cockroachdb#49218 for transaction timestamp pushes. It avoids redundant txn pushes of PENDING transactions and batches the resolution of PENDING intents. This breaks the O(num_intents) work performed by high-priority scans (e.g. backups) over intent-heavy keyspaces into something closer to O(num_ranges) work. The commit accomplishes its goals by adding a second per-Range LRU cache of transactions that are PENDING and are known to have been pushed to higher timestamps. We use this cache for two purposes: 1. when we are a non-locking read and we see a lock at a conflicting timestamp who is held by a pushed txn above our read timestamp, we neither wait out the kv.lock_table.coordinator_liveness_push_delay (50 ms) nor push the transactions record (RPC to leaseholder of pushee's txn record range). 2. we use the existence of a transaction in the cache as an indication that it may have written multiple intents, so we begin deferring intent resolution to enable batching. Together, these two changes make us much more effective at pushing transactions with a large number of intents. The following example (from cockroachdb#103126) demonstrates this: ```sql -- SETUP: run in a 3-node GCP roachprod cluster --- session 1 - write 100k intents CREATE TABLE keys (k BIGINT NOT NULL PRIMARY KEY); BEGIN; INSERT INTO keys SELECT generate_series(1, 100000); --- session 2 - push intents with high-priority txn without uncertainty interval BEGIN PRIORITY HIGH AS OF SYSTEM TIME '-1ms'; SELECT count(*) FROM keys; --- BEFORE this PR and before cockroachdb#103265 (i.e. v23.1.2): takes ~7.1ms per intent Time: 714.441s total --- BEFORE this PR: takes ~1.5ms per intent Time: 151.880s total --- AFTER this PR: takes ~24μs per intent Time: 2.405s ``` The change does have an unfortunate limitation. Deferred intent resolution is only currently enabled for non-locking readers without uncertainty intervals. Readers with uncertainty intervals must contend with the possibility of pushing a conflicting intent up into their uncertainty interval and causing more work for themselves, which is avoided with care by the lockTableWaiter but difficult to coordinate through the txnStatusCache. This limitation is acceptable because the most important case here is optimizing the Export requests issued by backup. This limitation also hints at the long-term plan for this interaction, which is that non-locking readers can ignore known pending intents without the need to even resolve those intents (see cockroachdb#94730). This will require a request-scoped cache of pending, pushed transactions, which does not have the same problems with uncertainty intervals. Release note (performance improvement): Backups no longer perform work proportional to the number of pending intents that they encounter, so they are over 100x faster when encountering long-running, bulk writing transactions. Co-authored-by: Arul Ajmani <[email protected]> Co-authored-by: Nathan VanBenschoten <[email protected]>
zachlite · Jun 20, 2023 · 76da6c7 · 76da6c7
2 parents 7cbcd3f + 71d6e46
commit 76da6c7
Show file tree

Hide file tree

Showing 14 changed files with 1,564 additions and 118 deletions.
diff --git a/pkg/kv/kvserver/concurrency/concurrency_control.go b/pkg/kv/kvserver/concurrency/concurrency_control.go
@@ -617,10 +617,10 @@ type lockTable interface {
 	// evaluation of this request. It adds the lock and enqueues this requester
 	// in its wait-queue. It is required that request evaluation discover such
 	// locks before acquiring its own locks, since the request needs to repeat
-	// ScanAndEnqueue. When consultFinalizedTxnCache=true, and the transaction
-	// holding the lock is finalized, the lock is not added to the lock table
-	// and instead tracked in the list of locks to resolve in the
-	// lockTableGuard.
+	// ScanAndEnqueue. When consultTxnStatusCache=true, and the transaction
+	// holding the lock is known to be pushed or finalized, the lock is not added
+	// to the lock table and instead tracked in the list of locks to resolve in
+	// the lockTableGuard.
 	//
 	// The lease sequence is used to detect lease changes between the when
 	// request that found the lock started evaluating and when the discovered
@@ -637,7 +637,7 @@ type lockTable interface {
 	// true) or whether it was ignored because the lockTable is currently
 	// disabled (false).
 	AddDiscoveredLock(
-		intent *roachpb.Intent, seq roachpb.LeaseSequence, consultFinalizedTxnCache bool,
+		intent *roachpb.Intent, seq roachpb.LeaseSequence, consultTxnStatusCache bool,
 		guard lockTableGuard) (bool, error)
 
 	// AcquireLock informs the lockTable that a new lock was acquired or an
@@ -722,11 +722,12 @@ type lockTable interface {
 	//     txn.WriteTimestamp.
 	UpdateLocks(*roachpb.LockUpdate) error
 
-	// TransactionIsFinalized informs the lock table that a transaction is
-	// finalized. This is used by the lock table in a best-effort manner to avoid
-	// waiting on locks of finalized transactions and telling the caller via
-	// lockTableGuard.ResolveBeforeEvaluation to resolve a batch of intents.
-	TransactionIsFinalized(*roachpb.Transaction)
+	// PushedTransactionUpdated informs the lock table that a transaction has been
+	// pushed and is either finalized or has been moved to a higher timestamp.
+	// This is used by the lock table in a best-effort manner to avoid waiting on
+	// locks of finalized or pushed transactions and telling the caller via
+	// lockTableGuard.ResolveBeforeScanning to resolve a batch of intents.
+	PushedTransactionUpdated(*roachpb.Transaction)
 
 	// QueryLockTableState returns detailed metadata on locks managed by the lockTable.
 	QueryLockTableState(span roachpb.Span, opts QueryLockTableOptions) ([]roachpb.LockStateInfo, QueryLockTableResumeState)

diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager.go b/pkg/kv/kvserver/concurrency/concurrency_manager.go
@@ -86,23 +86,37 @@ var MaxLockWaitQueueLength = settings.RegisterIntSetting(
 	},
 )
 
-// DiscoveredLocksThresholdToConsultFinalizedTxnCache sets a threshold as
-// mentioned in the description string. The default of 200 is somewhat
-// arbitrary but should suffice for small OLTP transactions. Given the default
+// DiscoveredLocksThresholdToConsultTxnStatusCache sets a threshold as mentioned
+// in the description string. The default of 200 is somewhat arbitrary but
+// should suffice for small OLTP transactions. Given the default
 // 10,000 lock capacity of the lock table, 200 is small enough to not matter
 // much against the capacity, which is desirable. We have seen examples with
 // discoveredCount > 100,000, caused by stats collection, where we definitely
 // want to avoid adding these locks to the lock table, if possible.
-var DiscoveredLocksThresholdToConsultFinalizedTxnCache = settings.RegisterIntSetting(
+var DiscoveredLocksThresholdToConsultTxnStatusCache = settings.RegisterIntSetting(
 	settings.SystemOnly,
+	// NOTE: the name of this setting mentions "finalized" for historical reasons.
 	"kv.lock_table.discovered_locks_threshold_for_consulting_finalized_txn_cache",
-	"the maximum number of discovered locks by a waiter, above which the finalized txn cache"+
+	"the maximum number of discovered locks by a waiter, above which the txn status cache"+
 		"is consulted and resolvable locks are not added to the lock table -- this should be a small"+
 		"fraction of the maximum number of locks in the lock table",
 	200,
 	settings.NonNegativeInt,
 )
 
+// BatchPushedLockResolution controls whether the lock table should allow
+// non-locking readers to defer and batch the resolution of conflicting locks
+// whose holder is known to be pending and have been pushed above the reader's
+// timestamp.
+var BatchPushedLockResolution = settings.RegisterBoolSetting(
+	settings.SystemOnly,
+	"kv.lock_table.batch_pushed_lock_resolution.enabled",
+	"whether the lock table should allow non-locking readers to defer and batch the resolution of "+
+		"conflicting locks whose holder is known to be pending and have been pushed above the reader's "+
+		"timestamp",
+	true,
+)
+
 // managerImpl implements the Manager interface.
 type managerImpl struct {
 	st *cluster.Settings
@@ -146,7 +160,7 @@ func (c *Config) initDefaults() {
 func NewManager(cfg Config) Manager {
 	cfg.initDefaults()
 	m := new(managerImpl)
-	lt := newLockTable(cfg.MaxLockTableSize, cfg.RangeDesc.RangeID, cfg.Clock)
+	lt := newLockTable(cfg.MaxLockTableSize, cfg.RangeDesc.RangeID, cfg.Clock, cfg.Settings)
 	*m = managerImpl{
 		st: cfg.Settings,
 		// TODO(nvanbenschoten): move pkg/storage/spanlatch to a new
@@ -465,11 +479,11 @@ func (m *managerImpl) HandleWriterIntentError(
 	//
 	// Either way, there is no possibility of the request entering an infinite
 	// loop without making progress.
-	consultFinalizedTxnCache :=
-		int64(len(t.Intents)) > DiscoveredLocksThresholdToConsultFinalizedTxnCache.Get(&m.st.SV)
+	consultTxnStatusCache :=
+		int64(len(t.Intents)) > DiscoveredLocksThresholdToConsultTxnStatusCache.Get(&m.st.SV)
 	for i := range t.Intents {
 		intent := &t.Intents[i]
-		added, err := m.lt.AddDiscoveredLock(intent, seq, consultFinalizedTxnCache, g.ltg)
+		added, err := m.lt.AddDiscoveredLock(intent, seq, consultTxnStatusCache, g.ltg)
 		if err != nil {
 			log.Fatalf(ctx, "%v", err)
 		}
@@ -634,13 +648,6 @@ func (m *managerImpl) TestingSetMaxLocks(maxLocks int64) {
 	m.lt.(*lockTableImpl).setMaxLocks(maxLocks)
 }
 
-func (r *Request) txnMeta() *enginepb.TxnMeta {
-	if r.Txn == nil {
-		return nil
-	}
-	return &r.Txn.TxnMeta
-}
-
 func (r *Request) isSingle(m kvpb.Method) bool {
 	if len(r.Requests) != 1 {
 		return false

diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager_test.go b/pkg/kv/kvserver/concurrency/concurrency_manager_test.go
@@ -89,7 +89,8 @@ import (
 // debug-disable-txn-pushes
 // debug-set-clock           ts=<secs>
 // debug-advance-clock       ts=<secs>
-// debug-set-discovered-locks-threshold-to-consult-finalized-txn-cache n=<count>
+// debug-set-discovered-locks-threshold-to-consult-txn-status-cache n=<count>
+// debug-set-batch-pushed-lock-resolution-enabled ok=<enabled>
 // debug-set-max-locks n=<count>
 // reset
 func TestConcurrencyManagerBasic(t *testing.T) {
@@ -571,10 +572,16 @@ func TestConcurrencyManagerBasic(t *testing.T) {
 				c.manual.Advance(time.Duration(secs) * time.Second)
 				return ""
 
-			case "debug-set-discovered-locks-threshold-to-consult-finalized-txn-cache":
+			case "debug-set-discovered-locks-threshold-to-consult-txn-status-cache":
 				var n int
 				d.ScanArgs(t, "n", &n)
-				c.setDiscoveredLocksThresholdToConsultFinalizedTxnCache(n)
+				c.setDiscoveredLocksThresholdToConsultTxnStatusCache(n)
+				return ""
+
+			case "debug-set-batch-pushed-lock-resolution-enabled":
+				var ok bool
+				d.ScanArgs(t, "ok", &ok)
+				c.setBatchPushedLockResolutionEnabled(ok)
 				return ""
 
 			case "debug-set-max-locks":
@@ -954,8 +961,12 @@ func (c *cluster) disableTxnPushes() {
 	concurrency.LockTableDeadlockDetectionPushDelay.Override(context.Background(), &c.st.SV, time.Hour)
 }
 
-func (c *cluster) setDiscoveredLocksThresholdToConsultFinalizedTxnCache(n int) {
-	concurrency.DiscoveredLocksThresholdToConsultFinalizedTxnCache.Override(context.Background(), &c.st.SV, int64(n))
+func (c *cluster) setDiscoveredLocksThresholdToConsultTxnStatusCache(n int) {
+	concurrency.DiscoveredLocksThresholdToConsultTxnStatusCache.Override(context.Background(), &c.st.SV, int64(n))
+}
+
+func (c *cluster) setBatchPushedLockResolutionEnabled(ok bool) {
+	concurrency.BatchPushedLockResolution.Override(context.Background(), &c.st.SV, ok)
 }
 
 // reset clears all request state in the cluster. This avoids portions of tests