kv: gate batched resolution of pushed locks behind a cluster setting

This commit adds a new `kv.lock_table.batch_pushed_lock_resolution.enabled` cluster setting which controls whether the lock table should allow non-locking readers to defer and batch the resolution of conflicting locks whose holder is known to be pending and have been pushed above the reader's timestamp. This is a safeguard against bugs or behavior changes as we quickly backport a fix for cockroachdb#103126. Epic: None Release note: None
zachlite · Jun 19, 2023 · 71d6e46 · 71d6e46
1 parent 23ee0a9
commit 71d6e46
Show file tree

Hide file tree

Showing 5 changed files with 386 additions and 2 deletions.
diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager.go b/pkg/kv/kvserver/concurrency/concurrency_manager.go
@@ -104,6 +104,19 @@ var DiscoveredLocksThresholdToConsultTxnStatusCache = settings.RegisterIntSettin
 	settings.NonNegativeInt,
 )
 
+// BatchPushedLockResolution controls whether the lock table should allow
+// non-locking readers to defer and batch the resolution of conflicting locks
+// whose holder is known to be pending and have been pushed above the reader's
+// timestamp.
+var BatchPushedLockResolution = settings.RegisterBoolSetting(
+	settings.SystemOnly,
+	"kv.lock_table.batch_pushed_lock_resolution.enabled",
+	"whether the lock table should allow non-locking readers to defer and batch the resolution of "+
+		"conflicting locks whose holder is known to be pending and have been pushed above the reader's "+
+		"timestamp",
+	true,
+)
+
 // managerImpl implements the Manager interface.
 type managerImpl struct {
 	st *cluster.Settings

diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager_test.go b/pkg/kv/kvserver/concurrency/concurrency_manager_test.go
@@ -90,6 +90,7 @@ import (
 // debug-set-clock           ts=<secs>
 // debug-advance-clock       ts=<secs>
 // debug-set-discovered-locks-threshold-to-consult-txn-status-cache n=<count>
+// debug-set-batch-pushed-lock-resolution-enabled ok=<enabled>
 // debug-set-max-locks n=<count>
 // reset
 func TestConcurrencyManagerBasic(t *testing.T) {
@@ -577,6 +578,12 @@ func TestConcurrencyManagerBasic(t *testing.T) {
 				c.setDiscoveredLocksThresholdToConsultTxnStatusCache(n)
 				return ""
 
+			case "debug-set-batch-pushed-lock-resolution-enabled":
+				var ok bool
+				d.ScanArgs(t, "ok", &ok)
+				c.setBatchPushedLockResolutionEnabled(ok)
+				return ""
+
 			case "debug-set-max-locks":
 				var n int
 				d.ScanArgs(t, "n", &n)
@@ -958,6 +965,10 @@ func (c *cluster) setDiscoveredLocksThresholdToConsultTxnStatusCache(n int) {
 	concurrency.DiscoveredLocksThresholdToConsultTxnStatusCache.Override(context.Background(), &c.st.SV, int64(n))
 }
 
+func (c *cluster) setBatchPushedLockResolutionEnabled(ok bool) {
+	concurrency.BatchPushedLockResolution.Override(context.Background(), &c.st.SV, ok)
+}
+
 // reset clears all request state in the cluster. This avoids portions of tests
 // leaking into one another and also serves as an assertion that a sequence of
 // commands has completed without leaking any requests.

diff --git a/pkg/kv/kvserver/concurrency/lock_table.go b/pkg/kv/kvserver/concurrency/lock_table.go
@@ -1734,7 +1734,7 @@ func (l *lockState) tryActiveWait(
 		// lockTableWaiter but difficult to coordinate through the txnStatusCache.
 		// This limitation is acceptable because the most important case here is
 		// optimizing the Export requests issued by backup.
-		if !g.hasUncertaintyInterval() {
+		if !g.hasUncertaintyInterval() && g.lt.batchPushedLockResolution() {
 			pushedTxn, ok := g.lt.txnStatusCache.pendingTxns.get(lockHolderTxn.ID)
 			if ok && g.ts.Less(pushedTxn.WriteTimestamp) {
 				up := roachpb.MakeLockUpdate(pushedTxn, roachpb.Span{Key: l.key})
@@ -2883,7 +2883,7 @@ func (t *lockTableImpl) AddDiscoveredLock(
 		// holder is known to have been pushed above the reader's timestamp. See the
 		// comment in tryActiveWait for more details, including why we include the
 		// hasUncertaintyInterval condition.
-		if str == lock.None && !g.hasUncertaintyInterval() {
+		if str == lock.None && !g.hasUncertaintyInterval() && t.batchPushedLockResolution() {
 			pushedTxn, ok := g.lt.txnStatusCache.pendingTxns.get(intent.Txn.ID)
 			if ok && g.ts.Less(pushedTxn.WriteTimestamp) {
 				g.toResolve = append(
@@ -3143,6 +3143,13 @@ func stepToNextSpan(g *lockTableGuardImpl) *roachpb.Span {
 	return nil
 }
 
+// batchPushedLockResolution returns whether non-locking readers can defer and
+// batch resolution of conflicting locks whose holder is known to be pending and
+// have been pushed above the reader's timestamp.
+func (t *lockTableImpl) batchPushedLockResolution() bool {
+	return BatchPushedLockResolution.Get(&t.settings.SV)
+}
+
 // PushedTransactionUpdated implements the lockTable interface.
 func (t *lockTableImpl) PushedTransactionUpdated(txn *roachpb.Transaction) {
 	// TODO(sumeer): We don't take any action for requests that are already

diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/resolve_pushed_intents b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/resolve_pushed_intents
@@ -319,3 +319,182 @@ num=0
 
 reset namespace
 ----
+
+# -------------------------------------------------------------------------
+# The kv.lock_table.batch_pushed_lock_resolution.enabled cluster setting can
+# be used to disable deferred lock resolution of pushed intents.
+# -------------------------------------------------------------------------
+
+debug-set-batch-pushed-lock-resolution-enabled ok=false
+----
+
+new-txn name=txn1 ts=10,1 epoch=0 priority=high
+----
+
+new-txn name=txn2 ts=10,1 epoch=0
+----
+
+new-txn name=txn3 ts=10,1 epoch=0
+----
+
+new-request name=req1 txn=txn1 ts=10,1
+  scan key=a endkey=z
+----
+
+sequence req=req1
+----
+[1] sequence req1: sequencing request
+[1] sequence req1: acquiring latches
+[1] sequence req1: scanning lock table for conflicting locks
+[1] sequence req1: sequencing complete, returned guard
+
+handle-write-intent-error req=req1 lease-seq=1
+  intent txn=txn2 key=a
+  intent txn=txn2 key=b
+  intent txn=txn2 key=c
+  intent txn=txn2 key=d
+  intent txn=txn2 key=e
+  intent txn=txn2 key=f
+  intent txn=txn2 key=g
+  intent txn=txn2 key=h
+  intent txn=txn2 key=i
+  intent txn=txn2 key=j
+----
+[2] handle write intent error req1: handled conflicting intents on ‹"a"›, ‹"b"›, ‹"c"›, ‹"d"›, ‹"e"›, ‹"f"›, ‹"g"›, ‹"h"›, ‹"i"›, ‹"j"›, released latches
+
+debug-lock-table
+----
+num=10
+ lock: "a"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+ lock: "b"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+ lock: "c"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+ lock: "d"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+ lock: "e"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+ lock: "f"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+ lock: "g"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+ lock: "h"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+ lock: "i"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+ lock: "j"
+  holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
+
+# Before re-scanning and pushing, add a waiter on a single key to demonstrate
+# that uncontended, replicated keys are released when pushed, while contended,
+# replicated keys are not.
+new-request name=req2 txn=txn3 ts=10,1
+  put key=c value=val
+----
+
+sequence req=req2
+----
+[3] sequence req2: sequencing request
+[3] sequence req2: acquiring latches
+[3] sequence req2: scanning lock table for conflicting locks
+[3] sequence req2: waiting in lock wait-queues
+[3] sequence req2: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"c"› (queuedWriters: 1, queuedReaders: 0)
+[3] sequence req2: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = false
+[3] sequence req2: pushing txn 00000002 to abort
+[3] sequence req2: blocked on select in concurrency_test.(*cluster).PushTransaction
+
+# Now re-scan with the high-priority reader.
+sequence req=req1
+----
+[4] sequence req1: re-sequencing request
+[4] sequence req1: acquiring latches
+[4] sequence req1: scanning lock table for conflicting locks
+[4] sequence req1: waiting in lock wait-queues
+[4] sequence req1: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"a"› (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req1: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: pusher pushed pushee to 10.000000000,2
+[4] sequence req1: resolving intent ‹"a"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,13}
+[4] sequence req1: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"b"› (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"a"› for 0.000s
+[4] sequence req1: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: resolving intent ‹"b"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,15}
+[4] sequence req1: lock wait-queue event: wait for txn 00000002 holding lock @ key ‹"c"› (queuedWriters: 1, queuedReaders: 1)
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"b"› for 0.000s
+[4] sequence req1: pushing after 0s for: liveness detection = false, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: resolving intent ‹"c"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,17}
+[4] sequence req1: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"d"› (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"c"› for 0.000s
+[4] sequence req1: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: resolving intent ‹"d"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,19}
+[4] sequence req1: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"e"› (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"d"› for 0.000s
+[4] sequence req1: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: resolving intent ‹"e"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,21}
+[4] sequence req1: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"f"› (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"e"› for 0.000s
+[4] sequence req1: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: resolving intent ‹"f"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,23}
+[4] sequence req1: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"g"› (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"f"› for 0.000s
+[4] sequence req1: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: resolving intent ‹"g"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,25}
+[4] sequence req1: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"h"› (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"g"› for 0.000s
+[4] sequence req1: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: resolving intent ‹"h"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,27}
+[4] sequence req1: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"i"› (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"h"› for 0.000s
+[4] sequence req1: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: resolving intent ‹"i"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,29}
+[4] sequence req1: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key ‹"j"› (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"i"› for 0.000s
+[4] sequence req1: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = true
+[4] sequence req1: pushing timestamp of txn 00000002 above 10.000000000,1
+[4] sequence req1: resolving intent ‹"j"› for txn 00000002 with PENDING status and clock observation {1 123.000000000,31}
+[4] sequence req1: lock wait-queue event: done waiting
+[4] sequence req1: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"j"› for 0.000s
+[4] sequence req1: acquiring latches
+[4] sequence req1: scanning lock table for conflicting locks
+[4] sequence req1: sequencing complete, returned guard
+
+finish req=req1
+----
+[-] finish req1: finishing request
+
+# Only the contended lock remains.
+debug-lock-table
+----
+num=1
+ lock: "c"
+   queued writers:
+    active: false req: 7, txn: 00000003-0000-0000-0000-000000000000
+
+on-txn-updated txn=txn2 status=aborted
+----
+[-] update txn: aborting txn2
+[3] sequence req2: resolving intent ‹"c"› for txn 00000002 with ABORTED status
+[3] sequence req2: lock wait-queue event: done waiting
+[3] sequence req2: conflicted with ‹00000002-0000-0000-0000-000000000000› on ‹"c"› for 0.000s
+[3] sequence req2: acquiring latches
+[3] sequence req2: scanning lock table for conflicting locks
+[3] sequence req2: sequencing complete, returned guard
+
+finish req=req2
+----
+[-] finish req2: finishing request
+
+reset namespace
+----
+
+debug-set-batch-pushed-lock-resolution-enabled ok=true
+----