From 110247f06ce026ac106ec6e94a4474944ff8e385 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Mon, 8 Jun 2020 15:25:47 -0400
Subject: [PATCH] kv/concurrency: drop uncontended replicated lock on
 unreplicated upgrade

Fixes #49658.
Informs #9521.
Informs #49973.
Related to #49684.

This commit tweaks the `lockTable`'s handling of lock acquisition to
drop write-uncontended locks when upgraded from the Unreplicated to
Replicated durability in much the same way we drop Replicated locks when
first acquired. This is possible because a Replicated lock is also
stored as an MVCC intent, so it does not need to also be stored in the
lockTable if writers are not queuing on it. This is beneficial because
it serves as a mitigation for #49973 and avoids the 99th percentile
latency regression observed in #49658. Since we aren't currently great
at avoiding excessive contention on limited scans when locks are in the
lockTable, it's better the keep locks out of the lockTable when
possible.

If any of the readers do truly contend with this lock even after their
limit has been applied, they will notice during their MVCC scan and
re-enter the queue (possibly recreating the lock through
AddDiscoveredLock). Still, in practice this seems to work well in
avoiding most of the artificial concurrency discussed in #49973. It's a
bit of a hack and I am very interested in fixing this fully in the
future (through an approach like #33373 or by incrementally consulting
the lockTable in a `lockAwareIterator`), but for now, I don't see a
downside to make this change.

I intend to backport this change to v20.1, as it's causing issues in one
of the demos we like to run.

Release note (performance improvement): limited SELECT statements
now do a better job avoiding unnecessary contention with UPDATE and
SELECT FOR UPDATE statements.
---
 pkg/kv/kvserver/concurrency/lock_table.go     |  56 +++++++-
 .../testdata/concurrency_manager/update       |  42 +++++-
 .../testdata/lock_table/lock_changes          |  32 ++++-
 .../testdata/lock_table/lock_dropped          | 129 ++++++++++++++++++
 .../testdata/lock_table/size_limit_exceeded   |  69 ++++++----
 5 files changed, 293 insertions(+), 35 deletions(-)
 create mode 100644 pkg/kv/kvserver/concurrency/testdata/lock_table/lock_dropped

diff --git a/pkg/kv/kvserver/concurrency/lock_table.go b/pkg/kv/kvserver/concurrency/lock_table.go
index 8444f5ad9b5d..55770181b1e3 100644
--- a/pkg/kv/kvserver/concurrency/lock_table.go
+++ b/pkg/kv/kvserver/concurrency/lock_table.go
@@ -746,6 +746,9 @@ func (l *lockState) Format(buf *strings.Builder) {
 	} else {
 		writeHolderInfo(buf, txn, ts)
 	}
+	// TODO(sumeer): Add an optional `description string` field to Request and
+	// lockTableGuardImpl that tests can set to avoid relying on the seqNum to
+	// identify requests.
 	if l.waitingReaders.Len() > 0 {
 		fmt.Fprintln(buf, "   waiting readers:")
 		for e := l.waitingReaders.Front(); e != nil; e = e.Next() {
@@ -1625,13 +1628,52 @@ func (l *lockState) requestDone(g *lockTableGuardImpl) (gc bool) {
 	return false
 }
 
+// tryFreeLockOnReplicatedAcquire attempts to free a write-uncontended lock
+// during the state transition from the Unreplicated durability to the
+// Replicated durability. This is possible because a Replicated lock is also
+// stored as an MVCC intent, so it does not need to also be stored in the
+// lockTable if writers are not queuing on it. This is beneficial because it
+// serves as a mitigation for #49973. Since we aren't currently great at
+// avoiding excessive contention on limited scans when locks are in the
+// lockTable, it's better the keep locks out of the lockTable when possible.
+//
+// If any of the readers do truly contend with this lock even after their limit
+// has been applied, they will notice during their MVCC scan and re-enter the
+// queue (possibly recreating the lock through AddDiscoveredLock). Still, in
+// practice this seems to work well in avoiding most of the artificial
+// concurrency discussed in #49973.
+//
+// Acquires l.mu.
+func (l *lockState) tryFreeLockOnReplicatedAcquire() bool {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	// Bail if not locked with only the Unreplicated durability.
+	if !l.holder.locked || l.holder.holder[lock.Replicated].txn != nil {
+		return false
+	}
+
+	// Bail if the lock has waiting writers. It is not uncontended.
+	if l.queuedWriters.Len() != 0 {
+		return false
+	}
+
+	// The lock is uncontended by other writers, so we're safe to drop it.
+	// This may release readers who were waiting on the lock.
+	if gc := l.lockIsFree(); !gc {
+		panic("expected lockIsFree to return true")
+	}
+	return true
+}
+
 // The lock has transitioned from locked/reserved to unlocked. There could be
 // waiters, but there cannot be a reservation.
 // REQUIRES: l.mu is locked.
 func (l *lockState) lockIsFree() (gc bool) {
 	if l.reservation != nil {
-		panic("lockTable bug")
+		panic("called lockIsFree on lock with reservation")
 	}
+
 	// All waiting readers don't need to wait here anymore.
 	for e := l.waitingReaders.Front(); e != nil; {
 		g := e.Value.(*lockTableGuardImpl)
@@ -1831,8 +1873,8 @@ func (t *lockTableImpl) AcquireLock(
 	iter.FirstOverlap(&lockState{key: key})
 	if !iter.Valid() {
 		if durability == lock.Replicated {
-			tree.mu.Unlock()
 			// Don't remember uncontended replicated locks.
+			tree.mu.Unlock()
 			return nil
 		}
 		l = &lockState{id: tree.nextLockSeqNum(), key: key, ss: ss}
@@ -1843,6 +1885,16 @@ func (t *lockTableImpl) AcquireLock(
 		atomic.AddInt64(&tree.numLocks, 1)
 	} else {
 		l = iter.Cur()
+		if durability == lock.Replicated && l.tryFreeLockOnReplicatedAcquire() {
+			// Don't remember uncontended replicated locks. Just like in the
+			// case where the lock is initially added as replicated, we drop
+			// replicated locks from the lockTable when being upgraded from
+			// Unreplicated to Replicated, whenever possible.
+			tree.Delete(l)
+			tree.mu.Unlock()
+			atomic.AddInt64(&tree.numLocks, -1)
+			return nil
+		}
 	}
 	err := l.acquireLock(strength, durability, txn, txn.WriteTimestamp)
 	tree.mu.Unlock()
diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/update b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/update
index 75b7cef634fa..2ee445ae9ee3 100644
--- a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/update
+++ b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/update
@@ -333,6 +333,24 @@ local: num=0
 # Issue another write to the same key for txn1 at its initial timestamp,
 # this time with a replicated durability. The timestamp in the lock
 # table should regress back down to reflect the replicated lock state.
+#
+# NOTE: we currently drop locks from the lockTable when they are
+# upgraded from unreplicated to replicated if they have no active
+# writers waiting on them. So to test what we want to test here, first
+# enqueue a writer on the lock.
+
+new-request name=req4 txn=none ts=10,1
+  put key=k value=v4
+----
+
+sequence req=req4
+----
+[3] sequence req4: sequencing request
+[3] sequence req4: acquiring latches
+[3] sequence req4: scanning lock table for conflicting locks
+[3] sequence req4: waiting in lock wait-queues
+[3] sequence req4: pushing txn 00000001 to abort
+[3] sequence req4: blocked on select in concurrency_test.(*cluster).PushTransaction
 
 new-request name=req3 txn=txn1 ts=10,1
   put key=k value=v2 seq=1
@@ -340,10 +358,10 @@ new-request name=req3 txn=txn1 ts=10,1
 
 sequence req=req3
 ----
-[3] sequence req3: sequencing request
-[3] sequence req3: acquiring latches
-[3] sequence req3: scanning lock table for conflicting locks
-[3] sequence req3: sequencing complete, returned guard
+[4] sequence req3: sequencing request
+[4] sequence req3: acquiring latches
+[4] sequence req3: scanning lock table for conflicting locks
+[4] sequence req3: sequencing complete, returned guard
 
 on-lock-acquired req=req3 key=k seq=1 dur=r
 ----
@@ -358,7 +376,23 @@ debug-lock-table
 global: num=1
  lock: "k"
   holder: txn: 00000001-0000-0000-0000-000000000000, ts: 0.000000010,1, info: repl epoch: 0, seqs: [1], unrepl epoch: 0, seqs: [0]
+   queued writers:
+    active: true req: 9, txn: none
+   distinguished req: 9
 local: num=0
 
+# Finish off txn1. Not needed once we can get rid of req4.
+on-txn-updated txn=txn1 status=committed
+----
+[-] update txn: committing txn1
+[3] sequence req4: resolving intent "k" for txn 00000001 with COMMITTED status
+[3] sequence req4: acquiring latches
+[3] sequence req4: scanning lock table for conflicting locks
+[3] sequence req4: sequencing complete, returned guard
+
+finish req=req4
+----
+[-] finish req4: finishing request
+
 reset namespace
 ----
diff --git a/pkg/kv/kvserver/concurrency/testdata/lock_table/lock_changes b/pkg/kv/kvserver/concurrency/testdata/lock_table/lock_changes
index ee90e3fc38c6..ca9ae5f3409a 100644
--- a/pkg/kv/kvserver/concurrency/testdata/lock_table/lock_changes
+++ b/pkg/kv/kvserver/concurrency/testdata/lock_table/lock_changes
@@ -53,10 +53,30 @@ local: num=0
 # Lock is reacquired at same epoch with lower timestamp and different durability.
 # The lock's timestamp is allowed to regress in this case, because it must never
 # diverge from the replicated state machine.
+#
+# We first add a queued writer because the lockTable currently does not keep track
+# of uncontended replicated locks.
 # ---------------------------------------------------------------------------------
 
+new-request r=reqContend txn=none ts=10 spans=w@a
+----
+
+scan r=reqContend
+----
+start-waiting: true
+
 acquire r=req2 k=a durability=r
 ----
+global: num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000008,0, info: repl epoch: 0, seqs: [3], unrepl epoch: 0, seqs: [2, 3]
+   queued writers:
+    active: true req: 1, txn: none
+   distinguished req: 1
+local: num=0
+
+dequeue r=reqContend
+----
 global: num=1
  lock: "a"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000008,0, info: repl epoch: 0, seqs: [3], unrepl epoch: 0, seqs: [2, 3]
@@ -119,8 +139,8 @@ global: num=1
  lock: "a"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000008,0, info: repl epoch: 0, seqs: [3], unrepl epoch: 2, seqs: [0]
    waiting readers:
-    req: 1, txn: 00000000-0000-0000-0000-000000000002
-   distinguished req: 1
+    req: 2, txn: 00000000-0000-0000-0000-000000000002
+   distinguished req: 2
 local: num=0
 
 new-txn txn=txn1 ts=14 epoch=1 seq=1
@@ -135,8 +155,8 @@ global: num=1
  lock: "a"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: repl epoch: 1, seqs: [1], unrepl epoch: 2, seqs: [0]
    waiting readers:
-    req: 1, txn: 00000000-0000-0000-0000-000000000002
-   distinguished req: 1
+    req: 2, txn: 00000000-0000-0000-0000-000000000002
+   distinguished req: 2
 local: num=0
 
 guard-state r=req5
@@ -178,8 +198,8 @@ global: num=1
  lock: "a"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000014,0, info: repl epoch: 1, seqs: [1], unrepl epoch: 1, seqs: [0, 1]
    waiting readers:
-    req: 2, txn: 00000000-0000-0000-0000-000000000002
-   distinguished req: 2
+    req: 3, txn: 00000000-0000-0000-0000-000000000002
+   distinguished req: 3
 local: num=0
 
 guard-state r=req7
diff --git a/pkg/kv/kvserver/concurrency/testdata/lock_table/lock_dropped b/pkg/kv/kvserver/concurrency/testdata/lock_table/lock_dropped
new file mode 100644
index 000000000000..415c5f784c46
--- /dev/null
+++ b/pkg/kv/kvserver/concurrency/testdata/lock_table/lock_dropped
@@ -0,0 +1,129 @@
+# Misc tests where locks are ignored or dropped during lock acquisitions.
+
+new-lock-table maxlocks=10000
+----
+
+# ---------------------------------------------------------------------------------
+# New replicated locks are ignored.
+# ---------------------------------------------------------------------------------
+
+new-txn txn=txn1 ts=10 epoch=0 seq=2
+----
+
+new-request r=req1 txn=txn1 ts=10 spans=w@a
+----
+
+acquire r=req1 k=a durability=r
+----
+global: num=0
+local: num=0
+
+# ---------------------------------------------------------------------------------
+# Upgrading from unreplicated to replicated for an uncontended lock causes that
+# lock to be dropped.
+# ---------------------------------------------------------------------------------
+
+acquire r=req1 k=a durability=u
+----
+global: num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [2]
+local: num=0
+
+acquire r=req1 k=a durability=r
+----
+global: num=0
+local: num=0
+
+# ---------------------------------------------------------------------------------
+# Upgrading from unreplicated to replicated for a lock with only waiting readers
+# causes that lock to be dropped and the readers to be released.
+# ---------------------------------------------------------------------------------
+
+new-request r=reqContendReader txn=none ts=10 spans=r@a
+----
+
+acquire r=req1 k=a durability=u
+----
+global: num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [2]
+local: num=0
+
+scan r=reqContendReader
+----
+start-waiting: true
+
+print
+----
+global: num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [2]
+   waiting readers:
+    req: 1, txn: none
+   distinguished req: 1
+local: num=0
+
+acquire r=req1 k=a durability=r
+----
+global: num=0
+local: num=0
+
+guard-state r=reqContendReader
+----
+new: state=doneWaiting
+
+# ---------------------------------------------------------------------------------
+# Upgrading from unreplicated to replicated for a lock with waiting reader and
+# writers causes the lock to be retained.
+# ---------------------------------------------------------------------------------
+
+new-request r=reqContendWriter txn=none ts=10 spans=w@a
+----
+
+acquire r=req1 k=a durability=u
+----
+global: num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [2]
+local: num=0
+
+scan r=reqContendReader
+----
+start-waiting: true
+
+scan r=reqContendWriter
+----
+start-waiting: true
+
+print
+----
+global: num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [2]
+   waiting readers:
+    req: 1, txn: none
+   queued writers:
+    active: true req: 2, txn: none
+   distinguished req: 1
+local: num=0
+
+acquire r=req1 k=a durability=r
+----
+global: num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: repl epoch: 0, seqs: [2], unrepl epoch: 0, seqs: [2]
+   waiting readers:
+    req: 1, txn: none
+   queued writers:
+    active: true req: 2, txn: none
+   distinguished req: 1
+local: num=0
+
+guard-state r=reqContendReader
+----
+new: state=waitForDistinguished txn=txn1 ts=10 key="a" held=true guard-access=read
+
+guard-state r=reqContendWriter
+----
+new: state=waitFor txn=txn1 ts=10 key="a" held=true guard-access=write
diff --git a/pkg/kv/kvserver/concurrency/testdata/lock_table/size_limit_exceeded b/pkg/kv/kvserver/concurrency/testdata/lock_table/size_limit_exceeded
index 659947960767..d07fef562aac 100644
--- a/pkg/kv/kvserver/concurrency/testdata/lock_table/size_limit_exceeded
+++ b/pkg/kv/kvserver/concurrency/testdata/lock_table/size_limit_exceeded
@@ -38,11 +38,13 @@ global: num=2
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
 local: num=0
 
-# c is locked both replicated and unreplicated, though we really only need it
-# replicated locked for the case we want to exercise. This is because the
-# lockTable currently does not keep track of uncontended replicated locks.
-# When that behavior changes with the segregated lock table, we can remove
-# this unreplicated lock acquisition.
+# c is first locked as unreplicated and establishes a writer queue
+# before being locked as replicated. We really only need it replicated
+# locked for the case we want to exercise, but we jump through these
+# extra hoops because the lockTable currently does not keep track of
+# uncontended replicated locks. When that behavior changes with the
+# segregated lock table, we can remove this unreplicated lock
+# acquisition and queued writer.
 acquire r=req1 k=c durability=u
 ----
 global: num=3
@@ -54,8 +56,29 @@ global: num=3
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
 local: num=0
 
+new-request r=reqContend txn=none ts=10 spans=w@c
+----
+
+scan r=reqContend
+----
+start-waiting: true
+
 acquire r=req1 k=c durability=r
 ----
+global: num=3
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
+ lock: "b"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
+ lock: "c"
+  holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: repl epoch: 0, seqs: [0], unrepl epoch: 0, seqs: [0]
+   queued writers:
+    active: true req: 2, txn: none
+   distinguished req: 2
+local: num=0
+
+dequeue r=reqContend
+----
 global: num=3
  lock: "a"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
@@ -96,9 +119,9 @@ global: num=3
  lock: "a"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
    queued writers:
-    active: true req: 2, txn: 00000000-0000-0000-0000-000000000002
     active: true req: 3, txn: 00000000-0000-0000-0000-000000000002
-   distinguished req: 2
+    active: true req: 4, txn: 00000000-0000-0000-0000-000000000002
+   distinguished req: 3
  lock: "b"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
  lock: "c"
@@ -109,9 +132,9 @@ release txn=txn1 span=a
 ----
 global: num=3
  lock: "a"
-  res: req: 2, txn: 00000000-0000-0000-0000-000000000002, ts: 0.000000010,0, seq: 0
+  res: req: 3, txn: 00000000-0000-0000-0000-000000000002, ts: 0.000000010,0, seq: 0
    queued writers:
-    active: true req: 3, txn: 00000000-0000-0000-0000-000000000002
+    active: true req: 4, txn: 00000000-0000-0000-0000-000000000002
  lock: "b"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
  lock: "c"
@@ -130,14 +153,14 @@ print
 ----
 global: num=3
  lock: "a"
-  res: req: 2, txn: 00000000-0000-0000-0000-000000000002, ts: 0.000000010,0, seq: 0
+  res: req: 3, txn: 00000000-0000-0000-0000-000000000002, ts: 0.000000010,0, seq: 0
    queued writers:
-    active: true req: 3, txn: 00000000-0000-0000-0000-000000000002
+    active: true req: 4, txn: 00000000-0000-0000-0000-000000000002
  lock: "b"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
    queued writers:
-    active: true req: 2, txn: 00000000-0000-0000-0000-000000000002
-   distinguished req: 2
+    active: true req: 3, txn: 00000000-0000-0000-0000-000000000002
+   distinguished req: 3
  lock: "c"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: repl epoch: 0, seqs: [0], unrepl epoch: 0, seqs: [0]
 local: num=0
@@ -178,26 +201,26 @@ add-discovered r=req7 k=d txn=txn1
 ----
 global: num=4
  lock: "a"
-  res: req: 2, txn: 00000000-0000-0000-0000-000000000002, ts: 0.000000010,0, seq: 0
+  res: req: 3, txn: 00000000-0000-0000-0000-000000000002, ts: 0.000000010,0, seq: 0
    queued writers:
-    active: true req: 3, txn: 00000000-0000-0000-0000-000000000002
+    active: true req: 4, txn: 00000000-0000-0000-0000-000000000002
  lock: "b"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: unrepl epoch: 0, seqs: [0]
    waiting readers:
-    req: 4, txn: 00000000-0000-0000-0000-000000000002
+    req: 5, txn: 00000000-0000-0000-0000-000000000002
    queued writers:
-    active: true req: 2, txn: 00000000-0000-0000-0000-000000000002
-    active: true req: 5, txn: 00000000-0000-0000-0000-000000000002
-   distinguished req: 2
+    active: true req: 3, txn: 00000000-0000-0000-0000-000000000002
+    active: true req: 6, txn: 00000000-0000-0000-0000-000000000002
+   distinguished req: 3
  lock: "c"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: repl epoch: 0, seqs: [0], unrepl epoch: 0, seqs: [0]
    queued writers:
-    active: true req: 6, txn: 00000000-0000-0000-0000-000000000002
-   distinguished req: 6
+    active: true req: 7, txn: 00000000-0000-0000-0000-000000000002
+   distinguished req: 7
  lock: "d"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: repl epoch: 0, seqs: [0]
    queued writers:
-    active: false req: 7, txn: 00000000-0000-0000-0000-000000000002
+    active: false req: 8, txn: 00000000-0000-0000-0000-000000000002
 local: num=0
 
 new-request r=req8 txn=txn2 ts=10 spans=w@e
@@ -215,7 +238,7 @@ global: num=1
  lock: "d"
   holder: txn: 00000000-0000-0000-0000-000000000001, ts: 0.000000010,0, info: repl epoch: 0, seqs: [0]
    queued writers:
-    active: false req: 7, txn: 00000000-0000-0000-0000-000000000002
+    active: false req: 8, txn: 00000000-0000-0000-0000-000000000002
 local: num=0
 
 guard-state r=req2