concurrency,kvserver: limited scans do optimistic latching

Latches for the full spans get inserted up front in the spanlatch.Manager, and the conflict checking happens after evaluation, only over the spans that were actually read. If there is a conflict, the existing inserted latches are waited on, and execution switches to pessimistic latching and locking. The existing cluster setting for optimistic locking is used to gate this behavior. Fixes cockroachdb#9521 Release note (performance improvement): A limited scan now checks for conflicting latches in an optimistic manner, which means it will not conflict with latches that were held in the scan's full spans, but were not in the spans that were scanned until the limit was reached. This behavior can be turned off (along with optimistic locking) by changing the value of the cluster setting kv.concurrency.optimistic_eval_limited_scans.enabled to false.
sumeerbhola · Jun 4, 2021 · 8369922 · 8369922
1 parent 6c2e732
commit 8369922
Show file tree

Hide file tree

Showing 10 changed files with 444 additions and 53 deletions.
diff --git a/pkg/kv/kvserver/concurrency/concurrency_control.go b/pkg/kv/kvserver/concurrency/concurrency_control.go
@@ -394,6 +394,7 @@ type Request struct {
 type Guard struct {
 	Req Request
 	lg  latchGuard
+	lm  latchManager
 	ltg lockTableGuard
 	// The latest RequestEvalKind passed to SequenceReq.
 	EvalKind RequestEvalKind
@@ -411,13 +412,30 @@ type Error = roachpb.Error
 // Internal Structure Interfaces //
 ///////////////////////////////////
 
-// latchManager serializes access to keys and key ranges.
+// latchManager serializes access to keys and key ranges. The
+// {AcquireOptimistic,CheckOptimisticNoConflicts,WaitUntilAcquired} methods
+// are only for use in optimistic latching.
 //
 // See additional documentation in pkg/storage/spanlatch.
 type latchManager interface {
 	// Acquires latches, providing mutual exclusion for conflicting requests.
 	Acquire(context.Context, Request) (latchGuard, *Error)
 
+	// AcquireOptimistic is like Acquire in that it inserts latches, but it does
+	// not wait for conflicting latches on overlapping spans to be released
+	// before returning. This should be followed by CheckOptimisticNoConflicts
+	// to validate that not waiting did not violate correctness.
+	AcquireOptimistic(req Request) latchGuard
+
+	// CheckOptimisticNoConflicts returns true iff the spans in the provided
+	// spanset do not conflict with existing latches.
+	CheckOptimisticNoConflicts(lg latchGuard, spans *spanset.SpanSet) bool
+
+	// WaitUntilAcquired is meant to be called when CheckOptimisticNoConflicts
+	// returned false, or some other occurrence (like conflicting locks) is
+	// causing this request to switch to pessimistic latching.
+	WaitUntilAcquired(ctx context.Context, lg latchGuard) (latchGuard, *Error)
+
 	// Releases latches, relinquish its protection from conflicting requests.
 	Release(latchGuard)
 

diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager.go b/pkg/kv/kvserver/concurrency/concurrency_manager.go
@@ -143,6 +143,7 @@ func (m *managerImpl) SequenceReq(
 			panic("retry should have non-nil guard")
 		}
 		g = newGuard(req)
+		g.lm = m.lm
 	} else {
 		g = prev
 		switch evalKind {
@@ -189,25 +190,38 @@ func (m *managerImpl) sequenceReqWithGuard(
 
 	// Only the first iteration can sometimes already be holding latches -- we
 	// use this to assert below.
-	first := true
+	firstIteration := true
 	for {
-		if !first {
+		if !firstIteration {
 			g.AssertNoLatches()
 		}
-		first = false
 		if !g.HoldingLatches() {
-			// TODO(sumeer): optimistic requests could register their need for
-			// latches, but not actually wait until acquisition.
-			// https://github.com/cockroachdb/cockroach/issues/9521
-
-			// Acquire latches for the request. This synchronizes the request
-			// with all conflicting in-flight requests.
-			log.Event(ctx, "acquiring latches")
-			g.lg, err = m.lm.Acquire(ctx, req)
-			if err != nil {
-				return nil, err
+			if g.EvalKind == OptimisticEval {
+				if !firstIteration {
+					// The only way we loop more than once is when conflicting locks are
+					// found -- see below where that happens and the comment there on
+					// why it will never happen with OptimisticEval.
+					panic("optimistic eval should not loop in sequenceReqWithGuard")
+				}
+				log.Event(ctx, "optimistically acquiring latches")
+				g.lg = m.lm.AcquireOptimistic(req)
+			} else {
+				// Acquire latches for the request. This synchronizes the request
+				// with all conflicting in-flight requests.
+				log.Event(ctx, "acquiring latches")
+				g.lg, err = m.lm.Acquire(ctx, req)
+				if err != nil {
+					return nil, err
+				}
+			}
+		} else {
+			if g.EvalKind != PessimisticAfterFailedOptimisticEval {
+				panic("must not be holding latches")
 			}
+			log.Event(ctx, "optimistic failed, so waiting for latches")
+			g.lg, err = m.lm.WaitUntilAcquired(ctx, g.lg)
 		}
+		firstIteration = false
 
 		// Some requests don't want the wait on locks.
 		if req.LockSpans.Empty() {
@@ -226,7 +240,9 @@ func (m *managerImpl) sequenceReqWithGuard(
 			g.ltg = m.lt.ScanAndEnqueue(g.Req, g.ltg)
 		}
 
-		// Wait on conflicting locks, if necessary.
+		// Wait on conflicting locks, if necessary. Note that this will never be
+		// true if ScanOptimistic was called above. Therefore it will also never
+		// be true if latchManager.AcquireOptimistic was called.
 		if g.ltg.ShouldWait() {
 			m.lm.Release(g.moveLatchGuard())
 
@@ -533,14 +549,24 @@ func (g *Guard) AssertNoLatches() {
 
 // CheckOptimisticNoConflicts checks that the lockSpansRead do not have a
 // conflicting lock.
-func (g *Guard) CheckOptimisticNoConflicts(lockSpansRead *spanset.SpanSet) (ok bool) {
+func (g *Guard) CheckOptimisticNoConflicts(
+	latchSpansRead *spanset.SpanSet, lockSpansRead *spanset.SpanSet,
+) (ok bool) {
 	if g.EvalKind != OptimisticEval {
 		panic(errors.AssertionFailedf("unexpected EvalKind: %d", g.EvalKind))
 	}
-	if g.ltg == nil {
+	if g.lg == nil && g.ltg == nil {
 		return true
 	}
-	return g.ltg.CheckOptimisticNoConflicts(lockSpansRead)
+	if g.lg == nil {
+		panic("expected non-nil latchGuard")
+	}
+	// First check the latches, since a conflict there could mean that racing
+	// requests in the lock table caused a conflicting lock to not be noticed.
+	if g.lm.CheckOptimisticNoConflicts(g.lg, latchSpansRead) {
+		return g.ltg.CheckOptimisticNoConflicts(lockSpansRead)
+	}
+	return false
 }
 
 func (g *Guard) moveLatchGuard() latchGuard {

diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager_test.go b/pkg/kv/kvserver/concurrency/concurrency_manager_test.go
@@ -300,8 +300,8 @@ func TestConcurrencyManagerBasic(t *testing.T) {
 					d.Fatalf(t, "unknown request: %s", reqName)
 				}
 				reqs, _ := scanRequests(t, d, c)
-				_, lockSpans := c.collectSpans(t, g.Req.Txn, g.Req.Timestamp, reqs)
-				return fmt.Sprintf("no-conflicts: %t", g.CheckOptimisticNoConflicts(lockSpans))
+				latchSpans, lockSpans := c.collectSpans(t, g.Req.Txn, g.Req.Timestamp, reqs)
+				return fmt.Sprintf("no-conflicts: %t", g.CheckOptimisticNoConflicts(latchSpans, lockSpans))
 
 			case "on-lock-acquired":
 				var reqName string

diff --git a/pkg/kv/kvserver/concurrency/latch_manager.go b/pkg/kv/kvserver/concurrency/latch_manager.go
@@ -15,6 +15,7 @@ import (
 
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanlatch"
+	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 )
 
@@ -31,6 +32,25 @@ func (m *latchManagerImpl) Acquire(ctx context.Context, req Request) (latchGuard
 	return lg, nil
 }
 
+func (m *latchManagerImpl) AcquireOptimistic(req Request) latchGuard {
+	lg := m.m.AcquireOptimistic(req.LatchSpans)
+	return lg
+}
+
+func (m *latchManagerImpl) CheckOptimisticNoConflicts(lg latchGuard, spans *spanset.SpanSet) bool {
+	return m.m.CheckOptimisticNoConflicts(lg.(*spanlatch.Guard), spans)
+}
+
+func (m *latchManagerImpl) WaitUntilAcquired(
+	ctx context.Context, lg latchGuard,
+) (latchGuard, *Error) {
+	lg, err := m.m.WaitUntilAcquired(ctx, lg.(*spanlatch.Guard))
+	if err != nil {
+		return nil, roachpb.NewError(err)
+	}
+	return lg, nil
+}
+
 func (m *latchManagerImpl) Release(lg latchGuard) {
 	m.m.Release(lg.(*spanlatch.Guard))
 }

diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/optimistic b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/optimistic
@@ -38,7 +38,7 @@ new-request name=req2 txn=txn2 ts=12,1
 sequence req=req2 eval-kind=opt
 ----
 [2] sequence req2: optimistically sequencing request
-[2] sequence req2: acquiring latches
+[2] sequence req2: optimistically acquiring latches
 [2] sequence req2: optimistically scanning lock table for conflicting locks
 [2] sequence req2: sequencing complete, returned guard
 
@@ -56,33 +56,135 @@ check-opt-no-conflicts req=req2
 ----
 no-conflicts: true
 
-# Wider span has a conflict.
-check-opt-no-conflicts req=req2
+finish req=req2
+----
+[-] finish req2: finishing request
+
+new-request name=req3 txn=txn2 ts=12,1
+  scan key=a endkey=e
+----
+
+# Optimistic locking for req3
+sequence req=req3 eval-kind=opt
+----
+[3] sequence req3: optimistically sequencing request
+[3] sequence req3: optimistically acquiring latches
+[3] sequence req3: optimistically scanning lock table for conflicting locks
+[3] sequence req3: sequencing complete, returned guard
+
+debug-lock-table
+----
+global: num=1
+ lock: "d"
+  holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: unrepl epoch: 0, seqs: [0]
+local: num=0
+
+# Wider span for req3 has a conflict.
+check-opt-no-conflicts req=req3
   scan key=a endkey=e
 ----
 no-conflicts: false
 
 # Sequence again -- latches are already held.
-sequence req=req2 eval-kind=pess-after-opt
+sequence req=req3 eval-kind=pess-after-opt
 ----
-[3] sequence req2: re-sequencing request after optimistic sequencing failed
-[3] sequence req2: scanning lock table for conflicting locks
-[3] sequence req2: waiting in lock wait-queues
-[3] sequence req2: lock wait-queue event: wait for (distinguished) txn 00000001 holding lock @ key "d" (queuedWriters: 0, queuedReaders: 1)
-[3] sequence req2: pushing timestamp of txn 00000001 above 12.000000000,1
-[3] sequence req2: blocked on select in concurrency_test.(*cluster).PushTransaction
+[4] sequence req3: re-sequencing request after optimistic sequencing failed
+[4] sequence req3: optimistic failed, so waiting for latches
+[4] sequence req3: scanning lock table for conflicting locks
+[4] sequence req3: waiting in lock wait-queues
+[4] sequence req3: lock wait-queue event: wait for (distinguished) txn 00000001 holding lock @ key "d" (queuedWriters: 0, queuedReaders: 1)
+[4] sequence req3: pushing timestamp of txn 00000001 above 12.000000000,1
+[4] sequence req3: blocked on select in concurrency_test.(*cluster).PushTransaction
 
 # Conflicting transaction commits.
 on-txn-updated txn=txn1 status=committed
 ----
 [-] update txn: committing txn1
-[3] sequence req2: resolving intent "d" for txn 00000001 with COMMITTED status
-[3] sequence req2: lock wait-queue event: done waiting
-[3] sequence req2: conflicted with 00000001-0000-0000-0000-000000000000 on "d" for 1.234s
-[3] sequence req2: acquiring latches
-[3] sequence req2: scanning lock table for conflicting locks
-[3] sequence req2: sequencing complete, returned guard
+[4] sequence req3: resolving intent "d" for txn 00000001 with COMMITTED status
+[4] sequence req3: lock wait-queue event: done waiting
+[4] sequence req3: conflicted with 00000001-0000-0000-0000-000000000000 on "d" for 1.234s
+[4] sequence req3: acquiring latches
+[4] sequence req3: scanning lock table for conflicting locks
+[4] sequence req3: sequencing complete, returned guard
 
-finish req=req2
+
+finish req=req3
 ----
-[-] finish req2: finishing request
+[-] finish req3: finishing request
+
+# Another transaction that writes, which will hold latches but not locks.
+new-txn name=txn3 ts=10,1 epoch=0
+----
+
+new-request name=req4 txn=txn3 ts=10,1
+  put  key=d value=d
+----
+
+sequence req=req4
+----
+[5] sequence req4: sequencing request
+[5] sequence req4: acquiring latches
+[5] sequence req4: scanning lock table for conflicting locks
+[5] sequence req4: sequencing complete, returned guard
+
+debug-lock-table
+----
+global: num=0
+local: num=0
+
+new-request name=req5 txn=txn2 ts=12,1
+  scan key=a endkey=e
+----
+
+sequence req=req5 eval-kind=opt
+----
+[6] sequence req5: optimistically sequencing request
+[6] sequence req5: optimistically acquiring latches
+[6] sequence req5: optimistically scanning lock table for conflicting locks
+[6] sequence req5: sequencing complete, returned guard
+
+# When checking with a span that does not include the existing latch, there is
+# no conflict.
+check-opt-no-conflicts req=req5
+  scan key=a endkey=c
+----
+no-conflicts: true
+
+finish req=req5
+----
+[-] finish req5: finishing request
+
+new-request name=req6 txn=txn2 ts=12,1
+  scan key=a endkey=e
+----
+
+sequence req=req6 eval-kind=opt
+----
+[7] sequence req6: optimistically sequencing request
+[7] sequence req6: optimistically acquiring latches
+[7] sequence req6: optimistically scanning lock table for conflicting locks
+[7] sequence req6: sequencing complete, returned guard
+
+# Wider span for req6 has a conflict with the latch held by req4.
+check-opt-no-conflicts req=req6
+  scan key=a endkey=e
+----
+no-conflicts: false
+
+sequence req=req6 eval-kind=pess-after-opt
+----
+[8] sequence req6: re-sequencing request after optimistic sequencing failed
+[8] sequence req6: optimistic failed, so waiting for latches
+[8] sequence req6: waiting to acquire read latch {a-e}@12.000000000,1, held by write latch [email protected],1
+[8] sequence req6: blocked on select in spanlatch.(*Manager).waitForSignal
+
+# req4 finishing releases the latch and allows req6 to proceed.
+finish req=req4
+----
+[-] finish req4: finishing request
+[8] sequence req6: scanning lock table for conflicting locks
+[8] sequence req6: sequencing complete, returned guard
+
+finish req=req6
+----
+[-] finish req6: finishing request
diff --git a/pkg/kv/kvserver/replica_read.go b/pkg/kv/kvserver/replica_read.go
@@ -73,11 +73,11 @@ func (r *Replica) executeReadOnlyBatch(
 	// release latches immediately after we acquire an engine iterator as long
 	// as we're performing a non-locking read. Note that this also requires that
 	// the request is not being optimistically evaluated (optimistic evaluation
-	// does not check locks). It would also be nice, but not required for
-	// correctness, that the read-only engine eagerly create an iterator (that
-	// is later cloned) while the latches are held, so that this request does
-	// not "see" the effect of any later requests that happen after the latches
-	// are released.
+	// does not wait for latches or check locks). It would also be nice, but not
+	// required for correctness, that the read-only engine eagerly create an
+	// iterator (that is later cloned) while the latches are held, so that this
+	// request does not "see" the effect of any later requests that happen after
+	// the latches are released.
 
 	var result result.Result
 	br, result, pErr = r.executeReadOnlyBatchWithServersideRefreshes(
@@ -94,14 +94,12 @@ func (r *Replica) executeReadOnlyBatch(
 	if pErr == nil && g.EvalKind == concurrency.OptimisticEval {
 		// Gather the spans that were read -- we distinguish the spans in the
 		// request from the spans that were actually read, using resume spans in
-		// the response. For now we ignore the latch spans, but when we stop
-		// waiting for latches in optimistic evaluation we will use these to check
-		// latches first.
-		_, lockSpansRead, err := r.collectSpansRead(ba, br)
+		// the response.
+		latchSpansRead, lockSpansRead, err := r.collectSpansRead(ba, br)
 		if err != nil {
 			return nil, g, roachpb.NewError(err)
 		}
-		if ok := g.CheckOptimisticNoConflicts(lockSpansRead); !ok {
+		if ok := g.CheckOptimisticNoConflicts(latchSpansRead, lockSpansRead); !ok {
 			return nil, g, roachpb.NewError(roachpb.NewOptimisticEvalConflictsError())
 		}
 	}