diff --git a/pkg/kv/kvserver/closed_timestamp_test.go b/pkg/kv/kvserver/closed_timestamp_test.go index 6c948b0a3d4c..4a52b1f37251 100644 --- a/pkg/kv/kvserver/closed_timestamp_test.go +++ b/pkg/kv/kvserver/closed_timestamp_test.go @@ -247,10 +247,10 @@ func TestClosedTimestampCanServeThroughoutLeaseTransfer(t *testing.T) { } } -// TestClosedTimestampCanServeWithConflictingIntent validates that a read served -// from a follower replica will wait on conflicting intents and ensure that they -// are cleaned up if necessary to allow the read to proceed. -func TestClosedTimestampCanServeWithConflictingIntent(t *testing.T) { +// TestClosedTimestampCantServeWithConflictingIntent validates that a read +// served from a follower replica will redirect to the leaseholder if it +// encounters a conflicting intent below the closed timestamp. +func TestClosedTimestampCantServeWithConflictingIntent(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) @@ -261,8 +261,8 @@ func TestClosedTimestampCanServeWithConflictingIntent(t *testing.T) { ds := tc.Server(0).DistSenderI().(*kvcoord.DistSender) // Write N different intents for the same transaction, where N is the number - // of replicas in the testing range. Each intent will be read and eventually - // resolved by a read on a different replica. + // of replicas in the testing range. Each intent will be read on a different + // replica. txnKey := desc.StartKey.AsRawKey() txnKey = txnKey[:len(txnKey):len(txnKey)] // avoid aliasing txn := roachpb.MakeTransaction("txn", txnKey, 0, tc.Server(0).Clock().Now(), 0) @@ -272,53 +272,89 @@ func TestClosedTimestampCanServeWithConflictingIntent(t *testing.T) { keys = append(keys, key) put := putArgs(key, []byte("val")) resp, err := kv.SendWrappedWith(ctx, ds, roachpb.Header{Txn: &txn}, put) - if err != nil { - t.Fatal(err) - } + require.Nil(t, err) txn.Update(resp.Header().Txn) } - // Read a different intent on each replica. All should begin waiting on the - // intents by pushing the transaction that wrote them. None should complete. - ts := txn.WriteTimestamp - respCh := make(chan error, len(keys)) - for i, key := range keys { - go func(repl *kvserver.Replica, key roachpb.Key) { - baRead := makeTxnReadBatchForDesc(desc, ts) - respCh <- testutils.SucceedsSoonError(func() error { - // Expect 0 rows, because the intents will be aborted. - _, err := expectRows(0)(repl.Send(ctx, baRead)) - return err - }) - }(repls[i], key) + // Set a long txn liveness threshold so that the txn cannot be aborted. + defer txnwait.TestingOverrideTxnLivenessThreshold(time.Hour)() + + // runFollowerReads attempts to perform a follower read on a different key on + // each replica, using the provided timestamp as the request timestamp. + runFollowerReads := func(ts hlc.Timestamp, retryUntilSuccessful bool) chan error { + respCh := make(chan error, len(repls)) + for i := range repls { + go func(repl *kvserver.Replica, key roachpb.Key) { + baRead := makeTxnReadBatchForDesc(desc, ts) + baRead.Requests[0].GetScan().SetSpan(roachpb.Span{ + Key: key, + EndKey: key.Next(), + }) + var err error + if retryUntilSuccessful { + err = testutils.SucceedsSoonError(func() error { + // Expect 0 rows, because the intents are never committed. + _, err := expectRows(0)(repl.Send(ctx, baRead)) + return err + }) + } else { + _, pErr := repl.Send(ctx, baRead) + err = pErr.GoError() + } + respCh <- err + }(repls[i], keys[i]) + } + return respCh + } + + // Follower reads should be possible up to just below the intents' timestamp. + // We use MinTimestamp instead of WriteTimestamp because the WriteTimestamp + // may have been bumped after the txn wrote some intents. + respCh1 := runFollowerReads(txn.MinTimestamp.Prev(), true) + for i := 0; i < len(repls); i++ { + require.NoError(t, <-respCh1) } + // At the intents' timestamp, reads on the leaseholder should block and reads + // on the followers should be redirected to the leaseholder, even though the + // read timestamp is below the closed timestamp. + respCh2 := runFollowerReads(txn.WriteTimestamp, false) + for i := 0; i < len(repls)-1; i++ { + err := <-respCh2 + require.Error(t, err) + var lErr *roachpb.NotLeaseHolderError + require.True(t, errors.As(err, &lErr)) + } select { - case err := <-respCh: + case err := <-respCh2: t.Fatalf("request unexpectedly returned, should block; err: %v", err) case <-time.After(20 * time.Millisecond): } - // Abort the transaction. All pushes should succeed and all intents should - // be resolved, allowing all reads (on the leaseholder and on followers) to - // proceed and finish. + // Abort the transaction. All intents should be rolled back. endTxn := &roachpb.EndTxnRequest{ RequestHeader: roachpb.RequestHeader{Key: txn.Key}, Commit: false, + LockSpans: []roachpb.Span{desc.KeySpan().AsRawSpanWithNoLocals()}, } - if _, err := kv.SendWrappedWith(ctx, ds, roachpb.Header{Txn: &txn}, endTxn); err != nil { - t.Fatal(err) - } - for range keys { - require.NoError(t, <-respCh) + _, err := kv.SendWrappedWith(ctx, ds, roachpb.Header{Txn: &txn}, endTxn) + require.Nil(t, err) + + // The blocked read on the leaseholder should succeed. + require.NoError(t, <-respCh2) + + // Follower reads should now be possible at the intents' timestamp. + respCh3 := runFollowerReads(txn.WriteTimestamp, true) + for i := 0; i < len(repls); i++ { + require.NoError(t, <-respCh3) } } // TestClosedTimestampCanServeAfterSplitsAndMerges validates the invariant that // if a timestamp is safe for reading on both the left side and right side of a -// a merge then it will be safe after the merge and that if a timestamp is safe +// merge then it will be safe after the merge and that if a timestamp is safe // for reading before the beginning of a split it will be safe on both sides of -// of the split. +// the split. func TestClosedTimestampCanServeAfterSplitAndMerges(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) diff --git a/pkg/kv/kvserver/concurrency/concurrency_control.go b/pkg/kv/kvserver/concurrency/concurrency_control.go index 55e23e6afe7d..883a83e6037c 100644 --- a/pkg/kv/kvserver/concurrency/concurrency_control.go +++ b/pkg/kv/kvserver/concurrency/concurrency_control.go @@ -756,22 +756,6 @@ type lockTableWaiter interface { // again. WaitOn(context.Context, Request, lockTableGuard) *Error - // WaitOnLock waits on the transaction responsible for the specified lock - // and then ensures that the lock is cleared out of the request's way. - // - // The method should be called after dropping any latches that a request has - // acquired. It returns when the lock has been resolved. - // - // NOTE: this method is used when the lockTable is disabled (e.g. on a - // follower replica) and a lock is discovered that must be waited on (e.g. - // during a follower read). If/when lockTables are maintained on follower - // replicas by propagating lockTable state transitions through the Raft log - // in the ReplicatedEvalResult instead of through the (leaseholder-only) - // LocalResult, we should be able to remove the lockTable "disabled" state - // and, in turn, remove this method. This will likely fall out of pulling - // all replicated locks into the lockTable. - WaitOnLock(context.Context, Request, *roachpb.Intent) *Error - // ResolveDeferredIntents resolves the batch of intents if the provided // error is nil. The batch of intents may be resolved more efficiently than // if they were resolved individually. diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager.go b/pkg/kv/kvserver/concurrency/concurrency_manager.go index eec701513a06..deec67b4a3af 100644 --- a/pkg/kv/kvserver/concurrency/concurrency_manager.go +++ b/pkg/kv/kvserver/concurrency/concurrency_manager.go @@ -416,11 +416,27 @@ func (m *managerImpl) HandleWriterIntentError( } // Add a discovered lock to lock-table for each intent and enter each lock's - // wait-queue. If the lock-table is disabled and one or more of the intents - // are ignored then we immediately wait on all intents. + // wait-queue. + // + // If the lock-table is disabled and one or more of the intents are ignored + // then we proceed without the intent being added to the lock table. In such + // cases, we know that this replica is no longer the leaseholder. One of two + // things can happen next. + // 1) if the request cannot be served on this follower replica according to + // the closed timestamp then it will be redirected to the leaseholder on + // its next evaluation attempt, where it may discover the same intent and + // wait in the new leaseholder's lock table. + // 2) if the request can be served on this follower replica according to the + // closed timestamp then it will likely re-encounter the same intent on its + // next evaluation attempt. The WriteIntentError will then be mapped to an + // InvalidLeaseError in maybeAttachLease, which will indicate that the + // request cannot be served as a follower read after all and cause the + // request to be redirected to the leaseholder. + // + // Either way, there is no possibility of the request entering an infinite + // loop without making progress. consultFinalizedTxnCache := int64(len(t.Intents)) > DiscoveredLocksThresholdToConsultFinalizedTxnCache.Get(&m.st.SV) - wait := false for i := range t.Intents { intent := &t.Intents[i] added, err := m.lt.AddDiscoveredLock(intent, seq, consultFinalizedTxnCache, g.ltg) @@ -428,7 +444,9 @@ func (m *managerImpl) HandleWriterIntentError( log.Fatalf(ctx, "%v", err) } if !added { - wait = true + log.VEventf(ctx, 2, + "intent on %s discovered but not added to disabled lock table", + intent.Key.String()) } } @@ -438,23 +456,12 @@ func (m *managerImpl) HandleWriterIntentError( // Guard. This is analogous to iterating through the loop in SequenceReq. m.lm.Release(g.moveLatchGuard()) - // If the lockTable was disabled then we need to immediately wait on the - // intents to ensure that they are resolved and moved out of the request's - // way. - if wait { - for i := range t.Intents { - intent := &t.Intents[i] - if err := m.ltw.WaitOnLock(ctx, g.Req, intent); err != nil { - m.FinishReq(g) - return nil, err - } - } - } else { - if toResolve := g.ltg.ResolveBeforeScanning(); len(toResolve) > 0 { - if err := m.ltw.ResolveDeferredIntents(ctx, toResolve); err != nil { - m.FinishReq(g) - return nil, err - } + // If the discovery process collected a set of intents to resolve before the + // next evaluation attempt, do so. + if toResolve := g.ltg.ResolveBeforeScanning(); len(toResolve) > 0 { + if err := m.ltw.ResolveDeferredIntents(ctx, toResolve); err != nil { + m.FinishReq(g) + return nil, err } } diff --git a/pkg/kv/kvserver/concurrency/lock_table.go b/pkg/kv/kvserver/concurrency/lock_table.go index ca9c44d3ca8b..1b7c9f4005e5 100644 --- a/pkg/kv/kvserver/concurrency/lock_table.go +++ b/pkg/kv/kvserver/concurrency/lock_table.go @@ -2330,9 +2330,8 @@ func (t *lockTableImpl) AddDiscoveredLock( } if seq < t.enabledSeq { // If the lease sequence is too low, this discovered lock may no longer - // be accurate, so we ignore it. However, we still return true so that - // the request immediately retries, this time under a newer lease. - return true, nil + // be accurate, so we ignore it. + return false, nil } else if seq > t.enabledSeq { // The enableSeq is set synchronously with the application of a new // lease, so it should not be possible for a request to evaluate at a diff --git a/pkg/kv/kvserver/concurrency/lock_table_waiter.go b/pkg/kv/kvserver/concurrency/lock_table_waiter.go index 748c6a31f24f..9a77b137dab6 100644 --- a/pkg/kv/kvserver/concurrency/lock_table_waiter.go +++ b/pkg/kv/kvserver/concurrency/lock_table_waiter.go @@ -435,31 +435,6 @@ func (w *lockTableWaiterImpl) WaitOn( } } -// WaitOnLock implements the lockTableWaiter interface. -func (w *lockTableWaiterImpl) WaitOnLock( - ctx context.Context, req Request, intent *roachpb.Intent, -) *Error { - sa, _, err := findAccessInSpans(intent.Key, req.LockSpans) - if err != nil { - return roachpb.NewError(err) - } - state := waitingState{ - kind: waitFor, - txn: &intent.Txn, - key: intent.Key, - held: true, - guardAccess: sa, - } - if req.LockTimeout != 0 { - return doWithTimeoutAndFallback( - ctx, req.LockTimeout, - func(ctx context.Context) *Error { return w.pushLockTxn(ctx, req, state) }, - func(ctx context.Context) *Error { return w.pushLockTxnAfterTimeout(ctx, req, state) }, - ) - } - return w.pushLockTxn(ctx, req, state) -} - // pushLockTxn pushes the holder of the provided lock. // // If a Block wait policy is set on the request, method blocks until the lock diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discover_lock_after_lease_race b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discover_lock_after_lease_race index 96ef5e31f442..3e8e9a7855f8 100644 --- a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discover_lock_after_lease_race +++ b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discover_lock_after_lease_race @@ -156,6 +156,7 @@ sequence req=req4 handle-write-intent-error req=req2 lease-seq=1 intent txn=txn1 key=k ---- +[6] handle write intent error req2: intent on "k" discovered but not added to disabled lock table [6] handle write intent error req2: handled conflicting intents on "k", released latches debug-lock-table diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discovered_lock b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discovered_lock index 1e7b9be7eac9..860003f22cef 100644 --- a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discovered_lock +++ b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discovered_lock @@ -62,7 +62,8 @@ reset namespace # ------------------------------------------------------------- # Read-only request runs into replicated intent while the # lock-table is disabled. The lock-table cannot store the lock, -# so the request is forced to push (PUSH_TIMESTAMP) immediately. +# so the request retries immediately and redirects when it +# notices the incorrect lease. # ------------------------------------------------------------- new-txn name=txn1 ts=10,1 epoch=0 @@ -89,20 +90,9 @@ sequence req=req1 handle-write-intent-error req=req1 lease-seq=2 intent txn=txn1 key=k ---- -[2] handle write intent error req1: pushing timestamp of txn 00000001 above 12.000000000,1 -[2] handle write intent error req1: blocked on select in concurrency_test.(*cluster).PushTransaction - -on-txn-updated txn=txn1 status=aborted ----- -[-] update txn: aborting txn1 -[2] handle write intent error req1: resolving intent "k" for txn 00000001 with ABORTED status +[2] handle write intent error req1: intent on "k" discovered but not added to disabled lock table [2] handle write intent error req1: handled conflicting intents on "k", released latches -debug-lock-table ----- -global: num=0 -local: num=0 - sequence req=req1 ---- [3] sequence req1: re-sequencing request @@ -110,6 +100,7 @@ sequence req=req1 [3] sequence req1: scanning lock table for conflicting locks [3] sequence req1: sequencing complete, returned guard +# NotLeaseHolderError redirect to new leaseholder. finish req=req1 ---- [-] finish req1: finishing request @@ -120,7 +111,8 @@ reset namespace # ------------------------------------------------------------- # Read-write request runs into replicated intent while the # lock-table is disabled. The lock-table cannot store the lock, -# so the request is forced to push (PUSH_ABORT) immediately. +# so the request retries immediately and redirects when it +# notices the incorrect lease. # ------------------------------------------------------------- new-txn name=txn1 ts=10,1 epoch=0 @@ -147,20 +139,9 @@ sequence req=req1 handle-write-intent-error req=req1 lease-seq=2 intent txn=txn1 key=k ---- -[2] handle write intent error req1: pushing txn 00000001 to abort -[2] handle write intent error req1: blocked on select in concurrency_test.(*cluster).PushTransaction - -on-txn-updated txn=txn1 status=aborted ----- -[-] update txn: aborting txn1 -[2] handle write intent error req1: resolving intent "k" for txn 00000001 with ABORTED status +[2] handle write intent error req1: intent on "k" discovered but not added to disabled lock table [2] handle write intent error req1: handled conflicting intents on "k", released latches -debug-lock-table ----- -global: num=0 -local: num=0 - sequence req=req1 ---- [3] sequence req1: re-sequencing request @@ -168,57 +149,10 @@ sequence req=req1 [3] sequence req1: scanning lock table for conflicting locks [3] sequence req1: sequencing complete, returned guard +# NotLeaseHolderError redirect to new leaseholder. finish req=req1 ---- [-] finish req1: finishing request reset namespace ---- - -# ------------------------------------------------------------- -# Read-write request runs into replicated intent while the -# lock-table is disabled. The lock-table cannot store the lock, -# so the request is forced to push (PUSH_ABORT) immediately. -# The request's own transaction is aborted while pushing. -# ------------------------------------------------------------- - -new-txn name=txn1 ts=10,1 epoch=0 ----- - -new-txn name=txn2 ts=12,1 epoch=0 ----- - -new-request name=req1 txn=txn2 ts=12,1 - get key=k ----- - -on-lease-updated leaseholder=false lease-seq=2 ----- -[-] transfer lease: released - -sequence req=req1 ----- -[1] sequence req1: sequencing request -[1] sequence req1: acquiring latches -[1] sequence req1: scanning lock table for conflicting locks -[1] sequence req1: sequencing complete, returned guard - -handle-write-intent-error req=req1 lease-seq=2 - intent txn=txn1 key=k ----- -[2] handle write intent error req1: pushing timestamp of txn 00000001 above 12.000000000,1 -[2] handle write intent error req1: blocked on select in concurrency_test.(*cluster).PushTransaction - -on-txn-updated txn=txn2 status=aborted ----- -[-] update txn: aborting txn2 -[2] handle write intent error req1: detected pusher aborted -[2] handle write intent error req1: handled conflicting intents on "k", returned error: TransactionAbortedError(ABORT_REASON_PUSHER_ABORTED): - -debug-lock-table ----- -global: num=0 -local: num=0 - -reset namespace ----- diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/range_state_listener b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/range_state_listener index 88599365d448..e36aaa429c29 100644 --- a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/range_state_listener +++ b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/range_state_listener @@ -15,24 +15,24 @@ debug-disable-txn-pushes # Test: txn2 enters lock's wait-queue # replica loses lease # txn2 proceeds -# txn2 discovers txn1's lock on k while writing (not ignored, waits) +# txn2 discovers and ignores txn1's lock on k while writing # txn2 re-sequences -# txn2 discovers txn1's lock on k2 while reading (not ignored, waits) +# +# txn2 redirected to leaseholder (i.e. replica acquires lease) # txn2 re-sequences -# txn1 lock is released (ignored) -# txn2 proceeds and acquires lock (ignored) +# txn1 lock is released +# txn2 proceeds and acquires lock # -# replica acquire lease -# txn3 discovers txn1's lock under old lease (ignored) +# txn3 sequences +# replica loses and re-acquires lease +# txn3 discovers and ignores txn1's lock under old lease # txn3 re-sequences -# txn3 discovers txn2's lock (not ignored) +# txn3 discovers txn2's lock # txn3 queue's on txn2's lock -# txn2's lock is released (not ignored) -# txn3 proceeds and acquires lock (not ignored) +# txn2's lock is released +# txn3 proceeds and acquires lock # ------------------------------------------------------------- -subtest on_range_lease_updated - new-txn name=txn1 ts=10,1 epoch=0 ---- @@ -127,13 +127,7 @@ local: num=0 handle-write-intent-error req=req2 lease-seq=1 intent txn=txn1 key=k ---- -[3] handle write intent error req2: pushing txn 00000001 to abort -[3] handle write intent error req2: blocked on select in concurrency_test.(*cluster).PushTransaction - -on-txn-updated txn=txn1 status=committed ----- -[-] update txn: committing txn1 -[3] handle write intent error req2: resolving intent "k" for txn 00000001 with COMMITTED status +[3] handle write intent error req2: intent on "k" discovered but not added to disabled lock table [3] handle write intent error req2: handled conflicting intents on "k", released latches debug-lock-table @@ -148,24 +142,81 @@ sequence req=req2 [4] sequence req2: scanning lock table for conflicting locks [4] sequence req2: sequencing complete, returned guard +# NotLeaseHolderError redirect to new leaseholder. +finish req=req2 +---- +[-] finish req2: finishing request + +on-lease-updated leaseholder=true lease-seq=2 +---- +[-] transfer lease: acquired + +sequence req=req2 +---- +[5] sequence req2: sequencing request +[5] sequence req2: acquiring latches +[5] sequence req2: scanning lock table for conflicting locks +[5] sequence req2: sequencing complete, returned guard + handle-write-intent-error req=req2 lease-seq=2 - intent txn=txn1 key=k2 + intent txn=txn1 key=k ---- -[5] handle write intent error req2: pushing timestamp of txn 00000001 above 10.000000000,1 -[5] handle write intent error req2: resolving intent "k2" for txn 00000001 with COMMITTED status -[5] handle write intent error req2: handled conflicting intents on "k2", released latches +[6] handle write intent error req2: handled conflicting intents on "k", released latches debug-lock-table ---- -global: num=0 +global: num=1 + lock: "k" + holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0] + queued writers: + active: false req: 3, txn: 00000002-0000-0000-0000-000000000000 local: num=0 sequence req=req2 ---- -[6] sequence req2: re-sequencing request -[6] sequence req2: acquiring latches -[6] sequence req2: scanning lock table for conflicting locks -[6] sequence req2: sequencing complete, returned guard +[7] sequence req2: re-sequencing request +[7] sequence req2: acquiring latches +[7] sequence req2: scanning lock table for conflicting locks +[7] sequence req2: waiting in lock wait-queues +[7] sequence req2: lock wait-queue event: wait for (distinguished) txn 00000001 holding lock @ key "k" (queuedWriters: 1, queuedReaders: 0) +[7] sequence req2: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn + +new-request name=reqRes1 txn=none ts=10,1 + resolve-intent txn=txn1 key=k status=committed + resolve-intent txn=txn1 key=k2 status=committed +---- + +sequence req=reqRes1 +---- +[8] sequence reqRes1: sequencing request +[8] sequence reqRes1: acquiring latches +[8] sequence reqRes1: sequencing complete, returned guard + +on-lock-updated req=reqRes1 txn=txn1 key=k status=committed +---- +[-] update lock: committing txn 00000001 @ k +[7] sequence req2: lock wait-queue event: done waiting +[7] sequence req2: conflicted with 00000001-0000-0000-0000-000000000000 on "k" for 1.234s +[7] sequence req2: acquiring latches +[7] sequence req2: waiting to acquire read latch k2@10.000000000,1, held by write latch k2@10.000000000,1 +[7] sequence req2: blocked on select in spanlatch.(*Manager).waitForSignal + +on-lock-updated req=reqRes1 txn=txn1 key=k2 status=committed +---- +[-] update lock: committing txn 00000001 @ k2 + +finish req=reqRes1 +---- +[-] finish reqRes1: finishing request +[7] sequence req2: scanning lock table for conflicting locks +[7] sequence req2: sequencing complete, returned guard + +debug-lock-table +---- +global: num=1 + lock: "k" + res: req: 3, txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, seq: 0 +local: num=0 on-lock-acquired req=req2 key=k ---- @@ -173,15 +224,21 @@ on-lock-acquired req=req2 key=k debug-lock-table ---- -global: num=0 +global: num=1 + lock: "k" + holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: unrepl epoch: 0, seqs: [0] local: num=0 finish req=req2 ---- [-] finish req2: finishing request -# Replica acquires lease. -on-lease-updated leaseholder=true lease-seq=3 +# Replica loses and re-acquires leases. +on-lease-updated leaseholder=false lease-seq=3 +---- +[-] transfer lease: released + +on-lease-updated leaseholder=true lease-seq=4 ---- [-] transfer lease: acquired @@ -192,19 +249,20 @@ local: num=0 sequence req=req3 ---- -[7] sequence req3: sequencing request -[7] sequence req3: acquiring latches -[7] sequence req3: scanning lock table for conflicting locks -[7] sequence req3: sequencing complete, returned guard +[9] sequence req3: sequencing request +[9] sequence req3: acquiring latches +[9] sequence req3: scanning lock table for conflicting locks +[9] sequence req3: sequencing complete, returned guard # Discover the initial intent, as if this request had been in-flight # this entire time. This isn't quite realistic, given the setup of this # test, but it is possible (see discover_lock_after_lease_race) and the # discovery should be ignored. -handle-write-intent-error req=req3 lease-seq=1 - intent txn=txn2 key=k +handle-write-intent-error req=req3 lease-seq=2 + intent txn=txn1 key=k ---- -[8] handle write intent error req3: handled conflicting intents on "k", released latches +[10] handle write intent error req3: intent on "k" discovered but not added to disabled lock table +[10] handle write intent error req3: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -213,15 +271,15 @@ local: num=0 sequence req=req3 ---- -[9] sequence req3: re-sequencing request -[9] sequence req3: acquiring latches -[9] sequence req3: scanning lock table for conflicting locks -[9] sequence req3: sequencing complete, returned guard +[11] sequence req3: re-sequencing request +[11] sequence req3: acquiring latches +[11] sequence req3: scanning lock table for conflicting locks +[11] sequence req3: sequencing complete, returned guard -handle-write-intent-error req=req3 lease-seq=3 +handle-write-intent-error req=req3 lease-seq=4 intent txn=txn2 key=k ---- -[10] handle write intent error req3: handled conflicting intents on "k", released latches +[12] handle write intent error req3: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -229,27 +287,17 @@ global: num=1 lock: "k" holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0] queued writers: - active: false req: 3, txn: 00000003-0000-0000-0000-000000000000 + active: false req: 4, txn: 00000003-0000-0000-0000-000000000000 local: num=0 sequence req=req3 ---- -[11] sequence req3: re-sequencing request -[11] sequence req3: acquiring latches -[11] sequence req3: scanning lock table for conflicting locks -[11] sequence req3: waiting in lock wait-queues -[11] sequence req3: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key "k" (queuedWriters: 1, queuedReaders: 0) -[11] sequence req3: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn - -debug-lock-table ----- -global: num=1 - lock: "k" - holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0] - queued writers: - active: true req: 3, txn: 00000003-0000-0000-0000-000000000000 - distinguished req: 3 -local: num=0 +[13] sequence req3: re-sequencing request +[13] sequence req3: acquiring latches +[13] sequence req3: scanning lock table for conflicting locks +[13] sequence req3: waiting in lock wait-queues +[13] sequence req3: lock wait-queue event: wait for (distinguished) txn 00000002 holding lock @ key "k" (queuedWriters: 1, queuedReaders: 0) +[13] sequence req3: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn new-request name=reqRes2 txn=none ts=10,1 resolve-intent txn=txn2 key=k status=committed @@ -257,30 +305,30 @@ new-request name=reqRes2 txn=none ts=10,1 sequence req=reqRes2 ---- -[12] sequence reqRes2: sequencing request -[12] sequence reqRes2: acquiring latches -[12] sequence reqRes2: sequencing complete, returned guard +[14] sequence reqRes2: sequencing request +[14] sequence reqRes2: acquiring latches +[14] sequence reqRes2: sequencing complete, returned guard on-lock-updated req=reqRes2 txn=txn2 key=k status=committed ---- [-] update lock: committing txn 00000002 @ k -[11] sequence req3: lock wait-queue event: done waiting -[11] sequence req3: conflicted with 00000002-0000-0000-0000-000000000000 on "k" for 1.234s -[11] sequence req3: acquiring latches -[11] sequence req3: waiting to acquire write latch k@10.000000000,1, held by write latch k@10.000000000,1 -[11] sequence req3: blocked on select in spanlatch.(*Manager).waitForSignal +[13] sequence req3: lock wait-queue event: done waiting +[13] sequence req3: conflicted with 00000002-0000-0000-0000-000000000000 on "k" for 1.234s +[13] sequence req3: acquiring latches +[13] sequence req3: waiting to acquire write latch k@10.000000000,1, held by write latch k@10.000000000,1 +[13] sequence req3: blocked on select in spanlatch.(*Manager).waitForSignal finish req=reqRes2 ---- [-] finish reqRes2: finishing request -[11] sequence req3: scanning lock table for conflicting locks -[11] sequence req3: sequencing complete, returned guard +[13] sequence req3: scanning lock table for conflicting locks +[13] sequence req3: sequencing complete, returned guard debug-lock-table ---- global: num=1 lock: "k" - res: req: 3, txn: 00000003-0000-0000-0000-000000000000, ts: 10.000000000,1, seq: 0 + res: req: 4, txn: 00000003-0000-0000-0000-000000000000, ts: 10.000000000,1, seq: 0 local: num=0 on-lock-acquired req=req3 key=k @@ -301,8 +349,6 @@ finish req=req3 reset namespace ---- -subtest end - # ------------------------------------------------------------- # OnRangeSplit - a Range split clears the lock-table but does # not disable it. @@ -312,14 +358,12 @@ subtest end # Test: txn2 enters lock's wait-queue # range is split # txn2 proceeds -# txn2 discovers txn1's lock (not ignored) +# txn2 discovers txn1's lock # txn2 queue's on txn1's lock -# txn1 lock is released (not ignored) -# txn2 proceeds and acquires lock (not ignored) +# txn1 lock is released +# txn2 proceeds and acquires lock # ------------------------------------------------------------- -subtest on_range_split - new-txn name=txn1 ts=10,1 epoch=0 ---- @@ -375,8 +419,8 @@ global: num=1 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: unrepl epoch: 0, seqs: [0] queued writers: - active: true req: 5, txn: 00000002-0000-0000-0000-000000000000 - distinguished req: 5 + active: true req: 6, txn: 00000002-0000-0000-0000-000000000000 + distinguished req: 6 local: num=0 on-split @@ -404,7 +448,7 @@ global: num=1 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0] queued writers: - active: false req: 5, txn: 00000002-0000-0000-0000-000000000000 + active: false req: 6, txn: 00000002-0000-0000-0000-000000000000 local: num=0 sequence req=req2 @@ -445,7 +489,7 @@ debug-lock-table ---- global: num=1 lock: "k" - res: req: 5, txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, seq: 0 + res: req: 6, txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, seq: 0 local: num=0 on-lock-acquired req=req2 key=k @@ -466,41 +510,47 @@ finish req=req2 reset namespace ---- -subtest end - # ------------------------------------------------------------- # OnRangeMerge - a Range merge clears the lock-table and # disables it. # -# Setup: txn1 acquires lock on k and k2 -# +# Setup: txn1 acquires lock on k +# # Test: txn2 enters lock's wait-queue +# txn3 sequences # range is merged +# txn3 proceeds and acquires lock (ignored) # txn2 proceeds -# txn2 discovers txn1's lock on k while writing (not ignored, waits) +# txn2 discovers and ignores txn1's lock on k while writing +# txn2 re-sequences +# +# txn2 redirected to left-hand side (i.e. replica acquires lease) # txn2 re-sequences -# txn2 discovers txn1's lock on k2 while reading (not ignored, waits) +# txn2 discovers txn1's lock on k while reading # txn2 re-sequences -# txn1 lock is released (ignored) -# txn2 proceeds and acquires lock (ignored) +# txn1 lock is released +# txn2 proceeds and acquires lock # ------------------------------------------------------------- -subtest on_range_merge - new-txn name=txn1 ts=10,1 epoch=0 ---- new-txn name=txn2 ts=10,1 epoch=0 ---- +new-txn name=txn3 ts=10,1 epoch=0 +---- + new-request name=req1 txn=txn1 ts=10,1 put key=k value=v - put key=k2 value=v ---- new-request name=req2 txn=txn2 ts=10,1 put key=k value=v - get key=k2 +---- + +new-request name=req3 txn=txn3 ts=10,1 + put key=k2 value=v ---- sequence req=req1 @@ -514,21 +564,15 @@ on-lock-acquired req=req1 key=k ---- [-] acquire lock: txn 00000001 @ k -on-lock-acquired req=req1 key=k2 ----- -[-] acquire lock: txn 00000001 @ k2 - finish req=req1 ---- [-] finish req1: finishing request debug-lock-table ---- -global: num=2 +global: num=1 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: unrepl epoch: 0, seqs: [0] - lock: "k2" - holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: unrepl epoch: 0, seqs: [0] local: num=0 # -------------------------------- @@ -544,16 +588,21 @@ sequence req=req2 [2] sequence req2: lock wait-queue event: wait for (distinguished) txn 00000001 holding lock @ key "k" (queuedWriters: 1, queuedReaders: 0) [2] sequence req2: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn +sequence req=req3 +---- +[3] sequence req3: sequencing request +[3] sequence req3: acquiring latches +[3] sequence req3: scanning lock table for conflicting locks +[3] sequence req3: sequencing complete, returned guard + debug-lock-table ---- -global: num=2 +global: num=1 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: unrepl epoch: 0, seqs: [0] queued writers: - active: true req: 7, txn: 00000002-0000-0000-0000-000000000000 - distinguished req: 7 - lock: "k2" - holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: unrepl epoch: 0, seqs: [0] + active: true req: 8, txn: 00000002-0000-0000-0000-000000000000 + distinguished req: 8 local: num=0 on-merge @@ -570,56 +619,112 @@ debug-lock-table global: num=0 local: num=0 -handle-write-intent-error req=req2 lease-seq=1 - intent txn=txn1 key=k ----- -[3] handle write intent error req2: pushing txn 00000001 to abort -[3] handle write intent error req2: blocked on select in concurrency_test.(*cluster).PushTransaction - -on-txn-updated txn=txn1 status=committed +on-lock-acquired req=req3 key=k2 ---- -[-] update txn: committing txn1 -[3] handle write intent error req2: resolving intent "k" for txn 00000001 with COMMITTED status -[3] handle write intent error req2: handled conflicting intents on "k", released latches +[-] acquire lock: txn 00000003 @ k2 debug-lock-table ---- global: num=0 local: num=0 -sequence req=req2 +finish req=req3 ---- -[4] sequence req2: re-sequencing request -[4] sequence req2: acquiring latches -[4] sequence req2: scanning lock table for conflicting locks -[4] sequence req2: sequencing complete, returned guard +[-] finish req3: finishing request handle-write-intent-error req=req2 lease-seq=1 - intent txn=txn1 key=k2 + intent txn=txn1 key=k ---- -[5] handle write intent error req2: pushing timestamp of txn 00000001 above 10.000000000,1 -[5] handle write intent error req2: resolving intent "k2" for txn 00000001 with COMMITTED status -[5] handle write intent error req2: handled conflicting intents on "k2", released latches +[4] handle write intent error req2: intent on "k" discovered but not added to disabled lock table +[4] handle write intent error req2: handled conflicting intents on "k", released latches -debug-lock-table +sequence req=req2 ---- -global: num=0 -local: num=0 +[5] sequence req2: re-sequencing request +[5] sequence req2: acquiring latches +[5] sequence req2: scanning lock table for conflicting locks +[5] sequence req2: sequencing complete, returned guard + +# RangeKeyMismatchError redirect to left-hand side range. +finish req=req2 +---- +[-] finish req2: finishing request + +on-lease-updated leaseholder=true lease-seq=2 +---- +[-] transfer lease: acquired sequence req=req2 ---- -[6] sequence req2: re-sequencing request +[6] sequence req2: sequencing request [6] sequence req2: acquiring latches [6] sequence req2: scanning lock table for conflicting locks [6] sequence req2: sequencing complete, returned guard +handle-write-intent-error req=req2 lease-seq=2 + intent txn=txn1 key=k +---- +[7] handle write intent error req2: handled conflicting intents on "k", released latches + +debug-lock-table +---- +global: num=1 + lock: "k" + holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0] + queued writers: + active: false req: 10, txn: 00000002-0000-0000-0000-000000000000 +local: num=0 + +sequence req=req2 +---- +[8] sequence req2: re-sequencing request +[8] sequence req2: acquiring latches +[8] sequence req2: scanning lock table for conflicting locks +[8] sequence req2: waiting in lock wait-queues +[8] sequence req2: lock wait-queue event: wait for (distinguished) txn 00000001 holding lock @ key "k" (queuedWriters: 1, queuedReaders: 0) +[8] sequence req2: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn + +new-request name=reqRes1 txn=none ts=10,1 + resolve-intent txn=txn1 key=k status=committed +---- + +sequence req=reqRes1 +---- +[9] sequence reqRes1: sequencing request +[9] sequence reqRes1: acquiring latches +[9] sequence reqRes1: sequencing complete, returned guard + +on-lock-updated req=reqRes1 txn=txn1 key=k status=committed +---- +[-] update lock: committing txn 00000001 @ k +[8] sequence req2: lock wait-queue event: done waiting +[8] sequence req2: conflicted with 00000001-0000-0000-0000-000000000000 on "k" for 1.234s +[8] sequence req2: acquiring latches +[8] sequence req2: waiting to acquire write latch k@10.000000000,1, held by write latch k@10.000000000,1 +[8] sequence req2: blocked on select in spanlatch.(*Manager).waitForSignal + +finish req=reqRes1 +---- +[-] finish reqRes1: finishing request +[8] sequence req2: scanning lock table for conflicting locks +[8] sequence req2: sequencing complete, returned guard + +debug-lock-table +---- +global: num=1 + lock: "k" + res: req: 10, txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, seq: 0 +local: num=0 + on-lock-acquired req=req2 key=k ---- [-] acquire lock: txn 00000002 @ k debug-lock-table ---- -global: num=0 +global: num=1 + lock: "k" + holder: txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, info: unrepl epoch: 0, seqs: [0] local: num=0 finish req=req2 @@ -629,8 +734,6 @@ finish req=req2 reset namespace ---- -subtest end - # ------------------------------------------------------------- # OnReplicaSnapshotApplied - applying a snapshot clears the # lock-table but does not disable it. @@ -640,14 +743,12 @@ subtest end # Test: txn2 enters lock's wait-queue # replica applies snapshot # txn2 proceeds -# txn2 discovers txn1's lock (not ignored) +# txn2 discovers txn1's lock # txn2 queue's on txn1's lock -# txn1 lock is released (not ignored) -# txn2 proceeds and acquires lock (not ignored) +# txn1 lock is released +# txn2 proceeds and acquires lock # ------------------------------------------------------------- -subtest on_replica_snapshot_applied - new-txn name=txn1 ts=10,1 epoch=0 ---- @@ -703,8 +804,8 @@ global: num=1 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: unrepl epoch: 0, seqs: [0] queued writers: - active: true req: 9, txn: 00000002-0000-0000-0000-000000000000 - distinguished req: 9 + active: true req: 12, txn: 00000002-0000-0000-0000-000000000000 + distinguished req: 12 local: num=0 on-snapshot-applied @@ -732,7 +833,7 @@ global: num=1 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0] queued writers: - active: false req: 9, txn: 00000002-0000-0000-0000-000000000000 + active: false req: 12, txn: 00000002-0000-0000-0000-000000000000 local: num=0 sequence req=req2 @@ -773,7 +874,7 @@ debug-lock-table ---- global: num=1 lock: "k" - res: req: 9, txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, seq: 0 + res: req: 12, txn: 00000002-0000-0000-0000-000000000000, ts: 10.000000000,1, seq: 0 local: num=0 on-lock-acquired req=req2 key=k @@ -793,5 +894,3 @@ finish req=req2 reset namespace ---- - -subtest end diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index 068b4618b68f..5314c5f00fb1 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -503,9 +503,56 @@ func isConcurrencyRetryError(pErr *roachpb.Error) bool { // maybeAttachLease is used to augment a concurrency retry error with // information about the lease that the operation which hit this error was -// operating under. +// operating under. If the operation was performed on a follower that does not +// hold the lease (e.g. a follower read), the provided lease will be empty. func maybeAttachLease(pErr *roachpb.Error, lease *roachpb.Lease) *roachpb.Error { if wiErr, ok := pErr.GetDetail().(*roachpb.WriteIntentError); ok { + // If we hit an intent on the leaseholder, attach information about the + // lease to WriteIntentErrors, which is necessary to keep the lock-table + // in sync with the applied state. + // + // However, if we hit an intent during a follower read, the lock-table will + // be disabled, so we won't be able to use it to wait for the resolution of + // the intent. Instead of waiting locally, we replace the WriteIntentError + // with an InvalidLeaseError so that the request will be redirected to the + // leaseholder. Beyond implementation constraints, waiting for conflicting + // intents on the leaseholder instead of on a follower is preferable + // because: + // - the leaseholder is notified of and reactive to lock-table state + // transitions. + // - the leaseholder is able to more efficiently resolve intents, if + // necessary, without the risk of multiple follower<->leaseholder + // round-trips compounding. If the follower was to attempt to resolve + // multiple intents during a follower read then the PushTxn and + // ResolveIntent requests would quickly be more expensive (in terms of + // latency) than simply redirecting the entire read request to the + // leaseholder and letting the leaseholder coordinate the intent + // resolution. + // - after the leaseholder has received a response from a ResolveIntent + // request, it has a guarantee that the intent resolution has been applied + // locally and that no future read will observe the intent. This is not + // true on follower replicas. Due to the asynchronous nature of Raft, both + // due to quorum voting and due to async commit acknowledgement from + // leaders to followers, it is possible for a ResolveIntent request to + // complete and then for a future read on a follower to observe the + // pre-resolution state of the intent. This effect is transient and will + // eventually disappear once the follower catches up on its Raft log, but + // it creates an opportunity for momentary thrashing if a follower read + // was to resolve an intent and then immediately attempt to read again. + // + // This behavior of redirecting follower read attempts to the leaseholder + // replica if they encounter conflicting intents on a follower means that + // follower read eligibility is a function of the "resolved timestamp" over + // a read's key span, and not just the "closed timestamp" over its key span. + // Architecturally, this is consistent with Google Spanner, who maintains a + // concept of "safe time", "paxos safe time", "transaction manager safe + // time". "safe time" is analogous to the "resolved timestamp" in + // CockroachDB and "paxos safe time" is analogous to the "closed timestamp" + // in CockroachDB. In Spanner, it is the "safe time" of a replica that + // determines follower read eligibility. + if lease.Empty() /* followerRead */ { + return roachpb.NewErrorWithTxn(&roachpb.InvalidLeaseError{}, pErr.GetTxn()) + } wiErr.LeaseSequence = lease.Sequence return roachpb.NewErrorWithTxn(wiErr, pErr.GetTxn()) }