From c57d6d003b85514ec2295d74dd51d6cd02d32941 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Wed, 18 Mar 2020 01:08:49 -0400 Subject: [PATCH] kv: immediately push on WriteIntentError when lock-table disabled Fixes #46148. This commit fixes a bug where follower reads that hit intents could get stuck in an indefinite loop of running into the intent during evaluation, not adding the intent to the lock-table because the lock table was disabled, sequencing in the concurrency manager without issue, and repeating. The new TestClosedTimestampCanServeWithConflictingIntent test hits exactly this issue before this commit. The fix implemented here is to immediately push the transaction responsible for an intent when serving a follower read (i.e. when a replica's lock-table is disabled). This ensures that the intent gets cleaned up if it was abandoned and avoids the busy loop we see today. If/when lockTables are maintained on follower replicas by propagating lockTable state transitions through the Raft log in the ReplicatedEvalResult instead of through the (leaseholder-only) LocalResult, we should be able to remove the lockTable "disabled" state and, in turn, remove this special-case. The alternative approach floated to address this was to simply pass a NotLeaseHolderError back to the client when an intent is hit on a follower. This would have worked to avoid the infinite loop, but it seems like a short-term patch that doesn't get to the root of the issue. As we push further on follower reads (or even consistent read replicas), we want non-leaseholders to be able to perform conflict resolution. Falling back to the leaseholder works counter to this goal. The approach implemented by this commit works towards this goal, simply falling back to the previous sub-optimal approach of pushing immediately during conflicts. Release note (bug fix): Follower reads that hit intents no longer have a chance of entering an infinite loop. This bug was present in earlier versions of the v20.1 release. Release justification: fixes a high-priority bug where follower reads could get stuck indefinitely if they hit an abandoned intent. --- pkg/kv/kvserver/closed_timestamp_test.go | 76 +++++++ .../concurrency/concurrency_control.go | 26 ++- .../concurrency/concurrency_manager.go | 28 ++- .../concurrency/concurrency_manager_test.go | 20 +- pkg/kv/kvserver/concurrency/lock_table.go | 32 +-- .../kvserver/concurrency/lock_table_test.go | 2 +- .../kvserver/concurrency/lock_table_waiter.go | 21 ++ .../concurrency_manager/discovered_lock | 187 ++++++++++++++++-- .../concurrency_manager/range_state_listener | 170 ++++++++++------ .../testdata/concurrency_manager/uncertainty | 44 ++--- pkg/kv/kvserver/replica_send.go | 2 +- 11 files changed, 488 insertions(+), 120 deletions(-) diff --git a/pkg/kv/kvserver/closed_timestamp_test.go b/pkg/kv/kvserver/closed_timestamp_test.go index b5dbf085c8af..d30967866698 100644 --- a/pkg/kv/kvserver/closed_timestamp_test.go +++ b/pkg/kv/kvserver/closed_timestamp_test.go @@ -15,11 +15,14 @@ import ( gosql "database/sql" "fmt" "math/rand" + "strconv" "sync/atomic" "testing" "time" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/kv" + "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" @@ -172,6 +175,79 @@ func TestClosedTimestampCanServeThroughoutLeaseTransfer(t *testing.T) { } } +// TestClosedTimestampCanServeWithConflictingIntent validates that a read served +// from a follower replica will wait on conflicting intents and ensure that they +// are cleaned up if necessary to allow the read to proceed. +func TestClosedTimestampCanServeWithConflictingIntent(t *testing.T) { + defer leaktest.AfterTest(t)() + + ctx := context.Background() + tc, _, desc, repls := setupTestClusterForClosedTimestampTesting(ctx, t, testingTargetDuration) + defer tc.Stopper().Stop(ctx) + ds := tc.Server(0).DistSenderI().(*kvcoord.DistSender) + + // Write N different intents for the same transaction, where N is the number + // of replicas in the testing range. Each intent will be read and eventually + // resolved by a read on a different replica. + txnKey := desc.StartKey.AsRawKey() + txnKey = txnKey[:len(txnKey):len(txnKey)] // avoid aliasing + txn := roachpb.MakeTransaction("txn", txnKey, 0, tc.Server(0).Clock().Now(), 0) + var keys []roachpb.Key + for i := range repls { + key := append(txnKey, []byte(strconv.Itoa(i))...) + keys = append(keys, key) + put := putArgs(key, []byte("val")) + resp, err := kv.SendWrappedWith(ctx, ds, roachpb.Header{Txn: &txn}, put) + if err != nil { + t.Fatal(err) + } + txn.Update(resp.Header().Txn) + } + + // Read a different intent on each replica. All should begin waiting on the + // intents by pushing the transaction that wrote them. None should complete. + ts := txn.WriteTimestamp + respCh := make(chan struct{}, len(keys)) + for i, key := range keys { + go func(repl *kvserver.Replica, key roachpb.Key) { + var baRead roachpb.BatchRequest + r := &roachpb.ScanRequest{} + r.Key = key + r.EndKey = key.Next() + baRead.Add(r) + baRead.Timestamp = ts + baRead.RangeID = desc.RangeID + + testutils.SucceedsSoon(t, func() error { + // Expect 0 rows, because the intents will be aborted. + _, err := expectRows(0)(repl.Send(ctx, baRead)) + return err + }) + respCh <- struct{}{} + }(repls[i], key) + } + + select { + case <-respCh: + t.Fatal("request unexpectedly succeeded, should block") + case <-time.After(20 * time.Millisecond): + } + + // Abort the transaction. All pushes should succeed and all intents should + // be resolved, allowing all reads (on the leaseholder and on followers) to + // proceed and finish. + endTxn := &roachpb.EndTxnRequest{ + RequestHeader: roachpb.RequestHeader{Key: txn.Key}, + Commit: false, + } + if _, err := kv.SendWrappedWith(ctx, ds, roachpb.Header{Txn: &txn}, endTxn); err != nil { + t.Fatal(err) + } + for range keys { + <-respCh + } +} + // TestClosedTimestampCanServeAfterSplitsAndMerges validates the invariant that // if a timestamp is safe for reading on both the left side and right side of a // a merge then it will be safe after the merge and that if a timestamp is safe diff --git a/pkg/kv/kvserver/concurrency/concurrency_control.go b/pkg/kv/kvserver/concurrency/concurrency_control.go index 1fc035413fb6..efbb42f907d1 100644 --- a/pkg/kv/kvserver/concurrency/concurrency_control.go +++ b/pkg/kv/kvserver/concurrency/concurrency_control.go @@ -196,6 +196,8 @@ type ContentionHandler interface { // error in the lock's wait-queue (but does not wait) and releases the // guard's latches. It returns an updated guard reflecting this change. // After the method returns, the original guard should no longer be used. + // If an error is returned then the provided guard will be released and no + // guard will be returned. // // Example usage: Txn A scans the lock table and does not see an intent on // key K from txn B because the intent is not being tracked in the lock @@ -204,7 +206,7 @@ type ContentionHandler interface { // method before txn A retries its scan. During the retry, txn A scans the // lock table and observes the lock on key K, so it enters the lock's // wait-queue and waits for it to be resolved. - HandleWriterIntentError(context.Context, *Guard, *roachpb.WriteIntentError) *Guard + HandleWriterIntentError(context.Context, *Guard, *roachpb.WriteIntentError) (*Guard, *Error) // HandleTransactionPushError consumes a TransactionPushError thrown by a // PushTxnRequest by informing the concurrency manager about a transaction @@ -474,7 +476,11 @@ type lockTable interface { // // A latch consistent with the access desired by the guard must be held on // the span containing the discovered lock's key. - AddDiscoveredLock(*roachpb.Intent, lockTableGuard) error + // + // The method returns a boolean indicating whether the discovered lock was + // added to the lockTable (true) or whether it was ignored because the + // lockTable is currently disabled (false). + AddDiscoveredLock(*roachpb.Intent, lockTableGuard) (bool, error) // AcquireLock informs the lockTable that a new lock was acquired or an // existing lock was updated. @@ -610,6 +616,22 @@ type lockTableWaiter interface { // wait-queues and it is safe to re-acquire latches and scan the lockTable // again. WaitOn(context.Context, Request, lockTableGuard) *Error + + // WaitOnLock waits on the transaction responsible for the specified lock + // and then ensures that the lock is cleared out of the request's way. + // + // The method should be called after dropping any latches that a request has + // acquired. It returns when the lock has been resolved. + // + // NOTE: this method is used when the lockTable is disabled (e.g. on a + // follower replica) and a lock is discovered that must be waited on (e.g. + // during a follower read). If/when lockTables are maintained on follower + // replicas by propagating lockTable state transitions through the Raft log + // in the ReplicatedEvalResult instead of through the (leaseholder-only) + // LocalResult, we should be able to remove the lockTable "disabled" state + // and, in turn, remove this method. This will likely fall out of pulling + // all replicated locks into the lockTable. + WaitOnLock(context.Context, Request, *roachpb.Intent) *Error } // txnWaitQueue holds a collection of wait-queues for transaction records. diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager.go b/pkg/kv/kvserver/concurrency/concurrency_manager.go index d6f5bf8af095..6e35fc5b14c2 100644 --- a/pkg/kv/kvserver/concurrency/concurrency_manager.go +++ b/pkg/kv/kvserver/concurrency/concurrency_manager.go @@ -244,19 +244,25 @@ func (m *managerImpl) FinishReq(g *Guard) { // HandleWriterIntentError implements the ContentionHandler interface. func (m *managerImpl) HandleWriterIntentError( ctx context.Context, g *Guard, t *roachpb.WriteIntentError, -) *Guard { +) (*Guard, *Error) { if g.ltg == nil { log.Fatalf(ctx, "cannot handle WriteIntentError %v for request without "+ "lockTableGuard; were lock spans declared for this request?", t) } // Add a discovered lock to lock-table for each intent and enter each lock's - // wait-queue. + // wait-queue. If the lock-table is disabled and one or more of the intents + // are ignored then we immediately wait on all intents. + wait := false for i := range t.Intents { intent := &t.Intents[i] - if err := m.lt.AddDiscoveredLock(intent, g.ltg); err != nil { + added, err := m.lt.AddDiscoveredLock(intent, g.ltg) + if err != nil { log.Fatal(ctx, errors.HandleAsAssertionFailure(err)) } + if !added { + wait = true + } } // Release the Guard's latches but continue to remain in lock wait-queues by @@ -264,7 +270,21 @@ func (m *managerImpl) HandleWriterIntentError( // then re-sequence the Request by calling SequenceReq with the un-latched // Guard. This is analogous to iterating through the loop in SequenceReq. m.lm.Release(g.moveLatchGuard()) - return g + + // If the lockTable was disabled then we need to immediately wait on the + // intents to ensure that they are resolved and moved out of the request's + // way. + if wait { + for i := range t.Intents { + intent := &t.Intents[i] + if err := m.ltw.WaitOnLock(ctx, g.Req, intent); err != nil { + m.FinishReq(g) + return nil, err + } + } + } + + return g, nil } // HandleTransactionPushError implements the ContentionHandler interface. diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager_test.go b/pkg/kv/kvserver/concurrency/concurrency_manager_test.go index a5155a18b103..12fb8aadc7b5 100644 --- a/pkg/kv/kvserver/concurrency/concurrency_manager_test.go +++ b/pkg/kv/kvserver/concurrency/concurrency_manager_test.go @@ -221,7 +221,7 @@ func TestConcurrencyManagerBasic(t *testing.T) { case "handle-write-intent-error": var reqName string d.ScanArgs(t, "req", &reqName) - guard, ok := c.guardsByReqName[reqName] + prev, ok := c.guardsByReqName[reqName] if !ok { d.Fatalf(t, "unknown request: %s", reqName) } @@ -237,12 +237,22 @@ func TestConcurrencyManagerBasic(t *testing.T) { d.ScanArgs(t, "key", &key) opName := fmt.Sprintf("handle write intent error %s", reqName) - mon.runSync(opName, func(ctx context.Context) { - err := &roachpb.WriteIntentError{Intents: []roachpb.Intent{ + mon.runAsync(opName, func(ctx context.Context) { + wiErr := &roachpb.WriteIntentError{Intents: []roachpb.Intent{ roachpb.MakeIntent(&txn.TxnMeta, roachpb.Key(key)), }} - log.Eventf(ctx, "handling %v", err) - guard = m.HandleWriterIntentError(ctx, guard, err) + guard, err := m.HandleWriterIntentError(ctx, prev, wiErr) + if err != nil { + log.Eventf(ctx, "handled %v, returned error: %v", wiErr, err) + c.mu.Lock() + delete(c.guardsByReqName, reqName) + c.mu.Unlock() + } else { + log.Eventf(ctx, "handled %v, released latches", wiErr) + c.mu.Lock() + c.guardsByReqName[reqName] = guard + c.mu.Unlock() + } }) return c.waitAndCollect(t, mon) diff --git a/pkg/kv/kvserver/concurrency/lock_table.go b/pkg/kv/kvserver/concurrency/lock_table.go index 6dd145d73d4c..31a330451d00 100644 --- a/pkg/kv/kvserver/concurrency/lock_table.go +++ b/pkg/kv/kvserver/concurrency/lock_table.go @@ -1729,22 +1729,20 @@ func (t *lockTableImpl) Dequeue(guard lockTableGuard) { } // AddDiscoveredLock implements the lockTable interface. -func (t *lockTableImpl) AddDiscoveredLock(intent *roachpb.Intent, guard lockTableGuard) error { +func (t *lockTableImpl) AddDiscoveredLock( + intent *roachpb.Intent, guard lockTableGuard, +) (added bool, _ error) { t.enabledMu.RLock() defer t.enabledMu.RUnlock() if !t.enabled { // If not enabled, don't track any locks. - return nil + return false, nil } g := guard.(*lockTableGuardImpl) key := intent.Key - ss := spanset.SpanGlobal - if keys.IsLocal(key) { - ss = spanset.SpanLocal - } - sa, err := findAccessInSpans(key, ss, g.spans) + sa, ss, err := findAccessInSpans(key, g.spans) if err != nil { - return err + return false, err } var l *lockState tree := &t.locks[ss] @@ -1763,7 +1761,7 @@ func (t *lockTableImpl) AddDiscoveredLock(intent *roachpb.Intent, guard lockTabl } else { l = iter.Cur() } - return l.discoveredLock(&intent.Txn, intent.Txn.WriteTimestamp, g, sa) + return true, l.discoveredLock(&intent.Txn, intent.Txn.WriteTimestamp, g, sa) } // AcquireLock implements the lockTable interface. @@ -1856,11 +1854,15 @@ func (t *lockTableImpl) tryClearLocks(force bool) { } } -// Given the key with scope ss must be in spans, returns the strongest access -// specified in the spans. +// Given the key must be in spans, returns the strongest access +// specified in the spans, along with the scope of the key. func findAccessInSpans( - key roachpb.Key, ss spanset.SpanScope, spans *spanset.SpanSet, -) (spanset.SpanAccess, error) { + key roachpb.Key, spans *spanset.SpanSet, +) (spanset.SpanAccess, spanset.SpanScope, error) { + ss := spanset.SpanGlobal + if keys.IsLocal(key) { + ss = spanset.SpanLocal + } for sa := spanset.NumSpanAccess - 1; sa >= 0; sa-- { s := spans.GetSpans(sa, ss) // First span that starts after key @@ -1869,10 +1871,10 @@ func findAccessInSpans( }) if i > 0 && ((len(s[i-1].EndKey) > 0 && key.Compare(s[i-1].EndKey) < 0) || key.Equal(s[i-1].Key)) { - return sa, nil + return sa, ss, nil } } - return spanset.NumSpanAccess, errors.Errorf("caller violated contract") + return 0, 0, errors.Errorf("caller violated contract") } // Tries to GC locks that were previously known to have become empty. diff --git a/pkg/kv/kvserver/concurrency/lock_table_test.go b/pkg/kv/kvserver/concurrency/lock_table_test.go index cff11d6b72b5..9c16286b4717 100644 --- a/pkg/kv/kvserver/concurrency/lock_table_test.go +++ b/pkg/kv/kvserver/concurrency/lock_table_test.go @@ -333,7 +333,7 @@ func TestLockTableBasic(t *testing.T) { d.Fatalf(t, "unknown txn %s", txnName) } intent := roachpb.MakeIntent(txnMeta, roachpb.Key(key)) - if err := lt.AddDiscoveredLock(&intent, g); err != nil { + if _, err := lt.AddDiscoveredLock(&intent, g); err != nil { return err.Error() } return lt.(*lockTableImpl).String() diff --git a/pkg/kv/kvserver/concurrency/lock_table_waiter.go b/pkg/kv/kvserver/concurrency/lock_table_waiter.go index 02ff7addb79f..3347c159dede 100644 --- a/pkg/kv/kvserver/concurrency/lock_table_waiter.go +++ b/pkg/kv/kvserver/concurrency/lock_table_waiter.go @@ -15,6 +15,7 @@ import ( "math" "time" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/intentresolver" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -298,6 +299,26 @@ func (w *lockTableWaiterImpl) WaitOn( } } +// WaitOnLock implements the lockTableWaiter interface. +func (w *lockTableWaiterImpl) WaitOnLock( + ctx context.Context, req Request, intent *roachpb.Intent, +) *Error { + sa, _, err := findAccessInSpans(intent.Key, req.LockSpans) + if err != nil { + return roachpb.NewError(err) + } + return w.pushLockTxn(ctx, req, waitingState{ + stateKind: waitFor, + txn: &intent.Txn, + ts: intent.Txn.WriteTimestamp, + dur: lock.Replicated, + key: intent.Key, + held: true, + access: spanset.SpanReadWrite, + guardAccess: sa, + }) +} + // pushLockTxn pushes the holder of the provided lock. // // The method blocks until the lock holder transaction experiences a state diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discovered_lock b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discovered_lock index 3f0df0f6b935..e0fb5f43f13c 100644 --- a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discovered_lock +++ b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/discovered_lock @@ -1,6 +1,6 @@ # ------------------------------------------------------------- # Read-only request runs into replicated intent. It informs the -# lock table and waits for the intent to be resolved. +# lock-table and waits for the intent to be resolved. # ------------------------------------------------------------- new-txn name=txn1 ts=10,1 epoch=0 @@ -22,7 +22,7 @@ sequence req=req1 handle-write-intent-error req=req1 txn=txn1 key=k ---- -[-] handle write intent error req1: handling conflicting intents on "k" +[2] handle write intent error req1: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -33,24 +33,185 @@ local: num=0 sequence req=req1 ---- -[2] sequence req1: re-sequencing request -[2] sequence req1: acquiring latches -[2] sequence req1: scanning lock table for conflicting locks -[2] sequence req1: waiting in lock wait-queues -[2] sequence req1: pushing timestamp of txn 00000001 above 0.000000012,1 -[2] sequence req1: blocked on select in concurrency_test.(*cluster).PushTransaction +[3] sequence req1: re-sequencing request +[3] sequence req1: acquiring latches +[3] sequence req1: scanning lock table for conflicting locks +[3] sequence req1: waiting in lock wait-queues +[3] sequence req1: pushing timestamp of txn 00000001 above 0.000000012,1 +[3] sequence req1: blocked on select in concurrency_test.(*cluster).PushTransaction on-txn-updated txn=txn1 status=aborted ---- [-] update txn: aborting txn1 -[2] sequence req1: resolving intent "k" for txn 00000001 with ABORTED status -[2] sequence req1: acquiring latches -[2] sequence req1: scanning lock table for conflicting locks -[2] sequence req1: sequencing complete, returned guard +[3] sequence req1: resolving intent "k" for txn 00000001 with ABORTED status +[3] sequence req1: acquiring latches +[3] sequence req1: scanning lock table for conflicting locks +[3] sequence req1: sequencing complete, returned guard finish req=req1 ---- [-] finish req1: finishing request -reset +reset namespace +---- + +# ------------------------------------------------------------- +# Read-only request runs into replicated intent while the +# lock-table is disabled. The lock-table cannot store the lock, +# so the request is forced to push (PUSH_TIMESTAMP) immediately. +# ------------------------------------------------------------- + +new-txn name=txn1 ts=10,1 epoch=0 +---- + +new-txn name=txn2 ts=12,1 epoch=0 +---- + +new-request name=req1 txn=txn2 ts=12,1 + get key=k +---- + +on-lease-updated leaseholder=false +---- +[-] transfer lease: released + +sequence req=req1 +---- +[1] sequence req1: sequencing request +[1] sequence req1: acquiring latches +[1] sequence req1: scanning lock table for conflicting locks +[1] sequence req1: sequencing complete, returned guard + +handle-write-intent-error req=req1 txn=txn1 key=k +---- +[2] handle write intent error req1: pushing timestamp of txn 00000001 above 0.000000012,1 +[2] handle write intent error req1: blocked on select in concurrency_test.(*cluster).PushTransaction + +on-txn-updated txn=txn1 status=aborted +---- +[-] update txn: aborting txn1 +[2] handle write intent error req1: resolving intent "k" for txn 00000001 with ABORTED status +[2] handle write intent error req1: handled conflicting intents on "k", released latches + +debug-lock-table +---- +global: num=0 +local: num=0 + +sequence req=req1 +---- +[3] sequence req1: re-sequencing request +[3] sequence req1: acquiring latches +[3] sequence req1: scanning lock table for conflicting locks +[3] sequence req1: sequencing complete, returned guard + +finish req=req1 +---- +[-] finish req1: finishing request + +reset namespace +---- + +# ------------------------------------------------------------- +# Read-write request runs into replicated intent while the +# lock-table is disabled. The lock-table cannot store the lock, +# so the request is forced to push (PUSH_ABORT) immediately. +# ------------------------------------------------------------- + +new-txn name=txn1 ts=10,1 epoch=0 +---- + +new-txn name=txn2 ts=12,1 epoch=0 +---- + +new-request name=req1 txn=txn2 ts=12,1 + put key=k value=v +---- + +on-lease-updated leaseholder=false +---- +[-] transfer lease: released + +sequence req=req1 +---- +[1] sequence req1: sequencing request +[1] sequence req1: acquiring latches +[1] sequence req1: scanning lock table for conflicting locks +[1] sequence req1: sequencing complete, returned guard + +handle-write-intent-error req=req1 txn=txn1 key=k +---- +[2] handle write intent error req1: pushing txn 00000001 to abort +[2] handle write intent error req1: blocked on select in concurrency_test.(*cluster).PushTransaction + +on-txn-updated txn=txn1 status=aborted +---- +[-] update txn: aborting txn1 +[2] handle write intent error req1: resolving intent "k" for txn 00000001 with ABORTED status +[2] handle write intent error req1: handled conflicting intents on "k", released latches + +debug-lock-table +---- +global: num=0 +local: num=0 + +sequence req=req1 +---- +[3] sequence req1: re-sequencing request +[3] sequence req1: acquiring latches +[3] sequence req1: scanning lock table for conflicting locks +[3] sequence req1: sequencing complete, returned guard + +finish req=req1 +---- +[-] finish req1: finishing request + +reset namespace +---- + +# ------------------------------------------------------------- +# Read-write request runs into replicated intent while the +# lock-table is disabled. The lock-table cannot store the lock, +# so the request is forced to push (PUSH_ABORT) immediately. +# The request's own transaction is aborted while pushing. +# ------------------------------------------------------------- + +new-txn name=txn1 ts=10,1 epoch=0 +---- + +new-txn name=txn2 ts=12,1 epoch=0 +---- + +new-request name=req1 txn=txn2 ts=12,1 + get key=k +---- + +on-lease-updated leaseholder=false +---- +[-] transfer lease: released + +sequence req=req1 +---- +[1] sequence req1: sequencing request +[1] sequence req1: acquiring latches +[1] sequence req1: scanning lock table for conflicting locks +[1] sequence req1: sequencing complete, returned guard + +handle-write-intent-error req=req1 txn=txn1 key=k +---- +[2] handle write intent error req1: pushing timestamp of txn 00000001 above 0.000000012,1 +[2] handle write intent error req1: blocked on select in concurrency_test.(*cluster).PushTransaction + +on-txn-updated txn=txn2 status=aborted +---- +[-] update txn: aborting txn2 +[2] handle write intent error req1: detected pusher aborted +[2] handle write intent error req1: handled conflicting intents on "k", returned error: TransactionAbortedError(ABORT_REASON_PUSHER_ABORTED): + +debug-lock-table +---- +global: num=0 +local: num=0 + +reset namespace ---- diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/range_state_listener b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/range_state_listener index 02e13a0f108b..410289b1d21f 100644 --- a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/range_state_listener +++ b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/range_state_listener @@ -10,12 +10,14 @@ debug-disable-txn-pushes # OnRangeLeaseUpdated - losing this lease disables the # lock-table and acquiring the lease enables the lock-table. # -# Setup: txn1 acquires lock +# Setup: txn1 acquires locks on k and k2 # # Test: txn2 enters lock's wait-queue # replica loses lease # txn2 proceeds -# txn2 discovers txn1's lock (ignored) +# txn2 discovers txn1's lock on k while writing (not ignored, waits) +# txn2 re-sequences +# txn2 discovers txn1's lock on k2 while reading (not ignored, waits) # txn2 re-sequences # txn1 lock is released (ignored) # txn2 proceeds and acquires lock (ignored) @@ -39,11 +41,13 @@ new-txn name=txn3 ts=10,1 epoch=0 ---- new-request name=req1 txn=txn1 ts=10,1 - put key=k value=v + put key=k value=v + put key=k2 value=v ---- new-request name=req2 txn=txn2 ts=10,1 - put key=k value=v + put key=k value=v + get key=k2 ---- new-request name=req3 txn=txn3 ts=10,1 @@ -61,15 +65,21 @@ on-lock-acquired txn=txn1 key=k ---- [-] acquire lock: txn1 @ k +on-lock-acquired txn=txn1 key=k2 +---- +[-] acquire lock: txn1 @ k2 + finish req=req1 ---- [-] finish req1: finishing request debug-lock-table ---- -global: num=1 +global: num=2 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 0.000000010,1, info: unrepl epoch: 0, seqs: [0] + lock: "k2" + holder: txn: 00000001-0000-0000-0000-000000000000, ts: 0.000000010,1, info: unrepl epoch: 0, seqs: [0] local: num=0 # -------------------------------- @@ -86,12 +96,14 @@ sequence req=req2 debug-lock-table ---- -global: num=1 +global: num=2 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 0.000000010,1, info: unrepl epoch: 0, seqs: [0] queued writers: active: true req: 2, txn: 00000002-0000-0000-0000-000000000000 distinguished req: 2 + lock: "k2" + holder: txn: 00000001-0000-0000-0000-000000000000, ts: 0.000000010,1, info: unrepl epoch: 0, seqs: [0] local: num=0 # Replica loses lease. @@ -109,7 +121,14 @@ local: num=0 handle-write-intent-error req=req2 txn=txn1 key=k ---- -[-] handle write intent error req2: handling conflicting intents on "k" +[3] handle write intent error req2: pushing txn 00000001 to abort +[3] handle write intent error req2: blocked on select in concurrency_test.(*cluster).PushTransaction + +on-txn-updated txn=txn1 status=committed +---- +[-] update txn: committing txn1 +[3] handle write intent error req2: resolving intent "k" for txn 00000001 with COMMITTED status +[3] handle write intent error req2: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -118,20 +137,29 @@ local: num=0 sequence req=req2 ---- -[3] sequence req2: re-sequencing request -[3] sequence req2: acquiring latches -[3] sequence req2: scanning lock table for conflicting locks -[3] sequence req2: sequencing complete, returned guard +[4] sequence req2: re-sequencing request +[4] sequence req2: acquiring latches +[4] sequence req2: scanning lock table for conflicting locks +[4] sequence req2: sequencing complete, returned guard -on-lock-updated txn=txn1 key=k status=committed +handle-write-intent-error req=req2 txn=txn1 key=k2 ---- -[-] update lock: committing txn1 @ k +[5] handle write intent error req2: pushing timestamp of txn 00000001 above 0.000000010,1 +[5] handle write intent error req2: resolving intent "k2" for txn 00000001 with COMMITTED status +[5] handle write intent error req2: handled conflicting intents on "k2", released latches debug-lock-table ---- global: num=0 local: num=0 +sequence req=req2 +---- +[6] sequence req2: re-sequencing request +[6] sequence req2: acquiring latches +[6] sequence req2: scanning lock table for conflicting locks +[6] sequence req2: sequencing complete, returned guard + on-lock-acquired txn=txn2 key=k ---- [-] acquire lock: txn2 @ k @@ -157,14 +185,14 @@ local: num=0 sequence req=req3 ---- -[4] sequence req3: sequencing request -[4] sequence req3: acquiring latches -[4] sequence req3: scanning lock table for conflicting locks -[4] sequence req3: sequencing complete, returned guard +[7] sequence req3: sequencing request +[7] sequence req3: acquiring latches +[7] sequence req3: scanning lock table for conflicting locks +[7] sequence req3: sequencing complete, returned guard handle-write-intent-error req=req3 txn=txn2 key=k ---- -[-] handle write intent error req3: handling conflicting intents on "k" +[8] handle write intent error req3: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -177,11 +205,11 @@ local: num=0 sequence req=req3 ---- -[5] sequence req3: re-sequencing request -[5] sequence req3: acquiring latches -[5] sequence req3: scanning lock table for conflicting locks -[5] sequence req3: waiting in lock wait-queues -[5] sequence req3: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn +[9] sequence req3: re-sequencing request +[9] sequence req3: acquiring latches +[9] sequence req3: scanning lock table for conflicting locks +[9] sequence req3: waiting in lock wait-queues +[9] sequence req3: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn debug-lock-table ---- @@ -196,9 +224,9 @@ local: num=0 on-lock-updated txn=txn2 key=k status=committed ---- [-] update lock: committing txn2 @ k -[5] sequence req3: acquiring latches -[5] sequence req3: scanning lock table for conflicting locks -[5] sequence req3: sequencing complete, returned guard +[9] sequence req3: acquiring latches +[9] sequence req3: scanning lock table for conflicting locks +[9] sequence req3: sequencing complete, returned guard debug-lock-table ---- @@ -316,7 +344,7 @@ local: num=0 handle-write-intent-error req=req2 txn=txn1 key=k ---- -[-] handle write intent error req2: handling conflicting intents on "k" +[3] handle write intent error req2: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -329,18 +357,18 @@ local: num=0 sequence req=req2 ---- -[3] sequence req2: re-sequencing request -[3] sequence req2: acquiring latches -[3] sequence req2: scanning lock table for conflicting locks -[3] sequence req2: waiting in lock wait-queues -[3] sequence req2: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn +[4] sequence req2: re-sequencing request +[4] sequence req2: acquiring latches +[4] sequence req2: scanning lock table for conflicting locks +[4] sequence req2: waiting in lock wait-queues +[4] sequence req2: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn on-lock-updated txn=txn1 key=k status=committed ---- [-] update lock: committing txn1 @ k -[3] sequence req2: acquiring latches -[3] sequence req2: scanning lock table for conflicting locks -[3] sequence req2: sequencing complete, returned guard +[4] sequence req2: acquiring latches +[4] sequence req2: scanning lock table for conflicting locks +[4] sequence req2: sequencing complete, returned guard debug-lock-table ---- @@ -373,12 +401,14 @@ subtest end # OnRangeMerge - a Range merge clears the lock-table and # disables it. # -# Setup: txn1 acquires lock +# Setup: txn1 acquires lock on k and k2 # # Test: txn2 enters lock's wait-queue # range is merged # txn2 proceeds -# txn2 discovers txn1's lock (ignored) +# txn2 discovers txn1's lock on k while writing (not ignored, waits) +# txn2 re-sequences +# txn2 discovers txn1's lock on k2 while reading (not ignored, waits) # txn2 re-sequences # txn1 lock is released (ignored) # txn2 proceeds and acquires lock (ignored) @@ -393,11 +423,13 @@ new-txn name=txn2 ts=10,1 epoch=0 ---- new-request name=req1 txn=txn1 ts=10,1 - put key=k value=v + put key=k value=v + put key=k2 value=v ---- new-request name=req2 txn=txn2 ts=10,1 - put key=k value=v + put key=k value=v + get key=k2 ---- sequence req=req1 @@ -411,15 +443,21 @@ on-lock-acquired txn=txn1 key=k ---- [-] acquire lock: txn1 @ k +on-lock-acquired txn=txn1 key=k2 +---- +[-] acquire lock: txn1 @ k2 + finish req=req1 ---- [-] finish req1: finishing request debug-lock-table ---- -global: num=1 +global: num=2 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 0.000000010,1, info: unrepl epoch: 0, seqs: [0] + lock: "k2" + holder: txn: 00000001-0000-0000-0000-000000000000, ts: 0.000000010,1, info: unrepl epoch: 0, seqs: [0] local: num=0 # -------------------------------- @@ -436,12 +474,14 @@ sequence req=req2 debug-lock-table ---- -global: num=1 +global: num=2 lock: "k" holder: txn: 00000001-0000-0000-0000-000000000000, ts: 0.000000010,1, info: unrepl epoch: 0, seqs: [0] queued writers: active: true req: 7, txn: 00000002-0000-0000-0000-000000000000 distinguished req: 7 + lock: "k2" + holder: txn: 00000001-0000-0000-0000-000000000000, ts: 0.000000010,1, info: unrepl epoch: 0, seqs: [0] local: num=0 on-merge @@ -458,7 +498,14 @@ local: num=0 handle-write-intent-error req=req2 txn=txn1 key=k ---- -[-] handle write intent error req2: handling conflicting intents on "k" +[3] handle write intent error req2: pushing txn 00000001 to abort +[3] handle write intent error req2: blocked on select in concurrency_test.(*cluster).PushTransaction + +on-txn-updated txn=txn1 status=committed +---- +[-] update txn: committing txn1 +[3] handle write intent error req2: resolving intent "k" for txn 00000001 with COMMITTED status +[3] handle write intent error req2: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -467,20 +514,29 @@ local: num=0 sequence req=req2 ---- -[3] sequence req2: re-sequencing request -[3] sequence req2: acquiring latches -[3] sequence req2: scanning lock table for conflicting locks -[3] sequence req2: sequencing complete, returned guard +[4] sequence req2: re-sequencing request +[4] sequence req2: acquiring latches +[4] sequence req2: scanning lock table for conflicting locks +[4] sequence req2: sequencing complete, returned guard -on-lock-updated txn=txn1 key=k status=committed +handle-write-intent-error req=req2 txn=txn1 key=k2 ---- -[-] update lock: committing txn1 @ k +[5] handle write intent error req2: pushing timestamp of txn 00000001 above 0.000000010,1 +[5] handle write intent error req2: resolving intent "k2" for txn 00000001 with COMMITTED status +[5] handle write intent error req2: handled conflicting intents on "k2", released latches debug-lock-table ---- global: num=0 local: num=0 +sequence req=req2 +---- +[6] sequence req2: re-sequencing request +[6] sequence req2: acquiring latches +[6] sequence req2: scanning lock table for conflicting locks +[6] sequence req2: sequencing complete, returned guard + on-lock-acquired txn=txn2 key=k ---- [-] acquire lock: txn2 @ k @@ -588,7 +644,7 @@ local: num=0 handle-write-intent-error req=req2 txn=txn1 key=k ---- -[-] handle write intent error req2: handling conflicting intents on "k" +[3] handle write intent error req2: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -601,18 +657,18 @@ local: num=0 sequence req=req2 ---- -[3] sequence req2: re-sequencing request -[3] sequence req2: acquiring latches -[3] sequence req2: scanning lock table for conflicting locks -[3] sequence req2: waiting in lock wait-queues -[3] sequence req2: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn +[4] sequence req2: re-sequencing request +[4] sequence req2: acquiring latches +[4] sequence req2: scanning lock table for conflicting locks +[4] sequence req2: waiting in lock wait-queues +[4] sequence req2: blocked on select in concurrency.(*lockTableWaiterImpl).WaitOn on-lock-updated txn=txn1 key=k status=committed ---- [-] update lock: committing txn1 @ k -[3] sequence req2: acquiring latches -[3] sequence req2: scanning lock table for conflicting locks -[3] sequence req2: sequencing complete, returned guard +[4] sequence req2: acquiring latches +[4] sequence req2: scanning lock table for conflicting locks +[4] sequence req2: sequencing complete, returned guard debug-lock-table ---- diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/uncertainty b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/uncertainty index d79abc2b7da2..035cf1e73aee 100644 --- a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/uncertainty +++ b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/uncertainty @@ -24,7 +24,7 @@ sequence req=req1 handle-write-intent-error req=req1 txn=txn1 key=k ---- -[-] handle write intent error req1: handling conflicting intents on "k" +[2] handle write intent error req1: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -35,20 +35,20 @@ local: num=0 sequence req=req1 ---- -[2] sequence req1: re-sequencing request -[2] sequence req1: acquiring latches -[2] sequence req1: scanning lock table for conflicting locks -[2] sequence req1: waiting in lock wait-queues -[2] sequence req1: pushing timestamp of txn 00000001 above 0.000000015,1 -[2] sequence req1: blocked on select in concurrency_test.(*cluster).PushTransaction +[3] sequence req1: re-sequencing request +[3] sequence req1: acquiring latches +[3] sequence req1: scanning lock table for conflicting locks +[3] sequence req1: waiting in lock wait-queues +[3] sequence req1: pushing timestamp of txn 00000001 above 0.000000015,1 +[3] sequence req1: blocked on select in concurrency_test.(*cluster).PushTransaction on-txn-updated txn=txn1 status=pending ts=15,2 ---- [-] update txn: increasing timestamp of txn1 -[2] sequence req1: resolving intent "k" for txn 00000001 with PENDING status -[2] sequence req1: acquiring latches -[2] sequence req1: scanning lock table for conflicting locks -[2] sequence req1: sequencing complete, returned guard +[3] sequence req1: resolving intent "k" for txn 00000001 with PENDING status +[3] sequence req1: acquiring latches +[3] sequence req1: scanning lock table for conflicting locks +[3] sequence req1: sequencing complete, returned guard finish req=req1 ---- @@ -84,7 +84,7 @@ sequence req=req1 handle-write-intent-error req=req1 txn=txn1 key=k ---- -[-] handle write intent error req1: handling conflicting intents on "k" +[2] handle write intent error req1: handled conflicting intents on "k", released latches debug-lock-table ---- @@ -95,20 +95,20 @@ local: num=0 sequence req=req1 ---- -[2] sequence req1: re-sequencing request -[2] sequence req1: acquiring latches -[2] sequence req1: scanning lock table for conflicting locks -[2] sequence req1: waiting in lock wait-queues -[2] sequence req1: pushing timestamp of txn 00000001 above 0.000000015,1 -[2] sequence req1: blocked on select in concurrency_test.(*cluster).PushTransaction +[3] sequence req1: re-sequencing request +[3] sequence req1: acquiring latches +[3] sequence req1: scanning lock table for conflicting locks +[3] sequence req1: waiting in lock wait-queues +[3] sequence req1: pushing timestamp of txn 00000001 above 0.000000015,1 +[3] sequence req1: blocked on select in concurrency_test.(*cluster).PushTransaction on-txn-updated txn=txn1 status=pending ts=15,2 ---- [-] update txn: increasing timestamp of txn1 -[2] sequence req1: resolving intent "k" for txn 00000001 with PENDING status -[2] sequence req1: acquiring latches -[2] sequence req1: scanning lock table for conflicting locks -[2] sequence req1: sequencing complete, returned guard +[3] sequence req1: resolving intent "k" for txn 00000001 with PENDING status +[3] sequence req1: acquiring latches +[3] sequence req1: scanning lock table for conflicting locks +[3] sequence req1: sequencing complete, returned guard finish req=req1 ---- diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index f4ad0ad794a0..d59a6cf92a2e 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -290,7 +290,7 @@ func (r *Replica) handleWriteIntentError( return g, pErr } // g's latches will be dropped, but it retains its spot in lock wait-queues. - return r.concMgr.HandleWriterIntentError(ctx, g, t), nil + return r.concMgr.HandleWriterIntentError(ctx, g, t) } func (r *Replica) handleTransactionPushError(