From f107bcea61e6692a1678f6f9eaac72161ff63955 Mon Sep 17 00:00:00 2001
From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com>
Date: Tue, 4 Jul 2023 15:17:05 +0800
Subject: [PATCH] Fix the issue that primary pessimistic lock may be left not
 cleared after GC (#866)

* Fix the issue that primary pessimistic lock may be left not cleared after GC

Signed-off-by: MyonKeminta <MyonKeminta@users.noreply.github.com>

* Fix mysteriously shown up thing that makes compilation failed

Signed-off-by: MyonKeminta <MyonKeminta@users.noreply.github.com>

* Fix test effectiveness (forgot to set txn2 to pessimistic txn); add more strict checks

Signed-off-by: MyonKeminta <MyonKeminta@users.noreply.github.com>

* Address comments

Signed-off-by: MyonKeminta <MyonKeminta@users.noreply.github.com>

---------

Signed-off-by: MyonKeminta <MyonKeminta@users.noreply.github.com>
Co-authored-by: MyonKeminta <MyonKeminta@users.noreply.github.com>
---
 integration_tests/lock_test.go | 61 +++++++++++++++++++++++++---------
 tikv/test_probe.go             | 37 +++++++++++++++++++++
 txnkv/transaction/txn.go       | 19 ++++++++++-
 txnkv/txnlock/lock_resolver.go | 17 ++++++----
 4 files changed, 111 insertions(+), 23 deletions(-)

diff --git a/integration_tests/lock_test.go b/integration_tests/lock_test.go
index bafe21648..89bead1c9 100644
--- a/integration_tests/lock_test.go
+++ b/integration_tests/lock_test.go
@@ -1472,52 +1472,81 @@ func (s *testLockWithTiKVSuite) TestBatchResolveLocks() {
 
 	s.NoError(failpoint.Enable("tikvclient/beforeAsyncPessimisticRollback", `return("skip")`))
 	s.NoError(failpoint.Enable("tikvclient/beforeCommitSecondaries", `return("skip")`))
-	s.NoError(failpoint.Enable("tikvclient/twoPCRequestBatchSizeLimit", `return("skip")`))
+	s.NoError(failpoint.Enable("tikvclient/twoPCRequestBatchSizeLimit", `return`))
+	s.NoError(failpoint.Enable("tikvclient/onRollback", `return("skipRollbackPessimisticLock")`))
 	defer func() {
 		s.NoError(failpoint.Disable("tikvclient/beforeAsyncPessimisticRollback"))
 		s.NoError(failpoint.Disable("tikvclient/beforeCommitSecondaries"))
 		s.NoError(failpoint.Disable("tikvclient/twoPCRequestBatchSizeLimit"))
+		s.NoError(failpoint.Disable("tikvclient/onRollback"))
 	}()
 
-	k1, k2, k3 := []byte("k1"), []byte("k2"), []byte("k3")
+	k1, k2, k3, k4 := []byte("k1"), []byte("k2"), []byte("k3"), []byte("k4")
 	v2, v3 := []byte("v2"), []byte("v3")
 
 	ctx := context.WithValue(context.Background(), util.SessionID, uint64(1))
 
-	txn, err := s.store.Begin()
+	txn1, err := s.store.Begin()
 	s.NoError(err)
-	txn.SetPessimistic(true)
+	txn1.SetPessimistic(true)
 
 	{
 		// Produce write conflict on key k2
-		txn2, err := s.store.Begin()
+		helperTxn, err := s.store.Begin()
 		s.NoError(err)
-		s.NoError(txn2.Set(k2, []byte("v0")))
-		s.NoError(txn2.Commit(ctx))
+		s.NoError(helperTxn.Set(k2, []byte("v0")))
+		s.NoError(helperTxn.Commit(ctx))
 	}
 
-	lockCtx := kv.NewLockCtx(txn.StartTS(), 200, time.Now())
-	err = txn.LockKeys(ctx, lockCtx, k1, k2)
+	lockCtx := kv.NewLockCtx(txn1.StartTS(), 200, time.Now())
+	err = txn1.LockKeys(ctx, lockCtx, k1, k2)
 	s.IsType(&tikverr.ErrWriteConflict{}, errors.Cause(err))
 
-	// k1 has txn's stale pessimistic lock now.
+	// k1 has txn1's stale pessimistic lock now.
 
 	forUpdateTS, err := s.store.CurrentTimestamp(oracle.GlobalTxnScope)
 	s.NoError(err)
 	lockCtx = kv.NewLockCtx(forUpdateTS, 200, time.Now())
-	s.NoError(txn.LockKeys(ctx, lockCtx, k2, k3))
+	s.NoError(txn1.LockKeys(ctx, lockCtx, k2, k3))
 
-	s.NoError(txn.Set(k2, v2))
-	s.NoError(txn.Set(k3, v3))
-	s.NoError(txn.Commit(ctx))
+	s.NoError(txn1.Set(k2, v2))
+	s.NoError(txn1.Set(k3, v3))
+	s.NoError(txn1.Commit(ctx))
 
-	// k3 has txn's stale prewrite lock now.
+	// k3 has txn1's stale prewrite lock now.
 
-	// Perform ScanLock - BatchResolveLock.
+	txn2, err := s.store.Begin()
+	txn2.SetPessimistic(true)
+	s.NoError(err)
+	lockCtx = kv.NewLockCtx(txn1.StartTS(), 200, time.Now())
+	err = txn2.LockKeys(ctx, lockCtx, k4)
+	s.NoError(err)
+	s.NoError(txn2.Rollback())
+
+	// k4 has txn2's stale primary pessimistic lock now.
 	currentTS, err := s.store.CurrentTimestamp(oracle.GlobalTxnScope)
+
+	remainingLocks, err := s.store.ScanLocks(ctx, []byte("k"), []byte("l"), currentTS)
+	s.NoError(err)
+
+	s.Len(remainingLocks, 3)
+	s.Equal(remainingLocks[0].Key, k1)
+	s.Equal(remainingLocks[0].LockType, kvrpcpb.Op_PessimisticLock)
+	s.Equal(remainingLocks[1].Key, k3)
+	s.Equal(remainingLocks[1].LockType, kvrpcpb.Op_Put)
+	s.Equal(remainingLocks[2].Key, k4)
+	s.Equal(remainingLocks[2].LockType, kvrpcpb.Op_PessimisticLock)
+	s.Equal(remainingLocks[2].Primary, k4)
+
+	// Perform ScanLock - BatchResolveLock.
 	s.NoError(err)
 	s.NoError(s.store.GCResolveLockPhase(ctx, currentTS, 1))
 
+	// Do ScanLock again to make sure no locks are left.
+	remainingLocks, err = s.store.ScanLocks(ctx, []byte("k"), []byte("l"), currentTS)
+	s.NoError(err)
+	s.Empty(remainingLocks)
+
 	// Check data consistency
 	readTS, err := s.store.CurrentTimestamp(oracle.GlobalTxnScope)
 	snapshot := s.store.GetSnapshot(readTS)
diff --git a/tikv/test_probe.go b/tikv/test_probe.go
index d470d6919..5971480f3 100644
--- a/tikv/test_probe.go
+++ b/tikv/test_probe.go
@@ -35,6 +35,7 @@
 package tikv
 
 import (
+	"bytes"
 	"context"
 
 	"github.com/pingcap/kvproto/pkg/metapb"
@@ -114,6 +115,42 @@ func (s StoreProbe) GCResolveLockPhase(ctx context.Context, safepoint uint64, co
 	return s.resolveLocks(ctx, safepoint, concurrency)
 }
 
+func (s StoreProbe) ScanLocks(ctx context.Context, startKey, endKey []byte, maxVersion uint64) ([]*txnlock.Lock, error) {
+	bo := NewGcResolveLockMaxBackoffer(ctx)
+	const limit = 1024
+
+	var result []*txnlock.Lock
+
+outerLoop:
+	for {
+		locks, loc, err := s.KVStore.scanLocksInRegionWithStartKey(bo, startKey, maxVersion, limit)
+		if err != nil {
+			return nil, err
+		}
+		for _, l := range locks {
+			if bytes.Compare(endKey, l.Key) <= 0 {
+				// Finished scanning the given range.
+				break outerLoop
+			}
+			result = append(result, l)
+		}
+
+		if len(locks) < limit {
+			if len(loc.EndKey) == 0 {
+				// Scanned to the very end.
+				break outerLoop
+			}
+			// The current region is completely scanned.
+			startKey = loc.EndKey
+		} else {
+			// The current region may still have more locks.
+			startKey = append(locks[len(locks)-1].Key, 0)
+		}
+	}
+
+	return result, nil
+}
+
 // LockResolverProbe wraps a LockResolver and exposes internal stats for testing purpose.
 type LockResolverProbe struct {
 	*txnlock.LockResolverProbe
diff --git a/txnkv/transaction/txn.go b/txnkv/transaction/txn.go
index e5067ce8a..f9873a33f 100644
--- a/txnkv/transaction/txn.go
+++ b/txnkv/transaction/txn.go
@@ -556,10 +556,27 @@ func (txn *KVTxn) Rollback() error {
 		txn.CancelAggressiveLocking(context.Background())
 	}
 
+	// `skipPessimisticRollback` may be true only when set by failpoint in tests.
+	skipPessimisticRollback := false
+	if val, err := util.EvalFailpoint("onRollback"); err == nil {
+		if s, ok := val.(string); ok {
+			if s == "skipRollbackPessimisticLock" {
+				logutil.BgLogger().Info("[failpoint] injected skip pessimistic rollback on explicit rollback",
+					zap.Uint64("txnStartTS", txn.startTS))
+				skipPessimisticRollback = true
+			} else {
+				panic(fmt.Sprintf("unknown instruction %s for failpoint \"onRollback\"", s))
+			}
+		}
+	}
+
 	start := time.Now()
 	// Clean up pessimistic lock.
 	if txn.IsPessimistic() && txn.committer != nil {
-		err := txn.rollbackPessimisticLocks()
+		var err error
+		if !skipPessimisticRollback {
+			err = txn.rollbackPessimisticLocks()
+		}
 		txn.committer.ttlManager.close()
 		if err != nil {
 			logutil.BgLogger().Error(err.Error())
diff --git a/txnkv/txnlock/lock_resolver.go b/txnkv/txnlock/lock_resolver.go
index 74caae652..1aa359eb5 100644
--- a/txnkv/txnlock/lock_resolver.go
+++ b/txnkv/txnlock/lock_resolver.go
@@ -245,6 +245,12 @@ func (lr *LockResolver) BatchResolveLocks(bo *retry.Backoffer, locks []*Lock, lo
 		}
 		metrics.LockResolverCountWithExpired.Inc()
 
+		// Use currentTS = math.MaxUint64 means rollback the txn, no matter the lock is expired or not!
+		status, err := lr.getTxnStatus(bo, l.TxnID, l.Primary, 0, math.MaxUint64, true, false, l)
+		if err != nil {
+			return false, err
+		}
+
 		if l.LockType == kvrpcpb.Op_PessimisticLock {
 			// BatchResolveLocks forces resolving the locks ignoring whether whey are expired.
 			// For pessimistic locks, committing them makes no sense, but it won't affect transaction
@@ -252,6 +258,9 @@ func (lr *LockResolver) BatchResolveLocks(bo *retry.Backoffer, locks []*Lock, lo
 			// Pessimistic locks needs special handling logic because their primary may not point
 			// to the real primary of that transaction, and their state cannot be put in `txnInfos`.
 			// (see: https://github.com/pingcap/tidb/issues/42937).
+			//
+			// `resolvePessimisticLock` should be called after calling `getTxnStatus`.
+			// See: https://github.com/pingcap/tidb/issues/45134
 			err := lr.resolvePessimisticLock(bo, l)
 			if err != nil {
 				return false, err
@@ -259,12 +268,6 @@ func (lr *LockResolver) BatchResolveLocks(bo *retry.Backoffer, locks []*Lock, lo
 			continue
 		}
 
-		// Use currentTS = math.MaxUint64 means rollback the txn, no matter the lock is expired or not!
-		status, err := lr.getTxnStatus(bo, l.TxnID, l.Primary, 0, math.MaxUint64, true, false, l)
-		if err != nil {
-			return false, err
-		}
-
 		// If the transaction uses async commit, CheckTxnStatus will reject rolling back the primary lock.
 		// Then we need to check the secondary locks to determine the final status of the transaction.
 		if status.primaryLock != nil && status.primaryLock.UseAsyncCommit {
@@ -1172,6 +1175,8 @@ func (lr *LockResolver) resolveLock(bo *retry.Backoffer, l *Lock, status TxnStat
 	}
 }
 
+// resolvePessimisticLock handles pessimistic locks after checking txn status.
+// Note that this function assumes `CheckTxnStatus` is done (or `getTxnStatusFromLock` has been called) on the lock.
 func (lr *LockResolver) resolvePessimisticLock(bo *retry.Backoffer, l *Lock) error {
 	metrics.LockResolverCountWithResolveLocks.Inc()
 	// The lock has been resolved by getTxnStatusFromLock.