From ef2f73416bc584264c9a663f9b167f08dab68101 Mon Sep 17 00:00:00 2001
From: Arul Ajmani <arulajmani@gmail.com>
Date: Wed, 22 Feb 2023 10:26:20 -0500
Subject: [PATCH] storage: consult intent history for all read-your-own-write
 transactions

Read-only requests that qualify for dropping latches early scan the
lock table under latches to perform conflict resolution. If they don't
conflict with any concurrent transactions, they drop their latches and
scan the MVCC keyspace. However, if a request is trying to read a key
that was written to in its transaction, we may need access to the key's
intent history when performing the scan.

If scanning the lock table uncovers an intent by our own transaction, we
need access to the intent history if:

1. The request is reading at a lower sequence number than the found
intent's sequence number.
2. OR the request is reading at a strictly lower timestamp than the
intent's timestamp.
3. OR the found intent should be ignored because it was written as part
of a savepoing which was subsequently rolled back.
4. OR the found intent and read request belong to different txn epochs.

Prior to this patch, we weren't accounting for 3 and 4. Worse yet, our
determination for 2 was a less than equal to -- which meant we would
end up consulting the intent history in the common case. This meant
that existing tests which would have exercised 3 and 4 weren't failing
on us.

Instead of correcting the determination, this patch makes it such that
all read-your-own-write transactions consult intent history. This
avoids the subtle complexity of getting all these cases right and
keeping them in sync with the MVCC code that reads keys. It also seems
like scanning the lock table twice (once during conflict resolution
and once during the MVCC read) for read-your-own-write transactions
in the common case did not cause noticable regressions. We can always
optimize this later, if we find there's a need to.

Epic: none
Fixes: #94337

Release note: None
---
 .../tests/3node-tenant/generated_test.go      |  7 +++
 pkg/kv/kvserver/replica_read.go               |  4 +-
 .../logictest/testdata/logic_test/savepoints  | 35 +++++++++++++
 .../tests/fakedist-disk/generated_test.go     |  7 +++
 .../tests/fakedist-vec-off/generated_test.go  |  7 +++
 .../tests/fakedist/generated_test.go          |  7 +++
 .../generated_test.go                         |  7 +++
 .../tests/local-vec-off/generated_test.go     |  7 +++
 .../logictest/tests/local/generated_test.go   |  7 +++
 pkg/storage/engine.go                         | 51 ++++++++++++-------
 pkg/storage/engine_test.go                    | 26 ++++++----
 11 files changed, 136 insertions(+), 29 deletions(-)
 create mode 100644 pkg/sql/logictest/testdata/logic_test/savepoints

diff --git a/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go b/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go
index 0af11926fbaf..5ef453cac7ba 100644
--- a/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go
+++ b/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go
@@ -1501,6 +1501,13 @@ func TestTenantLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestTenantLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestTenantLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/kv/kvserver/replica_read.go b/pkg/kv/kvserver/replica_read.go
index ed39fc05d42a..04b86024d0b4 100644
--- a/pkg/kv/kvserver/replica_read.go
+++ b/pkg/kv/kvserver/replica_read.go
@@ -303,13 +303,13 @@ func (r *Replica) canDropLatchesBeforeEval(
 	// or if any of them need to use an intent interleaving iterator.
 	for _, req := range ba.Requests {
 		reqHeader := req.GetInner().Header()
-		start, end, seq := reqHeader.Key, reqHeader.EndKey, reqHeader.Sequence
+		start, end := reqHeader.Key, reqHeader.EndKey
 		var txnID uuid.UUID
 		if ba.Txn != nil {
 			txnID = ba.Txn.ID
 		}
 		needsIntentInterleavingForThisRequest, err := storage.ScanConflictingIntentsForDroppingLatchesEarly(
-			ctx, rw, txnID, ba.Header.Timestamp, start, end, seq, &intents, maxIntents,
+			ctx, rw, txnID, ba.Header.Timestamp, start, end, &intents, maxIntents,
 		)
 		if err != nil {
 			return false /* ok */, true /* stillNeedsIntentInterleaving */, kvpb.NewError(
diff --git a/pkg/sql/logictest/testdata/logic_test/savepoints b/pkg/sql/logictest/testdata/logic_test/savepoints
new file mode 100644
index 000000000000..a9786e9ddcbd
--- /dev/null
+++ b/pkg/sql/logictest/testdata/logic_test/savepoints
@@ -0,0 +1,35 @@
+# Regression test for https://github.com/cockroachdb/cockroach/issues/94337.
+# The linked issue only manifests itself if the read request is able to drop
+# latches early. Before this bug was fixed, one of the conditions required for
+# a read request to drop its latches early was that it needed to have a higher
+# read timestamp compared to the timestamp at which it had written all intents
+# the read overlapped with. Adding the sleeps here bumps the transaction's read
+# timestamp, by virtue of the closed timestamp interval.
+
+statement ok
+CREATE TABLE a (id INT);
+
+statement ok
+BEGIN
+
+statement ok
+SELECT pg_sleep(1.5)
+
+statement ok
+SAVEPOINT s
+
+statement ok
+SELECT pg_sleep(1.5)
+
+statement ok
+INSERT INTO a(id) VALUES (0);
+
+statement ok
+ROLLBACK TO SAVEPOINT s
+
+query I
+SELECT * FROM a
+----
+
+statement ok
+COMMIT
diff --git a/pkg/sql/logictest/tests/fakedist-disk/generated_test.go b/pkg/sql/logictest/tests/fakedist-disk/generated_test.go
index 5b12a170ac8e..60e9c080d7f9 100644
--- a/pkg/sql/logictest/tests/fakedist-disk/generated_test.go
+++ b/pkg/sql/logictest/tests/fakedist-disk/generated_test.go
@@ -1472,6 +1472,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go b/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go
index c69d7a82a9de..496b46e5502c 100644
--- a/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go
+++ b/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go
@@ -1472,6 +1472,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/fakedist/generated_test.go b/pkg/sql/logictest/tests/fakedist/generated_test.go
index 7e7306cd4092..370cb806df7a 100644
--- a/pkg/sql/logictest/tests/fakedist/generated_test.go
+++ b/pkg/sql/logictest/tests/fakedist/generated_test.go
@@ -1486,6 +1486,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go b/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go
index 4db9511ec006..ecbf90f63086 100644
--- a/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go
+++ b/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go
@@ -1451,6 +1451,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/local-vec-off/generated_test.go b/pkg/sql/logictest/tests/local-vec-off/generated_test.go
index 8026e88eda6a..64df77d1252d 100644
--- a/pkg/sql/logictest/tests/local-vec-off/generated_test.go
+++ b/pkg/sql/logictest/tests/local-vec-off/generated_test.go
@@ -1479,6 +1479,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/local/generated_test.go b/pkg/sql/logictest/tests/local/generated_test.go
index 1c7629e5f9a1..f83880ccbb4c 100644
--- a/pkg/sql/logictest/tests/local/generated_test.go
+++ b/pkg/sql/logictest/tests/local/generated_test.go
@@ -1619,6 +1619,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go
index c05b77682735..a4faca7fe423 100644
--- a/pkg/storage/engine.go
+++ b/pkg/storage/engine.go
@@ -1731,14 +1731,10 @@ func assertMVCCIteratorInvariants(iter MVCCIterator) error {
 // proceed until the intents are resolved. Intents that don't conflict with the
 // transaction referenced by txnID[1] at the supplied `ts` are ignored.
 //
-// The caller must supply the sequence number of the request on behalf of which
-// the intents are being scanned. This is used to determine if the caller needs
-// to consult intent history when performing a scan over the MVCC keyspace
-// (indicated by the `needIntentHistory` return parameter). Intent history is
-// required to read the correct provisional value when scanning if we encounter
-// an intent written by the `txn` at a higher sequence number than the one
-// supplied or at a higher timestamp than the `ts` supplied (regardless of the
-// sequence number of the intent).
+// The `needsIntentHistory` return value indicates whether the caller needs to
+// consult intent history when performing a scan over the MVCC keyspace to
+// read correct provisional values for at least one of the keys being scanned.
+// Typically, this applies to all transactions that read their own writes.
 //
 // [1] The supplied txnID may be empty (uuid.Nil) if the request on behalf of
 // which the scan is being performed is non-transactional.
@@ -1748,7 +1744,6 @@ func ScanConflictingIntentsForDroppingLatchesEarly(
 	txnID uuid.UUID,
 	ts hlc.Timestamp,
 	start, end roachpb.Key,
-	seq enginepb.TxnSeq,
 	intents *[]roachpb.Intent,
 	maxIntents int64,
 ) (needIntentHistory bool, err error) {
@@ -1791,14 +1786,36 @@ func ScanConflictingIntentsForDroppingLatchesEarly(
 		}
 		ownIntent := txnID != uuid.Nil && txnID == meta.Txn.ID
 		if ownIntent {
-			// If we ran into one of our own intents, check whether the intent has a
-			// higher (or equal) sequence number or a higher (or equal) timestamp. If
-			// either of these conditions is true, a corresponding scan over the MVCC
-			// key space will need access to the key's intent history in order to read
-			// the correct provisional value. So we set `needIntentHistory` to true.
-			if seq <= meta.Txn.Sequence || ts.LessEq(meta.Timestamp.ToTimestamp()) {
-				needIntentHistory = true
-			}
+			// If we ran into one of our own intents, a corresponding scan over the
+			// MVCC keyspace will need access to the key's intent history in order to
+			// read the correct provisional value. As such, we set `needsIntentHistory`
+			// to be true.
+			//
+			// This determination is more restrictive than it needs to be. A read
+			// request needs access to the intent history when performing a scan over
+			// the MVCC keyspace only if:
+			// 1. The request is reading at a lower sequence number than the intent's
+			// sequence number.
+			// 2. OR the request is reading at a (strictly) lower timestamp than the
+			// intent timestamp[1]. This can happen if the intent was pushed for some
+			// reason.
+			// 3. OR the found intent should be ignored because it was written as part
+			// of a savepoint which was subsequently rolled back.
+			// 4. OR the found intent and read request belong to different txn epochs.
+			//
+			// The conditions above mirror special case handling for intents by
+			// pebbleMVCCScanner's getOne method. If we find scanning the lock table
+			// twice (once during conflict resolution, and once when interleaving
+			// intents during the MVCC read) is too expensive for transactions that
+			// read their own writes, there's some optimizations to be had here by
+			// being smarter about when we decide to interleave intents or not to.
+			//
+			// [1] Only relevant if the intent has a sequence number less than or
+			// equal to the read request's sequence number. Otherwise, we need access
+			// to the intent history to read the correct provisional value -- one
+			// written at a lower or equal sequence number compared to the read
+			// request's.
+			needIntentHistory = true
 			continue
 		}
 		if conflictingIntent := meta.Timestamp.ToTimestamp().LessEq(ts); !conflictingIntent {
diff --git a/pkg/storage/engine_test.go b/pkg/storage/engine_test.go
index b0eb2d455e46..b40178849d58 100644
--- a/pkg/storage/engine_test.go
+++ b/pkg/storage/engine_test.go
@@ -1914,6 +1914,14 @@ func TestEngineIteratorVisibility(t *testing.T) {
 // 2. Timestamps of the intent/read op.
 // 3. Epochs of the intent/read op.
 // 4. Whether any savepoints have been rolled back.
+//
+// NB: When scanning for conflicting intents to determine if latches can be
+// dropped early, we fallback to using the intent interleaving iterator in all
+// read-your-own-write cases. However, doing so is more restrictive than it
+// needs to be -- the in-line test expectations correspond to what an optimized
+// determination for `needsIntentHistory` would look like. However, for the
+// purposes of this test, we assert that we always fall back to using the intent
+// interleaving iterator in ALL read-your-own-write cases.
 func TestScanConflictingIntentsForDroppingLatchesEarlyReadYourOwnWrites(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
@@ -1944,7 +1952,7 @@ func TestScanConflictingIntentsForDroppingLatchesEarlyReadYourOwnWrites(t *testi
 		intentTS              hlc.Timestamp
 		intentSequenceNumber  enginepb.TxnSeq
 		intentEpoch           enginepb.TxnEpoch
-		expNeedsIntentHistory bool
+		expNeedsIntentHistory bool // unused while testing
 		ignoredSeqNumbers     enginepb.IgnoredSeqNumRange
 	}{
 		{
@@ -1952,7 +1960,7 @@ func TestScanConflictingIntentsForDroppingLatchesEarlyReadYourOwnWrites(t *testi
 			intentTS:              readTS,
 			intentSequenceNumber:  readSeqNumber,
 			intentEpoch:           readTxnEpoch,
-			expNeedsIntentHistory: false, // fails
+			expNeedsIntentHistory: false,
 		},
 		{
 			name:                  "higher {intent seq number} equal {timestamp, epoch}",
@@ -2000,7 +2008,7 @@ func TestScanConflictingIntentsForDroppingLatchesEarlyReadYourOwnWrites(t *testi
 			intentTS:              readTS,
 			intentSequenceNumber:  belowReadSeqNumber,
 			intentEpoch:           readTxnEpoch,
-			expNeedsIntentHistory: false, // fails
+			expNeedsIntentHistory: false,
 		},
 		{
 			name:                  "equal {epoch} lower {intent timestamp, intent seq number}",
@@ -2014,7 +2022,7 @@ func TestScanConflictingIntentsForDroppingLatchesEarlyReadYourOwnWrites(t *testi
 			intentTS:              belowReadTS,
 			intentSequenceNumber:  readSeqNumber,
 			intentEpoch:           readTxnEpoch,
-			expNeedsIntentHistory: false, // fails
+			expNeedsIntentHistory: false,
 		},
 
 		// lower/higher epoch test cases aren't exhaustive.
@@ -2037,14 +2045,14 @@ func TestScanConflictingIntentsForDroppingLatchesEarlyReadYourOwnWrites(t *testi
 			intentTS:              belowReadTS,
 			intentSequenceNumber:  belowReadSeqNumber,
 			intentEpoch:           belowReadTxnEpoch,
-			expNeedsIntentHistory: true, // fails
+			expNeedsIntentHistory: true,
 		},
 		{
 			name:                  "higher {epoch} lower {intent timestamp, intent seq number}",
 			intentTS:              belowReadTS,
 			intentSequenceNumber:  belowReadSeqNumber,
 			intentEpoch:           aboveReadTxnEpoch,
-			expNeedsIntentHistory: true, // fails
+			expNeedsIntentHistory: true,
 		},
 		// Savepoint related tests.
 		{
@@ -2054,7 +2062,7 @@ func TestScanConflictingIntentsForDroppingLatchesEarlyReadYourOwnWrites(t *testi
 			intentSequenceNumber:  belowReadSeqNumber,
 			intentEpoch:           readTxnEpoch,
 			ignoredSeqNumbers:     enginepb.IgnoredSeqNumRange{Start: belowReadSeqNumber, End: belowReadSeqNumber},
-			expNeedsIntentHistory: true, // fails
+			expNeedsIntentHistory: true,
 		},
 		{
 			name:                  "intent not part of rolled back savepoint",
@@ -2101,12 +2109,11 @@ func TestScanConflictingIntentsForDroppingLatchesEarlyReadYourOwnWrites(t *testi
 				txn.ReadTimestamp,
 				keyA,
 				nil,
-				txn.Sequence,
 				&intents,
 				0, /* maxIntents */
 			)
 			require.NoError(t, err)
-			require.Equal(t, tc.expNeedsIntentHistory, needsIntentHistory)
+			require.Equal(t, true, needsIntentHistory)
 		})
 	}
 }
@@ -2231,7 +2238,6 @@ func TestScanConflictingIntentsForDroppingLatchesEarly(t *testing.T) {
 				txn.ReadTimestamp,
 				tc.start,
 				tc.end,
-				txn.Sequence,
 				&intents,
 				0, /* maxIntents */
 			)