From f16f85412517b1be5a97ccbb013b50b0db05e643 Mon Sep 17 00:00:00 2001
From: Arul Ajmani <arulajmani@gmail.com>
Date: Wed, 22 Feb 2023 10:26:20 -0500
Subject: [PATCH] storage: consult intent history for all read-your-own-write
 transactions

Read-only requests that qualify for dropping latches early scan the
lock table under latches to perform conflict resolution. If they don't
conflict with any concurrent transactions, they drop their latches and
scan the MVCC keyspace. However, if a request is trying to read a key
that was written to in its transaction, we may need access to the key's
intent history when performing the scan.

If scanning the lock table uncovers an intent by our own transaction, we
need access to the intent history if:

1. The request is reading at a lower sequence number than the found
intent's sequence number.
2. OR the request is reading at a strictly lower timestamp than the
intent's timestamp.
3. OR the found intent should be ignored because it was written as part
of a savepoing which was subsequently rolled back.
4. OR the found intent and read request belong to different txn epochs.

Prior to this patch, we weren't accounting for 3 and 4. Worse yet, our
determination for 2 was a less than equal to -- which meant we would
end up consulting the intent history in the common case. This meant
that existing tests which would have exercised 3 and 4 weren't failing
on us.

Instead of correcting the determination, this patch makes it such that
all read-your-own-write transactions consult intent history. This
avoids the subtle complexity of getting all these cases right and
keeping them in sync with the MVCC code that reads keys. It also seems
like scanning the lock table twice (once during conflict resolution
and once during the MVCC read) for read-your-own-write transactions
in the common case did not cause noticable regressions. We can always
optimize this later, if we find there's a need to.

Epic: none
Fixes: #94337

Release note: None
---
 .../tests/3node-tenant/generated_test.go      |  7 +++
 pkg/kv/kvserver/replica_read.go               |  4 +-
 .../logictest/testdata/logic_test/savepoints  | 28 ++++++++++
 .../tests/fakedist-disk/generated_test.go     |  7 +++
 .../tests/fakedist-vec-off/generated_test.go  |  7 +++
 .../tests/fakedist/generated_test.go          |  7 +++
 .../generated_test.go                         |  7 +++
 .../tests/local-vec-off/generated_test.go     |  7 +++
 .../logictest/tests/local/generated_test.go   |  7 +++
 pkg/storage/engine.go                         | 51 ++++++++++++-------
 10 files changed, 113 insertions(+), 19 deletions(-)
 create mode 100644 pkg/sql/logictest/testdata/logic_test/savepoints

diff --git a/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go b/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go
index 6090c6f82825..e563e538bbcf 100644
--- a/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go
+++ b/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go
@@ -1501,6 +1501,13 @@ func TestTenantLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestTenantLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestTenantLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/kv/kvserver/replica_read.go b/pkg/kv/kvserver/replica_read.go
index ed39fc05d42a..04b86024d0b4 100644
--- a/pkg/kv/kvserver/replica_read.go
+++ b/pkg/kv/kvserver/replica_read.go
@@ -303,13 +303,13 @@ func (r *Replica) canDropLatchesBeforeEval(
 	// or if any of them need to use an intent interleaving iterator.
 	for _, req := range ba.Requests {
 		reqHeader := req.GetInner().Header()
-		start, end, seq := reqHeader.Key, reqHeader.EndKey, reqHeader.Sequence
+		start, end := reqHeader.Key, reqHeader.EndKey
 		var txnID uuid.UUID
 		if ba.Txn != nil {
 			txnID = ba.Txn.ID
 		}
 		needsIntentInterleavingForThisRequest, err := storage.ScanConflictingIntentsForDroppingLatchesEarly(
-			ctx, rw, txnID, ba.Header.Timestamp, start, end, seq, &intents, maxIntents,
+			ctx, rw, txnID, ba.Header.Timestamp, start, end, &intents, maxIntents,
 		)
 		if err != nil {
 			return false /* ok */, true /* stillNeedsIntentInterleaving */, kvpb.NewError(
diff --git a/pkg/sql/logictest/testdata/logic_test/savepoints b/pkg/sql/logictest/testdata/logic_test/savepoints
new file mode 100644
index 000000000000..cecad6044e04
--- /dev/null
+++ b/pkg/sql/logictest/testdata/logic_test/savepoints
@@ -0,0 +1,28 @@
+# Regression test for https://github.com/cockroachdb/cockroach/issues/94337.
+statement ok
+CREATE TABLE a (id INT);
+
+statement ok
+BEGIN
+
+statement ok
+SELECT pg_sleep(1.5)
+
+statement ok
+SAVEPOINT s
+
+statement ok
+SELECT pg_sleep(1.5)
+
+statement ok
+INSERT INTO a(id) VALUES (0);
+
+statement ok
+ROLLBACK TO SAVEPOINT s
+
+query I
+SELECT * FROM a
+----
+
+statement ok
+COMMIT
diff --git a/pkg/sql/logictest/tests/fakedist-disk/generated_test.go b/pkg/sql/logictest/tests/fakedist-disk/generated_test.go
index 2de6ca472886..f22c96144dea 100644
--- a/pkg/sql/logictest/tests/fakedist-disk/generated_test.go
+++ b/pkg/sql/logictest/tests/fakedist-disk/generated_test.go
@@ -1472,6 +1472,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go b/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go
index d0c8224e3e2d..376ddb0a1316 100644
--- a/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go
+++ b/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go
@@ -1472,6 +1472,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/fakedist/generated_test.go b/pkg/sql/logictest/tests/fakedist/generated_test.go
index 7de3685dc9c5..910e2abcdb87 100644
--- a/pkg/sql/logictest/tests/fakedist/generated_test.go
+++ b/pkg/sql/logictest/tests/fakedist/generated_test.go
@@ -1486,6 +1486,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go b/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go
index a48984f17364..8ccefb70efa7 100644
--- a/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go
+++ b/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go
@@ -1444,6 +1444,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/local-vec-off/generated_test.go b/pkg/sql/logictest/tests/local-vec-off/generated_test.go
index 046b453d2427..c6e50dc6b545 100644
--- a/pkg/sql/logictest/tests/local-vec-off/generated_test.go
+++ b/pkg/sql/logictest/tests/local-vec-off/generated_test.go
@@ -1479,6 +1479,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/sql/logictest/tests/local/generated_test.go b/pkg/sql/logictest/tests/local/generated_test.go
index d102f07d5dc5..34aacecdcd5e 100644
--- a/pkg/sql/logictest/tests/local/generated_test.go
+++ b/pkg/sql/logictest/tests/local/generated_test.go
@@ -1619,6 +1619,13 @@ func TestLogic_save_table(
 	runLogicTest(t, "save_table")
 }
 
+func TestLogic_savepoints(
+	t *testing.T,
+) {
+	defer leaktest.AfterTest(t)()
+	runLogicTest(t, "savepoints")
+}
+
 func TestLogic_scale(
 	t *testing.T,
 ) {
diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go
index 05caa963cc0f..3b840a790dc1 100644
--- a/pkg/storage/engine.go
+++ b/pkg/storage/engine.go
@@ -1734,14 +1734,10 @@ func assertMVCCIteratorInvariants(iter MVCCIterator) error {
 // proceed until the intents are resolved. Intents that don't conflict with the
 // transaction referenced by txnID[1] at the supplied `ts` are ignored.
 //
-// The caller must supply the sequence number of the request on behalf of which
-// the intents are being scanned. This is used to determine if the caller needs
-// to consult intent history when performing a scan over the MVCC keyspace
-// (indicated by the `needIntentHistory` return parameter). Intent history is
-// required to read the correct provisional value when scanning if we encounter
-// an intent written by the `txn` at a higher sequence number than the one
-// supplied or at a higher timestamp than the `ts` supplied (regardless of the
-// sequence number of the intent).
+// The `needsIntentHistory` return value indicates whether the caller needs to
+// consult intent history when performing a scan over the MVCC keyspace to
+// read correct provisional values for at least one of the keys being scanned.
+// Typically, this applies to all transactions that read their own writes.
 //
 // [1] The supplied txnID may be empty (uuid.Nil) if the request on behalf of
 // which the scan is being performed is non-transactional.
@@ -1751,7 +1747,6 @@ func ScanConflictingIntentsForDroppingLatchesEarly(
 	txnID uuid.UUID,
 	ts hlc.Timestamp,
 	start, end roachpb.Key,
-	seq enginepb.TxnSeq,
 	intents *[]roachpb.Intent,
 	maxIntents int64,
 ) (needIntentHistory bool, err error) {
@@ -1794,14 +1789,36 @@ func ScanConflictingIntentsForDroppingLatchesEarly(
 		}
 		ownIntent := txnID != uuid.Nil && txnID == meta.Txn.ID
 		if ownIntent {
-			// If we ran into one of our own intents, check whether the intent has a
-			// higher (or equal) sequence number or a higher (or equal) timestamp. If
-			// either of these conditions is true, a corresponding scan over the MVCC
-			// key space will need access to the key's intent history in order to read
-			// the correct provisional value. So we set `needIntentHistory` to true.
-			if seq <= meta.Txn.Sequence || ts.LessEq(meta.Timestamp.ToTimestamp()) {
-				needIntentHistory = true
-			}
+			// If we ran into one of our own intents, a corresponding scan over the
+			// MVCC keyspace will need access to the key's intent history in order to
+			// read the correct provisional value. As such, we set `needsIntentHistory`
+			// to be true.
+			//
+			// This determination is more restrictive than it needs to be. A read
+			// request needs access to the intent history when performing a scan over
+			// the MVCC keyspace only if:
+			// 1. The request is reading at a lower sequence number than the intent's
+			// sequence number.
+			// 2. OR the request is reading at a (strictly) lower timestamp than the
+			// intent timestamp[1]. This can happen if the intent was pushed for some
+			// reason.
+			// 3. OR the found intent should be ignored because it was written as part
+			// of a savepoint which was subsequently rolled back.
+			// 4. OR the found intent and read request belong to different txn epochs.
+			//
+			// The conditions above mirror special case handling for intents by
+			// pebbleMVCCScanner's getOne method. If we find scanning the lock table
+			// twice (once during conflict resolution, and once when interleaving
+			// intents during the MVCC read) is too expensive for transactions that
+			// read their own writes, there's some optimizations to be had here by
+			// being smarter about when we decide to interleave intents or not to.
+			//
+			// [1] Only relevant if the intent has a sequence number less than or
+			// equal to the read request's sequence number. Otherwise, we need access
+			// to the intent history to read the correct provisional value -- one
+			// written at a lower or equal sequence number compared to the read
+			// request's.
+			needIntentHistory = true
 			continue
 		}
 		if conflictingIntent := meta.Timestamp.ToTimestamp().LessEq(ts); !conflictingIntent {