storage: write at provisional commit ts, not orig ts

The fact that transactions write at their original timestamp, and not their provisional commit timestamp, allows leaving an intent under a read. The timestamp cache will ensure that the transaction can't actually commit unless it can bump its intents above the read, but it will still leave an intent under the read in the meantime. This can lead to starvation. Intents are meant to function as a sort of lock on a key. Once a writer lays down an intent, no readers should be allowed to read above that intent until that intent is resolved. Otherwise a continual stream of readers could prevent the writer from ever managing to commit by continually bumping the timestamp cache. Now consider how CDC's poller works: it reads (tsmin, ts1], then (ts1, ts2], then (ts2, ts3], and so on, in a tight loop. Since it uses a time-bound iterator under the hood, reading (ts2, ts3], for example, cannot return an intent written at ts2. But the idea was that we inductively guranteed that we never read above an intent. If an intent was written at ts2, even though the read from (ts2, ts3] would fail to observe it, the previous read from (ts1, ts2] would have. Of course, since transactions write at their original timestamp, a transaction with an original timestamp of ts2 can write an intent at ts2 *after* the CDC poller has read (ts1, ts2]. (The transaction will be forced to commit at ts3 or later to be sequenced after the CDC poller's read, but it will leave the intent at ts2.) The CDC poller's next read, from (ts2, ts3], thus won't see the intent, nor will any future reads at higher timestamps. And so the CDC poller will continually bump the timestamp cache, completely starving the writer. Fix the problem by writing at the transaction's provisional commit timestamp (i.e., the timestamp that has been forwarded by the timestamp cache, if necessary) instead of the transaction's original timestamp. Writing at the original timestamp was only necessary to prevent a lost update in snapshot isolation mode, which is no longer supported. In serializable mode, the anomaly is protected against by the read refresh mechanism. Besides fixing the starvation problem, the new behavior is more intuitive than the old behavior. It also might have some performance benefits, as it is less likely that intents will need to be bumped at commit time, which saves on RocksDB writes. Touches #32433. Release note: None
cockroachdb · Dec 10, 2018 · 26ea8f2 · 26ea8f2
1 parent 1aa2dd1
commit 26ea8f2
Show file tree

Hide file tree

Showing 5 changed files with 92 additions and 78 deletions.
diff --git a/pkg/roachpb/data.pb.go b/pkg/roachpb/data.pb.go
diff --git a/pkg/roachpb/data.proto b/pkg/roachpb/data.proto
@@ -263,41 +263,7 @@ message Transaction {
   // refreshed_timestamp.
   //
   // This timestamp is the one at which all transactions will read, unless
-  // refreshed_timestamp is set. It is also, surprisingly, the timestamp at
-  // which transactions will provisionally _write_ (i.e. intents are written at
-  // this orig_timestamp and, after commit, when the intents are resolved,
-  // their timestamps are bumped to the to the commit timestamp), if
-  // refreshed_timestamp isn't set.
-  // This is ultimately because of correctness concerns around SNAPSHOT
-  // transactions.
-  //
-  // Intuitively, one could think that the timestamp at which intents should be
-  // written should be the provisional commit timestamp, and while this is
-  // morally true, consider the following scenario, where txn1 is a SNAPSHOT
-  // txn:
-  //
-  // - txn1 at orig_timestamp=5 reads key1: (value) 1.
-  // - txn1 writes elsewhere, has its commit timestamp increased to 20.
-  // - txn2 at orig_timestamp=10 reads key1: 1
-  // - txn2 increases the value by 5: key1: 6 and commits
-  // - txn1 increases the value by 1: key1: 2, attempts commit
-  //
-  // If txn1 uses its orig_timestamp for updating key1 (as it does), it
-  // conflicts with txn2's committed value (which is at timestamp 10, in the
-  // future of 5), and restarts.
-  // Using instead its candidate commit timestamp, it wouldn't see a conflict
-  // and commit, but this is not the expected outcome (the expected outcome is
-  // {key1: 6} (since txn1 is not expected to commit)) and we would be
-  // experiencing the Lost Update Anomaly.
-  //
-  // Note that in practice, before restarting, txn1 would still lay down an
-  // intent (just above the committed value) not with the intent to commit it,
-  // but to avoid being starved by short-lived transactions on that key which
-  // would otherwise not have to go through conflict resolution with txn1.
-  //
-  // Again, keep in mind that, when the transaction commits, all the intents are
-  // bumped to the commit timestamp (otherwise, pushing a transaction wouldn't
-  // achieve anything).
+  // refreshed_timestamp is set.
   util.hlc.Timestamp orig_timestamp = 6 [(gogoproto.nullable) = false];
   // Initial Timestamp + clock skew. Reads which encounter values with
   // timestamps between timestamp and max_timestamp trigger a txn

diff --git a/pkg/storage/engine/enginepb/mvcc3.proto b/pkg/storage/engine/enginepb/mvcc3.proto
@@ -37,14 +37,24 @@ message TxnMeta {
   // Incremented on txn retry.
   uint32 epoch = 4;
   // The proposed timestamp for the transaction. This starts as the
-  // current wall time on the txn coordinator. This is the timestamp
-  // at which all of the transaction's writes are performed: even if
-  // intents have been laid down at different timestamps, the process
-  // of resolving them (e.g. when the txn commits) will bump them to
-  // this timestamp. SERIALIZABLE transactions only commit when
-  // timestamp == orig_timestamp. SNAPSHOT transactions can commit
-  // even when they've performed their reads (at orig_timestamp) at a
-  // different timestamp than their writes (at timestamp).
+  // current wall time on the txn coordinator, and is forwarded by the
+  // timestamp cache if the txn attempts to write "beneath" another txn.
+  //
+  // Reads and writes within the txn are performed using the most up-
+  // to-date value of this timestamp that is available. For example,
+  // suppose a txn starts at some timestamp, writes a key/value, and
+  // gets pushed forward while doing so. As soon as the txn coordinator
+  // learns of the updated timestamp, it will begin performing reads
+  // and writes at the updated timestamp. The coordinator may, however,
+  // continue issuing reads at the original timestamp before it learns
+  // about the forwarded timestamp.
+  //
+  // Importantly, intents are laid down at this timestamp. The process
+  // of resolving the intents when the txn commits will bump any intents
+  // written at an older timestamp to the final commit timestamp. A
+  // similar process occurs with reads, where any reads that occured at
+  // older timestamps will need to be refreshed at the final commit
+  // timestamp in order for the transactin to commit.
   util.hlc.Timestamp timestamp = 5 [(gogoproto.nullable) = false];
   int32 priority = 6;
   // A one-indexed sequence number which is increased on each request

diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go
@@ -2583,6 +2583,13 @@ func (r *Replica) applyTimestampCache(
 						txn := ba.Txn.Clone()
 						bumped = txn.Timestamp.Forward(nextTS) || bumped
 						ba.Txn = &txn
+						// Update the batch timestamp, and not just the batch's
+						// transaction's timestamp, so that the request will leave intents,
+						// if any, at the forwarded timestamp. This maintains the invariant
+						// that nothing will ever be written beneath a prior read/write, not
+						// even an intent that will never be committed. Time-bound iterators
+						// rely on this guarantee.
+						ba.Timestamp = txn.Timestamp
 					}
 				}
 			} else {
@@ -2601,6 +2608,11 @@ func (r *Replica) applyTimestampCache(
 						bumped = txn.Timestamp.Forward(wTS.Next()) || bumped
 						txn.WriteTooOld = true
 						ba.Txn = &txn
+						// Update the batch timestamp, and not just the batch's
+						// transaction's timestamp, so that the request will leave intents,
+						// if any, at the forwarded timestamp. See the analogous comment in the
+						// read tscache path for rationale.
+						ba.Timestamp = txn.Timestamp
 					}
 				}
 			} else {

diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go
@@ -2618,6 +2618,66 @@ func TestReplicaUseTSCache(t *testing.T) {
 	}
 }
 
+func TestReplicaTSCacheForwardsIntentTS(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+
+	ctx := context.Background()
+	tc := testContext{}
+	stopper := stop.NewStopper()
+	defer stopper.Stop(ctx)
+	tc.Start(t, stopper)
+
+	tsOld := tc.Clock().Now()
+	tsNew := tsOld.Add(time.Millisecond.Nanoseconds(), 0)
+
+	// Read at tNew to populate the read timestamp cache.
+	// DeleteRange at tNew to populate the write timestamp cache.
+	txnNew := newTransaction("new", roachpb.Key("txn-anchor"), roachpb.NormalUserPriority, tc.Clock())
+	txnNew.OrigTimestamp = tsNew
+	txnNew.Timestamp = tsNew
+	keyGet := roachpb.Key("get")
+	keyDeleteRange := roachpb.Key("delete-range")
+	gArgs := &roachpb.GetRequest{
+		RequestHeader: roachpb.RequestHeader{Key: keyGet},
+	}
+	drArgs := &roachpb.DeleteRangeRequest{
+		RequestHeader: roachpb.RequestHeader{Key: keyDeleteRange, EndKey: keyDeleteRange.Next()},
+	}
+	assignSeqNumsForReqs(txnNew, gArgs, drArgs)
+	var ba roachpb.BatchRequest
+	ba.Header.Txn = txnNew
+	ba.Add(gArgs, drArgs)
+	if _, pErr := tc.Sender().Send(ctx, ba); pErr != nil {
+		t.Fatal(pErr)
+	}
+
+	// Write under the timestamp cache within the transaction, and verify that
+	// the intents are written above the timestamp cache.
+	txnOld := newTransaction("old", roachpb.Key("txn-anchor"), roachpb.NormalUserPriority, tc.Clock())
+	txnOld.OrigTimestamp = tsOld
+	txnOld.Timestamp = tsOld
+	for _, key := range []roachpb.Key{keyGet, keyDeleteRange} {
+		t.Run(string(key), func(t *testing.T) {
+			pArgs := putArgs(key, []byte("foo"))
+			assignSeqNumsForReqs(txnOld, &pArgs)
+			if _, pErr := tc.SendWrappedWith(roachpb.Header{Txn: txnOld}, &pArgs); pErr != nil {
+				t.Fatal(pErr)
+			}
+			var keyMeta enginepb.MVCCMetadata
+			ok, _, _, err := tc.engine.GetProto(engine.MakeMVCCMetadataKey(key), &keyMeta)
+			if err != nil {
+				t.Fatal(err)
+			} else if !ok {
+				t.Fatalf("metadata missing for key %q", key)
+			}
+			if tsNext := tsNew.Next(); hlc.Timestamp(keyMeta.Timestamp) != tsNext {
+				t.Errorf("timestamp not forwarded for %q intent: expected %s but got %s",
+					key, tsNext, keyMeta.Timestamp)
+			}
+		})
+	}
+}
+
 func TestConditionalPutUpdatesTSCacheOnError(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	tc := testContext{}