Skip to content

Commit

Permalink
mvcc: don't write below provisional commit timestamp on WriteTooOld e…
Browse files Browse the repository at this point in the history
…rrors

Fixes cockroachdb#34853.

This change fixes cockroachdb#34853, which appears to have been broken by
cockroachdb@57d0201#diff-41e9e1b7829f8e25daeb564f42c26017L1514.
That change made it so that intents are now written at a transaction's
provisional commit timestamp instead of its original timestamp (which is
fantastic!). In doing so, it removed special purpose code to forward the
`MVCCLogicalOpDetails` timestamp to the provisional commit timestamp of the
transaction because it assumed that the new `writeTimestamp` variable that the
change introduced already assumed this role.

Unfortunately, the change allowed this `writeTimestamp` variable to regress
below the transaction's provisional commit timestamp in subtle cases where a
committed value ended up between the transaction's original timestamp and its
provisional commit timestamp (a case that generates `WriteTooOldError`s). In
this case, we were replacing the `writeTimestamp` with the committed value's
timestamp + 1 logical tick. This regression allowed `MVCCWriteIntentOp`s to look
like they were ignoring closed timestamps.

The fix was to either introduce the special purpose code again or to avoid
letting `writeTimestamp` regress in this `WriteTooOldError` code path. This PR
chose the latter solution.

In doing so, this PR also improves upon the benefit already provided by cockroachdb#32487 -
it reduces the number of intents that need to be moved on transaction commit.
This is because it ensures that if a transaction has an original (read)
timestamp below a committed value and a provisional commit (write) timestamp
above a committed value, an intent left by that transaction will be in a
"committable" state without a need to rewrite it during intent resolution.

Release note: None
  • Loading branch information
nvanbenschoten committed Feb 21, 2019
1 parent 763e90b commit 6b49279
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 31 deletions.
82 changes: 51 additions & 31 deletions pkg/storage/engine/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -1266,9 +1266,6 @@ func replayTransactionalWrite(
// that the timestamp parameter be set to hlc.Timestamp{} when writing
// transactionally, but hlc.Timestamp{} is already used as a sentinel
// for inline puts.)
//
// TODO(andrei): explore alternate function signatures that reduce
// confusion about which timestamp applies.
func mvccPutInternal(
ctx context.Context,
engine Writer,
Expand Down Expand Up @@ -1326,18 +1323,23 @@ func mvccPutInternal(
return err
}

// Determine the read and write timestamps for the write. For a
// non-transactional write, these will be identical. For a transactional
// write, we read at the transaction's original timestamp (forwarded by any
// refresh timestamp) but write intents at its provisional commit timestamp.
// See the comment on the txn.Timestamp field definition for rationale.
readTimestamp := timestamp
writeTimestamp := timestamp
if txn != nil {
txnReadTimestamp := txn.OrigTimestamp
txnReadTimestamp.Forward(txn.RefreshedTimestamp)
if txnReadTimestamp != timestamp {
readTimestamp = txn.OrigTimestamp
readTimestamp.Forward(txn.RefreshedTimestamp)
if readTimestamp != timestamp {
return errors.Errorf("mvccPutInternal: txn's read timestamp %s does not match timestamp %s",
txn.OrigTimestamp, timestamp)
}
// A txn writes intents at its provisional commit timestamp. See the
// comment on the txn.Timestamp field definition for rationale.
writeTimestamp = txn.Timestamp
}
timestamp = hlc.Timestamp{} // prevent accidental use below

// Determine what the logical operation is. Are we writing an intent
// or a value directly?
Expand Down Expand Up @@ -1371,7 +1373,7 @@ func mvccPutInternal(
// The transaction has executed at this sequence before. This is merely a
// replay of the transactional write. Assert that all is in order and return
// early.
return replayTransactionalWrite(ctx, engine, iter, meta, ms, key, timestamp, value, txn, buf, valueFn)
return replayTransactionalWrite(ctx, engine, iter, meta, ms, key, readTimestamp, value, txn, buf, valueFn)
}

// We're overwriting the intent that was present at this key, before we do
Expand All @@ -1383,11 +1385,11 @@ func mvccPutInternal(
getBuf.meta = buf.meta // initialize get metadata from what we've already read

existingVal, _, _, err := mvccGetInternal(
ctx, iter, metaKey, timestamp, true /* consistent */, safeValue, txn, getBuf)
ctx, iter, metaKey, readTimestamp, true /* consistent */, safeValue, txn, getBuf)
if err != nil {
return err
}
// Its possible that the existing value is nil if the intent on the key
// It's possible that the existing value is nil if the intent on the key
// has a lower epoch. We don't have to deal with this as a special case
// because in this case, the value isn't written to the intent history.
// Instead, the intent history is blown away completely.
Expand Down Expand Up @@ -1460,35 +1462,54 @@ func mvccPutInternal(
} else {
buf.newMeta.IntentHistory = nil
}
} else if !metaTimestamp.Less(timestamp) {
// This is the case where we're trying to write under a
// committed value. Obviously we can't do that, but we can
// increment our timestamp to one logical tick past the existing
// value and go on to write, but then return a write-too-old
// error indicating what the timestamp ended up being. This
// timestamp can then be used to increment the txn timestamp and
// be returned with the response.
writeTimestamp = metaTimestamp.Next()
maybeTooOldErr = &roachpb.WriteTooOldError{Timestamp: timestamp, ActualTimestamp: writeTimestamp}
} else if !metaTimestamp.Less(readTimestamp) {
// This is the case where we're trying to write under a committed
// value. Obviously we can't do that, but we can increment our
// timestamp to one logical tick past the existing value and go on
// to write, but then also return a write-too-old error indicating
// what the timestamp ended up being. This timestamp can then be
// used to increment the txn timestamp and be returned with the
// response.
//
// For this to work, this function needs to complete its mutation to
// the Writer even if it plans to return a write-too-old error. This
// allows logic that lives in evaluateBatch to determine what to do
// with the error and whether it should prevent the batch from being
// proposed or not.
//
// NB: even if metaTimestamp is less than writeTimestamp, we can't
// avoid the WriteTooOld error if metaTimestamp is equal to or
// greater than readTimestamp. This is because certain operations
// like ConditionalPuts and InitPuts avoid ever needing refreshes
// by ensuring that they propagate WriteTooOld errors immediately
// instead of allowing their transactions to continue and be retried
// before committing.
writeTimestamp.Forward(metaTimestamp.Next())
maybeTooOldErr = &roachpb.WriteTooOldError{
Timestamp: readTimestamp, ActualTimestamp: writeTimestamp,
}
// If we're in a transaction, always get the value at the orig
// timestamp.
if txn != nil {
if value, err = maybeGetValue(
ctx, iter, metaKey, value, ok, timestamp, txn, buf, valueFn); err != nil {
ctx, iter, metaKey, value, ok, readTimestamp, txn, buf, valueFn); err != nil {
return err
}
} else {
// Outside of a transaction, read the latest value and advance
// the write timestamp to the latest value's timestamp + 1. The
// new timestamp is returned to the caller in maybeTooOldErr.
// Outside of a transaction, the read timestamp advances to the
// the latest value's timestamp + 1 as well. The new timestamp
// is returned to the caller in maybeTooOldErr. Because we're
// outside of a transaction, we'll never actually commit this
// value, but that's a concern of evaluateBatch and not here.
readTimestamp = writeTimestamp
if value, err = maybeGetValue(
ctx, iter, metaKey, value, ok, writeTimestamp, txn, buf, valueFn); err != nil {
ctx, iter, metaKey, value, ok, readTimestamp, txn, buf, valueFn); err != nil {
return err
}
}
} else {
if value, err = maybeGetValue(
ctx, iter, metaKey, value, ok, timestamp, txn, buf, valueFn); err != nil {
ctx, iter, metaKey, value, ok, readTimestamp, txn, buf, valueFn); err != nil {
return err
}
}
Expand Down Expand Up @@ -1754,7 +1775,8 @@ func mvccInitPutUsingIter(
failOnTombstones bool,
txn *roachpb.Transaction,
) error {
err := mvccPutUsingIter(ctx, engine, iter, ms, key, timestamp, noValue, txn,
return mvccPutUsingIter(
ctx, engine, iter, ms, key, timestamp, noValue, txn,
func(existVal *roachpb.Value) ([]byte, error) {
if failOnTombstones && existVal != nil && len(existVal.RawBytes) == 0 {
// We found a tombstone and failOnTombstones is true: fail.
Expand All @@ -1767,9 +1789,7 @@ func mvccInitPutUsingIter(
}
}
return value.RawBytes, nil
},
)
return err
})
}

// MVCCMerge implements a merge operation. Merge adds integer values,
Expand Down
45 changes: 45 additions & 0 deletions pkg/storage/engine/mvcc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3111,6 +3111,51 @@ func TestMVCCMultiplePutOldTimestamp(t *testing.T) {
}
}

// TestMVCCPutOldOrigTimestampNewCommitTimestamp tests a case where a
// transactional Put occurs to the same key, but with an older original
// timestamp than a pre-existing key. As always, this should result in a
// WriteTooOld error. However, in this case the transaction has a larger
// provisional commit timestamp than the pre-existing key. It should write
// its intent at this timestamp instead of directly above the existing key.
func TestMVCCPutOldOrigTimestampNewCommitTimestamp(t *testing.T) {
defer leaktest.AfterTest(t)()

ctx := context.Background()
engine := createTestEngine()
defer engine.Close()

err := MVCCPut(ctx, engine, nil, testKey1, hlc.Timestamp{WallTime: 3}, value1, nil)
if err != nil {
t.Fatal(err)
}

// Perform a transactional Put with a transaction whose original timestamp is
// below the existing key's timestamp and whose provisional commit timestamp
// is above the existing key's timestamp.
txn := makeTxn(*txn1, hlc.Timestamp{WallTime: 1})
txn.Timestamp = hlc.Timestamp{WallTime: 5}
txn.Sequence++
err = MVCCPut(ctx, engine, nil, testKey1, txn.OrigTimestamp, value2, txn)

// Verify that the Put returned a WriteTooOld with the ActualTime set to the
// transactions provisional commit timestamp.
expTS := txn.Timestamp
if wtoErr, ok := err.(*roachpb.WriteTooOldError); !ok || wtoErr.ActualTimestamp != expTS {
t.Fatalf("expected WriteTooOldError with actual time = %s; got %s", expTS, wtoErr)
}

// Verify new value was actually written at the transaction's provisional
// commit timestamp.
value, _, err := MVCCGet(ctx, engine, testKey1, hlc.MaxTimestamp, MVCCGetOptions{Txn: txn})
if err != nil {
t.Fatal(err)
}
if value.Timestamp != expTS || !bytes.Equal(value2.RawBytes, value.RawBytes) {
t.Fatalf("expected timestamp=%s (got %s), value=%q (got %q)",
value.Timestamp, expTS, value2.RawBytes, value.RawBytes)
}
}

func TestMVCCAbortTxn(t *testing.T) {
defer leaktest.AfterTest(t)()

Expand Down

0 comments on commit 6b49279

Please sign in to comment.