Skip to content

Commit

Permalink
Merge #43580 #43606
Browse files Browse the repository at this point in the history
43580: storage: improve the error for aborted txns after lease transfer r=andreimatei a=andreimatei

Lease transfers wipe the timestamp cache, and so a txn that straddles a
lease transfer will not be allowed to create its txn record. This commit
introduces a specific reason for the TransactionAbortedError
highlighting that there's a new lease.

Release note: None

43606: kv: don't close TCS interceptors on errors r=andreimatei a=andreimatei

Before this patch, the TxnCoordSender was closing all the interceptors
when on non-retriable errors. This was a useless optimization serving to
stop the heartbeat loop early; the client was required to send a
rollback to clean up the intents.
This patch gets rid of the optimization in anticipation of the
savepoints API, which will serve for error recovery.

Release note: None

Co-authored-by: Andrei Matei <[email protected]>
  • Loading branch information
craig[bot] and andreimatei committed Jan 2, 2020
3 parents b720429 + 58d6188 + 6997710 commit d037845
Show file tree
Hide file tree
Showing 8 changed files with 311 additions and 244 deletions.
1 change: 1 addition & 0 deletions c-deps/libroach/protos/roachpb/errors.pb.cc

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions c-deps/libroach/protos/roachpb/errors.pb.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions pkg/kv/txn_coord_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -768,15 +768,14 @@ func (tc *TxnCoordSender) updateStateLocked(
return roachpb.NewError(err)
}

// This is the non-retriable error case.
// This is the non-retriable error case. The client is expected to send a
// rollback.
if errTxn := pErr.GetTxn(); errTxn != nil {
tc.mu.txnState = txnError
tc.mu.storedErr = roachpb.NewError(&roachpb.TxnAlreadyEncounteredErrorError{
PrevError: pErr.String(),
})
// Cleanup.
tc.mu.txn.Update(errTxn)
tc.cleanupTxnLocked(ctx)
}
return pErr
}
Expand Down
466 changes: 243 additions & 223 deletions pkg/roachpb/errors.pb.go

Large diffs are not rendered by default.

27 changes: 22 additions & 5 deletions pkg/roachpb/errors.proto
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,28 @@ enum TransactionAbortedReason {
// it must be the latter case, and the transaction can be retried.
ABORT_REASON_ALREADY_COMMITTED_OR_ROLLED_BACK_POSSIBLE_REPLAY = 6;

// Like the above, except the timestamp cache doesn't have a txn id in it.
// This means that it no longer has good accuracy; likely as a result of a
// lease transfer. As above, if the error has not been converted by the time
// it reaches a client, then it's not a replay.
ABORT_REASON_TIMESTAMP_CACHE_REJECTED_POSSIBLE_REPLAY= 7;
// A request attempting to create a transaction record is not allowed to
// proceed by the timestamp cache because it cannot be verified that the
// respective transaction record did not previously exist. As opposed to the
// case above, the timestamp cache does not have a txn id in it, but the lease
// under which the request is evaluated is newer than the transaction's
// minimum timestamp (see CanCreateTxnRecord()). A new lease wipes the
// timestamp cache, so transaction record creation is bound to fail for
// transactions that spanned a lease acquisition.
// As above, if the error has not been converted by the time it reaches a
// client, then it's not a replay.
ABORT_REASON_NEW_LEASE_PREVENTS_TXN = 8;

// Like the above, the timestamp cache rejects the creation of a transaction
// record. But there's no txn id in the ts cache, and also the lease is not
// new. The timestamp cache has lost accuracy because of a range merge or
// because of its memory limit.
// As above, if the error has not been converted by the time it reaches a
// client, then it's not a replay.
//
// TODO(andrei): We should be able to identify the range merge case by saving
// a bit more info in the timestamp cache.
ABORT_REASON_TIMESTAMP_CACHE_REJECTED = 7;

reserved 2;
}
Expand Down
29 changes: 29 additions & 0 deletions pkg/storage/client_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,3 +411,32 @@ func TestCannotTransferLeaseToVoterOutgoing(t *testing.T) {
})

}

// Test the error returned by attempts to create a txn record after a lease
// transfer.
func TestTimestampCacheErrorAfterLeaseTransfer(t *testing.T) {
defer leaktest.AfterTest(t)()
ctx := context.Background()
tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{})
defer tc.Stopper().Stop(ctx)

key := []byte("a")
rangeDesc, err := tc.LookupRange(key)
require.NoError(t, err)

// Transfer the lease to Servers[0] so we start in a known state. Otherwise,
// there might be already a lease owned by a random node.
require.NoError(t, tc.TransferRangeLease(rangeDesc, tc.Target(0)))

// Start a txn and perform a write, so that a txn record has to be created by
// the EndTransaction.
txn := tc.Servers[0].DB().NewTxn(ctx, "test")
require.NoError(t, txn.Put(ctx, "a", "val"))
// After starting the transaction, transfer the lease. This will wipe the
// timestamp cache, which means that the txn record will not be able to be
// created (because someone might have already aborted the txn).
require.NoError(t, tc.TransferRangeLease(rangeDesc, tc.Target(1)))

err = txn.Commit(ctx)
require.Error(t, err, "TransactionAbortedError(ABORT_REASON_NEW_LEASE_PREVENTS_TXN)")
}
2 changes: 1 addition & 1 deletion pkg/storage/client_merge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ func mergeCheckingTimestampCaches(t *testing.T, disjointLeaseholders bool) {
ba.Add(hb)
var expReason roachpb.TransactionAbortedReason
if disjointLeaseholders {
expReason = roachpb.ABORT_REASON_TIMESTAMP_CACHE_REJECTED_POSSIBLE_REPLAY
expReason = roachpb.ABORT_REASON_TIMESTAMP_CACHE_REJECTED
} else {
expReason = roachpb.ABORT_REASON_ABORTED_RECORD_FOUND
}
Expand Down
20 changes: 10 additions & 10 deletions pkg/storage/replica_tscache.go
Original file line number Diff line number Diff line change
Expand Up @@ -476,23 +476,23 @@ func (r *Replica) CanCreateTxnRecord(
// our coordinator has already been processed. We might be a replay,
// or we raced with an asynchronous abort. Either way, return an
// error.
return false, minCommitTS, roachpb.ABORT_REASON_ALREADY_COMMITTED_OR_ROLLED_BACK_POSSIBLE_REPLAY
return false, hlc.Timestamp{},
roachpb.ABORT_REASON_ALREADY_COMMITTED_OR_ROLLED_BACK_POSSIBLE_REPLAY
case uuid.Nil:
// On lease transfers the timestamp cache is reset with the transfer
// time as the low water mark. The timestamp cache may also lose
// information when bumping its low water mark due to memory
// constraints. If this replica recently obtained the lease or if
// the timestamp cache recently bumped its low water mark, this case
// will be true for new txns, even if they're not a replay. We force
// these txns to retry.
return false, minCommitTS, roachpb.ABORT_REASON_TIMESTAMP_CACHE_REJECTED_POSSIBLE_REPLAY
lease, _ /* nextLease */ := r.GetLease()
// Recognize the case where a lease started recently. Lease transfers bump
// the ts cache low water mark.
if wTS == lease.Start {
return false, hlc.Timestamp{}, roachpb.ABORT_REASON_NEW_LEASE_PREVENTS_TXN
}
return false, hlc.Timestamp{}, roachpb.ABORT_REASON_TIMESTAMP_CACHE_REJECTED
default:
// If we find another transaction's ID then that transaction has
// aborted us before our transaction record was written. It obeyed
// the restriction that it couldn't create a transaction record for
// us, so it bumped the write timestamp cache instead to prevent us
// from ever creating a transaction record.
return false, minCommitTS, roachpb.ABORT_REASON_ABORTED_RECORD_FOUND
return false, hlc.Timestamp{}, roachpb.ABORT_REASON_ABORTED_RECORD_FOUND
}
}
return true, minCommitTS, 0
Expand Down

0 comments on commit d037845

Please sign in to comment.