From 15187bb670fdf04a14323e2285b9a1ccac95f6a4 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Fri, 16 Apr 2021 15:28:54 -0400 Subject: [PATCH] kvnemesis: add backoff to retry loop on txn aborts This commit adds an exponential backoff to the transaction retry loop when it detects that a transaction has been aborted. This was observed to prevent thrashing under heavy read-write contention on `global_read` ranges, which are added to kvnemesis in #63747. These ranges have an added propensity to cause thrashing because every write to these ranges gets bumped to a higher timestamp, so it is currently imperative that a transaction be able to refresh its reads after writing to a global_read range. If other transactions continue to invalidate a read-write transaction's reads, it may never complete and will repeatedly abort conflicting txns after detecting deadlocks. This commit prevents this from stalling kvnemesis indefinitely. I see two ways that we can improve this situation in the future. 1. The first option is that we could introduce some form of pessimistic read-locking for long running read-write transactions, so that they can eventually prevent other transactions from invalidating their reads as they proceed to write to a global_reads range and get their write timestamp bumped. This ensures that when the long-running transaction returns to refresh (if it even needs to, depending on the durability of the read locks) its reads, the refresh will have a high likelihood of succeeding. This is discussed in #52768. 2. The second option is to allow a transaction to re-write its existing intents in new epochs without being bumped by the closed timestamp. If a transaction only got bumped by the closed timestamp when writing new intents, then after a transaction was forced to retry, it would have a high likelihood of succeeding on its second epoch as long as it didn't write to a new set of keys. This is discussed in #63796. --- pkg/kv/kvnemesis/applier.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pkg/kv/kvnemesis/applier.go b/pkg/kv/kvnemesis/applier.go index bde84f442580..9d43b92b433f 100644 --- a/pkg/kv/kvnemesis/applier.go +++ b/pkg/kv/kvnemesis/applier.go @@ -12,6 +12,7 @@ package kvnemesis import ( "context" + "time" "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -86,8 +87,17 @@ func applyOp(ctx context.Context, db *kv.DB, op *Operation) { err := db.AdminTransferLease(ctx, o.Key, o.Target) o.Result = resultError(ctx, err) case *ClosureTxnOperation: + // Use a backoff loop to avoid thrashing on txn aborts. Don't wait between + // epochs of the same transaction to avoid waiting while holding locks. + retryOnAbort := retry.StartWithCtx(ctx, retry.Options{ + InitialBackoff: 1 * time.Millisecond, + MaxBackoff: 250 * time.Millisecond, + }) var savedTxn *kv.Txn txnErr := db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { + if savedTxn != nil && savedTxn.TestingCloneTxn().Epoch == 0 { + retryOnAbort.Next() + } savedTxn = txn for i := range o.Ops { op := &o.Ops[i] @@ -116,7 +126,7 @@ func applyOp(ctx context.Context, db *kv.DB, op *Operation) { }) o.Result = resultError(ctx, txnErr) if txnErr == nil { - o.Txn = savedTxn.Sender().TestingCloneTxn() + o.Txn = savedTxn.TestingCloneTxn() } default: panic(errors.AssertionFailedf(`unknown operation type: %T %v`, o, o))