From 6efba5b13277aa36301037d9cf0d3e77b2534398 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Fri, 23 Jul 2021 14:59:57 +0200 Subject: [PATCH 1/2] roachtest: don't run zfs/ycsb/* on AWS This threw off the nightly roachtest AWS run. Release note: None --- build/teamcity-nightly-roachtest.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build/teamcity-nightly-roachtest.sh b/build/teamcity-nightly-roachtest.sh index 5b634258694d..7bd6ebe2ac5b 100755 --- a/build/teamcity-nightly-roachtest.sh +++ b/build/teamcity-nightly-roachtest.sh @@ -97,7 +97,9 @@ case "${CLOUD}" in PARALLELISM=3 CPUQUOTA=384 if [ -z "${TESTS}" ]; then - TESTS="kv(0|95)|ycsb|tpcc/(headroom/n4cpu16)|tpccbench/(nodes=3/cpu=16)|scbench/randomload/(nodes=3/ops=2000/conc=1)|backup/(KMS/n3cpu4)" + # NB: anchor ycsb to beginning of line to avoid matching `zfs/ycsb/*` which + # isn't supported on AWS at time of writing. + TESTS="kv(0|95)|^ycsb|tpcc/(headroom/n4cpu16)|tpccbench/(nodes=3/cpu=16)|scbench/randomload/(nodes=3/ops=2000/conc=1)|backup/(KMS/n3cpu4)" fi ;; *) From 3020d22f8942c0b203d9703a26de8558730f291e Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 23 Jul 2021 16:18:06 +0000 Subject: [PATCH 2/2] kvserver: add GC intent resolution timeout This patch adds a 2-minute timeout for GC intent resolution batches, to ensure progress even when encountering a stuck range (recall that intent GC will attempt to remove intents on other ranges that belong to a txn anchored on the current range). Release note: None --- pkg/kv/kvserver/gc/BUILD.bazel | 1 + pkg/kv/kvserver/gc/gc.go | 21 +++++++++++++++------ pkg/kv/kvserver/gc_queue.go | 7 +++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/pkg/kv/kvserver/gc/BUILD.bazel b/pkg/kv/kvserver/gc/BUILD.bazel index f2dc050319cc..94f9348274cd 100644 --- a/pkg/kv/kvserver/gc/BUILD.bazel +++ b/pkg/kv/kvserver/gc/BUILD.bazel @@ -20,6 +20,7 @@ go_library( "//pkg/storage", "//pkg/storage/enginepb", "//pkg/util/bufalloc", + "//pkg/util/contextutil", "//pkg/util/hlc", "//pkg/util/log", "//pkg/util/protoutil", diff --git a/pkg/kv/kvserver/gc/gc.go b/pkg/kv/kvserver/gc/gc.go index a2d9dcfd9658..3c7954d17f5b 100644 --- a/pkg/kv/kvserver/gc/gc.go +++ b/pkg/kv/kvserver/gc/gc.go @@ -31,6 +31,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/util/bufalloc" + "github.com/cockroachdb/cockroach/pkg/util/contextutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -202,6 +203,8 @@ type RunOptions struct { // process in one go. This number should be lower than intent resolver default to // prevent further splitting in resolver. MaxTxnsPerIntentCleanupBatch int64 + // IntentCleanupBatchTimeout is the timeout for processing a batch of intents. 0 to disable. + IntentCleanupBatchTimeout time.Duration } // CleanupIntentsFunc synchronously resolves the supplied intents @@ -252,6 +255,7 @@ func Run( maxIntentsPerIntentCleanupBatch: options.MaxIntentsPerIntentCleanupBatch, maxIntentKeyBytesPerIntentCleanupBatch: options.MaxIntentKeyBytesPerIntentCleanupBatch, maxTxnsPerIntentCleanupBatch: options.MaxTxnsPerIntentCleanupBatch, + intentCleanupBatchTimeout: options.IntentCleanupBatchTimeout, }, cleanupIntentsFn, &info) if err != nil { return Info{}, err @@ -439,6 +443,7 @@ type intentBatcherOptions struct { maxIntentsPerIntentCleanupBatch int64 maxIntentKeyBytesPerIntentCleanupBatch int64 maxTxnsPerIntentCleanupBatch int64 + intentCleanupBatchTimeout time.Duration } // newIntentBatcher initializes an intentBatcher. Batcher will take ownership of @@ -504,12 +509,16 @@ func (b *intentBatcher) maybeFlushPendingIntents(ctx context.Context) error { return ctx.Err() } - // FIXME(erikgrinaker): We should have a timeout for suboperations here now - // that the overall GC timeout has been increased, to make sure we'll make - // progress even with range unavailability. That's probably best done once - // batching is implemented, so we'll wait for that: - // https://github.com/cockroachdb/cockroach/pull/65847 - err := b.cleanupIntentsFn(ctx, b.pendingIntents) + var err error + cleanupIntentsFn := func(ctx context.Context) error { + return b.cleanupIntentsFn(ctx, b.pendingIntents) + } + if b.options.intentCleanupBatchTimeout > 0 { + err = contextutil.RunWithTimeout( + ctx, "intent GC batch", b.options.intentCleanupBatchTimeout, cleanupIntentsFn) + } else { + err = cleanupIntentsFn(ctx) + } if err == nil { // IntentTxns and PushTxn will be equal here, since // pushes to transactions whose record lies in this diff --git a/pkg/kv/kvserver/gc_queue.go b/pkg/kv/kvserver/gc_queue.go index 5b97773d6d1d..9e91816bab23 100644 --- a/pkg/kv/kvserver/gc_queue.go +++ b/pkg/kv/kvserver/gc_queue.go @@ -39,6 +39,12 @@ const ( gcQueueTimerDuration = 1 * time.Second // gcQueueTimeout is the timeout for a single GC run. gcQueueTimeout = 10 * time.Minute + // gcQueueIntentBatchTimeout is the timeout for resolving a single batch of + // intents. It is used to ensure progress in the face of unavailable ranges + // (since intent resolution may touch other ranges), but can prevent progress + // for ranged intent resolution if it exceeds the timeout. + gcQueueIntentBatchTimeout = 2 * time.Minute + // gcQueueIntentCooldownDuration is the duration to wait between GC attempts // of the same range when triggered solely by intents. This is to prevent // continually spinning on intents that belong to active transactions, which @@ -519,6 +525,7 @@ func (gcq *gcQueue) process( MaxIntentsPerIntentCleanupBatch: maxIntentsPerCleanupBatch, MaxIntentKeyBytesPerIntentCleanupBatch: maxIntentKeyBytesPerCleanupBatch, MaxTxnsPerIntentCleanupBatch: intentresolver.MaxTxnsPerIntentCleanupBatch, + IntentCleanupBatchTimeout: gcQueueIntentBatchTimeout, }, *zone.GC, &replicaGCer{repl: repl},