diff --git a/pkg/kv/kvserver/mvcc_gc_queue.go b/pkg/kv/kvserver/mvcc_gc_queue.go index 294aafda4714..65a0eecdf700 100644 --- a/pkg/kv/kvserver/mvcc_gc_queue.go +++ b/pkg/kv/kvserver/mvcc_gc_queue.go @@ -25,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/spanconfig" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/admission" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" @@ -46,6 +47,11 @@ const ( // for ranged intent resolution if it exceeds the timeout. mvccGCQueueIntentBatchTimeout = 2 * time.Minute + // mvccGCQueueCooldownDuration is duration to wait between MVCC GC attempts of + // the same rage when triggered by a low score threshold. This cooldown time + // is reduced proportionally to score and becomes 0 when score reaches a + // mvccGCKeyScoreNoCooldownThreshold score. + mvccGCQueueCooldownDuration = 2 * time.Hour // mvccGCQueueIntentCooldownDuration is the duration to wait between MVCC GC // attempts of the same range when triggered solely by intents. This is to // prevent continually spinning on intents that belong to active transactions, @@ -57,8 +63,9 @@ const ( // Thresholds used to decide whether to queue for MVCC GC based on keys and // intents. - mvccGCKeyScoreThreshold = 2 - mvccGCIntentScoreThreshold = 1 + mvccGCKeyScoreThreshold = 1 + mvccGCKeyScoreNoCooldownThreshold = 2 + mvccGCIntentScoreThreshold = 1 probablyLargeAbortSpanSysCountThreshold = 10000 largeAbortSpanBytesThreshold = 16 * (1 << 20) // 16mb @@ -402,8 +409,32 @@ func makeMVCCGCQueueScoreImpl( valScore := r.DeadFraction * r.ValuesScalableScore r.FinalScore = r.FuzzFactor * (valScore + r.IntentScore) + // Check GC queueing eligibility using cooldown discounted by score. + isGCScoreMet := func(score float64, minThreshold, maxThreshold float64, cooldown time.Duration) bool { + if minThreshold > maxThreshold { + if util.RaceEnabled { + log.Fatalf(ctx, + "invalid cooldown score thresholds. min (%f) must be less or equal to max (%f)", + minThreshold, maxThreshold) + } + // Swap thresholds for non test builds. This should never happen in practice. + minThreshold, maxThreshold = maxThreshold, minThreshold + } + // Cool down rate is how much we want to cool down after previous gc based + // on the score. If score is at min threshold we would wait for cooldown + // time, it is proportionally reduced as score reaches maxThreshold and is + // zero at maxThreshold which means no cooldown is necessary. + coolDownRate := 1 - (score-minThreshold)/(maxThreshold-minThreshold) + adjustedCoolDown := time.Duration(int64(float64(cooldown.Nanoseconds()) * coolDownRate)) + if score > minThreshold && (r.LastGC == 0 || r.LastGC >= adjustedCoolDown) { + return true + } + return false + } + // First determine whether we should queue based on MVCC score alone. - r.ShouldQueue = canAdvanceGCThreshold && r.FuzzFactor*valScore > mvccGCKeyScoreThreshold + r.ShouldQueue = canAdvanceGCThreshold && isGCScoreMet(r.FuzzFactor*valScore, mvccGCKeyScoreThreshold, + mvccGCKeyScoreNoCooldownThreshold, mvccGCQueueCooldownDuration) // Next, determine whether we should queue based on intent score. For // intents, we also enforce a cooldown time since we may not actually diff --git a/pkg/kv/kvserver/mvcc_gc_queue_test.go b/pkg/kv/kvserver/mvcc_gc_queue_test.go index c49a8d0e3d08..48d4f8dfc7ad 100644 --- a/pkg/kv/kvserver/mvcc_gc_queue_test.go +++ b/pkg/kv/kvserver/mvcc_gc_queue_test.go @@ -346,12 +346,21 @@ func (cws *cachedWriteSimulator) singleKeySteady( } func (cws *cachedWriteSimulator) shouldQueue( - b bool, prio float64, after time.Duration, ttl time.Duration, ms enginepb.MVCCStats, + b bool, + prio float64, + after time.Duration, + ttl time.Duration, + ms enginepb.MVCCStats, + timeSinceGC time.Duration, ) { cws.t.Helper() ts := hlc.Timestamp{}.Add(ms.LastUpdateNanos+after.Nanoseconds(), 0) + prevGCTs := hlc.Timestamp{} + if timeSinceGC > 0 { + prevGCTs = prevGCTs.Add(ms.LastUpdateNanos+(after-timeSinceGC).Nanoseconds(), 0) + } r := makeMVCCGCQueueScoreImpl(context.Background(), 0 /* seed */, ts, ms, ttl, - hlc.Timestamp{}, true, /* canAdvanceGCThreshold */ + prevGCTs, true, /* canAdvanceGCThreshold */ time.Hour, /* txnCleanupThreshold */ ) if fmt.Sprintf("%.2f", r.FinalScore) != fmt.Sprintf("%.2f", prio) || b != r.ShouldQueue { @@ -377,6 +386,7 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) { } minuteTTL, hourTTL := time.Minute, time.Hour + const never = time.Duration(0) { // Hammer a key with 1MB blobs for a (simulated) minute. Logically, at @@ -391,32 +401,39 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) { // // Since at the time of this check the data is already 30s old on // average (i.e. ~30x the TTL), we expect to *really* want GC. - cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms) - cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms) + cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms, never) + cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms, never) // Right after we finished writing, we don't want to GC yet with a one-minute TTL. - cws.shouldQueue(false, 0.48, time.Duration(0), minuteTTL, ms) + cws.shouldQueue(false, 0.48, time.Duration(0), minuteTTL, ms, never) - // Neither after a minute. The first values are about to become GC'able, though. - cws.shouldQueue(false, 1.46, time.Minute, minuteTTL, ms) + // After a minute score is already above minimum threshold and some data is + // GC able so it triggers shouldQueue if there were no GC runs recently + // enough, but not queueable if GC run recently. + cws.shouldQueue(true, 1.46, time.Minute, minuteTTL, ms, never) + cws.shouldQueue(false, 1.46, time.Minute, minuteTTL, ms, 30*time.Second) // 90 seconds in it's really close, but still just shy of GC. Half of the // values could be deleted now (remember that we wrote them over a one // minute period). - cws.shouldQueue(false, 1.95, 3*time.Minute/2, minuteTTL, ms) + // We are close to high threshold, so cooldown time shrunk to just under 6 + // minutes. + cws.shouldQueue(true, 1.95, 3*time.Minute/2, minuteTTL, ms, 6*time.Minute) + cws.shouldQueue(false, 1.95, 3*time.Minute/2, minuteTTL, ms, 5*time.Minute) // Advancing another 1/4 minute does the trick. - cws.shouldQueue(true, 2.20, 7*time.Minute/4, minuteTTL, ms) + cws.shouldQueue(true, 2.20, 7*time.Minute/4, minuteTTL, ms, never) // After an hour, that's (of course) still true with a very high priority. - cws.shouldQueue(true, 59.34, time.Hour, minuteTTL, ms) + cws.shouldQueue(true, 59.34, time.Hour, minuteTTL, ms, never) // Let's see what the same would look like with a 1h TTL. // Can't delete anything until 59min have passed, and indeed the score is low. - cws.shouldQueue(false, 0.01, time.Duration(0), hourTTL, ms) - cws.shouldQueue(false, 0.02, time.Minute, hourTTL, ms) - cws.shouldQueue(false, 0.99, time.Hour, hourTTL, ms) + cws.shouldQueue(false, 0.01, time.Duration(0), hourTTL, ms, never) + cws.shouldQueue(false, 0.02, time.Minute, hourTTL, ms, never) + cws.shouldQueue(false, 0.99, time.Hour, hourTTL, ms, never) // After 90 minutes, we're getting closer. After just over two hours, // definitely ripe for GC (and this would delete all the values). - cws.shouldQueue(false, 1.48, 90*time.Minute, hourTTL, ms) - cws.shouldQueue(true, 2.05, 125*time.Minute, hourTTL, ms) + cws.shouldQueue(true, 1.48, 90*time.Minute, hourTTL, ms, never) + cws.shouldQueue(false, 1.48, 90*time.Minute, hourTTL, ms, 5*time.Minute) + cws.shouldQueue(true, 2.05, 125*time.Minute, hourTTL, ms, never) } { @@ -426,7 +443,7 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) { cws.multiKey(999, valSize, nil /* no txn */, &ms) // GC shouldn't move at all, even after a long time and short TTL. - cws.shouldQueue(false, 0, 24*time.Hour, minuteTTL, ms) + cws.shouldQueue(false, 0, 24*time.Hour, minuteTTL, ms, never) // Write a single key twice. cws.singleKeySteady(2, 0, valSize, &ms) @@ -437,16 +454,16 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) { // If the fact that 99.9% of the replica is live were not taken into // account, this would get us at least close to GC. - cws.shouldQueue(false, 0.00, 5*time.Minute, minuteTTL, ms) + cws.shouldQueue(false, 0.00, 5*time.Minute, minuteTTL, ms, never) // 12 hours in the score becomes relevant, but not yet large enough. // The key is of course GC'able and has been for a long time, but // to find it we'd have to scan all the other kv pairs as well. - cws.shouldQueue(false, 0.71, 12*time.Hour, minuteTTL, ms) + cws.shouldQueue(false, 0.71, 12*time.Hour, minuteTTL, ms, never) // Two days in we're more than ready to go, would queue for GC, and // delete. - cws.shouldQueue(true, 2.85, 48*time.Hour, minuteTTL, ms) + cws.shouldQueue(true, 2.85, 48*time.Hour, minuteTTL, ms, never) } { @@ -462,9 +479,12 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) { // doesn't matter. In reality, the value-based GC score will often strike first. cws.multiKey(100, valSize, txn, &ms) - cws.shouldQueue(false, 0.12, 1*time.Hour, irrelevantTTL, ms) - cws.shouldQueue(false, 0.87, 7*time.Hour, irrelevantTTL, ms) - cws.shouldQueue(true, 1.12, 9*time.Hour, irrelevantTTL, ms) + cws.shouldQueue(false, 0.12, 1*time.Hour, irrelevantTTL, ms, never) + cws.shouldQueue(false, 0.87, 7*time.Hour, irrelevantTTL, ms, never) + cws.shouldQueue(true, 1.12, 9*time.Hour, irrelevantTTL, ms, never) + // Check if cooldown policy protects us from too frequent runs based on + // intents alone. + cws.shouldQueue(false, 1.12, 9*time.Hour, irrelevantTTL, ms, time.Hour) } }