Skip to content

Commit

Permalink
kvserver: lower score threshold for mvcc GC eligibility
Browse files Browse the repository at this point in the history
Previously GC score which is calculated based on average garbage age
and percentage of live data was set to 2 to avoid frequent GC invocations.
Value was set high in abundance of caution.
Lowering it to 1 would guarantee that we would always collect something
and not cause unnecessary frequent GC runs.
To protect against unexpected situations where GC would be firing too
frequently, cooldown period of 2 hours is added. This cooldown will apply
from gc score of 1 and is decreased proportionally as score reaches 2
(the old threshold value).

Release note (performance improvement): GC score triggering mvcc GC run
is lowered. GC Score is ratio is computed from mvcc stats and uses ratio
of live objects and estimated garbage age to estimate collectability of
old data. Reduced score will trigger GC earlier lowering interval between
runs 3 times, giving 2 times reduced peak garbage usage at the expense
of 30% increase of wasteful data scanning on constantly updated data.
  • Loading branch information
aliher1911 committed Dec 1, 2022
1 parent f9730bd commit b013c84
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 25 deletions.
37 changes: 34 additions & 3 deletions pkg/kv/kvserver/mvcc_gc_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/spanconfig"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/admission"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
Expand All @@ -46,6 +47,11 @@ const (
// for ranged intent resolution if it exceeds the timeout.
mvccGCQueueIntentBatchTimeout = 2 * time.Minute

// mvccGCQueueCooldownDuration is duration to wait between MVCC GC attempts of
// the same rage when triggered by a low score threshold. This cooldown time
// is reduced proportionally to score and becomes 0 when score reaches a
// mvccGCKeyScoreNoCooldownThreshold score.
mvccGCQueueCooldownDuration = 2 * time.Hour
// mvccGCQueueIntentCooldownDuration is the duration to wait between MVCC GC
// attempts of the same range when triggered solely by intents. This is to
// prevent continually spinning on intents that belong to active transactions,
Expand All @@ -57,8 +63,9 @@ const (

// Thresholds used to decide whether to queue for MVCC GC based on keys and
// intents.
mvccGCKeyScoreThreshold = 2
mvccGCIntentScoreThreshold = 1
mvccGCKeyScoreThreshold = 1
mvccGCKeyScoreNoCooldownThreshold = 2
mvccGCIntentScoreThreshold = 1

probablyLargeAbortSpanSysCountThreshold = 10000
largeAbortSpanBytesThreshold = 16 * (1 << 20) // 16mb
Expand Down Expand Up @@ -402,8 +409,32 @@ func makeMVCCGCQueueScoreImpl(
valScore := r.DeadFraction * r.ValuesScalableScore
r.FinalScore = r.FuzzFactor * (valScore + r.IntentScore)

// Check GC queueing eligibility using cooldown discounted by score.
isGCScoreMet := func(score float64, minThreshold, maxThreshold float64, cooldown time.Duration) bool {
if minThreshold > maxThreshold {
if util.RaceEnabled {
log.Fatalf(ctx,
"invalid cooldown score thresholds. min (%f) must be less or equal to max (%f)",
minThreshold, maxThreshold)
}
// Swap thresholds for non test builds. This should never happen in practice.
minThreshold, maxThreshold = maxThreshold, minThreshold
}
// Cool down rate is how much we want to cool down after previous gc based
// on the score. If score is at min threshold we would wait for cooldown
// time, it is proportionally reduced as score reaches maxThreshold and is
// zero at maxThreshold which means no cooldown is necessary.
coolDownRate := 1 - (score-minThreshold)/(maxThreshold-minThreshold)
adjustedCoolDown := time.Duration(int64(float64(cooldown.Nanoseconds()) * coolDownRate))
if score > minThreshold && (r.LastGC == 0 || r.LastGC >= adjustedCoolDown) {
return true
}
return false
}

// First determine whether we should queue based on MVCC score alone.
r.ShouldQueue = canAdvanceGCThreshold && r.FuzzFactor*valScore > mvccGCKeyScoreThreshold
r.ShouldQueue = canAdvanceGCThreshold && isGCScoreMet(r.FuzzFactor*valScore, mvccGCKeyScoreThreshold,
mvccGCKeyScoreNoCooldownThreshold, mvccGCQueueCooldownDuration)

// Next, determine whether we should queue based on intent score. For
// intents, we also enforce a cooldown time since we may not actually
Expand Down
64 changes: 42 additions & 22 deletions pkg/kv/kvserver/mvcc_gc_queue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,12 +346,21 @@ func (cws *cachedWriteSimulator) singleKeySteady(
}

func (cws *cachedWriteSimulator) shouldQueue(
b bool, prio float64, after time.Duration, ttl time.Duration, ms enginepb.MVCCStats,
b bool,
prio float64,
after time.Duration,
ttl time.Duration,
ms enginepb.MVCCStats,
timeSinceGC time.Duration,
) {
cws.t.Helper()
ts := hlc.Timestamp{}.Add(ms.LastUpdateNanos+after.Nanoseconds(), 0)
prevGCTs := hlc.Timestamp{}
if timeSinceGC > 0 {
prevGCTs = prevGCTs.Add(ms.LastUpdateNanos+(after-timeSinceGC).Nanoseconds(), 0)
}
r := makeMVCCGCQueueScoreImpl(context.Background(), 0 /* seed */, ts, ms, ttl,
hlc.Timestamp{}, true, /* canAdvanceGCThreshold */
prevGCTs, true, /* canAdvanceGCThreshold */
time.Hour, /* txnCleanupThreshold */
)
if fmt.Sprintf("%.2f", r.FinalScore) != fmt.Sprintf("%.2f", prio) || b != r.ShouldQueue {
Expand All @@ -377,6 +386,7 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) {
}

minuteTTL, hourTTL := time.Minute, time.Hour
const never = time.Duration(0)

{
// Hammer a key with 1MB blobs for a (simulated) minute. Logically, at
Expand All @@ -391,32 +401,39 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) {
//
// Since at the time of this check the data is already 30s old on
// average (i.e. ~30x the TTL), we expect to *really* want GC.
cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms)
cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms)
cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms, never)
cws.shouldQueue(true, 28.94, time.Duration(0), 0, ms, never)

// Right after we finished writing, we don't want to GC yet with a one-minute TTL.
cws.shouldQueue(false, 0.48, time.Duration(0), minuteTTL, ms)
cws.shouldQueue(false, 0.48, time.Duration(0), minuteTTL, ms, never)

// Neither after a minute. The first values are about to become GC'able, though.
cws.shouldQueue(false, 1.46, time.Minute, minuteTTL, ms)
// After a minute score is already above minimum threshold and some data is
// GC able so it triggers shouldQueue if there were no GC runs recently
// enough, but not queueable if GC run recently.
cws.shouldQueue(true, 1.46, time.Minute, minuteTTL, ms, never)
cws.shouldQueue(false, 1.46, time.Minute, minuteTTL, ms, 30*time.Second)
// 90 seconds in it's really close, but still just shy of GC. Half of the
// values could be deleted now (remember that we wrote them over a one
// minute period).
cws.shouldQueue(false, 1.95, 3*time.Minute/2, minuteTTL, ms)
// We are close to high threshold, so cooldown time shrunk to just under 6
// minutes.
cws.shouldQueue(true, 1.95, 3*time.Minute/2, minuteTTL, ms, 6*time.Minute)
cws.shouldQueue(false, 1.95, 3*time.Minute/2, minuteTTL, ms, 5*time.Minute)
// Advancing another 1/4 minute does the trick.
cws.shouldQueue(true, 2.20, 7*time.Minute/4, minuteTTL, ms)
cws.shouldQueue(true, 2.20, 7*time.Minute/4, minuteTTL, ms, never)
// After an hour, that's (of course) still true with a very high priority.
cws.shouldQueue(true, 59.34, time.Hour, minuteTTL, ms)
cws.shouldQueue(true, 59.34, time.Hour, minuteTTL, ms, never)

// Let's see what the same would look like with a 1h TTL.
// Can't delete anything until 59min have passed, and indeed the score is low.
cws.shouldQueue(false, 0.01, time.Duration(0), hourTTL, ms)
cws.shouldQueue(false, 0.02, time.Minute, hourTTL, ms)
cws.shouldQueue(false, 0.99, time.Hour, hourTTL, ms)
cws.shouldQueue(false, 0.01, time.Duration(0), hourTTL, ms, never)
cws.shouldQueue(false, 0.02, time.Minute, hourTTL, ms, never)
cws.shouldQueue(false, 0.99, time.Hour, hourTTL, ms, never)
// After 90 minutes, we're getting closer. After just over two hours,
// definitely ripe for GC (and this would delete all the values).
cws.shouldQueue(false, 1.48, 90*time.Minute, hourTTL, ms)
cws.shouldQueue(true, 2.05, 125*time.Minute, hourTTL, ms)
cws.shouldQueue(true, 1.48, 90*time.Minute, hourTTL, ms, never)
cws.shouldQueue(false, 1.48, 90*time.Minute, hourTTL, ms, 5*time.Minute)
cws.shouldQueue(true, 2.05, 125*time.Minute, hourTTL, ms, never)
}

{
Expand All @@ -426,7 +443,7 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) {
cws.multiKey(999, valSize, nil /* no txn */, &ms)

// GC shouldn't move at all, even after a long time and short TTL.
cws.shouldQueue(false, 0, 24*time.Hour, minuteTTL, ms)
cws.shouldQueue(false, 0, 24*time.Hour, minuteTTL, ms, never)

// Write a single key twice.
cws.singleKeySteady(2, 0, valSize, &ms)
Expand All @@ -437,16 +454,16 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) {

// If the fact that 99.9% of the replica is live were not taken into
// account, this would get us at least close to GC.
cws.shouldQueue(false, 0.00, 5*time.Minute, minuteTTL, ms)
cws.shouldQueue(false, 0.00, 5*time.Minute, minuteTTL, ms, never)

// 12 hours in the score becomes relevant, but not yet large enough.
// The key is of course GC'able and has been for a long time, but
// to find it we'd have to scan all the other kv pairs as well.
cws.shouldQueue(false, 0.71, 12*time.Hour, minuteTTL, ms)
cws.shouldQueue(false, 0.71, 12*time.Hour, minuteTTL, ms, never)

// Two days in we're more than ready to go, would queue for GC, and
// delete.
cws.shouldQueue(true, 2.85, 48*time.Hour, minuteTTL, ms)
cws.shouldQueue(true, 2.85, 48*time.Hour, minuteTTL, ms, never)
}

{
Expand All @@ -462,9 +479,12 @@ func TestMVCCGCQueueMakeGCScoreRealistic(t *testing.T) {
// doesn't matter. In reality, the value-based GC score will often strike first.
cws.multiKey(100, valSize, txn, &ms)

cws.shouldQueue(false, 0.12, 1*time.Hour, irrelevantTTL, ms)
cws.shouldQueue(false, 0.87, 7*time.Hour, irrelevantTTL, ms)
cws.shouldQueue(true, 1.12, 9*time.Hour, irrelevantTTL, ms)
cws.shouldQueue(false, 0.12, 1*time.Hour, irrelevantTTL, ms, never)
cws.shouldQueue(false, 0.87, 7*time.Hour, irrelevantTTL, ms, never)
cws.shouldQueue(true, 1.12, 9*time.Hour, irrelevantTTL, ms, never)
// Check if cooldown policy protects us from too frequent runs based on
// intents alone.
cws.shouldQueue(false, 1.12, 9*time.Hour, irrelevantTTL, ms, time.Hour)
}
}

Expand Down

0 comments on commit b013c84

Please sign in to comment.