diff --git a/pkg/kv/kvserver/BUILD.bazel b/pkg/kv/kvserver/BUILD.bazel index 3fc0c4e559f4..971ea09c1e06 100644 --- a/pkg/kv/kvserver/BUILD.bazel +++ b/pkg/kv/kvserver/BUILD.bazel @@ -227,6 +227,7 @@ go_test( "client_merge_test.go", "client_metrics_test.go", "client_migration_test.go", + "client_mvcc_gc_test.go", "client_protectedts_test.go", "client_raft_helpers_test.go", "client_raft_log_queue_test.go", diff --git a/pkg/kv/kvserver/client_mvcc_gc_test.go b/pkg/kv/kvserver/client_mvcc_gc_test.go new file mode 100644 index 000000000000..ad266065a5d6 --- /dev/null +++ b/pkg/kv/kvserver/client_mvcc_gc_test.go @@ -0,0 +1,67 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package kvserver_test + +import ( + "context" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/stretchr/testify/require" +) + +// TestMVCCGCCorrectStats verifies that the mvcc gc queue corrects stats +// for a range that has bad ones that would unnecessarily trigger the mvcc +// gc queue. +func TestMVCCGCCorrectStats(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + serv, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) + s := serv.(*server.TestServer) + defer s.Stopper().Stop(ctx) + + key, err := s.ScratchRange() + require.NoError(t, err) + store, err := s.Stores().GetStore(s.GetFirstStoreID()) + require.NoError(t, err) + + repl := store.LookupReplica(roachpb.RKey(key)) + for i := 0; i < 10; i++ { + if err := store.DB().Put(ctx, key, "foo"); err != nil { + t.Fatal(err) + } + key = key.Next() + } + + // Put some garbage in the stats, so it triggers the mvcc gc queue. + ms := repl.GetMVCCStats() + oldKeyBytes := ms.KeyBytes + oldValBytes := ms.ValBytes + ms.KeyBytes = 16 * (1 << 20) // 16mb + ms.ValBytes = 32 * (1 << 20) // 16mb + ms.GCBytesAge = 48 * (1 << 20) * 100 * int64(time.Hour.Seconds()) + + repl.SetMVCCStatsForTesting(&ms) + require.NoError(t, store.ManualMVCCGC(repl)) + + // Verify that the mvcc gc queue restored the stats. + newStats := repl.GetMVCCStats() + require.Equal(t, oldKeyBytes, newStats.KeyBytes) + require.Equal(t, oldValBytes, newStats.ValBytes) +} diff --git a/pkg/kv/kvserver/gc/BUILD.bazel b/pkg/kv/kvserver/gc/BUILD.bazel index 781c3698fce3..f4324db2de97 100644 --- a/pkg/kv/kvserver/gc/BUILD.bazel +++ b/pkg/kv/kvserver/gc/BUILD.bazel @@ -32,7 +32,7 @@ go_library( go_test( name = "gc_test", - size = "medium", + size = "large", srcs = [ "data_distribution_test.go", "gc_iterator_test.go", diff --git a/pkg/kv/kvserver/mvcc_gc_queue.go b/pkg/kv/kvserver/mvcc_gc_queue.go index 23b3756db078..421c5be31986 100644 --- a/pkg/kv/kvserver/mvcc_gc_queue.go +++ b/pkg/kv/kvserver/mvcc_gc_queue.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "time" + "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/gc" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/intentresolver" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" @@ -614,10 +615,38 @@ func (mgcq *mvccGCQueue) process( return false, err } - log.Eventf(ctx, "MVCC stats after GC: %+v", repl.GetMVCCStats()) - log.Eventf(ctx, "GC score after GC: %s", makeMVCCGCQueueScore( - ctx, repl, repl.store.Clock().Now(), lastGC, conf.TTL(), canAdvanceGCThreshold)) + scoreAfter := makeMVCCGCQueueScore( + ctx, repl, repl.store.Clock().Now(), lastGC, conf.TTL(), canAdvanceGCThreshold) + log.VEventf(ctx, 2, "MVCC stats after GC: %+v", repl.GetMVCCStats()) + log.VEventf(ctx, 2, "GC score after GC: %s", scoreAfter) updateStoreMetricsWithGCInfo(mgcq.store.metrics, info) + // If the score after running through the queue indicates that this + // replica should be re-queued for GC it most likely means that there + // is something wrong with the stats. One such known issue is + // https://github.com/cockroachdb/cockroach/issues/82920. To fix this we + // recompute stats, it's an expensive operation but it's better to recompute + // them then to spin the GC queue. + // Note: the score is not recomputed as if the GC queue was going to run again, + // because we are reusing the old lastGC and canAdvanceGCThreshold. This helps + // avoid issues with e.g. cooldown timers and focuses the recomputation on the + // difference in stats after GC. + + if scoreAfter.ShouldQueue { + // The scores are very long, so splitting into multiple lines manually for + // readability. + log.Infof(ctx, "GC still needed following GC, recomputing MVCC stats") + log.Infof(ctx, "old score %s", r) + log.Infof(ctx, "new score %s", scoreAfter) + req := roachpb.RecomputeStatsRequest{ + RequestHeader: roachpb.RequestHeader{Key: desc.StartKey.AsRawKey()}, + } + var b kv.Batch + b.AddRawRequest(&req) + err := repl.store.db.Run(ctx, &b) + if err != nil { + log.Errorf(ctx, "failed to recompute stats with error=%s", err) + } + } return true, nil } diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index b6c520336a5d..9b7537124285 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -1140,6 +1140,14 @@ func (r *Replica) GetMVCCStats() enginepb.MVCCStats { return *r.mu.state.Stats } +// SetMVCCStatsForTesting updates the MVCC stats on the repl object only, it does +// not affect the on disk state and is only safe to use for testing purposes. +func (r *Replica) SetMVCCStatsForTesting(stats *enginepb.MVCCStats) { + r.mu.RLock() + defer r.mu.RUnlock() + r.mu.state.Stats = stats +} + // GetMaxSplitQPS returns the Replica's maximum queries/s request rate over a // configured measurement period. If the Replica has not been recording QPS for // at least an entire measurement period, the method will return false.