Skip to content

Commit

Permalink
kvserver: recompute stats after mvcc gc
Browse files Browse the repository at this point in the history
Touched cockroachdb#82920

There is at least one known issue in MVCC stats calculation and
there maybe more. This could lead to the MVCC GC Queue spinning on
ranges with bad stats. To prevent the queue from spinning it should
recompute the stats if it detects that they are wrong. The easiest
mechanism to do that is to check if the GC score wants to queue this
range again after finishing GC, if it does it likely indicates something
fishy with the stats.

Release note: Change the MVCC GC queue to recompute MVCC stats on a
range, if after doing a GC run it still thinks there is garbage in
the range.
  • Loading branch information
lunevalex committed Jul 21, 2022
1 parent e67e47f commit a5b1d71
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 3 deletions.
67 changes: 67 additions & 0 deletions pkg/kv/kvserver/client_mvcc_gc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package kvserver_test

import (
"context"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/stretchr/testify/require"
)

// TestMVCCGCCorrectStats verifies that the mvcc gc queue corrects stats
// for a range that has bad ones that would unnecessarily trigger the mvcc
// gc queue.
func TestMVCCGCCorrectStats(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()
serv, _, _ := serverutils.StartServer(t, base.TestServerArgs{})
s := serv.(*server.TestServer)
defer s.Stopper().Stop(ctx)

key, err := s.ScratchRange()
require.NoError(t, err)
store, err := s.Stores().GetStore(s.GetFirstStoreID())
require.NoError(t, err)

repl := store.LookupReplica(roachpb.RKey(key))
for i := 0; i < 10; i++ {
if err := store.DB().Put(ctx, key, "foo"); err != nil {
t.Fatal(err)
}
key = key.Next()
}

// Put some garbage in the stats, so it triggers the mvcc gc queue.
ms := repl.GetMVCCStats()
oldKeyBytes := ms.KeyBytes
oldValBytes := ms.ValBytes
ms.KeyBytes = 16 * (1 << 20) // 16mb
ms.ValBytes = 32 * (1 << 20) // 16mb
ms.GCBytesAge = 48 * (1 << 20) * 100 * int64(time.Hour.Seconds())

repl.SetMVCCStatsForTesting(&ms)
require.NoError(t, store.ManualMVCCGC(repl))

// Verify that the mvcc gc queue restored the stats.
newStats := repl.GetMVCCStats()
require.Equal(t, oldKeyBytes, newStats.KeyBytes)
require.Equal(t, oldValBytes, newStats.ValBytes)
}
26 changes: 23 additions & 3 deletions pkg/kv/kvserver/mvcc_gc_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"sync/atomic"
"time"

"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/gc"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/intentresolver"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
Expand Down Expand Up @@ -611,10 +612,29 @@ func (mgcq *mvccGCQueue) process(
return false, err
}

log.Eventf(ctx, "MVCC stats after GC: %+v", repl.GetMVCCStats())
log.Eventf(ctx, "GC score after GC: %s", makeMVCCGCQueueScore(
ctx, repl, repl.store.Clock().Now(), lastGC, conf.TTL(), canAdvanceGCThreshold))
scoreAfter := makeMVCCGCQueueScore(
ctx, repl, repl.store.Clock().Now(), lastGC, conf.TTL(), canAdvanceGCThreshold)
log.VEventf(ctx, 2, "MVCC stats after GC: %+v", repl.GetMVCCStats())
log.VEventf(ctx, 2, "GC score after GC: %s", scoreAfter)
updateStoreMetricsWithGCInfo(mgcq.store.metrics, info)
// If the score after running through the queue indicates that this
// replica should be re-queued for GC it most likely means that there
// is something wrong with the stats. One such known issue is
// https://github.com/cockroachdb/cockroach/issues/82920. To fix this we
// recompute stats, it's an expensive operation but it's better to recompute
// them then to spin the GC queue.
if scoreAfter.ShouldQueue {
log.Infof(ctx, "triggering stats re-computation")
req := roachpb.RecomputeStatsRequest{
RequestHeader: roachpb.RequestHeader{Key: desc.StartKey.AsRawKey()},
}
var b kv.Batch
b.AddRawRequest(&req)
err := repl.store.db.Run(ctx, &b)
if err != nil {
log.Errorf(ctx, "Failed to recompute stats with error=%s", err)
}
}
return true, nil
}

Expand Down
8 changes: 8 additions & 0 deletions pkg/kv/kvserver/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -1128,6 +1128,14 @@ func (r *Replica) GetMVCCStats() enginepb.MVCCStats {
return *r.mu.state.Stats
}

// SetMVCCStatsForTesting updates the MVCC stats on the repl object only, it does
// not affect the on disk state and is only safe to use for testing purposes.
func (r *Replica) SetMVCCStatsForTesting(stats *enginepb.MVCCStats) {
r.mu.RLock()
defer r.mu.RUnlock()
r.mu.state.Stats = stats
}

// GetMaxSplitQPS returns the Replica's maximum queries/s request rate over a
// configured measurement period. If the Replica has not been recording QPS for
// at least an entire measurement period, the method will return false.
Expand Down

0 comments on commit a5b1d71

Please sign in to comment.