From fb739440c59ef9b7ca739e1f36f7afe104e99a67 Mon Sep 17 00:00:00 2001 From: Aayush Shah Date: Wed, 22 Jun 2022 14:38:10 -0400 Subject: [PATCH] kvserver: make MVCC GC less disruptive to foreground traffic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit changes GC requests to no longer declare exclusive latches at their BatchRequest's timestamp. This was already incorrect as explained in https://github.com/cockroachdb/cockroach/issues/55293. >The first use is broken because we acquire write latches at the batch header's timestamp, which is set to time.Now(), so we're only serializing with reads in the future and all other writes [1]. So we're disruptive to everyone except who we want to serialize with – reads in the past! This commit makes GC requests only declare a non-mvcc exclusive latch over the `RangeGCThresholdKey`. This is correct because: ``` // 1. We define "correctness" to be the property that a reader reading at / // around the GC threshold will either see the correct results or receive an // error. // 2. Readers perform their command evaluation over a stable snapshot of the // storage engine. This means that the reader will not see the effects of a // subsequent GC run as long as it created a Pebble iterator before the GC // request. // 3. A reader checks the in-memory GC threshold of a Replica after it has // created this snapshot (i.e. after a Pebble iterator has been created). // 4. If the in-memory GC threshold is above the timestamp of the read, the // reader receives an error. Otherwise, the reader is guaranteed to see a // state of the storage engine that hasn't been affected by the GC request [5]. // 5. GC requests bump the in-memory GC threshold of a Replica as a pre-apply // side effect. This means that if a reader checks the in-memory GC threshold // after it has created a Pebble iterator, it is impossible for the iterator // to point to a storage engine state that has been affected by the GC // request. ``` As a result, GC requests should now be much less disruptive to foreground traffic since they're no longer redundantly declaring exclusive latches over global keys. Release note (performance improvement): MVCC garbage collection should now be much less disruptive to foreground traffic than before. --- pkg/kv/kvserver/batcheval/cmd_gc.go | 60 ++++++----- pkg/kv/kvserver/replica_test.go | 150 ++++++++++++++++++---------- pkg/kv/kvserver/spanset/spanset.go | 18 +++- 3 files changed, 152 insertions(+), 76 deletions(-) diff --git a/pkg/kv/kvserver/batcheval/cmd_gc.go b/pkg/kv/kvserver/batcheval/cmd_gc.go index 2f9290bf2015..0570d64bb836 100644 --- a/pkg/kv/kvserver/batcheval/cmd_gc.go +++ b/pkg/kv/kvserver/batcheval/cmd_gc.go @@ -35,35 +35,47 @@ func declareKeysGC( latchSpans, _ *spanset.SpanSet, _ time.Duration, ) { - // Intentionally don't call DefaultDeclareKeys: the key range in the header - // is usually the whole range (pending resolution of #7880). gcr := req.(*roachpb.GCRequest) - for _, key := range gcr.Keys { - if keys.IsLocal(key.Key) { - latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: key.Key}) - } else { - latchSpans.AddMVCC(spanset.SpanReadWrite, roachpb.Span{Key: key.Key}, header.Timestamp) - } - } - // Extend the range key latches by feather to ensure MVCC stats - // computations correctly account for adjacent range keys tombstones if they - // need to be split. + // Extend the range key latches by feather to ensure MVCC stats computations + // correctly account for adjacent range keys tombstones if they need to be + // split. + // // TODO(oleg): These latches are very broad and will be disruptive to read and // write operations despite only accessing "stale" data. We should think of // better integrating it with latchless GC approach. - for _, span := range mergeAdjacentSpans(makeLookupBoundariesForGCRanges(rs.GetStartKey().AsRawKey(), - nil, gcr.RangeKeys)) { - latchSpans.AddMVCC(spanset.SpanReadWrite, span, - header.Timestamp) - } - // Be smart here about blocking on the threshold keys. The MVCC GC queue can - // send an empty request first to bump the thresholds, and then another one - // that actually does work but can avoid declaring these keys below. - if !gcr.Threshold.IsEmpty() { - latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: keys.RangeGCThresholdKey(rs.GetRangeID())}) + for _, span := range mergeAdjacentSpans(makeLookupBoundariesForGCRanges( + rs.GetStartKey().AsRawKey(), nil, gcr.RangeKeys, + )) { + latchSpans.AddMVCC(spanset.SpanReadWrite, span, header.Timestamp) } + // The RangeGCThresholdKey is only written to if the + // req.(*GCRequest).Threshold is set. However, we always declare an exclusive + // access over this key in order to serialize with other GC requests. + // + // Correctness: + // It is correct for a GC request to not declare exclusive access over the + // keys being GCed because of the following: + // 1. We define "correctness" to be the property that a reader reading at / + // around the GC threshold will either see the correct results or receive an + // error. + // 2. Readers perform their command evaluation over a stable snapshot of the + // storage engine. This means that the reader will not see the effects of a + // subsequent GC run as long as it created a Pebble iterator before the GC + // request. + // 3. A reader checks the in-memory GC threshold of a Replica after it has + // created this snapshot (i.e. after a Pebble iterator has been created). + // 4. If the in-memory GC threshold is above the timestamp of the read, the + // reader receives an error. Otherwise, the reader is guaranteed to see a + // state of the storage engine that hasn't been affected by the GC request [5]. + // 5. GC requests bump the in-memory GC threshold of a Replica as a pre-apply + // side effect. This means that if a reader checks the in-memory GC threshold + // after it has created a Pebble iterator, it is impossible for the iterator + // to point to a storage engine state that has been affected by the GC + // request. + latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: keys.RangeGCThresholdKey(rs.GetRangeID())}) // Needed for Range bounds checks in calls to EvalContext.ContainsKey. latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{Key: keys.RangeDescriptorKey(rs.GetStartKey())}) + latchSpans.DisableUndeclaredAccessAssertions() } // Create latches and merge adjacent. @@ -171,9 +183,7 @@ func GC( newThreshold := oldThreshold updated := newThreshold.Forward(args.Threshold) - // Don't write the GC threshold key unless we have to. We also don't - // declare the key unless we have to (to allow the MVCC GC queue to - // batch requests more efficiently), and we must honor what we declare. + // Don't write the GC threshold key unless we have to. if updated { if err := MakeStateLoader(cArgs.EvalCtx).SetGCThreshold( ctx, readWriter, cArgs.Stats, &newThreshold, diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go index 9a0207b4d31c..c5d6d9c3c4de 100644 --- a/pkg/kv/kvserver/replica_test.go +++ b/pkg/kv/kvserver/replica_test.go @@ -8398,56 +8398,6 @@ func TestReplicaReproposalWithNewLeaseIndexError(t *testing.T) { } } -// TestGCWithoutThreshold validates that GCRequest only declares the threshold -// key if it is subject to change, and that it does not access this key if it -// does not declare them. -func TestGCWithoutThreshold(t *testing.T) { - defer leaktest.AfterTest(t)() - defer log.Scope(t).Close(t) - - ctx := context.Background() - tc := &testContext{} - stopper := stop.NewStopper() - defer stopper.Stop(ctx) - tc.Start(ctx, t, stopper) - - for _, keyThresh := range []hlc.Timestamp{{}, {Logical: 1}} { - t.Run(fmt.Sprintf("thresh=%s", keyThresh), func(t *testing.T) { - var gc roachpb.GCRequest - var spans spanset.SpanSet - - gc.Threshold = keyThresh - cmd, _ := batcheval.LookupCommand(roachpb.GC) - cmd.DeclareKeys(tc.repl.Desc(), &roachpb.Header{RangeID: tc.repl.RangeID}, &gc, &spans, nil, 0) - - expSpans := 1 - if !keyThresh.IsEmpty() { - expSpans++ - } - if numSpans := spans.Len(); numSpans != expSpans { - t.Fatalf("expected %d declared keys, found %d", expSpans, numSpans) - } - - eng := storage.NewDefaultInMemForTesting() - defer eng.Close() - - batch := eng.NewBatch() - defer batch.Close() - rw := spanset.NewBatch(batch, &spans) - - var resp roachpb.GCResponse - if _, err := batcheval.GC(ctx, rw, batcheval.CommandArgs{ - Args: &gc, - EvalCtx: NewReplicaEvalContext( - ctx, tc.repl, &spans, false, /* requiresClosedTSOlderThanStorageSnap */ - ), - }, &resp); err != nil { - t.Fatal(err) - } - }) - } -} - // Test that, if the Raft command resulting from EndTxn request fails to be // processed/apply, then the LocalResult associated with that command is // cleared. @@ -8518,6 +8468,96 @@ func TestFailureToProcessCommandClearsLocalResult(t *testing.T) { } } +// TestMVCCStatsGCCommutesWithWrites tests that the MVCCStats updates +// corresponding to writes and GCs are commutative. +// +// This test does so by: +// 1. Initially writing N versions of a key. +// 2. Concurrently GC-ing the N-1 versions written in step 1 while writing N-1 +// new versions of the key. +// 3. Checking that the MVCC stats after step 1 and step 2 are identical. +func TestMVCCStatsGCCommutesWithWrites(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + tc := serverutils.StartNewTestCluster(t, 1, base.TestClusterArgs{}) + defer tc.Stopper().Stop(ctx) + key := tc.ScratchRange(t) + desc := tc.LookupRangeOrFatal(t, key) + store, err := tc.Server(0).GetStores().(*Stores).GetStore(tc.Server(0).GetFirstStoreID()) + require.NoError(t, err) + repl, err := store.GetReplica(desc.RangeID) + require.NoError(t, err) + + write := func() hlc.Timestamp { + var ba roachpb.BatchRequest + put := putArgs(key, []byte("0")) + ba.Add(&put) + resp, pErr := store.TestSender().Send(ctx, ba) + require.Nil(t, pErr) + return resp.Timestamp + } + + // Write `numVersions` versions for a key. + const numVersions = 100 + writeTimestamps := make([]hlc.Timestamp, 0, numVersions) + for i := 0; i < numVersions; i++ { + writeTimestamps = append(writeTimestamps, write()) + } + var statsBefore enginepb.MVCCStats + // NB: We use SucceedsSoon because MVCCStats are only updated on command + // application. + testutils.SucceedsSoon(t, func() error { + statsBefore = repl.GetMVCCStats() + if statsBefore.ValCount != numVersions { + return errors.Newf( + "expected val count to be equal to %d; found %d", numVersions, statsBefore.ValCount, + ) + } + return nil + }) + + // Now, we GC the first `numVersions-1` versions we wrote above while + // concurrently writing `numVersions-1` new versions. + var wg sync.WaitGroup + wg.Add(2) + go func() { + defer wg.Done() + for _, ts := range writeTimestamps[:numVersions-1] { + gcReq := gcArgs(key, key.Next(), gcKey(key, ts)) + _, pErr := kv.SendWrapped(ctx, store.TestSender(), &gcReq) + require.Nil(t, pErr) + } + }() + go func() { + defer wg.Done() + for i := 0; i < numVersions-1; i++ { + write() + } + }() + wg.Wait() + + var statsAfter enginepb.MVCCStats + testutils.SucceedsSoon(t, func() error { + statsAfter = repl.GetMVCCStats() + if statsAfter.ValCount != numVersions { + return errors.Newf( + "expected val count to be equal to %d; found %d", numVersions, statsAfter.ValCount, + ) + } + return nil + }) + + // Zero out the `LastUpdateNanos` and `GCBytesAge` fields and compare the + // stats before and after. They must be identical. + statsBefore.LastUpdateNanos = 0 + statsBefore.GCBytesAge = 0 + statsAfter.LastUpdateNanos = 0 + statsAfter.GCBytesAge = 0 + require.Equal(t, statsBefore, statsAfter, "MVCC stats before and after diverge") +} + // TestBatchTimestampBelowGCThreshold verifies that commands below the replica // GC threshold fail. func TestBatchTimestampBelowGCThreshold(t *testing.T) { @@ -8784,6 +8824,16 @@ func TestGCThresholdRacesWithRead(t *testing.T) { require.Nil(t, err) require.Equal(t, va, b) + // Since the GC request does not acquire latches on the keys being GC'ed, + // they're not guaranteed to wait for these above Puts to get applied. So + // we separately ensure both these Puts have been applied by just trying + // to read the latest value @ ts2. These Get requests do indeed declare + // latches on the keys being read, so by the time they return, subsequent + // GC requests are guaranteed to see the latest keys. + gArgs = getArgs(key) + _, pErr = kv.SendWrappedWith(ctx, reader, h2, &gArgs) + require.Nil(t, pErr) + // Perform two actions concurrently: // 1. GC up to ts2. This should remove the k@ts1 version. // 2. Read @ ts1. diff --git a/pkg/kv/kvserver/spanset/spanset.go b/pkg/kv/kvserver/spanset/spanset.go index fdefe74434e8..44e32aefea5e 100644 --- a/pkg/kv/kvserver/spanset/spanset.go +++ b/pkg/kv/kvserver/spanset/spanset.go @@ -82,7 +82,8 @@ type Span struct { // The Span slice for a particular access and scope contains non-overlapping // spans in increasing key order after calls to SortAndDedup. type SpanSet struct { - spans [NumSpanAccess][NumSpanScope][]Span + spans [NumSpanAccess][NumSpanScope][]Span + allowUndeclared bool } var spanSetPool = sync.Pool{ @@ -152,6 +153,7 @@ func (s *SpanSet) Copy() *SpanSet { n.spans[sa][ss] = append(n.spans[sa][ss], s.spans[sa][ss]...) } } + n.allowUndeclared = s.allowUndeclared return n } @@ -204,6 +206,7 @@ func (s *SpanSet) Merge(s2 *SpanSet) { s.spans[sa][ss] = append(s.spans[sa][ss], s2.spans[sa][ss]...) } } + s.allowUndeclared = s2.allowUndeclared s.SortAndDedup() } @@ -335,6 +338,12 @@ func (s *SpanSet) CheckAllowedAt( func (s *SpanSet) checkAllowed( access SpanAccess, span roachpb.Span, check func(SpanAccess, Span) bool, ) error { + if s.allowUndeclared { + // If the request has specified that undeclared spans are allowed, do + // nothing. + return nil + } + scope := SpanGlobal if (span.Key != nil && keys.IsLocal(span.Key)) || (span.EndKey != nil && keys.IsLocal(span.EndKey)) { @@ -387,3 +396,10 @@ func (s *SpanSet) Validate() error { return nil } + +// DisableUndeclaredAccessAssertions disables the assertions that prevent +// undeclared access to spans. This is generally set by requests that rely on +// other forms of synchronization for correctness (e.g. GCRequest). +func (s *SpanSet) DisableUndeclaredAccessAssertions() { + s.allowUndeclared = true +}