Skip to content

Commit

Permalink
kvserver: make MVCC GC less disruptive to foreground traffic
Browse files Browse the repository at this point in the history
This commit changes GC requests to no longer declare exclusive latches
at their BatchRequest's timestamp. This was already incorrect as
explained in cockroachdb#55293.

>The first use is broken because we acquire write latches at the batch
header's timestamp, which is set to time.Now(), so we're only
serializing with reads in the future and all other writes [1]. So we're
disruptive to everyone except who we want to serialize with – reads in
the past!

This commit makes GC requests only declare a non-mvcc exclusive latch
over the `RangeGCThresholdKey`. This is correct because:
```

// 1. We define "correctness" to be the property that a reader reading at /
// around the GC threshold will either see the correct results or receive an
// error.
// 2. Readers perform their command evaluation over a stable snapshot of the
// storage engine. This means that the reader will not see the effects of a
// subsequent GC run as long as it created a Pebble iterator before the GC
// request.
// 3. A reader checks the in-memory GC threshold of a Replica after it has
// created this snapshot (i.e. after a Pebble iterator has been created).
// 4. If the in-memory GC threshold is above the timestamp of the read, the
// reader receives an error. Otherwise, the reader is guaranteed to see a
// state of the storage engine that hasn't been affected by the GC request [5].
// 5. GC requests bump the in-memory GC threshold of a Replica as a pre-apply
// side effect. This means that if a reader checks the in-memory GC threshold
// after it has created a Pebble iterator, it is impossible for the iterator
// to point to a storage engine state that has been affected by the GC
// request.

```

As a result, GC requests should now be much less disruptive to
foreground traffic since they're no longer redundantly declaring
exclusive latches over global keys.

Release note (performance improvement): MVCC garbage collection should
now be much less disruptive to foreground traffic than before.
  • Loading branch information
aayushshah15 committed Jul 22, 2022
1 parent 4ce560f commit fb73944
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 76 deletions.
60 changes: 35 additions & 25 deletions pkg/kv/kvserver/batcheval/cmd_gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,47 @@ func declareKeysGC(
latchSpans, _ *spanset.SpanSet,
_ time.Duration,
) {
// Intentionally don't call DefaultDeclareKeys: the key range in the header
// is usually the whole range (pending resolution of #7880).
gcr := req.(*roachpb.GCRequest)
for _, key := range gcr.Keys {
if keys.IsLocal(key.Key) {
latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: key.Key})
} else {
latchSpans.AddMVCC(spanset.SpanReadWrite, roachpb.Span{Key: key.Key}, header.Timestamp)
}
}
// Extend the range key latches by feather to ensure MVCC stats
// computations correctly account for adjacent range keys tombstones if they
// need to be split.
// Extend the range key latches by feather to ensure MVCC stats computations
// correctly account for adjacent range keys tombstones if they need to be
// split.
//
// TODO(oleg): These latches are very broad and will be disruptive to read and
// write operations despite only accessing "stale" data. We should think of
// better integrating it with latchless GC approach.
for _, span := range mergeAdjacentSpans(makeLookupBoundariesForGCRanges(rs.GetStartKey().AsRawKey(),
nil, gcr.RangeKeys)) {
latchSpans.AddMVCC(spanset.SpanReadWrite, span,
header.Timestamp)
}
// Be smart here about blocking on the threshold keys. The MVCC GC queue can
// send an empty request first to bump the thresholds, and then another one
// that actually does work but can avoid declaring these keys below.
if !gcr.Threshold.IsEmpty() {
latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: keys.RangeGCThresholdKey(rs.GetRangeID())})
for _, span := range mergeAdjacentSpans(makeLookupBoundariesForGCRanges(
rs.GetStartKey().AsRawKey(), nil, gcr.RangeKeys,
)) {
latchSpans.AddMVCC(spanset.SpanReadWrite, span, header.Timestamp)
}
// The RangeGCThresholdKey is only written to if the
// req.(*GCRequest).Threshold is set. However, we always declare an exclusive
// access over this key in order to serialize with other GC requests.
//
// Correctness:
// It is correct for a GC request to not declare exclusive access over the
// keys being GCed because of the following:
// 1. We define "correctness" to be the property that a reader reading at /
// around the GC threshold will either see the correct results or receive an
// error.
// 2. Readers perform their command evaluation over a stable snapshot of the
// storage engine. This means that the reader will not see the effects of a
// subsequent GC run as long as it created a Pebble iterator before the GC
// request.
// 3. A reader checks the in-memory GC threshold of a Replica after it has
// created this snapshot (i.e. after a Pebble iterator has been created).
// 4. If the in-memory GC threshold is above the timestamp of the read, the
// reader receives an error. Otherwise, the reader is guaranteed to see a
// state of the storage engine that hasn't been affected by the GC request [5].
// 5. GC requests bump the in-memory GC threshold of a Replica as a pre-apply
// side effect. This means that if a reader checks the in-memory GC threshold
// after it has created a Pebble iterator, it is impossible for the iterator
// to point to a storage engine state that has been affected by the GC
// request.
latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: keys.RangeGCThresholdKey(rs.GetRangeID())})
// Needed for Range bounds checks in calls to EvalContext.ContainsKey.
latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{Key: keys.RangeDescriptorKey(rs.GetStartKey())})
latchSpans.DisableUndeclaredAccessAssertions()
}

// Create latches and merge adjacent.
Expand Down Expand Up @@ -171,9 +183,7 @@ func GC(
newThreshold := oldThreshold
updated := newThreshold.Forward(args.Threshold)

// Don't write the GC threshold key unless we have to. We also don't
// declare the key unless we have to (to allow the MVCC GC queue to
// batch requests more efficiently), and we must honor what we declare.
// Don't write the GC threshold key unless we have to.
if updated {
if err := MakeStateLoader(cArgs.EvalCtx).SetGCThreshold(
ctx, readWriter, cArgs.Stats, &newThreshold,
Expand Down
150 changes: 100 additions & 50 deletions pkg/kv/kvserver/replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8398,56 +8398,6 @@ func TestReplicaReproposalWithNewLeaseIndexError(t *testing.T) {
}
}

// TestGCWithoutThreshold validates that GCRequest only declares the threshold
// key if it is subject to change, and that it does not access this key if it
// does not declare them.
func TestGCWithoutThreshold(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()
tc := &testContext{}
stopper := stop.NewStopper()
defer stopper.Stop(ctx)
tc.Start(ctx, t, stopper)

for _, keyThresh := range []hlc.Timestamp{{}, {Logical: 1}} {
t.Run(fmt.Sprintf("thresh=%s", keyThresh), func(t *testing.T) {
var gc roachpb.GCRequest
var spans spanset.SpanSet

gc.Threshold = keyThresh
cmd, _ := batcheval.LookupCommand(roachpb.GC)
cmd.DeclareKeys(tc.repl.Desc(), &roachpb.Header{RangeID: tc.repl.RangeID}, &gc, &spans, nil, 0)

expSpans := 1
if !keyThresh.IsEmpty() {
expSpans++
}
if numSpans := spans.Len(); numSpans != expSpans {
t.Fatalf("expected %d declared keys, found %d", expSpans, numSpans)
}

eng := storage.NewDefaultInMemForTesting()
defer eng.Close()

batch := eng.NewBatch()
defer batch.Close()
rw := spanset.NewBatch(batch, &spans)

var resp roachpb.GCResponse
if _, err := batcheval.GC(ctx, rw, batcheval.CommandArgs{
Args: &gc,
EvalCtx: NewReplicaEvalContext(
ctx, tc.repl, &spans, false, /* requiresClosedTSOlderThanStorageSnap */
),
}, &resp); err != nil {
t.Fatal(err)
}
})
}
}

// Test that, if the Raft command resulting from EndTxn request fails to be
// processed/apply, then the LocalResult associated with that command is
// cleared.
Expand Down Expand Up @@ -8518,6 +8468,96 @@ func TestFailureToProcessCommandClearsLocalResult(t *testing.T) {
}
}

// TestMVCCStatsGCCommutesWithWrites tests that the MVCCStats updates
// corresponding to writes and GCs are commutative.
//
// This test does so by:
// 1. Initially writing N versions of a key.
// 2. Concurrently GC-ing the N-1 versions written in step 1 while writing N-1
// new versions of the key.
// 3. Checking that the MVCC stats after step 1 and step 2 are identical.
func TestMVCCStatsGCCommutesWithWrites(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()
tc := serverutils.StartNewTestCluster(t, 1, base.TestClusterArgs{})
defer tc.Stopper().Stop(ctx)
key := tc.ScratchRange(t)
desc := tc.LookupRangeOrFatal(t, key)
store, err := tc.Server(0).GetStores().(*Stores).GetStore(tc.Server(0).GetFirstStoreID())
require.NoError(t, err)
repl, err := store.GetReplica(desc.RangeID)
require.NoError(t, err)

write := func() hlc.Timestamp {
var ba roachpb.BatchRequest
put := putArgs(key, []byte("0"))
ba.Add(&put)
resp, pErr := store.TestSender().Send(ctx, ba)
require.Nil(t, pErr)
return resp.Timestamp
}

// Write `numVersions` versions for a key.
const numVersions = 100
writeTimestamps := make([]hlc.Timestamp, 0, numVersions)
for i := 0; i < numVersions; i++ {
writeTimestamps = append(writeTimestamps, write())
}
var statsBefore enginepb.MVCCStats
// NB: We use SucceedsSoon because MVCCStats are only updated on command
// application.
testutils.SucceedsSoon(t, func() error {
statsBefore = repl.GetMVCCStats()
if statsBefore.ValCount != numVersions {
return errors.Newf(
"expected val count to be equal to %d; found %d", numVersions, statsBefore.ValCount,
)
}
return nil
})

// Now, we GC the first `numVersions-1` versions we wrote above while
// concurrently writing `numVersions-1` new versions.
var wg sync.WaitGroup
wg.Add(2)
go func() {
defer wg.Done()
for _, ts := range writeTimestamps[:numVersions-1] {
gcReq := gcArgs(key, key.Next(), gcKey(key, ts))
_, pErr := kv.SendWrapped(ctx, store.TestSender(), &gcReq)
require.Nil(t, pErr)
}
}()
go func() {
defer wg.Done()
for i := 0; i < numVersions-1; i++ {
write()
}
}()
wg.Wait()

var statsAfter enginepb.MVCCStats
testutils.SucceedsSoon(t, func() error {
statsAfter = repl.GetMVCCStats()
if statsAfter.ValCount != numVersions {
return errors.Newf(
"expected val count to be equal to %d; found %d", numVersions, statsAfter.ValCount,
)
}
return nil
})

// Zero out the `LastUpdateNanos` and `GCBytesAge` fields and compare the
// stats before and after. They must be identical.
statsBefore.LastUpdateNanos = 0
statsBefore.GCBytesAge = 0
statsAfter.LastUpdateNanos = 0
statsAfter.GCBytesAge = 0
require.Equal(t, statsBefore, statsAfter, "MVCC stats before and after diverge")
}

// TestBatchTimestampBelowGCThreshold verifies that commands below the replica
// GC threshold fail.
func TestBatchTimestampBelowGCThreshold(t *testing.T) {
Expand Down Expand Up @@ -8784,6 +8824,16 @@ func TestGCThresholdRacesWithRead(t *testing.T) {
require.Nil(t, err)
require.Equal(t, va, b)

// Since the GC request does not acquire latches on the keys being GC'ed,
// they're not guaranteed to wait for these above Puts to get applied. So
// we separately ensure both these Puts have been applied by just trying
// to read the latest value @ ts2. These Get requests do indeed declare
// latches on the keys being read, so by the time they return, subsequent
// GC requests are guaranteed to see the latest keys.
gArgs = getArgs(key)
_, pErr = kv.SendWrappedWith(ctx, reader, h2, &gArgs)
require.Nil(t, pErr)

// Perform two actions concurrently:
// 1. GC up to ts2. This should remove the k@ts1 version.
// 2. Read @ ts1.
Expand Down
18 changes: 17 additions & 1 deletion pkg/kv/kvserver/spanset/spanset.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ type Span struct {
// The Span slice for a particular access and scope contains non-overlapping
// spans in increasing key order after calls to SortAndDedup.
type SpanSet struct {
spans [NumSpanAccess][NumSpanScope][]Span
spans [NumSpanAccess][NumSpanScope][]Span
allowUndeclared bool
}

var spanSetPool = sync.Pool{
Expand Down Expand Up @@ -152,6 +153,7 @@ func (s *SpanSet) Copy() *SpanSet {
n.spans[sa][ss] = append(n.spans[sa][ss], s.spans[sa][ss]...)
}
}
n.allowUndeclared = s.allowUndeclared
return n
}

Expand Down Expand Up @@ -204,6 +206,7 @@ func (s *SpanSet) Merge(s2 *SpanSet) {
s.spans[sa][ss] = append(s.spans[sa][ss], s2.spans[sa][ss]...)
}
}
s.allowUndeclared = s2.allowUndeclared
s.SortAndDedup()
}

Expand Down Expand Up @@ -335,6 +338,12 @@ func (s *SpanSet) CheckAllowedAt(
func (s *SpanSet) checkAllowed(
access SpanAccess, span roachpb.Span, check func(SpanAccess, Span) bool,
) error {
if s.allowUndeclared {
// If the request has specified that undeclared spans are allowed, do
// nothing.
return nil
}

scope := SpanGlobal
if (span.Key != nil && keys.IsLocal(span.Key)) ||
(span.EndKey != nil && keys.IsLocal(span.EndKey)) {
Expand Down Expand Up @@ -387,3 +396,10 @@ func (s *SpanSet) Validate() error {

return nil
}

// DisableUndeclaredAccessAssertions disables the assertions that prevent
// undeclared access to spans. This is generally set by requests that rely on
// other forms of synchronization for correctness (e.g. GCRequest).
func (s *SpanSet) DisableUndeclaredAccessAssertions() {
s.allowUndeclared = true
}

0 comments on commit fb73944

Please sign in to comment.