From 50aac2dc2f1f151fb6dfa590add967a4270ebf8f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <grinaker@cockroachlabs.com>
Date: Sun, 1 May 2022 16:08:37 +0000
Subject: [PATCH] batcheval: handle MVCC range tombstones in `ClearRange`

This patch makes `ClearRange` account for MVCC range tombstones when
updating MVCC stats.

Release note: None
---
 pkg/kv/kvserver/batcheval/cmd_clear_range.go  |  84 +++++-
 .../batcheval/cmd_clear_range_test.go         | 279 +++++++++++-------
 2 files changed, 248 insertions(+), 115 deletions(-)

diff --git a/pkg/kv/kvserver/batcheval/cmd_clear_range.go b/pkg/kv/kvserver/batcheval/cmd_clear_range.go
index 5d66f6f63741..9ecea57c93af 100644
--- a/pkg/kv/kvserver/batcheval/cmd_clear_range.go
+++ b/pkg/kv/kvserver/batcheval/cmd_clear_range.go
@@ -48,6 +48,17 @@ func declareKeysClearRange(
 	// We look up the range descriptor key to check whether the span
 	// is equal to the entire range for fast stats updating.
 	latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{Key: keys.RangeDescriptorKey(rs.GetStartKey())})
+
+	// We must peek beyond the span for MVCC range tombstones that straddle the
+	// span bounds, to update MVCC stats with their new bounds. But we make sure
+	// to stay within the range.
+	//
+	// NB: The range end key is not available, so this will pessimistically latch
+	// up to args.EndKey.Next(). If EndKey falls on the range end key, the span
+	// will be tightened during evaluation.
+	args := req.(*roachpb.ClearRangeRequest)
+	l, r := rangeTombstonePeekBounds(args.Key, args.EndKey, rs.GetStartKey().AsRawKey(), nil)
+	latchSpans.AddMVCC(spanset.SpanReadOnly, roachpb.Span{Key: l, EndKey: r}, header.Timestamp)
 }
 
 // ClearRange wipes all MVCC versions of keys covered by the specified
@@ -144,8 +155,8 @@ func computeStatsDelta(
 
 	// We can avoid manually computing the stats delta if we're clearing
 	// the entire range.
-	fast := desc.StartKey.Equal(from) && desc.EndKey.Equal(to)
-	if fast {
+	entireRange := desc.StartKey.Equal(from) && desc.EndKey.Equal(to)
+	if entireRange {
 		// Note this it is safe to use the full range MVCC stats, as
 		// opposed to the usual method of computing only a localizied
 		// stats delta, because a full-range clear prevents any concurrent
@@ -155,11 +166,11 @@ func computeStatsDelta(
 		delta.SysCount, delta.SysBytes, delta.AbortSpanBytes = 0, 0, 0 // no change to system stats
 	}
 
-	// If we can't use the fast stats path, or race test is enabled,
-	// compute stats across the key span to be cleared.
-	//
-	// TODO(erikgrinaker): This must handle range key stats adjustments.
-	if !fast || util.RaceEnabled {
+	// If we can't use the fast stats path, or race test is enabled, compute stats
+	// across the key span to be cleared. In this case we must also look for MVCC
+	// range tombstones that straddle the span bounds, since we must adjust the
+	// stats for their new key bounds.
+	if !entireRange || util.RaceEnabled {
 		iter := readWriter.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{
 			KeyTypes:   storage.IterKeyTypePointsAndRanges,
 			LowerBound: from,
@@ -171,7 +182,7 @@ func computeStatsDelta(
 			return enginepb.MVCCStats{}, err
 		}
 		// If we took the fast path but race is enabled, assert stats were correctly computed.
-		if fast {
+		if entireRange {
 			computed.ContainsEstimates = delta.ContainsEstimates // retained for tests under race
 			if !delta.Equal(computed) {
 				log.Fatalf(ctx, "fast-path MVCCStats computation gave wrong result: diff(fast, computed) = %s",
@@ -179,6 +190,63 @@ func computeStatsDelta(
 			}
 		}
 		delta = computed
+
+		// If we're not clearing the whole range, we need to adjust for any MVCC
+		// range tombstones that straddle the span bounds. These will now be
+		// truncated, or possibly split into two. We take care not to peek outside
+		// the range bounds.
+		//
+		// Conveniently, due to the symmetry of the range keys and their start/end
+		// bounds around the truncation point, this is equivalent to twice what was
+		// removed at each bound. This applies both in the truncation and
+		// split-in-two cases, again due to symmetry.
+		//
+		// TODO(erikgrinaker): Consolidate this logic with the corresponding logic
+		// during range splits/merges and MVCC range tombstone writes.
+		if !entireRange {
+			leftPeekBound, rightPeekBound := rangeTombstonePeekBounds(
+				from, to, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey())
+			iter = readWriter.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{
+				KeyTypes:   storage.IterKeyTypeRangesOnly,
+				LowerBound: leftPeekBound,
+				UpperBound: rightPeekBound,
+			})
+			defer iter.Close()
+
+			addTruncatedRangeKeyStats := func(bound roachpb.Key) error {
+				iter.SeekGE(storage.MVCCKey{Key: bound})
+				if ok, err := iter.Valid(); err != nil {
+					return err
+				} else if ok && iter.RangeBounds().Key.Compare(bound) < 0 {
+					for i, rkv := range iter.RangeKeys() {
+						keyBytes := int64(storage.EncodedMVCCTimestampSuffixLength(rkv.RangeKey.Timestamp))
+						valBytes := int64(len(rkv.Value))
+						if i == 0 {
+							delta.RangeKeyCount--
+							keyBytes += 2 * int64(storage.EncodedMVCCKeyPrefixLength(bound))
+						}
+						delta.RangeKeyBytes -= keyBytes
+						delta.RangeValCount--
+						delta.RangeValBytes -= valBytes
+						delta.GCBytesAge -= (keyBytes + valBytes) *
+							(delta.LastUpdateNanos/1e9 - rkv.RangeKey.Timestamp.WallTime/1e9)
+					}
+				}
+				return nil
+			}
+
+			if !leftPeekBound.Equal(from) {
+				if err := addTruncatedRangeKeyStats(from); err != nil {
+					return enginepb.MVCCStats{}, err
+				}
+			}
+
+			if !rightPeekBound.Equal(to) {
+				if err := addTruncatedRangeKeyStats(to); err != nil {
+					return enginepb.MVCCStats{}, err
+				}
+			}
+		}
 	}
 
 	return delta, nil
diff --git a/pkg/kv/kvserver/batcheval/cmd_clear_range_test.go b/pkg/kv/kvserver/batcheval/cmd_clear_range_test.go
index a4caf52b195f..9c35dea41c52 100644
--- a/pkg/kv/kvserver/batcheval/cmd_clear_range_test.go
+++ b/pkg/kv/kvserver/batcheval/cmd_clear_range_test.go
@@ -17,6 +17,8 @@ import (
 	"testing"
 	"time"
 
+	"github.com/cockroachdb/cockroach/pkg/keys"
+	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/storage"
@@ -25,7 +27,6 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
-	"github.com/cockroachdb/errors"
 	"github.com/stretchr/testify/require"
 )
 
@@ -45,129 +46,193 @@ func (wb *wrappedBatch) ClearMVCCRange(start, end roachpb.Key) error {
 	return wb.Batch.ClearMVCCRange(start, end)
 }
 
-// TestCmdClearRangeBytesThreshold verifies that clear range resorts to
-// clearing keys individually if under the bytes threshold and issues a
-// clear range command to the batch otherwise.
-func TestCmdClearRangeBytesThreshold(t *testing.T) {
+// TestCmdClearRange verifies that ClearRange clears point and range keys in the
+// given span, and that MVCC stats are updated correctly (both when clearing a
+// complete range and just parts of it). It should clear keys using an iterator
+// if under the bytes threshold, or using a Pebble range tombstone otherwise.
+func TestCmdClearRange(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
 
-	startKey := roachpb.Key("0000")
+	nowNanos := int64(100e9)
+	startKey := roachpb.Key("000") // NB: not 0000, different bound lengths for MVCC stats testing
 	endKey := roachpb.Key("9999")
-	desc := roachpb.RangeDescriptor{
-		RangeID:  99,
-		StartKey: roachpb.RKey(startKey),
-		EndKey:   roachpb.RKey(endKey),
-	}
 	valueStr := strings.Repeat("0123456789", 1024)
 	var value roachpb.Value
 	value.SetString(valueStr) // 10KiB
+
 	halfFull := ClearRangeBytesThreshold / (2 * len(valueStr))
 	overFull := ClearRangeBytesThreshold/len(valueStr) + 1
-	tests := []struct {
-		keyCount           int
-		estimatedStats     bool
-		expClearIterCount  int
-		expClearRangeCount int
+	testcases := map[string]struct {
+		keyCount       int
+		estimatedStats bool
+		partialRange   bool
+		expClearIter   bool
 	}{
-		{
-			keyCount:           1,
-			expClearIterCount:  1,
-			expClearRangeCount: 0,
+		"single key": {
+			keyCount:     1,
+			expClearIter: true,
+		},
+		"below threshold": {
+			keyCount:     halfFull,
+			expClearIter: true,
 		},
-		// More than a single key, but not enough to use ClearRange.
-		{
-			keyCount:           halfFull,
-			expClearIterCount:  1,
-			expClearRangeCount: 0,
+		"below threshold partial range": {
+			keyCount:     halfFull,
+			partialRange: true,
+			expClearIter: true,
 		},
-		// With key sizes requiring additional space, this will overshoot.
-		{
-			keyCount:           overFull,
-			expClearIterCount:  0,
-			expClearRangeCount: 1,
+		"above threshold": {
+			keyCount:     overFull,
+			expClearIter: false,
 		},
-		// Estimated stats always use ClearRange.
-		{
-			keyCount:           1,
-			estimatedStats:     true,
-			expClearIterCount:  0,
-			expClearRangeCount: 1,
+		"above threshold partial range": {
+			keyCount:     overFull,
+			partialRange: true,
+			expClearIter: false,
+		},
+		"estimated stats": { // must not use iterator, since we can't trust stats
+			keyCount:       1,
+			estimatedStats: true,
+			expClearIter:   false,
+		},
+		"estimated stats and partial range": { // stats get computed for partial ranges
+			keyCount:       1,
+			estimatedStats: true,
+			partialRange:   true,
+			expClearIter:   true,
 		},
 	}
 
-	for _, test := range tests {
-		t.Run("", func(t *testing.T) {
-			ctx := context.Background()
-			eng := storage.NewDefaultInMemForTesting()
-			defer eng.Close()
-
-			var stats enginepb.MVCCStats
-			for i := 0; i < test.keyCount; i++ {
-				key := roachpb.Key(fmt.Sprintf("%04d", i))
-				if err := storage.MVCCPut(ctx, eng, &stats, key, hlc.Timestamp{WallTime: int64(i % 2)}, hlc.ClockTimestamp{}, value, nil); err != nil {
-					t.Fatal(err)
+	for name, tc := range testcases {
+		t.Run(name, func(t *testing.T) {
+			testutils.RunTrueAndFalse(t, "spanningRangeTombstones", func(t *testing.T, spanningRangeTombstones bool) {
+				ctx := context.Background()
+				eng := storage.NewDefaultInMemForTesting()
+				defer eng.Close()
+
+				// Set up range descriptor. If partialRange is true, we make the range
+				// wider than the cleared span, which disabled the MVCC stats fast path.
+				desc := roachpb.RangeDescriptor{
+					RangeID:  99,
+					StartKey: roachpb.RKey(startKey),
+					EndKey:   roachpb.RKey(endKey),
+				}
+				if tc.partialRange {
+					desc.StartKey = roachpb.RKey(keys.LocalMax)
+					desc.EndKey = roachpb.RKey(keys.MaxKey)
+				}
+
+				// Write some range tombstones at the bottom of the keyspace, some of
+				// which straddle the clear span bounds. In particular, we need to
+				// ensure MVCC stats are updated correctly for range tombstones that
+				// get truncated by the ClearRange.
+				//
+				// If spanningRangeTombstone is true, we write very wide range
+				// tombstones that engulf the entire cleared span. Otherwise, we write
+				// additional range tombstones that span the start/end bounds as well as
+				// some in the middle -- these will fragment the very wide range
+				// tombstones, which is why we need to test both cases separately.
+				rangeTombstones := []storage.MVCCRangeKey{
+					{StartKey: roachpb.Key("0"), EndKey: roachpb.Key("a"), Timestamp: hlc.Timestamp{WallTime: 1e9}},
+					{StartKey: roachpb.Key("0"), EndKey: roachpb.Key("a"), Timestamp: hlc.Timestamp{WallTime: 2e9}},
+				}
+				if !spanningRangeTombstones {
+					rangeTombstones = append(rangeTombstones, []storage.MVCCRangeKey{
+						{StartKey: roachpb.Key("00"), EndKey: roachpb.Key("111"), Timestamp: hlc.Timestamp{WallTime: 3e9}},
+						{StartKey: roachpb.Key("2"), EndKey: roachpb.Key("4"), Timestamp: hlc.Timestamp{WallTime: 3e9}},
+						{StartKey: roachpb.Key("6"), EndKey: roachpb.Key("8"), Timestamp: hlc.Timestamp{WallTime: 3e9}},
+						{StartKey: roachpb.Key("999"), EndKey: roachpb.Key("aa"), Timestamp: hlc.Timestamp{WallTime: 3e9}},
+					}...)
+				}
+				for _, rk := range rangeTombstones {
+					localTS := hlc.ClockTimestamp{WallTime: rk.Timestamp.WallTime - 1e9} // give range key a value if > 0
+					require.NoError(t, storage.ExperimentalMVCCDeleteRangeUsingTombstone(
+						ctx, eng, nil, rk.StartKey, rk.EndKey, rk.Timestamp, localTS, nil, nil, 0))
+				}
+
+				// Write some random point keys within the cleared span, above the range tombstones.
+				for i := 0; i < tc.keyCount; i++ {
+					key := roachpb.Key(fmt.Sprintf("%04d", i))
+					require.NoError(t, storage.MVCCPut(ctx, eng, nil, key,
+						hlc.Timestamp{WallTime: int64(4+i%2) * 1e9}, hlc.ClockTimestamp{}, value, nil))
+				}
+
+				// Calculate the range stats.
+				stats := computeStats(t, eng, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey(), nowNanos)
+				if tc.estimatedStats {
+					stats.ContainsEstimates++
+				}
+
+				// Set up the evaluation context.
+				cArgs := CommandArgs{
+					EvalCtx: (&MockEvalCtx{
+						ClusterSettings: cluster.MakeTestingClusterSettings(),
+						Desc:            &desc,
+						Clock:           hlc.NewClockWithSystemTimeSource(time.Nanosecond),
+						Stats:           stats,
+					}).EvalContext(),
+					Header: roachpb.Header{
+						RangeID:   desc.RangeID,
+						Timestamp: hlc.Timestamp{WallTime: nowNanos},
+					},
+					Args: &roachpb.ClearRangeRequest{
+						RequestHeader: roachpb.RequestHeader{
+							Key:    startKey,
+							EndKey: endKey,
+						},
+					},
+					Stats: &enginepb.MVCCStats{},
+				}
+
+				// Use a spanset batch to assert latching of all accesses. In
+				// particular, to test the additional seeks necessary to peek for
+				// adjacent range keys that we may truncate (for stats purposes) which
+				// should not cross the range bounds.
+				var latchSpans, lockSpans spanset.SpanSet
+				declareKeysClearRange(&desc, &cArgs.Header, cArgs.Args, &latchSpans, &lockSpans, 0)
+				batch := &wrappedBatch{Batch: spanset.NewBatchAt(eng.NewBatch(), &latchSpans, cArgs.Header.Timestamp)}
+				defer batch.Close()
+
+				// Run the request.
+				result, err := ClearRange(ctx, batch, cArgs, &roachpb.ClearRangeResponse{})
+				require.NoError(t, err)
+				require.NotNil(t, result.Replicated.MVCCHistoryMutation)
+				require.Equal(t, result.Replicated.MVCCHistoryMutation.Spans, []roachpb.Span{{Key: startKey, EndKey: endKey}})
+
+				require.NoError(t, batch.Commit(true /* sync */))
+
+				// Verify that we see the correct counts for ClearMVCCIteratorRange and ClearMVCCRange.
+				require.Equal(t, tc.expClearIter, batch.clearIterCount == 1)
+				require.Equal(t, tc.expClearIter, batch.clearRangeCount == 0)
+
+				// Ensure that the data is gone.
+				iter := eng.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{
+					KeyTypes:   storage.IterKeyTypePointsAndRanges,
+					LowerBound: startKey,
+					UpperBound: endKey,
+				})
+				defer iter.Close()
+				iter.SeekGE(storage.MVCCKey{Key: keys.LocalMax})
+				ok, err := iter.Valid()
+				require.NoError(t, err)
+				require.False(t, ok, "expected empty span, found key %s", iter.UnsafeKey())
+
+				// Verify the stats delta by adding it to the original range stats and
+				// comparing with the computed range stats. If we're clearing the entire
+				// range then the new stats should be empty.
+				newStats := stats
+				newStats.ContainsEstimates, cArgs.Stats.ContainsEstimates = 0, 0
+				newStats.SysBytes, cArgs.Stats.SysBytes = 0, 0
+				newStats.SysCount, cArgs.Stats.SysCount = 0, 0
+				newStats.AbortSpanBytes, cArgs.Stats.AbortSpanBytes = 0, 0
+				newStats.Add(*cArgs.Stats)
+				require.Equal(t, newStats, computeStats(t, eng, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey(), nowNanos))
+				if !tc.partialRange {
+					newStats.LastUpdateNanos = 0
+					require.Empty(t, newStats)
 				}
-			}
-			if test.estimatedStats {
-				stats.ContainsEstimates++
-			}
-
-			batch := &wrappedBatch{Batch: eng.NewBatch()}
-			defer batch.Close()
-
-			var h roachpb.Header
-			h.RangeID = desc.RangeID
-
-			cArgs := CommandArgs{Header: h}
-			cArgs.EvalCtx = (&MockEvalCtx{
-				ClusterSettings: cluster.MakeTestingClusterSettings(),
-				Desc:            &desc,
-				Clock:           hlc.NewClockWithSystemTimeSource(time.Nanosecond /* maxOffset */),
-				Stats:           stats,
-			}).EvalContext()
-			cArgs.Args = &roachpb.ClearRangeRequest{
-				RequestHeader: roachpb.RequestHeader{
-					Key:    startKey,
-					EndKey: endKey,
-				},
-			}
-			cArgs.Stats = &enginepb.MVCCStats{}
-
-			result, err := ClearRange(ctx, batch, cArgs, &roachpb.ClearRangeResponse{})
-			require.NoError(t, err)
-			require.NotNil(t, result.Replicated.MVCCHistoryMutation)
-			require.Equal(t, result.Replicated.MVCCHistoryMutation.Spans, []roachpb.Span{{Key: startKey, EndKey: endKey}})
-
-			// Verify cArgs.Stats is equal to the stats we wrote, ignoring some values.
-			newStats := stats
-			newStats.ContainsEstimates, cArgs.Stats.ContainsEstimates = 0, 0
-			newStats.SysBytes, cArgs.Stats.SysBytes = 0, 0
-			newStats.SysCount, cArgs.Stats.SysCount = 0, 0
-			newStats.AbortSpanBytes, cArgs.Stats.AbortSpanBytes = 0, 0
-			newStats.Add(*cArgs.Stats)
-			newStats.AgeTo(0) // pin at LastUpdateNanos==0
-			if !newStats.Equal(enginepb.MVCCStats{}) {
-				t.Errorf("expected stats on original writes to be negated on clear range: %+v vs %+v", stats, *cArgs.Stats)
-			}
-
-			// Verify we see the correct counts for Clear and ClearRange.
-			if a, e := batch.clearIterCount, test.expClearIterCount; a != e {
-				t.Errorf("expected %d iter range clears; got %d", e, a)
-			}
-			if a, e := batch.clearRangeCount, test.expClearRangeCount; a != e {
-				t.Errorf("expected %d clear ranges; got %d", e, a)
-			}
-
-			// Now ensure that the data is gone, whether it was a ClearRange or individual calls to clear.
-			if err := batch.Commit(true /* commit */); err != nil {
-				t.Fatal(err)
-			}
-			if err := eng.MVCCIterate(startKey, endKey, storage.MVCCKeyAndIntentsIterKind, func(kv storage.MVCCKeyValue) error {
-				return errors.New("expected no data in underlying engine")
-			}); err != nil {
-				t.Fatal(err)
-			}
+			})
 		})
 	}
 }