diff --git a/pkg/keys/constants.go b/pkg/keys/constants.go index 4390db95ef57..caee9bcdec3b 100644 --- a/pkg/keys/constants.go +++ b/pkg/keys/constants.go @@ -133,6 +133,10 @@ var ( // LocalRangeLastReplicaGCTimestampSuffix is the suffix for a range's last // replica GC timestamp (for GC of old replicas). LocalRangeLastReplicaGCTimestampSuffix = []byte("rlrt") + // LocalRangeMVCCRangeKeyGCLockSuffix is the suffix for a lock obtained + // by range tombstone operations to ensure they don't overlap with + // GC requests while allowing point traffic to go through unobstructed. + LocalRangeMVCCRangeKeyGCLockSuffix = []byte("rltu") // localRangeLastVerificationTimestampSuffix is DEPRECATED and remains to // prevent reuse. localRangeLastVerificationTimestampSuffix = []byte("rlvt") diff --git a/pkg/keys/keys.go b/pkg/keys/keys.go index 1d39b9c0e700..57d4a132d12a 100644 --- a/pkg/keys/keys.go +++ b/pkg/keys/keys.go @@ -279,6 +279,12 @@ func RangeGCThresholdKey(rangeID roachpb.RangeID) roachpb.Key { return MakeRangeIDPrefixBuf(rangeID).RangeGCThresholdKey() } +// MVCCRangeKeyGCKey returns a range local key protecting range +// tombstone mvcc stats calculations during range tombstone GC. +func MVCCRangeKeyGCKey(rangeID roachpb.RangeID) roachpb.Key { + return MakeRangeIDPrefixBuf(rangeID).MVCCRangeKeyGCKey() +} + // RangeVersionKey returns a system-local for the range version. func RangeVersionKey(rangeID roachpb.RangeID) roachpb.Key { return MakeRangeIDPrefixBuf(rangeID).RangeVersionKey() @@ -1055,3 +1061,9 @@ func (b RangeIDPrefixBuf) RaftReplicaIDKey() roachpb.Key { func (b RangeIDPrefixBuf) RangeLastReplicaGCTimestampKey() roachpb.Key { return append(b.unreplicatedPrefix(), LocalRangeLastReplicaGCTimestampSuffix...) } + +// MVCCRangeKeyGCKey returns a range local key protecting range +// tombstone mvcc stats calculations during range tombstone GC. +func (b RangeIDPrefixBuf) MVCCRangeKeyGCKey() roachpb.Key { + return append(b.unreplicatedPrefix(), LocalRangeMVCCRangeKeyGCLockSuffix...) +} diff --git a/pkg/kv/kvserver/batcheval/cmd_add_sstable.go b/pkg/kv/kvserver/batcheval/cmd_add_sstable.go index 4a794e33b21a..4f9f1a246cf1 100644 --- a/pkg/kv/kvserver/batcheval/cmd_add_sstable.go +++ b/pkg/kv/kvserver/batcheval/cmd_add_sstable.go @@ -57,8 +57,17 @@ func declareKeysAddSSTable( // NB: The range end key is not available, so this will pessimistically // latch up to args.EndKey.Next(). If EndKey falls on the range end key, the // span will be tightened during evaluation. + // Even if we obtain latches beyond the end range here, it won't cause + // contention with the subsequent range because latches are enforced per + // range. l, r := rangeTombstonePeekBounds(args.Key, args.EndKey, rs.GetStartKey().AsRawKey(), nil) latchSpans.AddMVCC(spanset.SpanReadOnly, roachpb.Span{Key: l, EndKey: r}, header.Timestamp) + + // Obtain a read only lock on range key GC key to serialize with + // range tombstone GC requests. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } } diff --git a/pkg/kv/kvserver/batcheval/cmd_clear_range.go b/pkg/kv/kvserver/batcheval/cmd_clear_range.go index 88ca15ed92a8..11b06f026c94 100644 --- a/pkg/kv/kvserver/batcheval/cmd_clear_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_clear_range.go @@ -56,9 +56,18 @@ func declareKeysClearRange( // NB: The range end key is not available, so this will pessimistically latch // up to args.EndKey.Next(). If EndKey falls on the range end key, the span // will be tightened during evaluation. + // Even if we obtain latches beyond the end range here, it won't cause + // contention with the subsequent range because latches are enforced per + // range. args := req.(*roachpb.ClearRangeRequest) l, r := rangeTombstonePeekBounds(args.Key, args.EndKey, rs.GetStartKey().AsRawKey(), nil) latchSpans.AddMVCC(spanset.SpanReadOnly, roachpb.Span{Key: l, EndKey: r}, header.Timestamp) + + // Obtain a read only lock on range key GC key to serialize with + // range tombstone GC requests. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } // ClearRange wipes all MVCC versions of keys covered by the specified diff --git a/pkg/kv/kvserver/batcheval/cmd_delete_range.go b/pkg/kv/kvserver/batcheval/cmd_delete_range.go index 54acdf8a0744..8369732305d6 100644 --- a/pkg/kv/kvserver/batcheval/cmd_delete_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_delete_range.go @@ -49,6 +49,9 @@ func declareKeysDeleteRange( // NB: The range end key is not available, so this will pessimistically // latch up to args.EndKey.Next(). If EndKey falls on the range end key, the // span will be tightened during evaluation. + // Even if we obtain latches beyond the end range here, it won't cause + // contention with the subsequent range because latches are enforced per + // range. l, r := rangeTombstonePeekBounds(args.Key, args.EndKey, rs.GetStartKey().AsRawKey(), nil) latchSpans.AddMVCC(spanset.SpanReadOnly, roachpb.Span{Key: l, EndKey: r}, header.Timestamp) @@ -56,6 +59,12 @@ func declareKeysDeleteRange( latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ Key: keys.RangeDescriptorKey(rs.GetStartKey()), }) + + // Obtain a read only lock on range key GC key to serialize with + // range tombstone GC requests. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } } diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go index 3d68657c5f24..e664a1b827b1 100644 --- a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go +++ b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go @@ -151,6 +151,12 @@ func declareKeysEndTxn( Key: abortspan.MinKey(rs.GetRangeID()), EndKey: abortspan.MaxKey(rs.GetRangeID()), }) + + // Protect range tombstones from collection by GC to avoid interference + // with MVCCStats calculation. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } if mt := et.InternalCommitTrigger.MergeTrigger; mt != nil { // Merges copy over the RHS abort span to the LHS, and compute @@ -183,6 +189,11 @@ func declareKeysEndTxn( Key: leftPeekBound, EndKey: rightPeekBound, }) + // Protect range tombstones from collection by GC to avoid interference + // with MVCCStats calculation. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(mt.LeftDesc.RangeID), + }) } } } diff --git a/pkg/kv/kvserver/batcheval/cmd_gc.go b/pkg/kv/kvserver/batcheval/cmd_gc.go index b148481d6d3b..4bb69417e3b2 100644 --- a/pkg/kv/kvserver/batcheval/cmd_gc.go +++ b/pkg/kv/kvserver/batcheval/cmd_gc.go @@ -37,22 +37,25 @@ func declareKeysGC( _ time.Duration, ) { gcr := req.(*roachpb.GCRequest) - // When GC-ing MVCC range key tombstones or individual range keys, we need to - // serialize with all writes that overlap the MVCC range tombstone, as well as - // the immediate left/right neighboring keys. This is because a range key - // write has non-local effects, i.e. it can fragment or merge other range keys - // at other timestamps and at its boundaries, and this has a non-commutative - // effect on MVCC stats -- if someone writes a new range key while we're GCing - // one below, the stats would come out wrong. - // Note that we only need to serialize with writers (including other GC - // processes) and not with readers (that are guaranteed to be above the GC - // threshold). To achieve this, we declare read-write access at - // hlc.MaxTimestamp which will not block any readers. - for _, span := range mergeAdjacentSpans(makeLookupBoundariesForGCRanges( - rs.GetStartKey().AsRawKey(), nil, gcr.RangeKeys, - )) { - latchSpans.AddMVCC(spanset.SpanReadWrite, span, hlc.MaxTimestamp) + if gcr.RangeKeys != nil { + // When GC-ing MVCC range key tombstones, we need to serialize with + // range key writes that overlap the MVCC range tombstone, as well as + // the immediate left/right neighboring keys. This is because a range + // key write has non-local effects, i.e. it can fragment or merge other + // range keys at other timestamps and at its boundaries, and this has + // a non-commutative effect on MVCC stats -- if someone writes a new + // range key while we're GCing one below, the stats would come out wrong. + // + // To achieve this, we use a virtual latch key that will prevent any + // range key write operations until GC request is done. All other range + // tombstone write operations obtain a read latch on this key and can + // run concurrently. + latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } + // For ClearRangeKey request we still obtain a wide write lock as we don't + // expect any operations running on the range. if rk := gcr.ClearRangeKey; rk != nil { latchSpans.AddMVCC(spanset.SpanReadWrite, roachpb.Span{Key: rk.StartKey, EndKey: rk.EndKey}, hlc.MaxTimestamp) diff --git a/pkg/kv/kvserver/batcheval/cmd_revert_range.go b/pkg/kv/kvserver/batcheval/cmd_revert_range.go index bbf862dc5cd0..0f8f52ccb7a6 100644 --- a/pkg/kv/kvserver/batcheval/cmd_revert_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_revert_range.go @@ -54,8 +54,17 @@ func declareKeysRevertRange( // NB: The range end key is not available, so this will pessimistically // latch up to args.EndKey.Next(). If EndKey falls on the range end key, the // span will be tightened during evaluation. + // Even if we obtain latches beyond the end range here, it won't cause + // contention with the subsequent range because latches are enforced per + // range. l, r := rangeTombstonePeekBounds(args.Key, args.EndKey, rs.GetStartKey().AsRawKey(), nil) latchSpans.AddMVCC(spanset.SpanReadOnly, roachpb.Span{Key: l, EndKey: r}, header.Timestamp) + + // Obtain a read only lock on range key GC key to serialize with + // range tombstone GC requests. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } // isEmptyKeyTimeRange checks if the span has no writes in (since,until].