From d15851a0fb0bd83b1872e49718e6b81e2e762e2b Mon Sep 17 00:00:00 2001 From: Oleg Afanasyev Date: Mon, 22 Aug 2022 20:55:53 +0100 Subject: [PATCH] batcheval: add latch key protecting range key stats update Previously GC needed to get a read latch with max timestamp to ensure that range tombstones are not modified during GC. This is causing all writers to get stuck in queue while GC is validating request and removing range tombstone. This commit adds a dedicated latch key LocalRangeMVCCRangeKeyGCLockSuffix to address the problem. All range tombstone writers obtain this read latch on top of the write latches for the ranges they are interested to update. GC on the other hand will obtain write latch on that key. This approach allows point writers to proceed during GC, but will block new range tombstones from being written. Non conflicting writes of range tombstones could still proceed since their write latch ranges don't overlap. Release justification: this is a safe change as range tombstone behaviour is not enabled yet and the change is needed to address potential performance regressions. Release note: None --- pkg/keys/constants.go | 4 +++ pkg/keys/keys.go | 12 +++++++ pkg/kv/kvserver/batcheval/cmd_add_sstable.go | 9 +++++ pkg/kv/kvserver/batcheval/cmd_clear_range.go | 9 +++++ pkg/kv/kvserver/batcheval/cmd_delete_range.go | 9 +++++ .../kvserver/batcheval/cmd_end_transaction.go | 11 +++++++ pkg/kv/kvserver/batcheval/cmd_gc.go | 33 ++++++++++--------- pkg/kv/kvserver/batcheval/cmd_revert_range.go | 9 +++++ 8 files changed, 81 insertions(+), 15 deletions(-) diff --git a/pkg/keys/constants.go b/pkg/keys/constants.go index 4390db95ef57..caee9bcdec3b 100644 --- a/pkg/keys/constants.go +++ b/pkg/keys/constants.go @@ -133,6 +133,10 @@ var ( // LocalRangeLastReplicaGCTimestampSuffix is the suffix for a range's last // replica GC timestamp (for GC of old replicas). LocalRangeLastReplicaGCTimestampSuffix = []byte("rlrt") + // LocalRangeMVCCRangeKeyGCLockSuffix is the suffix for a lock obtained + // by range tombstone operations to ensure they don't overlap with + // GC requests while allowing point traffic to go through unobstructed. + LocalRangeMVCCRangeKeyGCLockSuffix = []byte("rltu") // localRangeLastVerificationTimestampSuffix is DEPRECATED and remains to // prevent reuse. localRangeLastVerificationTimestampSuffix = []byte("rlvt") diff --git a/pkg/keys/keys.go b/pkg/keys/keys.go index 1d39b9c0e700..57d4a132d12a 100644 --- a/pkg/keys/keys.go +++ b/pkg/keys/keys.go @@ -279,6 +279,12 @@ func RangeGCThresholdKey(rangeID roachpb.RangeID) roachpb.Key { return MakeRangeIDPrefixBuf(rangeID).RangeGCThresholdKey() } +// MVCCRangeKeyGCKey returns a range local key protecting range +// tombstone mvcc stats calculations during range tombstone GC. +func MVCCRangeKeyGCKey(rangeID roachpb.RangeID) roachpb.Key { + return MakeRangeIDPrefixBuf(rangeID).MVCCRangeKeyGCKey() +} + // RangeVersionKey returns a system-local for the range version. func RangeVersionKey(rangeID roachpb.RangeID) roachpb.Key { return MakeRangeIDPrefixBuf(rangeID).RangeVersionKey() @@ -1055,3 +1061,9 @@ func (b RangeIDPrefixBuf) RaftReplicaIDKey() roachpb.Key { func (b RangeIDPrefixBuf) RangeLastReplicaGCTimestampKey() roachpb.Key { return append(b.unreplicatedPrefix(), LocalRangeLastReplicaGCTimestampSuffix...) } + +// MVCCRangeKeyGCKey returns a range local key protecting range +// tombstone mvcc stats calculations during range tombstone GC. +func (b RangeIDPrefixBuf) MVCCRangeKeyGCKey() roachpb.Key { + return append(b.unreplicatedPrefix(), LocalRangeMVCCRangeKeyGCLockSuffix...) +} diff --git a/pkg/kv/kvserver/batcheval/cmd_add_sstable.go b/pkg/kv/kvserver/batcheval/cmd_add_sstable.go index 4a794e33b21a..4f9f1a246cf1 100644 --- a/pkg/kv/kvserver/batcheval/cmd_add_sstable.go +++ b/pkg/kv/kvserver/batcheval/cmd_add_sstable.go @@ -57,8 +57,17 @@ func declareKeysAddSSTable( // NB: The range end key is not available, so this will pessimistically // latch up to args.EndKey.Next(). If EndKey falls on the range end key, the // span will be tightened during evaluation. + // Even if we obtain latches beyond the end range here, it won't cause + // contention with the subsequent range because latches are enforced per + // range. l, r := rangeTombstonePeekBounds(args.Key, args.EndKey, rs.GetStartKey().AsRawKey(), nil) latchSpans.AddMVCC(spanset.SpanReadOnly, roachpb.Span{Key: l, EndKey: r}, header.Timestamp) + + // Obtain a read only lock on range key GC key to serialize with + // range tombstone GC requests. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } } diff --git a/pkg/kv/kvserver/batcheval/cmd_clear_range.go b/pkg/kv/kvserver/batcheval/cmd_clear_range.go index 88ca15ed92a8..11b06f026c94 100644 --- a/pkg/kv/kvserver/batcheval/cmd_clear_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_clear_range.go @@ -56,9 +56,18 @@ func declareKeysClearRange( // NB: The range end key is not available, so this will pessimistically latch // up to args.EndKey.Next(). If EndKey falls on the range end key, the span // will be tightened during evaluation. + // Even if we obtain latches beyond the end range here, it won't cause + // contention with the subsequent range because latches are enforced per + // range. args := req.(*roachpb.ClearRangeRequest) l, r := rangeTombstonePeekBounds(args.Key, args.EndKey, rs.GetStartKey().AsRawKey(), nil) latchSpans.AddMVCC(spanset.SpanReadOnly, roachpb.Span{Key: l, EndKey: r}, header.Timestamp) + + // Obtain a read only lock on range key GC key to serialize with + // range tombstone GC requests. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } // ClearRange wipes all MVCC versions of keys covered by the specified diff --git a/pkg/kv/kvserver/batcheval/cmd_delete_range.go b/pkg/kv/kvserver/batcheval/cmd_delete_range.go index 54acdf8a0744..8369732305d6 100644 --- a/pkg/kv/kvserver/batcheval/cmd_delete_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_delete_range.go @@ -49,6 +49,9 @@ func declareKeysDeleteRange( // NB: The range end key is not available, so this will pessimistically // latch up to args.EndKey.Next(). If EndKey falls on the range end key, the // span will be tightened during evaluation. + // Even if we obtain latches beyond the end range here, it won't cause + // contention with the subsequent range because latches are enforced per + // range. l, r := rangeTombstonePeekBounds(args.Key, args.EndKey, rs.GetStartKey().AsRawKey(), nil) latchSpans.AddMVCC(spanset.SpanReadOnly, roachpb.Span{Key: l, EndKey: r}, header.Timestamp) @@ -56,6 +59,12 @@ func declareKeysDeleteRange( latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ Key: keys.RangeDescriptorKey(rs.GetStartKey()), }) + + // Obtain a read only lock on range key GC key to serialize with + // range tombstone GC requests. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } } diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go index 3d68657c5f24..e664a1b827b1 100644 --- a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go +++ b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go @@ -151,6 +151,12 @@ func declareKeysEndTxn( Key: abortspan.MinKey(rs.GetRangeID()), EndKey: abortspan.MaxKey(rs.GetRangeID()), }) + + // Protect range tombstones from collection by GC to avoid interference + // with MVCCStats calculation. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } if mt := et.InternalCommitTrigger.MergeTrigger; mt != nil { // Merges copy over the RHS abort span to the LHS, and compute @@ -183,6 +189,11 @@ func declareKeysEndTxn( Key: leftPeekBound, EndKey: rightPeekBound, }) + // Protect range tombstones from collection by GC to avoid interference + // with MVCCStats calculation. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(mt.LeftDesc.RangeID), + }) } } } diff --git a/pkg/kv/kvserver/batcheval/cmd_gc.go b/pkg/kv/kvserver/batcheval/cmd_gc.go index b148481d6d3b..4bb69417e3b2 100644 --- a/pkg/kv/kvserver/batcheval/cmd_gc.go +++ b/pkg/kv/kvserver/batcheval/cmd_gc.go @@ -37,22 +37,25 @@ func declareKeysGC( _ time.Duration, ) { gcr := req.(*roachpb.GCRequest) - // When GC-ing MVCC range key tombstones or individual range keys, we need to - // serialize with all writes that overlap the MVCC range tombstone, as well as - // the immediate left/right neighboring keys. This is because a range key - // write has non-local effects, i.e. it can fragment or merge other range keys - // at other timestamps and at its boundaries, and this has a non-commutative - // effect on MVCC stats -- if someone writes a new range key while we're GCing - // one below, the stats would come out wrong. - // Note that we only need to serialize with writers (including other GC - // processes) and not with readers (that are guaranteed to be above the GC - // threshold). To achieve this, we declare read-write access at - // hlc.MaxTimestamp which will not block any readers. - for _, span := range mergeAdjacentSpans(makeLookupBoundariesForGCRanges( - rs.GetStartKey().AsRawKey(), nil, gcr.RangeKeys, - )) { - latchSpans.AddMVCC(spanset.SpanReadWrite, span, hlc.MaxTimestamp) + if gcr.RangeKeys != nil { + // When GC-ing MVCC range key tombstones, we need to serialize with + // range key writes that overlap the MVCC range tombstone, as well as + // the immediate left/right neighboring keys. This is because a range + // key write has non-local effects, i.e. it can fragment or merge other + // range keys at other timestamps and at its boundaries, and this has + // a non-commutative effect on MVCC stats -- if someone writes a new + // range key while we're GCing one below, the stats would come out wrong. + // + // To achieve this, we use a virtual latch key that will prevent any + // range key write operations until GC request is done. All other range + // tombstone write operations obtain a read latch on this key and can + // run concurrently. + latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } + // For ClearRangeKey request we still obtain a wide write lock as we don't + // expect any operations running on the range. if rk := gcr.ClearRangeKey; rk != nil { latchSpans.AddMVCC(spanset.SpanReadWrite, roachpb.Span{Key: rk.StartKey, EndKey: rk.EndKey}, hlc.MaxTimestamp) diff --git a/pkg/kv/kvserver/batcheval/cmd_revert_range.go b/pkg/kv/kvserver/batcheval/cmd_revert_range.go index bbf862dc5cd0..0f8f52ccb7a6 100644 --- a/pkg/kv/kvserver/batcheval/cmd_revert_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_revert_range.go @@ -54,8 +54,17 @@ func declareKeysRevertRange( // NB: The range end key is not available, so this will pessimistically // latch up to args.EndKey.Next(). If EndKey falls on the range end key, the // span will be tightened during evaluation. + // Even if we obtain latches beyond the end range here, it won't cause + // contention with the subsequent range because latches are enforced per + // range. l, r := rangeTombstonePeekBounds(args.Key, args.EndKey, rs.GetStartKey().AsRawKey(), nil) latchSpans.AddMVCC(spanset.SpanReadOnly, roachpb.Span{Key: l, EndKey: r}, header.Timestamp) + + // Obtain a read only lock on range key GC key to serialize with + // range tombstone GC requests. + latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ + Key: keys.MVCCRangeKeyGCKey(rs.GetRangeID()), + }) } // isEmptyKeyTimeRange checks if the span has no writes in (since,until].