From 06fe17cfcad5122f4e8db961018a83a68a54818c Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Thu, 11 Jan 2024 13:43:57 -0500 Subject: [PATCH] db: remove compactionIter's sameStripeNonSkippable This commit removes the compaction iterator's concept of a key that lies within the same snapshot stripe but is non-skippable. These "non-skippable" keys lead to bizarre behavior, such as a sequence like: a.MERGE.5 a.RANGEDEL.4 a.MERGE.4 yielding the separate unmerged a.MERGE.5 and a.MERGE.4 internal keys (Note a.RANGEDEL.4 does not delete a.MERGE.4 because they share the same sequence number.) The sameStripeNonSkippable fell out of the fact that range deletions were previously interleaved by the input iterator at their start key with their sequence number. These sameStripeNonSkippable could interrupt any logic iterating across the keys of a stripe, complicating the compaction iterator logic. With #3219, range deletions began to be interleaved at their start key with the maximal sequence number, ensuring they never interrupt the keys of a snapshot stripe. In some instances INVALID keys were also returned with sameStripeNonSkippable. These keys are now treated similarly to other errors encountered during iteration: i.err is set and newStripeNewKey is returned. Close #3082. --- compaction_iter.go | 274 +++++++------------------- compaction_iter_test.go | 64 ++++-- testdata/compaction_iter_delete_sized | 232 ++++++++++++---------- testdata/compaction_iter_set_with_del | 164 ++++++++------- 4 files changed, 340 insertions(+), 394 deletions(-) diff --git a/compaction_iter.go b/compaction_iter.go index c86173e711..1667474e24 100644 --- a/compaction_iter.go +++ b/compaction_iter.go @@ -180,16 +180,9 @@ type compactionIter struct { iterKey *InternalKey iterValue []byte iterStripeChange stripeChangeType - // `skip` indicates whether the remaining skippable entries in the current - // snapshot stripe should be skipped or processed. An example of a non- - // skippable entry is a range tombstone as we need to return it from the - // `compactionIter`, even if a key covering its start key has already been - // seen in the same stripe. `skip` has no effect when `pos == iterPosNext`. - // - // TODO(jackson): If we use keyspan.InterleavingIter for range deletions, - // like we do for range keys, the only remaining 'non-skippable' key is - // the invalid key. We should be able to simplify this logic and remove this - // field. + // `skip` indicates whether the remaining entries in the current snapshot + // stripe should be skipped or processed. `skip` has no effect when `pos == + // iterPosNext`. skip bool // `pos` indicates the iterator position at the top of `Next()`. Its type's // (`iterPos`) values take on the following meanings in the context of @@ -206,13 +199,6 @@ type compactionIter struct { // compaction iterator was only returned because an open snapshot prevents // its elision. This field only applies to point keys, and not to range // deletions or range keys. - // - // For MERGE, it is possible that doing the merge is interrupted even when - // the next point key is in the same stripe. This can happen if the loop in - // mergeNext gets interrupted by sameStripeNonSkippable. - // sameStripeNonSkippable occurs due to RANGEDELs that sort before - // SET/MERGE/DEL with the same seqnum, so the RANGEDEL does not necessarily - // delete the subsequent SET/MERGE/DEL keys. snapshotPinned bool // forceObsoleteDueToRangeDel is set to true in a subset of the cases that // snapshotPinned is true. This value is true when the point is obsolete due @@ -249,6 +235,8 @@ type compactionIter struct { frontiers frontiers // Reference to the range deletion tombstone fragmenter (e.g., // `compaction.rangeDelFrag`). + // TODO(jackson): We can eliminate range{Del,Key}Frag now that fragmentation + // is performed upfront by keyspan.MergingIters. rangeDelFrag *keyspan.Fragmenter rangeKeyFrag *keyspan.Fragmenter // The fragmented tombstones. @@ -348,24 +336,6 @@ func (i *compactionIter) Next() (*InternalKey, []byte) { // snapshot stripe. // - `skip && pos == iterPosCurForward`: We are at the key that has been returned. // To move forward we skip skippable entries in the stripe. - // - `skip && pos == iterPosNext && i.iterStripeChange == sameStripeNonSkippable`: - // This case may occur when skipping within a snapshot stripe and we - // encounter either: - // a) an invalid key kind; The previous call will have returned - // whatever key it was processing and deferred handling of the - // invalid key to this invocation of Next(). We're responsible for - // ignoring skip=true and falling into the invalid key kind case - // down below. - // b) an interleaved range delete; This is a wart of the current code - // structure. While skipping within a snapshot stripe, a range - // delete interleaved at its start key and sequence number - // interrupts the sequence of point keys. After we return the range - // delete to the caller, we need to pick up skipping at where we - // left off, so we preserve skip=true. - // TODO(jackson): This last case is confusing and can be removed if we - // interleave range deletions at the maximal sequence number using the - // keyspan interleaving iterator. This is the treatment given to range - // keys today. if i.pos == iterPosCurForward { if i.skip { i.skipInStripe() @@ -373,9 +343,7 @@ func (i *compactionIter) Next() (*InternalKey, []byte) { i.nextInStripe() } } else if i.skip { - if i.iterStripeChange != sameStripeNonSkippable { - panic(errors.AssertionFailedf("compaction iterator has skip=true, but iterator is at iterPosNext")) - } + panic(errors.AssertionFailedf("compaction iterator has skip=true, but iterator is at iterPosNext")) } i.pos = iterPosCurForward @@ -395,15 +363,16 @@ func (i *compactionIter) Next() (*InternalKey, []byte) { if i.iterKey.Kind() == InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKey.Kind()) { // Return the span so the compaction can use it for file truncation and add - // it to the relevant fragmenter. We do not set `skip` to true before - // returning as there may be a forthcoming point key with the same user key - // and sequence number. Such a point key must be visible (i.e., not skipped + // it to the relevant fragmenter. In the case of range deletions, we do not + // set `skip` to true before returning as there may be any number of point + // keys with the same user key and sequence numbers ≥ the range deletion's + // sequence number. Such point keys must be visible (i.e., not skipped // over) since we promise point keys are not deleted by range tombstones at - // the same sequence number. + // the same sequence number (or higher). // - // Although, note that `skip` may already be true before reaching here - // due to an earlier key in the stripe. Then it is fine to leave it set - // to true, as the earlier key must have had a higher sequence number. + // Note that `skip` must already be false here, because range keys and range + // deletions are interleaved at the maximal sequence numbers and neither will + // set `skip`=true. // // NOTE: there is a subtle invariant violation here in that calling // saveKey and returning a reference to the temporary slice violates @@ -526,9 +495,8 @@ func (i *compactionIter) Next() (*InternalKey, []byte) { origSnapshotIdx := i.curSnapshotIdx var valueMerger ValueMerger valueMerger, i.err = i.merge(i.iterKey.UserKey, i.iterValue) - var change stripeChangeType if i.err == nil { - change = i.mergeNext(valueMerger) + i.mergeNext(valueMerger) } var needDelete bool if i.err == nil { @@ -553,20 +521,8 @@ func (i *compactionIter) Next() (*InternalKey, []byte) { } continue } - // A non-skippable entry does not necessarily cover later merge - // operands, so we must not zero the current merge result's seqnum. - // - // For example, suppose the forthcoming two keys are a range - // tombstone, `[a, b)#3`, and a merge operand, `a#3`. Recall that - // range tombstones do not cover point keys at the same seqnum, so - // `a#3` is not deleted. The range tombstone will be seen first due - // to its larger value type. Since it is a non-skippable key, the - // current merge will not include `a#3`. If we zeroed the current - // merge result's seqnum, then it would conflict with the upcoming - // merge including `a#3`, whose seqnum will also be zeroed. - if change != sameStripeNonSkippable { - i.maybeZeroSeqnum(origSnapshotIdx) - } + + i.maybeZeroSeqnum(origSnapshotIdx) return &i.key, i.value } if i.err != nil { @@ -618,18 +574,13 @@ func (i *compactionIter) skipInStripe() { i.skip = true // TODO(sumeer): we can avoid the overhead of calling i.rangeDelFrag.Covers, // in this case of nextInStripe, since we are skipping all of them anyway. - for i.nextInStripe() == sameStripeSkippable { + for i.nextInStripe() == sameStripe { if i.err != nil { panic(i.err) } } - // Reset skip if we landed outside the original stripe. Otherwise, we landed - // in the same stripe on a non-skippable key. In that case we should preserve - // `i.skip == true` such that later keys in the stripe will continue to be - // skipped. - if i.iterStripeChange == newStripeNewKey || i.iterStripeChange == newStripeSameKey { - i.skip = false - } + // We landed outside the original stripe, so reset skip. + i.skip = false } func (i *compactionIter) iterNext() bool { @@ -643,24 +594,22 @@ func (i *compactionIter) iterNext() bool { } // stripeChangeType indicates how the snapshot stripe changed relative to the -// previous key. If no change, it also indicates whether the current entry is -// skippable. If the snapshot stripe changed, it also indicates whether the new -// stripe was entered because the iterator progressed onto an entirely new key -// or entered a new stripe within the same key. +// previous key. If the snapshot stripe changed, it also indicates whether the +// new stripe was entered because the iterator progressed onto an entirely new +// key or entered a new stripe within the same key. type stripeChangeType int const ( newStripeNewKey stripeChangeType = iota newStripeSameKey - sameStripeSkippable - sameStripeNonSkippable + sameStripe ) // nextInStripe advances the iterator and returns one of the above const ints // indicating how its state changed. // -// All sameStripeSkippable keys that are covered by a RANGEDEL will be skipped -// and not returned. +// All sameStripe keys that are covered by a RANGEDEL will be skipped and not +// returned. // // Calls to nextInStripe must be preceded by a call to saveKey to retain a // temporary reference to the original key, so that forward iteration can @@ -709,29 +658,28 @@ func (i *compactionIter) nextInStripeHelper() stripeChangeType { i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) switch key.Kind() { - case InternalKeyKindRangeDelete: - // Range tombstones need to be exposed by the compactionIter to the upper level - // `compaction` object, so return them regardless of whether they are in the same - // snapshot stripe. - if i.curSnapshotIdx == origSnapshotIdx { - return sameStripeNonSkippable - } - return newStripeSameKey case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: - // Range keys are interleaved at the max sequence number for a given user - // key, so we should not see any more range keys in this stripe. + // Range keys are interleaved once, at the max sequence number + // before all other keys with the saem user key, so we should not + // see any more in this stripe. panic("unreachable") - case InternalKeyKindInvalid: - if i.curSnapshotIdx == origSnapshotIdx { - return sameStripeNonSkippable + case InternalKeyKindRangeDelete: + // Range deletions are also interleaved at the max sequence number, + // but after any range keys beginning at the same user key (this + // falls out of the key kind numbering). Since they're interleaved + // at the max sequence number, they must fall within the first + // snapshot stripe. + if i.curSnapshotIdx != origSnapshotIdx { + panic(errors.AssertionFailedf("pebble: invariant violation: RANGEDEL in separate snapshot stripe")) } - return newStripeSameKey + return sameStripe case InternalKeyKindDelete, InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSingleDelete, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized: // Fall through default: + kind := i.iterKey.Kind() i.iterKey = nil - i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) + i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(kind)) i.valid = false return newStripeNewKey } @@ -740,7 +688,7 @@ func (i *compactionIter) nextInStripeHelper() stripeChangeType { if i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum) == keyspan.CoversVisibly { continue } - return sameStripeSkippable + return sameStripe } return newStripeSameKey } @@ -767,56 +715,15 @@ func (i *compactionIter) setNext() { // Else, we continue to loop through entries in the stripe looking for a // DEL. Note that we may stop *before* encountering a DEL, if one exists. // - // NB: nextInStripe will skip sameStripeSkippable keys that are visibly - // covered by a RANGEDEL. This can include DELs -- this is fine since such - // DELs don't need to be combined with SET to make SETWITHDEL. + // NB: nextInStripe will skip sameStripe keys that are visibly covered by a + // RANGEDEL. This can include DELs -- this is fine since such DELs don't + // need to be combined with SET to make SETWITHDEL. for { switch i.nextInStripe() { case newStripeNewKey, newStripeSameKey: i.pos = iterPosNext return - case sameStripeNonSkippable: - i.pos = iterPosNext - // We iterated onto a key that we cannot skip. We can - // conservatively transform the original SET into a SETWITHDEL - // as an indication that there *may* still be a DEL/SINGLEDEL - // under this SET, even if we did not actually encounter one. - // - // This is safe to do, as: - // - // - in the case that there *is not* actually a DEL/SINGLEDEL - // under this entry, any SINGLEDEL above this now-transformed - // SETWITHDEL will become a DEL when the two encounter in a - // compaction. The DEL will eventually be elided in a - // subsequent compaction. The cost for ensuring correctness is - // that this entry is kept around for an additional compaction - // cycle(s). - // - // - in the case there *is* indeed a DEL/SINGLEDEL under us - // (but in a different stripe or sstable), then we will have - // already done the work to transform the SET into a - // SETWITHDEL, and we will skip any additional iteration when - // this entry is encountered again in a subsequent compaction. - // - // Ideally, this codepath would be smart enough to handle the - // case of SET <- RANGEDEL <- ... <- DEL/SINGLEDEL <- .... - // This requires preserving any RANGEDEL entries we encounter - // along the way, then emitting the original (possibly - // transformed) key, followed by the RANGEDELs. This requires - // a sizable refactoring of the existing code, as nextInStripe - // currently returns a sameStripeNonSkippable when it - // encounters a RANGEDEL. - // TODO(travers): optimize to handle the RANGEDEL case if it - // turns out to be a performance problem. - i.key.SetKind(InternalKeyKindSetWithDelete) - - // By setting i.skip=true, we are saying that after the - // non-skippable key is emitted (which is likely a RANGEDEL), - // the remaining point keys that share the same user key as this - // saved key should be skipped. - i.skip = true - return - case sameStripeSkippable: + case sameStripe: // We're still in the same stripe. If this is a // DEL/SINGLEDEL/DELSIZED, we stop looking and emit a SETWITHDEL. // Subsequent keys are eligible for skipping. @@ -837,7 +744,7 @@ func (i *compactionIter) setNext() { } } -func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { +func (i *compactionIter) mergeNext(valueMerger ValueMerger) { // Save the current key. i.saveKey() i.valid = true @@ -845,20 +752,20 @@ func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { // Loop looking for older values in the current snapshot stripe and merge // them. for { - if i.nextInStripe() != sameStripeSkippable { + if i.nextInStripe() != sameStripe { i.pos = iterPosNext - return i.iterStripeChange + return } if i.err != nil { panic(i.err) } // NB: MERGE#10+RANGEDEL#9 stays a MERGE, since nextInStripe skips - // sameStripeSkippable keys that are visibly covered by a RANGEDEL. There - // may be MERGE#7 that is invisibly covered and will be preserved, but - // there is no risk that MERGE#10 and MERGE#7 will get merged in the - // future as the RANGEDEL still exists and will be used in user-facing - // reads that see MERGE#10, and will also eventually cause MERGE#7 to be - // deleted in a compaction. + // sameStripe keys that are visibly covered by a RANGEDEL. There may be + // MERGE#7 that is invisibly covered and will be preserved, but there is + // no risk that MERGE#10 and MERGE#7 will get merged in the future as + // the RANGEDEL still exists and will be used in user-facing reads that + // see MERGE#10, and will also eventually cause MERGE#7 to be deleted in + // a compaction. key := i.iterKey switch key.Kind() { case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: @@ -881,7 +788,7 @@ func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { // use of SingleDelete. i.key.SetKind(InternalKeyKindSetWithDelete) i.skip = true - return sameStripeSkippable + return case InternalKeyKindSet, InternalKeyKindSetWithDelete: // We've hit a Set or SetWithDel value. Merge with the existing @@ -891,11 +798,11 @@ func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { i.err = valueMerger.MergeOlder(i.iterValue) if i.err != nil { i.valid = false - return sameStripeSkippable + return } i.key.SetKind(InternalKeyKindSet) i.skip = true - return sameStripeSkippable + return case InternalKeyKindMerge: // We've hit another Merge value. Merge with the existing value and @@ -903,13 +810,13 @@ func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { i.err = valueMerger.MergeOlder(i.iterValue) if i.err != nil { i.valid = false - return sameStripeSkippable + return } default: i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) i.valid = false - return sameStripeSkippable + return } } } @@ -934,7 +841,7 @@ func (i *compactionIter) singleDeleteNext() bool { for { // If we find a key that can't be skipped, return true so that the // caller yields the SingleDelete to the caller. - if i.nextInStripe() != sameStripeSkippable { + if i.nextInStripe() != sameStripe { // This defers additional error checking regarding single delete // invariants to the compaction where the keys with the same user key as // the single delete are in the same stripe. @@ -944,7 +851,7 @@ func (i *compactionIter) singleDeleteNext() bool { if i.err != nil { panic(i.err) } - // INVARIANT: sameStripeSkippable. + // INVARIANT: sameStripe. key := i.iterKey kind := key.Kind() switch kind { @@ -970,33 +877,26 @@ func (i *compactionIter) singleDeleteNext() bool { // since we are at the key after the Set/Merge that was single deleted. change := i.nextInStripe() switch change { - case sameStripeSkippable, newStripeSameKey: + case sameStripe, newStripeSameKey: // On the same user key. nextKind := i.iterKey.Kind() switch nextKind { case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge: if i.singleDeleteInvariantViolationCallback != nil { - // sameStripeSkippable keys returned by nextInStripe() are already + // sameStripe keys returned by nextInStripe() are already // known to not be covered by a RANGEDEL, so it is an invariant // violation. The rare case is newStripeSameKey, where it is a // violation if not covered by a RANGEDEL. - if change == sameStripeSkippable || + if change == sameStripe || i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum) == keyspan.NoCover { i.singleDeleteInvariantViolationCallback(i.key.UserKey) } } - case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete, - InternalKeyKindRangeDelete: + case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete: default: panic(errors.AssertionFailedf( "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) } - case sameStripeNonSkippable: - // No ability to check whether there is another Set/Merge below with - // the same user key. - // - // TODO(sumeer): once range deletions are interleaved at the maximal - // sequence number, this case will go away. case newStripeNewKey: default: panic("unreachable") @@ -1054,25 +954,7 @@ func (i *compactionIter) skipDueToSingleDeleteElision() { // determined that the tombstone is in the final snapshot stripe, but we // stepped into a new stripe of the same key. panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe")) - case sameStripeNonSkippable: - // There's a key that we cannot skip. There are two possible cases: - // a. The key is invalid. This is an error. - // b. The key is a range deletion. - // The second case may also be an ineffectual single delete. However, it - // is possible that there is a SET that is at the same seqnum as the - // RANGEDEL, and so is not deleted by that RANGEDEL, and will be deleted - // by this single delete. So we cannot be certain that this is an - // ineffectual single delete. - // - // TODO(sumeer): the existing todo to interleave range deletions at the - // maximal sequence number will allow us to address this ambiguity. - // - // TODO(sumeer): by setting skip to true, the compactionIter is making a - // single delete stronger (like a del), which will hide bugs in the use of - // single delete. - i.skip = true - return - case sameStripeSkippable: + case sameStripe: kind := i.iterKey.Kind() switch kind { case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete: @@ -1113,7 +995,7 @@ func (i *compactionIter) skipDueToSingleDeleteElision() { case newStripeSameKey: panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe")) case newStripeNewKey: - case sameStripeSkippable: + case sameStripe: // On the same key. nextKind := i.iterKey.Kind() switch nextKind { @@ -1121,18 +1003,11 @@ func (i *compactionIter) skipDueToSingleDeleteElision() { if i.singleDeleteInvariantViolationCallback != nil { i.singleDeleteInvariantViolationCallback(i.key.UserKey) } - case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete, - InternalKeyKindRangeDelete: + case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete: default: panic(errors.AssertionFailedf( "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) } - case sameStripeNonSkippable: - // No ability to check whether there is another Set/Merge below with - // the same user key. - // - // TODO(sumeer): once range deletions are interleaved at the maximal - // sequence number, this case will go away. default: panic("unreachable") } @@ -1174,7 +1049,7 @@ func (i *compactionIter) deleteSizedNext() (*base.InternalKey, []byte) { // Loop through all the keys within this stripe that are skippable. i.pos = iterPosNext - for i.nextInStripe() == sameStripeSkippable { + for i.nextInStripe() == sameStripe { if i.err != nil { panic(i.err) } @@ -1305,13 +1180,12 @@ func (i *compactionIter) deleteSizedNext() (*base.InternalKey, []byte) { return nil, nil } } - // Reset skip if we landed outside the original stripe. Otherwise, we landed - // in the same stripe on a non-skippable key. In that case we should preserve - // `i.skip == true` such that later keys in the stripe will continue to be - // skipped. - if i.iterStripeChange == newStripeNewKey || i.iterStripeChange == newStripeSameKey { - i.skip = false + + if i.iterStripeChange == sameStripe { + panic(errors.AssertionFailedf("unexpectedly found iter stripe change = %d", i.iterStripeChange)) } + // We landed outside the original stripe. Reset skip. + i.skip = false if i.err != nil { return nil, nil } diff --git a/compaction_iter_test.go b/compaction_iter_test.go index 932abde16e..73743448f5 100644 --- a/compaction_iter_test.go +++ b/compaction_iter_test.go @@ -81,11 +81,13 @@ func TestCompactionIter(t *testing.T) { var merge Merge var keys []InternalKey var rangeKeys []keyspan.Span + var rangeDels []keyspan.Span var vals [][]byte var snapshots []uint64 var elideTombstones bool var allowZeroSeqnum bool - var interleavingIter *keyspan.InterleavingIter + var rangeKeyInterleaving *keyspan.InterleavingIter + var rangeDelInterleaving *keyspan.InterleavingIter // The input to the data-driven test is dependent on the format major // version we are testing against. @@ -108,13 +110,19 @@ func TestCompactionIter(t *testing.T) { // susceptible to use-after-free bugs, we skip the zeroing of // RangeDelete keys. fi := &fakeIter{keys: keys, vals: vals} - interleavingIter = &keyspan.InterleavingIter{} - interleavingIter.Init( + rangeDelInterleaving = &keyspan.InterleavingIter{} + rangeDelInterleaving.Init( base.DefaultComparer, fi, + keyspan.NewIter(base.DefaultComparer.Compare, rangeDels), + keyspan.InterleavingIterOpts{}) + rangeKeyInterleaving = &keyspan.InterleavingIter{} + rangeKeyInterleaving.Init( + base.DefaultComparer, + rangeDelInterleaving, keyspan.NewIter(base.DefaultComparer.Compare, rangeKeys), keyspan.InterleavingIterOpts{}) - iter := invalidating.NewIter(interleavingIter, invalidating.IgnoreKinds(InternalKeyKindRangeDelete)) + iter := invalidating.NewIter(rangeKeyInterleaving, invalidating.IgnoreKinds(InternalKeyKindRangeDelete)) if merge == nil { merge = func(key, value []byte) (base.ValueMerger, error) { m := &debugMerger{} @@ -161,9 +169,37 @@ func TestCompactionIter(t *testing.T) { keys = keys[:0] vals = vals[:0] rangeKeys = rangeKeys[:0] + rangeDels = rangeDels[:0] + rangeDelFragmenter := keyspan.Fragmenter{ + Cmp: DefaultComparer.Compare, + Format: DefaultComparer.FormatKey, + Emit: func(s keyspan.Span) { + rangeDels = append(rangeDels, s) + }, + } for _, key := range strings.Split(d.Input, "\n") { + // If the line ends in a '}' assume it's a span. + if strings.HasSuffix(key, "}") { + s := keyspan.ParseSpan(strings.TrimSpace(key)) + rangeKeys = append(rangeKeys, s) + continue + } + j := strings.Index(key, ":") - keys = append(keys, base.ParseInternalKey(key[:j])) + ik := base.ParseInternalKey(key[:j]) + if rangekey.IsRangeKey(ik.Kind()) { + panic("range keys must be pre-fragmented and formatted as spans") + } + if ik.Kind() == base.InternalKeyKindRangeDelete { + rangeDelFragmenter.Add(keyspan.Span{ + Start: ik.UserKey, + End: []byte(key[j+1:]), + Keys: []keyspan.Key{{Trailer: ik.Trailer}}, + }) + continue + } + + keys = append(keys, ik) if strings.HasPrefix(key[j+1:], "varint(") { valueStr := strings.TrimSuffix(strings.TrimPrefix(key[j+1:], "varint("), ")") @@ -175,13 +211,7 @@ func TestCompactionIter(t *testing.T) { vals = append(vals, []byte(key[j+1:])) } } - return "" - - case "define-range-keys": - for _, key := range strings.Split(d.Input, "\n") { - s := keyspan.ParseSpan(strings.TrimSpace(key)) - rangeKeys = append(rangeKeys, s) - } + rangeDelFragmenter.Finish() return "" case "iter": @@ -288,16 +318,10 @@ func TestCompactionIter(t *testing.T) { } fmt.Fprintf(&b, "%s:%s%s%s\n", iter.Key(), v, snapshotPinned, forceObsolete) if iter.Key().Kind() == InternalKeyKindRangeDelete { - iter.rangeDelFrag.Add(keyspan.Span{ - Start: append([]byte{}, iter.Key().UserKey...), - End: append([]byte{}, iter.Value()...), - Keys: []keyspan.Key{ - {Trailer: iter.Key().Trailer}, - }, - }) + iter.rangeDelFrag.Add(*rangeDelInterleaving.Span()) } if rangekey.IsRangeKey(iter.Key().Kind()) { - iter.rangeKeyFrag.Add(*interleavingIter.Span()) + iter.rangeKeyFrag.Add(*rangeKeyInterleaving.Span()) } } else if err := iter.Error(); err != nil { fmt.Fprintf(&b, "err=%v\n", err) diff --git a/testdata/compaction_iter_delete_sized b/testdata/compaction_iter_delete_sized index 950ec82cf6..209c5df448 100644 --- a/testdata/compaction_iter_delete_sized +++ b/testdata/compaction_iter_delete_sized @@ -247,9 +247,7 @@ a.INVALID.1:c iter first -next ---- -a#2,18:b err=invalid internal key kind: INVALID define @@ -259,9 +257,7 @@ a.INVALID.1:c iter first -next ---- -a#2,2:b err=invalid internal key kind: INVALID define @@ -271,10 +267,10 @@ a.RANGEDEL.1:d iter first -tombstones +next ---- +a#72057594037927935,15: err=invalid internal key kind: INVALID -. define a.MERGE.2:b @@ -306,11 +302,13 @@ first next next next +next tombstones ---- -a#2,18:b -a#1,15:c -b#4,15:d +a#72057594037927935,15: +a#2,1:b +b#72057594037927935,15: +c#72057594037927935,15: . a-b#1 b-c#4 @@ -322,11 +320,13 @@ first next next next +next tombstones ---- +a#72057594037927935,15: a#2,1:b -a#1,15:c -b#4,15:d +b#72057594037927935,15: +c#72057594037927935,15: . a-b#1 b-c#4 @@ -340,12 +340,14 @@ next next next next +next tombstones ---- -a#2,18:b (not pinned) (not force obsolete) -a#1,15:c (not pinned) (not force obsolete) -b#4,15:d (not pinned) (not force obsolete) +a#72057594037927935,15: (not pinned) (not force obsolete) +a#2,1:b (pinned) (not force obsolete) +b#72057594037927935,15: (not pinned) (not force obsolete) b#2,1:e (pinned) (force obsolete) +c#72057594037927935,15: (not pinned) (force obsolete) . a-b#1 b-c#4 @@ -360,12 +362,14 @@ next next next next +next tombstones ---- -a#2,18:b (not pinned) (not force obsolete) -a#1,15:c (not pinned) (not force obsolete) -b#4,15:d (not pinned) (not force obsolete) +a#72057594037927935,15: (not pinned) (not force obsolete) +a#2,1:b (pinned) (not force obsolete) +b#72057594037927935,15: (not pinned) (not force obsolete) b#2,1:e (pinned) (force obsolete) +c#72057594037927935,15: (not pinned) (force obsolete) c#3,1:f (pinned) (force obsolete) . a-b#1 @@ -390,7 +394,7 @@ next next tombstones ---- -a#3,15:e +a#72057594037927935,15: b#4,1:b c#3,1:c e#1,1:e @@ -414,7 +418,7 @@ next next tombstones ---- -a#3,15:e +a#72057594037927935,15: b#4,2:b c#3,2:c e#1,2:e @@ -443,10 +447,10 @@ next next tombstones ---- -a#3,15:c +a#72057594037927935,15: b#5,2:de +d#72057594037927935,15: d#5,2:bc -d#3,15:f . a-c#3 d-f#3 @@ -463,11 +467,15 @@ first next next next +next +next tombstones ---- -a#3,15:d -b#2,15:e -c#1,15:f +a#72057594037927935,15: +b#72057594037927935,15: +c#72057594037927935,15: +d#72057594037927935,15: +e#72057594037927935,15: . a-b#3 b-c#3 @@ -483,17 +491,16 @@ next next tombstones ---- -a#3,15:d -b#2,15:e -c#1,15:f -. +a#72057594037927935,15: +b#72057594037927935,15: +c#72057594037927935,15: +d#72057594037927935,15: a-b#3 b-c#3 c-d#3 c-d#1 d-e#2 d-e#1 -e-f#1 . iter snapshots=3 @@ -501,11 +508,15 @@ first next next next +next +next tombstones ---- -a#3,15:d -b#2,15:e -c#1,15:f +a#72057594037927935,15: +b#72057594037927935,15: +c#72057594037927935,15: +d#72057594037927935,15: +e#72057594037927935,15: . a-b#3 b-c#3 @@ -521,11 +532,15 @@ first next next next +next +next tombstones ---- -a#3,15:d -b#2,15:e -c#1,15:f +a#72057594037927935,15: +b#72057594037927935,15: +c#72057594037927935,15: +d#72057594037927935,15: +e#72057594037927935,15: . a-b#3 b-c#3 @@ -551,7 +566,7 @@ tombstones f next tombstones ---- -a#10,15:k (not pinned) (not force obsolete) +a#72057594037927935,15: (not pinned) (not force obsolete) f#9,1:f (pinned) (force obsolete) a-f#10 . @@ -572,7 +587,7 @@ tombstones f next tombstones ---- -f#10,15:k +f#72057594037927935,15: f#9,1:f . f#8,1:f @@ -589,17 +604,21 @@ d.SET.4:d iter first next +tombstones c +next +next next next -tombstones c tombstones ---- a#1,1:a -b#2,15:d -c#3,15:e -d#4,1:d +b#72057594037927935,15: b-c#2 . +c#72057594037927935,15: +d#72057594037927935,15: +d#4,1:d +. c-d#3 d-e#3 . @@ -607,17 +626,21 @@ d-e#3 iter snapshots=3 first next +tombstones c +next +next next next -tombstones c tombstones ---- a#1,1:a -b#2,15:d -c#3,15:e -d#4,1:d +b#72057594037927935,15: b-c#2 . +c#72057594037927935,15: +d#72057594037927935,15: +d#4,1:d +. c-d#3 c-d#2 d-e#3 @@ -633,12 +656,14 @@ iter first next next +next tombstones c tombstones ---- a#1,1:a -b#2,15:d +b#72057594037927935,15: c#4,1:d +. b-c#2 . c-d#2 @@ -658,7 +683,7 @@ next next next ---- -a#2,15:d +a#72057594037927935,15: a#2,1:a b#2,1:b c#2,1:c @@ -979,8 +1004,8 @@ next next tombstones ---- +a#72057594037927935,15: a#3,7: -a#2,15:c . a-c#2 . @@ -998,7 +1023,7 @@ next next tombstones ---- -a#3,15:d +a#72057594037927935,15: d#2,0: . a-d#3 @@ -1010,7 +1035,7 @@ next next next ---- -a#3,15:d +a#72057594037927935,15: a#2,0: d#2,0: . @@ -1021,7 +1046,7 @@ next next next ---- -a#3,15:d +a#72057594037927935,15: a#1,1:a d#2,0: . @@ -1031,7 +1056,7 @@ first next next ---- -a#3,15:d +a#72057594037927935,15: d#2,0: . @@ -1049,7 +1074,7 @@ tombstones ---- a#2,2:a . -b#1,15:c +b#72057594037927935,15: . b-c#1 . @@ -1064,12 +1089,10 @@ iter allow-zero-seqnum=true first next next -next tombstones ---- -a#2,2:v2 -a#1,15:b -a#0,2:v1 +a#72057594037927935,15: +a#0,2:v1v2 . a-b#1 . @@ -1135,7 +1158,7 @@ first next next ---- -a#3,15:c +a#72057594037927935,15: b#5,2:5 . @@ -1144,7 +1167,7 @@ first next next ---- -a#3,15:c +a#72057594037927935,15: b#0,2:5 . @@ -1152,10 +1175,12 @@ iter snapshots=2 first next next +next ---- -a#3,15:c +a#72057594037927935,15: b#5,2:5 b#1,2:1 +. define a.RANGEDEL.3:c @@ -1169,7 +1194,7 @@ first next next ---- -a#3,15:c +a#72057594037927935,15: b#5,2:5 . @@ -1177,10 +1202,12 @@ iter snapshots=2 first next next +next ---- -a#3,15:c +a#72057594037927935,15: b#5,2:5 b#1,2:1 +. # SET that meets a DEL is transformed into a SETWITHDEL. @@ -1303,8 +1330,8 @@ first next next ---- -a#0,18:c -a#2,15:z +a#72057594037927935,15: +a#0,1:c . iter allow-zero-seqnum=true snapshots=3 @@ -1313,8 +1340,8 @@ next next next ---- +a#72057594037927935,15: a#3,1:c -a#2,15:z a#0,1:b . @@ -1324,8 +1351,8 @@ next next next ---- -a#3,18:c -a#2,15:z +a#72057594037927935,15: +a#3,1:c a#1,0: . @@ -1341,8 +1368,8 @@ first next next ---- -a#4,18:c -a#3,15:z +a#72057594037927935,15: +a#4,1:c . # Invalid keys are emitted under SETWITHDEL. @@ -1354,9 +1381,7 @@ a.INVALID.1: iter first -next ---- -a#2,18:b err=invalid internal key kind: INVALID define @@ -1367,9 +1392,7 @@ a.SET.1:b iter first -next ---- -a#3,18:c err=invalid internal key kind: INVALID # SINGLEDEL that meets a SETWITHDEL is transformed into a DEL. @@ -1653,9 +1676,9 @@ tombstones # Test a very subtle sequence where a elision of tombstones is active, and a # unskippable RANGEDEL sits between a DELSIZED and the key it was intended to -# delete. The unskippable RANGEDEL breaks the skipping of keys within the -# snapshot stripe, but it's ultimately okay because we preserve skip=true across -# the RANGEDEL return. +# delete. In previous versions of the code, the RANGEDEL was interleaved at its +# sequence number and "unskippable" breaking the skipping of keys within the +# snapshot stripe. define a.DELSIZED.5:varint(4) @@ -1668,7 +1691,7 @@ first next tombstones ---- -a#4,15:d +a#72057594037927935,15: . . @@ -1679,7 +1702,7 @@ first next tombstones ---- -a#4,15:d +a#72057594037927935,15: . . @@ -1696,7 +1719,7 @@ first next tombstones ---- -a#4,15:d +a#72057594037927935,15: . . @@ -1713,9 +1736,10 @@ first next tombstones ---- -a#4,15:d +a#72057594037927935,15: . . +ineffectual-single-deletes: a # Perform a few variants of the above but with a range del with a seqnum equal to # keys. NB: When seqnums are equal, the order of keys with various kinds is: @@ -1726,15 +1750,12 @@ a#4,15:d # compaction iterator should always observe them first. define +a-z:{(#5,RANGEKEYDEL)} a.SINGLEDEL.6: a.SETWITHDEL.5:foo a.RANGEDEL.5:z ---- -define-range-keys -a-z:{(#5,RANGEKEYDEL)} ----- - # In the following case, the SINGLEDEL meets a SETWITHDEL, promoting the # SINGLEDEL into a DEL. @@ -1746,8 +1767,8 @@ next tombstones ---- a#72057594037927935,19: +a#72057594037927935,15: a#6,0: -a#5,15:z . a-z#5 . @@ -1762,7 +1783,7 @@ next tombstones ---- a#72057594037927935,19: -a#5,15:z +a#72057594037927935,15: . . @@ -1772,21 +1793,17 @@ a.RANGEDEL.5:d a.SET.5:foo ---- -# NB: In this case, the RANGEDEL acts as an unintentional snapshot stripe -# change. This is a code artifact, and we will be able to remove this behavior -# when range deletes are interleaved at the maximal sequence number by an -# interleaving iterator (like range keys are). +# NB: In this case, in previous versions of the code, the RANGEDEL acted as an +# unintentional snapshot stripe change. This was a code artifact that was fixed +# when we began interleaving range deletions at the maximal sequence number +# using an interleaving iterator (like range keys are). iter first next -next -next tombstones ---- -a#6,7: -a#5,15:d -a#5,1:foo +a#72057594037927935,15: . a-d#5 . @@ -1796,7 +1813,7 @@ first next tombstones ---- -a#5,15:d +a#72057594037927935,15: . . @@ -1807,17 +1824,20 @@ a.RANGEDEL.4:d a.SET.4:bar ---- -# The SINGLEDEL invariant checking can't see past the RANGEDEL and see that -# the a.SET.4 violates the invariant. This is a code artifact that will be -# improved when range deletes are interleaved at the maximal sequence number. +# In previous versions of the code, the SINGLEDEL invariant checking could not +# see past the RANGEDEL and see that the a.SET.4 violates the invariant. This +# was a code artifact that has been improved when we began interleaving range +# deletes at the maximal sequence number + iter first next next ---- -a#4,15:d +a#72057594037927935,15: a#4,1:bar . +invariant-violation-single-deletes: a define a.SINGLEDEL.6: @@ -1832,8 +1852,8 @@ first next tombstones ---- +a#72057594037927935,15: a#6,0: -a#5,15:d a-d#5 . @@ -1842,7 +1862,7 @@ first next tombstones ---- -a#5,15:d +a#72057594037927935,15: . . @@ -1855,10 +1875,12 @@ a.SET.5:foo iter first next +next tombstones ---- -a#6,23:varint(3) -a#5,15:d +a#72057594037927935,15: +a#6,0: +. a-d#5 . @@ -1867,7 +1889,7 @@ first next tombstones ---- -a#5,15:d +a#72057594037927935,15: . . diff --git a/testdata/compaction_iter_set_with_del b/testdata/compaction_iter_set_with_del index 620c1cf3b8..d412ad79d6 100644 --- a/testdata/compaction_iter_set_with_del +++ b/testdata/compaction_iter_set_with_del @@ -247,9 +247,7 @@ a.INVALID.1:c iter first -next ---- -a#2,18:b err=invalid internal key kind: INVALID define @@ -259,9 +257,7 @@ a.INVALID.1:c iter first -next ---- -a#2,2:b err=invalid internal key kind: INVALID define @@ -271,9 +267,12 @@ a.RANGEDEL.1:d iter first +next tombstones ---- +a#72057594037927935,15: err=invalid internal key kind: INVALID +a-d#1 . define @@ -306,11 +305,13 @@ first next next next +next tombstones ---- -a#2,18:b -a#1,15:c -b#4,15:d +a#72057594037927935,15: +a#2,1:b +b#72057594037927935,15: +c#72057594037927935,15: . a-b#1 b-c#4 @@ -322,11 +323,13 @@ first next next next +next tombstones ---- +a#72057594037927935,15: a#2,1:b -a#1,15:c -b#4,15:d +b#72057594037927935,15: +c#72057594037927935,15: . a-b#1 b-c#4 @@ -340,12 +343,14 @@ next next next next +next tombstones ---- -a#2,18:b -a#1,15:c -b#4,15:d +a#72057594037927935,15: +a#2,1:b +b#72057594037927935,15: b#2,1:e +c#72057594037927935,15: . a-b#1 b-c#4 @@ -360,12 +365,14 @@ next next next next +next tombstones ---- -a#2,18:b -a#1,15:c -b#4,15:d +a#72057594037927935,15: +a#2,1:b +b#72057594037927935,15: b#2,1:e +c#72057594037927935,15: c#3,1:f . a-b#1 @@ -390,7 +397,7 @@ next next tombstones ---- -a#3,15:e +a#72057594037927935,15: b#4,1:b c#3,1:c e#1,1:e @@ -414,7 +421,7 @@ next next tombstones ---- -a#3,15:e +a#72057594037927935,15: b#4,2:b c#3,2:c e#1,2:e @@ -443,10 +450,10 @@ next next tombstones ---- -a#3,15:c +a#72057594037927935,15: b#5,2:de +d#72057594037927935,15: d#5,2:bc -d#3,15:f . a-c#3 d-f#3 @@ -463,11 +470,15 @@ first next next next +next +next tombstones ---- -a#3,15:d -b#2,15:e -c#1,15:f +a#72057594037927935,15: +b#72057594037927935,15: +c#72057594037927935,15: +d#72057594037927935,15: +e#72057594037927935,15: . a-b#3 b-c#3 @@ -481,11 +492,15 @@ first next next next +next +next tombstones ---- -a#3,15:d -b#2,15:e -c#1,15:f +a#72057594037927935,15: +b#72057594037927935,15: +c#72057594037927935,15: +d#72057594037927935,15: +e#72057594037927935,15: . a-b#3 b-c#3 @@ -501,11 +516,15 @@ first next next next +next +next tombstones ---- -a#3,15:d -b#2,15:e -c#1,15:f +a#72057594037927935,15: +b#72057594037927935,15: +c#72057594037927935,15: +d#72057594037927935,15: +e#72057594037927935,15: . a-b#3 b-c#3 @@ -521,11 +540,15 @@ first next next next +next +next tombstones ---- -a#3,15:d -b#2,15:e -c#1,15:f +a#72057594037927935,15: +b#72057594037927935,15: +c#72057594037927935,15: +d#72057594037927935,15: +e#72057594037927935,15: . a-b#3 b-c#3 @@ -551,7 +574,7 @@ tombstones f next tombstones ---- -a#10,15:k +a#72057594037927935,15: f#9,1:f a-f#10 . @@ -572,7 +595,7 @@ tombstones f next tombstones ---- -f#10,15:k +f#72057594037927935,15: f#9,1:f . f#8,1:f @@ -589,17 +612,21 @@ d.SET.4:d iter first next +tombstones c +next +next next next -tombstones c tombstones ---- a#1,1:a -b#2,15:d -c#3,15:e -d#4,1:d +b#72057594037927935,15: b-c#2 . +c#72057594037927935,15: +d#72057594037927935,15: +d#4,1:d +. c-d#3 d-e#3 . @@ -607,17 +634,21 @@ d-e#3 iter snapshots=3 first next +tombstones c +next +next next next -tombstones c tombstones ---- a#1,1:a -b#2,15:d -c#3,15:e -d#4,1:d +b#72057594037927935,15: b-c#2 . +c#72057594037927935,15: +d#72057594037927935,15: +d#4,1:d +. c-d#3 c-d#2 d-e#3 @@ -637,7 +668,7 @@ tombstones c tombstones ---- a#1,1:a -b#2,15:d +b#72057594037927935,15: c#4,1:d b-c#2 . @@ -658,7 +689,7 @@ next next next ---- -a#2,15:d +a#72057594037927935,15: a#2,1:a b#2,1:b c#2,1:c @@ -979,8 +1010,8 @@ next next tombstones ---- +a#72057594037927935,15: a#3,7: -a#2,15:c . a-c#2 . @@ -998,7 +1029,7 @@ next next tombstones ---- -a#3,15:d +a#72057594037927935,15: d#2,0: . a-d#3 @@ -1010,7 +1041,7 @@ next next next ---- -a#3,15:d +a#72057594037927935,15: a#2,0: d#2,0: . @@ -1021,7 +1052,7 @@ next next next ---- -a#3,15:d +a#72057594037927935,15: a#1,1:a d#2,0: . @@ -1031,7 +1062,7 @@ first next next ---- -a#3,15:d +a#72057594037927935,15: d#2,0: . @@ -1049,7 +1080,7 @@ tombstones ---- a#2,2:a . -b#1,15:c +b#72057594037927935,15: . b-c#1 . @@ -1064,12 +1095,10 @@ iter allow-zero-seqnum=true first next next -next tombstones ---- -a#2,2:v2 -a#1,15:b -a#0,2:v1 +a#72057594037927935,15: +a#0,2:v1v2 . a-b#1 . @@ -1135,7 +1164,7 @@ first next next ---- -a#3,15:c +a#72057594037927935,15: b#5,2:5 . @@ -1144,7 +1173,7 @@ first next next ---- -a#3,15:c +a#72057594037927935,15: b#0,2:5 . @@ -1153,7 +1182,7 @@ first next next ---- -a#3,15:c +a#72057594037927935,15: b#5,2:5 b#1,2:1 @@ -1169,7 +1198,7 @@ first next next ---- -a#3,15:c +a#72057594037927935,15: b#5,2:5 . @@ -1178,7 +1207,7 @@ first next next ---- -a#3,15:c +a#72057594037927935,15: b#5,2:5 b#1,2:1 @@ -1291,6 +1320,7 @@ a#1,0: # SETWITHDEL-eligible entries at or under a RANGEDEL at the same user key should # be skipped. + define a.SET.3:c a.RANGEDEL.2:z @@ -1303,8 +1333,8 @@ first next next ---- -a#0,18:c -a#2,15:z +a#72057594037927935,15: +a#0,1:c . iter allow-zero-seqnum=true snapshots=3 @@ -1313,8 +1343,8 @@ next next next ---- +a#72057594037927935,15: a#3,1:c -a#2,15:z a#0,1:b . @@ -1324,8 +1354,8 @@ next next next ---- -a#3,18:c -a#2,15:z +a#72057594037927935,15: +a#3,1:c a#1,0: . @@ -1341,8 +1371,8 @@ first next next ---- -a#4,18:c -a#3,15:z +a#72057594037927935,15: +a#4,1:c . # Invalid keys are emitted under SETWITHDEL. @@ -1354,9 +1384,7 @@ a.INVALID.1: iter first -next ---- -a#2,18:b err=invalid internal key kind: INVALID define @@ -1367,9 +1395,7 @@ a.SET.1:b iter first -next ---- -a#3,18:c err=invalid internal key kind: INVALID # SINGLEDEL that meets a SETWITHDEL is transformed into a DEL.