From a860bbad07c19bb9ab314138b7ed23d14a2a610d Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Mon, 24 Jan 2022 13:56:56 -0500 Subject: [PATCH] compaction: don't split outputs within a user key During a compaction, if the current sstable hits the file size limit, defer finishing the sstable if the next sstable would share a user key. This is the current behavior of flushes, and this change brings parity between the two. This change is motivated by introduction of range keys (see #1339). This ensures we can always cleanly truncate range keys that span range-key boundaries. This commit also removes (keyspan.Fragmenter).FlushTo. Now that we prohibit splitting sstables in the middle of a user key, the Fragmenter's FlushTo function is unnecessary. Compactions and flushes always use the TruncateAndFlushTo variant. This change required a tweak to the way grandparent limits are applied, in order to switch the grandparent splitter's comparison into a >= comparsion. This was necessary due to the shift in interpreting `splitterSuggestion`s as exclusive boundaries. Close #734. --- compaction.go | 330 ++++-------------- compaction_iter.go | 9 +- compaction_iter_test.go | 2 +- compaction_picker.go | 5 + docs/range_deletions.md | 4 + internal/keyspan/fragmenter.go | 85 +---- internal/keyspan/fragmenter_test.go | 33 +- internal/keyspan/testdata/fragmenter_flush_to | 107 ------ internal/keyspan/testdata/fragmenter_values | 23 -- range_del_test.go | 37 +- testdata/compaction_find_grandparent_limit | 42 ++- testdata/compaction_iter | 8 +- testdata/compaction_iter_set_with_del | 8 +- testdata/manual_compaction | 55 ++- testdata/manual_compaction_set_with_del | 56 ++- 15 files changed, 180 insertions(+), 624 deletions(-) delete mode 100644 internal/keyspan/testdata/fragmenter_flush_to diff --git a/compaction.go b/compaction.go index ae09ea09ab..fd74a361b0 100644 --- a/compaction.go +++ b/compaction.go @@ -145,124 +145,30 @@ func (f *fileSizeSplitter) onNewOutput(key *InternalKey) []byte { return nil } -type grandparentLimitSplitter struct { - c *compaction - ve *versionEdit - limit []byte +type limitFuncSplitter struct { + c *compaction + limitFunc func(userKey []byte) []byte + limit []byte } -func (g *grandparentLimitSplitter) shouldSplitBefore( +func (lf *limitFuncSplitter) shouldSplitBefore( key *InternalKey, tw *sstable.Writer, ) compactionSplitSuggestion { - if g.limit != nil && g.c.cmp(key.UserKey, g.limit) > 0 { + // NB: The limit must be applied using >= since lf.limit may be used as the + // `splitterSuggestion` ultimately passed to `compactionIter.Tombstones` to + // serve as an *exclusive* end boundary truncation point. If we used > then, + // we may have already added a key with the user key `lf.limit` to the + // previous sstable. + if lf.limit != nil && lf.c.cmp(key.UserKey, lf.limit) >= 0 { return splitNow } return noSplit } -func (g *grandparentLimitSplitter) onNewOutput(key *InternalKey) []byte { - g.limit = nil - if g.c.rangeDelFrag.Empty() || len(g.ve.NewFiles) == 0 { - // In this case, `limit` will be a larger user key than `key.UserKey`, or - // nil. In either case, the inner loop will execute at least once to - // process `key`, and the input iterator will be advanced. - if key != nil { - g.limit = g.c.findGrandparentLimit(key.UserKey) - } else { // !c.rangeDelFrag.Empty() - g.limit = g.c.findGrandparentLimit(g.c.rangeDelFrag.Start()) - } - } else { - // There is a range tombstone spanning from the last file into the - // current one. Therefore this file's smallest boundary will overlap the - // last file's largest boundary. - // - // In this case, `limit` will be a larger user key than the previous - // file's largest key and correspond to a grandparent file's largest user - // key, or nil. Then, it is possible the inner loop executes zero times, - // and the output file contains only range tombstones. That is fine as - // long as the number of times we execute this case is bounded. Since - // `findGrandparentLimit()` returns a strictly larger user key each time - // and it corresponds to a grandparent file largest key, the number of - // times this case can execute is bounded by the number of grandparent - // files (plus one for the final time it returns nil). - // - // n > 0 since we cannot have seen range tombstones at the - // beginning of the first file. - n := len(g.ve.NewFiles) - g.limit = g.c.findGrandparentLimit(g.ve.NewFiles[n-1].Meta.Largest.UserKey) - // This conditional is necessary to maintain the invariant that - // successive calls to finishOutput() have an increasing - // limit, and that the fragmenter's *FlushTo() and Add() calls are - // always made with successively increasing user keys. It's possible - // for the last file's Meta.Largest.UserKey - // to be substantially less than the last key returned by - // the compactionIter, such as in a sequence of consecutive - // rangedel tombstones, of which some but not all get elided. - // Consider this example: - // - // Compaction input (all range tombstones in this snippet): - // a-b - // c-d - // e-f (elided) - // g-h (elided) <-- grandparent limit before this (at ff) - // i-j (elided) - // k-q <-- grandparent limit at k - // m-q - // - // Note that elided tombstones are added to the fragmenter, but - // removed before they make their way from the fragmenter onto - // iter.tombstones. They still affect the fragmenter's internal - // tracking of the key up to which all tombstones have been flushed. - // After the first output is cut with limit = g, the fragmenter - // is empty, so the next grandparent calculation happens with - // key = g, and returns k. We continue adding range tombstones to - // the fragmenter between [g,k], which includes k-q as we only - // switch outputs after the limit is exceeded. So key = m and - // limit = k when finishOutput is called. Since the start key in - // the fragmenter (k) matches the limit, and no point keys were - // added, we actually don't produce a new output (see the - // conditional in finishOutput() on why). - // - // When we try to calculate the next grandparent limit, since - // c.rangeDelFrag.Empty() == false (as k-q is sitting in it), - // we fall into this case, and use the end key of the last written - // sstable (which was [a-d]) to calculate the grandparent limit. - // That gets us g again, which gets us to call - // c.rangeDelFrag.FlushTo(g), which violates the - // invariant that the current flush-to key (g) be greater than the - // last one (k). - // - // To solve this, if the grandparent limit falls "in between" - // ve.NewFiles[n-1].Meta.Largest.UserKey and c.rangeDelFrag.Start(), - // re-calculate a higher limit ahead of that key. - if g.c.rangeDelFrag.Start() != nil && g.c.cmp(g.limit, g.c.rangeDelFrag.Start()) <= 0 { - g.limit = g.c.findGrandparentLimit(g.c.rangeDelFrag.Start()) - } - } - return g.limit -} - -type l0LimitSplitter struct { - c *compaction - ve *versionEdit - limit []byte -} - -func (l *l0LimitSplitter) shouldSplitBefore( - key *InternalKey, tw *sstable.Writer, -) compactionSplitSuggestion { - if l.limit != nil && l.c.cmp(key.UserKey, l.limit) > 0 { - return splitNow - } - return noSplit -} - -func (l *l0LimitSplitter) onNewOutput(key *InternalKey) []byte { - l.limit = nil - // For flushes being split across multiple sstables, call - // findL0Limit to find the next L0 limit. +func (lf *limitFuncSplitter) onNewOutput(key *InternalKey) []byte { + lf.limit = nil if key != nil { - l.limit = l.c.findL0Limit(key.UserKey) + lf.limit = lf.limitFunc(key.UserKey) } else { // Use the start key of the first pending tombstone to find the // next limit. All pending tombstones have the same start key. @@ -279,12 +185,11 @@ func (l *l0LimitSplitter) onNewOutput(key *InternalKey) []byte { // and finishOutput() would not advance any further because // the next range tombstone to write does not start until after // the L0 split point. - startKey := l.c.rangeDelFrag.Start() - if startKey != nil { - l.limit = l.c.findL0Limit(startKey) + if startKey := lf.c.rangeDelFrag.Start(); startKey != nil { + lf.limit = lf.limitFunc(startKey) } } - return l.limit + return lf.limit } // splitterGroup is a compactionOutputSplitter that splits whenever one of its @@ -858,14 +763,6 @@ func seekGT(iter *manifest.LevelIterator, cmp base.Compare, key []byte) *manifes // current heuristic stops output of a table if the addition of another key // would cause the table to overlap more than 10x the target file size at // level N. See maxGrandparentOverlapBytes. -// -// TODO(peter): Stopping compaction output in the middle of a user-key creates -// 2 sstables that need to be compacted together as an "atomic compaction -// unit". This is unfortunate as it removes the benefit of stopping output to -// an sstable in order to prevent a large compaction with the next level. Seems -// better to adjust findGrandparentLimit to not stop output in the middle of a -// user-key. Perhaps this isn't a problem if the compaction picking heuristics -// always pick the right (older) sibling for compaction first. func (c *compaction) findGrandparentLimit(start []byte) []byte { iter := c.grandparents.Iter() var overlappedBytes uint64 @@ -874,8 +771,8 @@ func (c *compaction) findGrandparentLimit(start []byte) []byte { // To ensure forward progress we always return a larger user // key than where we started. See comments above clients of // this function for how this is used. - if overlappedBytes > c.maxOverlapBytes && c.cmp(start, f.Largest.UserKey) < 0 { - return f.Largest.UserKey + if overlappedBytes > c.maxOverlapBytes && c.cmp(start, f.Smallest.UserKey) < 0 { + return f.Smallest.UserKey } } return nil @@ -1060,13 +957,15 @@ func (c *compaction) newInputIter(newIters tableNewIters) (_ internalIterator, r c.closers = append(c.closers, rangeDelIter) rangeDelIter = noCloseIter{rangeDelIter} - // Truncate the range tombstones returned by the iterator to the upper - // bound of the atomic compaction unit. Note that we need do this - // truncation at read time in order to handle RocksDB generated sstables - // which do not truncate range tombstones to atomic compaction unit - // boundaries at write time. Because we're doing the truncation at read - // time, we follow RocksDB's lead and do not truncate tombstones to - // atomic unit boundaries at compaction time. + // Truncate the range tombstones returned by the iterator to the + // upper bound of the atomic compaction unit. Note that we need do + // this truncation at read time in order to handle sstables + // generated by RocksDB and earlier versions of Pebble which do not + // truncate range tombstones to atomic compaction unit boundaries at + // write time. + // + // The current Pebble compaction logic DOES truncate tombstones to + // atomic unit boundaries at compaction time too. atomicUnit, _ := expandToAtomicUnit(c.cmp, f.Slice(), true /* disableIsCompacting */) lowerBound, upperBound := manifest.KeyRange(c.cmp, atomicUnit.Iter()) // Range deletion tombstones are often written to sstables @@ -2213,7 +2112,7 @@ func (d *DB) runCompaction( // compactionIter.Tombstones via keyspan.Fragmenter.FlushTo, and by the // WriterMetadata.LargestRangeDel.UserKey. splitKey = append([]byte(nil), splitKey...) - for _, v := range iter.Tombstones(splitKey, splitL0Outputs) { + for _, v := range iter.Tombstones(splitKey) { if tw == nil { if err := newOutput(); err != nil { return err @@ -2280,10 +2179,9 @@ func (d *DB) runCompaction( outputMetrics.Size += int64(meta.Size) outputMetrics.NumFiles++ - // The handling of range boundaries is a bit complicated. if n := len(ve.NewFiles); n > 1 { - // This is not the first output file. Bound the smallest range key by the - // previous tables largest key. + // This is not the first output file. Ensure the sstable boundaries + // are nonoverlapping. prevMeta := ve.NewFiles[n-2].Meta if writerMeta.SmallestRangeDel.UserKey != nil { c := d.cmp(writerMeta.SmallestRangeDel.UserKey, prevMeta.Largest.UserKey) @@ -2292,57 +2190,30 @@ func (d *DB) runCompaction( "pebble: smallest range tombstone start key is less than previous sstable largest key: %s < %s", writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey), prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey)) - } - if c == 0 && prevMeta.Largest.SeqNum() <= writerMeta.SmallestRangeDel.SeqNum() { - // The user key portion of the range boundary start key is equal to - // the previous table's largest key. We need the tables to be - // key-space partitioned, so force the boundary to a key that we know - // is larger than the previous table's largest key. - if prevMeta.Largest.SeqNum() == 0 { - // If the seqnum of the previous table's largest key is 0, we can't - // decrement it. This should never happen as we take care in the - // main compaction loop to avoid generating an sstable with a - // largest key containing a zero seqnum. - return errors.Errorf( - "pebble: previous sstable largest key unexpectedly has 0 seqnum: %s", - prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey)) - } - // TODO(peter): Technically, this produces a small gap with the - // previous sstable. The largest key of the previous table may be - // b#5.SET. The smallest key of the new sstable may then become - // b#4.RANGEDEL even though the tombstone is [b,z)#6. If iteration - // ever precisely truncates sstable boundaries, the key b#5.DEL at - // a lower level could slip through. Note that this can't ever - // actually happen, though, because the only way for two records to - // have the same seqnum is via ingestion. And even if it did happen - // revealing a deletion tombstone is not problematic. Slightly more - // worrisome is the combination of b#5.MERGE, b#5.SET and - // b#4.RANGEDEL, but we can't ever see b#5.MERGE and b#5.SET at - // different levels in the tree due to the ingestion argument and - // atomic compaction units. - // - // TODO(sumeer): Incorporate the the comment in - // https://github.com/cockroachdb/pebble/pull/479#pullrequestreview-340600654 - // into docs/range_deletions.md and reference the correctness - // argument here. Note that that comment might be slightly incorrect. - writerMeta.SmallestRangeDel.SetSeqNum(prevMeta.Largest.SeqNum() - 1) + } else if c == 0 && !prevMeta.Largest.IsExclusiveSentinel() { + // The user key portion of the range boundary start key is + // equal to the previous table's largest key user key, and + // the previous table's largest key is not exclusive. This + // violates the invariant that tables are key-space + // partitioned. + return errors.Errorf( + "pebble: invariant violation: previous sstable largest key %s, current sstable smallest rangedel: %s", + prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey), + writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey), + ) } } } - if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil { - // The current file is not the last output file and there is a range tombstone in it. - // If the tombstone extends into the next file, then truncate it for the purposes of - // computing meta.Largest. For example, say the next file's first key is c#7,1 and the - // current file's last key is c#10,1 and the current file has a range tombstone - // [b, d)#12,15. For purposes of the bounds we pretend that the range tombstone ends at - // c#inf where inf is the InternalKeyRangeDeleteSentinel. Note that this is just for - // purposes of bounds computation -- the current sstable will end up with a Largest key - // of c#7,1 so the range tombstone in the current file will be able to delete c#7. - if d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) >= 0 { - writerMeta.LargestRangeDel.UserKey = splitKey - writerMeta.LargestRangeDel.Trailer = InternalKeyRangeDeleteSentinel - } + // Verify that all range deletions outputted to the sstable are + // truncated to split key. + if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil && + d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) > 0 { + return errors.Errorf( + "pebble: invariant violation: rangedel largest key %q extends beyond split key %q", + writerMeta.LargestRangeDel.Pretty(d.opts.Comparer.FormatKey), + d.opts.Comparer.FormatKey(splitKey), + ) } meta.Smallest = writerMeta.Smallest(d.cmp) @@ -2373,12 +2244,10 @@ func (d *DB) runCompaction( c.largest.Pretty(d.opts.Comparer.FormatKey)) } } - // Verify that when splitting an output to L0, we never split different - // revisions of the same user key across two different sstables. - if splitL0Outputs { - if err := c.errorOnUserKeyOverlap(ve); err != nil { - return err - } + // Verify that we never split different revisions of the same user key + // across two different sstables. + if err := c.errorOnUserKeyOverlap(ve); err != nil { + return err } if err := meta.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { return err @@ -2392,24 +2261,14 @@ func (d *DB) runCompaction( // the splitterGroup can be composed of multiple splitters. In this case, // we start off with splitters for file sizes, grandparent limits, and (for // L0 splits) L0 limits, before wrapping them in an splitterGroup. - // - // There is a complication here: We may not be able to switch SSTables right - // away when we are splitting an L0 output. We do not split the same user - // key across different sstables within one flush or intra-L0 compaction, so - // the userKeyChangeSplitter ensures we are at a user key change boundary - // when doing a split. outputSplitters := []compactionOutputSplitter{ - &fileSizeSplitter{maxFileSize: c.maxOutputFileSize}, - &grandparentLimitSplitter{c: c, ve: ve}, - } - var splitter compactionOutputSplitter - if splitL0Outputs { - // outputSplitters[0] is the file size splitter, which doesn't guarantee - // that all advised splits will be at user key change boundaries. Wrap - // it in a userKeyChangeSplitter. - outputSplitters[0] = &userKeyChangeSplitter{ + // We do not split the same user key across different sstables within + // one flush or compaction. The fileSizeSplitter may request a split in + // the middle of a user key, so the userKeyChangeSplitter ensures we are + // at a user key change boundary when doing a split. + &userKeyChangeSplitter{ cmp: c.cmp, - splitter: outputSplitters[0], + splitter: &fileSizeSplitter{maxFileSize: c.maxOutputFileSize}, unsafePrevUserKey: func() []byte { // Return the largest point key written to tw or the start of // the current range deletion in the fragmenter, whichever is @@ -2420,13 +2279,13 @@ func (d *DB) runCompaction( } return c.rangeDelFrag.Start() }, - } - outputSplitters = append(outputSplitters, &l0LimitSplitter{c: c, ve: ve}) + }, + &limitFuncSplitter{c: c, limitFunc: c.findGrandparentLimit}, } - splitter = &splitterGroup{ - cmp: c.cmp, - splitters: outputSplitters, + if splitL0Outputs { + outputSplitters = append(outputSplitters, &limitFuncSplitter{c: c, limitFunc: c.findL0Limit}) } + splitter := &splitterGroup{cmp: c.cmp, splitters: outputSplitters} // NB: we avoid calling maybeThrottle on a nilPacer because the cost of // dynamic dispatch in the hot loop below is pronounced in CPU profiles (see @@ -2448,7 +2307,6 @@ func (d *DB) runCompaction( splitterSuggestion := splitter.onNewOutput(key) // Each inner loop iteration processes one key from the input iterator. - prevPointSeqNum := InternalKeySeqNumMax for ; key != nil; key, val = iter.Next() { if split := splitter.shouldSplitBefore(key, tw); split == splitNow { break @@ -2476,21 +2334,11 @@ func (d *DB) runCompaction( if err := tw.Add(*key, val); err != nil { return nil, pendingOutputs, err } - prevPointSeqNum = key.SeqNum() } // A splitter requested a split, and we're ready to finish the output. // We need to choose the key at which to split any pending range // tombstones. - // - // There's a complication here. We need to ensure that for a user key - // k we never end up with one output's largest key as k#0 and the - // next output's smallest key as k#RANGEDEL,#x where x > 0. This is a - // problem because k#RANGEDEL,#x sorts before k#0. Normally, we just - // adjust the seqnum of the next output's smallest boundary to be - // less, but that's not possible with the zero seqnum. We can avoid - // this case with careful picking of where to split pending range - // tombstones. var splitKey []byte switch { case key != nil: @@ -2504,50 +2352,10 @@ func (d *DB) runCompaction( // Splitting at the current key will cause this output to overlap // a potentially unbounded number of grandparents. splitKey = key.UserKey - case key == nil && splitL0Outputs: - // We ran out of keys with flush splits enabled. Set splitKey to - // nil so all range tombstones get flushed in the current sstable. - // Consider this example: - // - // a.SET.4 - // d.MERGE.5 - // d.RANGEDEL.3:f - // (no more keys remaining) - // - // Where d is a flush split key (i.e. splitterSuggestion = 'd'). - // Since d.MERGE.5 has already been written to this output by this - // point (as it's <= splitterSuggestion), and outputs written to L0 - // cannot have user keys split across multiple sstables, we have to - // set splitKey to a key greater than 'd' to ensure the range - // deletion also gets flushed. Setting the splitKey to nil is the - // simplest way to ensure that. - // - // TODO(jackson): This case is only problematic if splitKey equals - // the user key of the last point key added. We don't need to - // flush *all* range tombstones to the current sstable. We could - // flush up to the next grandparent limit greater than - // `splitterSuggestion` instead. - splitKey = nil - case key == nil && prevPointSeqNum != 0: - // The last key added did not have a zero sequence number, so - // we'll always be able to adjust the next table's smallest key. - // NB: Because of the splitter's `onNewOutput` contract, - // `splitterSuggestion` must be >= any key previously added to the - // current output sstable. + case key == nil: + // NB: Because of the userKeyChangeSplitter, `splitterSuggestion` + // must be > any key previously added to the current output sstable. splitKey = splitterSuggestion - case key == nil && prevPointSeqNum == 0: - // The last key added did have a zero sequence number. The - // splitters' suggested split point might have the same user key, - // which would cause the next output to have an unadjustable - // smallest key. To prevent that, we ignore the splitter's - // suggestion, leaving splitKey nil to flush all pending range - // tombstones. - // TODO(jackson): This case is only problematic if splitKey equals - // the user key of the last point key added. We don't need to - // flush *all* range tombstones to the current sstable. We could - // flush up to the next grandparent limit greater than - // `splitterSuggestion` instead. - splitKey = nil default: return nil, nil, errors.New("pebble: not reached") } diff --git a/compaction_iter.go b/compaction_iter.go index 9ec825fd50..e7747fd4d6 100644 --- a/compaction_iter.go +++ b/compaction_iter.go @@ -770,17 +770,14 @@ func (i *compactionIter) Close() error { // exclude specifies if the specified key is exclusive or inclusive. // When exclude = true, all returned range tombstones are truncated to the // specified key. -func (i *compactionIter) Tombstones(key []byte, exclude bool) []keyspan.Span { - switch { - case key == nil: +func (i *compactionIter) Tombstones(key []byte) []keyspan.Span { + if key == nil { i.rangeDelFrag.Finish() - case exclude: + } else { // The specified end key is exclusive; no versions of the specified // user key (including range tombstones covering that key) should // be flushed yet. i.rangeDelFrag.TruncateAndFlushTo(key) - default: - i.rangeDelFrag.FlushTo(key) } tombstones := i.tombstones i.tombstones = nil diff --git a/compaction_iter_test.go b/compaction_iter_test.go index 918b7560da..02b604872d 100644 --- a/compaction_iter_test.go +++ b/compaction_iter_test.go @@ -192,7 +192,7 @@ func TestCompactionIter(t *testing.T) { if len(parts) == 2 { key = []byte(parts[1]) } - for _, v := range iter.Tombstones(key, false) { + for _, v := range iter.Tombstones(key) { fmt.Fprintf(&b, "%s-%s#%d\n", v.Start.UserKey, v.End, v.Start.SeqNum()) } diff --git a/compaction_picker.go b/compaction_picker.go index 6a3cd47726..4155477db4 100644 --- a/compaction_picker.go +++ b/compaction_picker.go @@ -374,6 +374,11 @@ func (pc *pickedCompaction) grow(sm, la InternalKey, maxExpandedBytes uint64) bo // disableIsCompacting is true, isCompacting always returns false. This helps // avoid spurious races from being detected when this method is used outside // of compaction picking code. +// +// TODO(jackson): Compactions and flushes no longer split a user key between two +// sstables. We could perform a migration, re-compacting any sstables with split +// user keys, which would allow us to remove atomic compaction unit expansion +// code. func expandToAtomicUnit( cmp Compare, inputs manifest.LevelSlice, disableIsCompacting bool, ) (slice manifest.LevelSlice, isCompacting bool) { diff --git a/docs/range_deletions.md b/docs/range_deletions.md index e1c9ac6564..19b8f06a38 100644 --- a/docs/range_deletions.md +++ b/docs/range_deletions.md @@ -1,5 +1,9 @@ # Range Deletions +TODO: The following explanation of range deletions does not take into account +the recent change to prohibit splitting of a user key between sstables. This +change simplifies the logic, removing 'improperly truncated range tombstones.' + TODO: The following explanation of range deletions ignores the kind/trailer that appears at the end of keys after the sequence number. This should be harmless but need to add a justification on why diff --git a/internal/keyspan/fragmenter.go b/internal/keyspan/fragmenter.go index 76e5319443..276709a146 100644 --- a/internal/keyspan/fragmenter.go +++ b/internal/keyspan/fragmenter.go @@ -88,8 +88,8 @@ type Fragmenter struct { // flushed. All pending spans have the same Start.UserKey. pending []Span // doneBuf is used to buffer completed span fragments when flushing to a - // specific key (e.g. FlushTo). It is cached in the Fragmenter to allow - // reuse. + // specific key (e.g. TruncateAndFlushTo). It is cached in the Fragmenter to + // allow reuse. doneBuf []Span // sortBuf is used to sort fragments by end key when flushing. sortBuf spansByEndKey @@ -266,88 +266,15 @@ func (f *Fragmenter) Empty() bool { return f.finished || len(f.pending) == 0 } -// FlushTo flushes all of the fragments with a start key <= key. Used during -// compaction to force emitting of spans which straddle an sstable boundary. -// Note that the emitted spans are not truncated to the specified key. Consider +// TruncateAndFlushTo flushes all of the fragments with a start key <= key, +// truncating spans to the specified end key. Used during compaction to force +// emitting of spans which straddle an sstable boundary. Consider // the scenario: // // a---------k#10 // f#8 // f#7 -// -// If the compaction logic splits f#8 and f#7 into different sstables, we can't -// truncate the span [a,k) at f. Doing so could produce an sstable with the -// records: -// -// a----f#10 -// f#8 -// -// The span [a,f) does not cover the key f. -func (f *Fragmenter) FlushTo(key []byte) { - if f.finished { - panic("pebble: span fragmenter already finished") - } - - if f.flushedKey != nil { - switch c := f.Cmp(key, f.flushedKey); { - case c < 0: - panic(fmt.Sprintf("pebble: flush-to key (%s) < flushed key (%s)", - f.Format(key), f.Format(f.flushedKey))) - } - } - f.flushedKey = append(f.flushedKey[:0], key...) - - if len(f.pending) > 0 { - // Since all of the pending spans have the same start key, we only need - // to compare against the first one. - switch c := f.Cmp(f.pending[0].Start.UserKey, key); { - case c > 0: - panic(fmt.Sprintf("pebble: keys must be in order: %s > %s", - f.Format(f.pending[0].Start.UserKey), f.Format(key))) - } - // Note that we explicitly do not return early here if Start.UserKey == - // key. Similar to the scenario described above, consider: - // - // f----k#10 - // f#8 - // f#7 - // - // If the compaction logic splits f#8 and f#7 into different sstables, we - // have to emit the span [f,k) in both sstables. - } - - // At this point we know that the new start key is greater than the pending - // spans start keys. We flush all span fragments with a start key - // <= key. - f.flush(f.pending, key) - - // Truncate the pending spans to start with key, filtering any which - // would become empty. - pending := f.pending - f.pending = f.pending[:0] - for _, s := range pending { - if f.Cmp(key, s.End) < 0 { - // s: a--+--e - // new: c------ - f.pending = append(f.pending, Span{ - Start: base.MakeInternalKey(key, s.Start.SeqNum(), s.Start.Kind()), - End: s.End, - Value: s.Value, - }) - } - } -} - -// TruncateAndFlushTo is similar to FlushTo, except it also truncates spans to -// the specified end key by calling truncateAndFlush. Only called in compactions -// where we can guarantee that all versions of UserKeys < key have been written, -// or in other words, where we can ensure we don't split a user key across two -// sstables. Going back to the scenario from above: -// -// a---------k#10 -// f#8 -// f#7 -// +/// // Let's say the next user key after f is g. Calling TruncateAndFlushTo(g) will // flush this span: // diff --git a/internal/keyspan/fragmenter_test.go b/internal/keyspan/fragmenter_test.go index 16782b221f..86d9cfb3d5 100644 --- a/internal/keyspan/fragmenter_test.go +++ b/internal/keyspan/fragmenter_test.go @@ -45,14 +45,7 @@ func buildSpans( }, } for _, line := range strings.Split(s, "\n") { - if strings.HasPrefix(line, "flush-to ") { - parts := strings.Split(line, " ") - if len(parts) != 2 { - t.Fatalf("expected 2 components, but found %d: %s", len(parts), line) - } - f.FlushTo([]byte(parts[1])) - continue - } else if strings.HasPrefix(line, "truncate-and-flush-to ") { + if strings.HasPrefix(line, "truncate-and-flush-to ") { parts := strings.Split(line, " ") if len(parts) != 2 { t.Fatalf("expected 2 components, but found %d: %s", len(parts), line) @@ -204,30 +197,6 @@ func TestFragmenterDeleted(t *testing.T) { }) } -func TestFragmenterFlushTo(t *testing.T) { - cmp := base.DefaultComparer.Compare - fmtKey := base.DefaultComparer.FormatKey - - datadriven.RunTest(t, "testdata/fragmenter_flush_to", func(d *datadriven.TestData) string { - switch d.Cmd { - case "build": - return func() (result string) { - defer func() { - if r := recover(); r != nil { - result = fmt.Sprint(r) - } - }() - - spans := buildSpans(t, cmp, fmtKey, d.Input, base.InternalKeyKindRangeDelete) - return formatSpans(spans) - }() - - default: - return fmt.Sprintf("unknown command: %s", d.Cmd) - } - }) -} - func TestFragmenterTruncateAndFlushTo(t *testing.T) { cmp := base.DefaultComparer.Compare fmtKey := base.DefaultComparer.FormatKey diff --git a/internal/keyspan/testdata/fragmenter_flush_to b/internal/keyspan/testdata/fragmenter_flush_to deleted file mode 100644 index a4b935358a..0000000000 --- a/internal/keyspan/testdata/fragmenter_flush_to +++ /dev/null @@ -1,107 +0,0 @@ -build -2: a--c -flush-to c -1: b--d ----- -pebble: start key (b) < flushed key (c) - -build -flush-to c -1: b--d ----- -pebble: start key (b) < flushed key (c) - -build -flush-to c -flush-to b ----- -pebble: flush-to key (b) < flushed key (c) - -build -3: a--d -2: d--g -flush-to c ----- -pebble: flush-to key (c) < flushed key (d) - -# Note that the duplication of the fragments is expected. The flush-to -# is emitting tombstones to different sstables. - -build -3: a--d -flush-to a ----- -3: a--d -3: a--d - -build -3: a--d -2: d--g -flush-to d ----- -3: a--d -2: d--g -2: d--g - -build -2: a----f -flush-to c ----- -2: a----f -2: c--f - -build -2: a----f -flush-to f ----- -2: a----f - -build -3: a-c -1: a-----g -flush-to d ----- -3: a-c -1: a-c -1: c---g -1: d--g - -build -2: a---e -1: a------h -flush-to c ----- -2: a---e -1: a---e -2: c-e -1: c-e -1: e--h - -build -3: a-c -2: a---e -1: a-----g -flush-to d -3: d----i ----- -3: a-c -2: a-c -1: a-c -2: c-e -1: c-e -3: de -2: de -1: de -3: e-g -1: e-g -3: g-i - -build -3: a-c -2: a-----g -flush-to e ----- -3: a-c -2: a-c -2: c---g -2: e-g diff --git a/internal/keyspan/testdata/fragmenter_values b/internal/keyspan/testdata/fragmenter_values index 8e2e5fcbc8..7462ae8b4b 100644 --- a/internal/keyspan/testdata/fragmenter_values +++ b/internal/keyspan/testdata/fragmenter_values @@ -63,26 +63,3 @@ truncate-and-flush-to d 3: e-g orange 1: e-g coconut 3: g-i orange - -# NB: Unlike the above truncate-and-flush-to calls, a flush-to does not truncate -# the end boundary. In this case, the fragments beginning at `c` are not -# truncated to `d`, they're flushed with the bounadries formed by fragmentation -# (`e`) -build -3: a-c apple -2: a---e banana -1: a-----g coconut -flush-to d -3: d----i orange ----- -3: a-c apple -2: a-c banana -1: a-c coconut -2: c-e banana -1: c-e coconut -3: de orange -2: de banana -1: de coconut -3: e-g orange -1: e-g coconut -3: g-i orange diff --git a/range_del_test.go b/range_del_test.go index e2d7389c75..dc94a79388 100644 --- a/range_del_test.go +++ b/range_del_test.go @@ -267,20 +267,18 @@ func TestRangeDelCompactionTruncation(t *testing.T) { 1: 000008:[a#3,RANGEDEL-b#72057594037927935,RANGEDEL] 2: - 000012:[b#4,SET-b#4,SET] - 000013:[b#3,RANGEDEL-c#72057594037927935,RANGEDEL] + 000012:[b#4,SET-c#72057594037927935,RANGEDEL] 3: - 000014:[c#5,SET-d#72057594037927935,RANGEDEL] + 000013:[c#5,SET-d#72057594037927935,RANGEDEL] `) } else { expectLSM(` 1: 000008:[a#3,RANGEDEL-b#72057594037927935,RANGEDEL] 2: - 000012:[b#4,SETWITHDEL-b#4,SETWITHDEL] - 000013:[b#3,RANGEDEL-c#72057594037927935,RANGEDEL] + 000012:[b#4,SETWITHDEL-c#72057594037927935,RANGEDEL] 3: - 000014:[c#5,SET-d#72057594037927935,RANGEDEL] + 000013:[c#5,SET-d#72057594037927935,RANGEDEL] `) } @@ -359,17 +357,15 @@ func TestRangeDelCompactionTruncation2(t *testing.T) { require.NoError(t, d.Compact([]byte("b"), []byte("b\x00"))) expectLSM(` 6: - 000008:[a#3,RANGEDEL-b#2,SET] - 000009:[b#1,RANGEDEL-d#72057594037927935,RANGEDEL] + 000009:[a#3,RANGEDEL-d#72057594037927935,RANGEDEL] `) require.NoError(t, d.Set([]byte("c"), bytes.Repeat([]byte("d"), 100), nil)) require.NoError(t, d.Compact([]byte("c"), []byte("c\x00"))) expectLSM(` 6: - 000012:[a#3,RANGEDEL-b#2,SET] - 000013:[b#1,RANGEDEL-c#72057594037927935,RANGEDEL] - 000014:[c#4,SET-d#72057594037927935,RANGEDEL] + 000012:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL] + 000013:[c#4,SET-d#72057594037927935,RANGEDEL] `) } @@ -435,8 +431,7 @@ func TestRangeDelCompactionTruncation3(t *testing.T) { } expectLSM(` 3: - 000012:[a#3,RANGEDEL-b#2,SET] - 000013:[b#1,RANGEDEL-d#72057594037927935,RANGEDEL] + 000009:[a#3,RANGEDEL-d#72057594037927935,RANGEDEL] `) require.NoError(t, d.Set([]byte("c"), bytes.Repeat([]byte("d"), 100), nil)) @@ -444,19 +439,18 @@ func TestRangeDelCompactionTruncation3(t *testing.T) { require.NoError(t, d.Compact([]byte("c"), []byte("c\x00"))) expectLSM(` 3: - 000017:[a#3,RANGEDEL-b#2,SET] - 000018:[b#1,RANGEDEL-c#72057594037927935,RANGEDEL] + 000013:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL] 4: - 000019:[c#4,SET-d#72057594037927935,RANGEDEL] + 000014:[c#4,SET-d#72057594037927935,RANGEDEL] `) require.NoError(t, d.Compact([]byte("c"), []byte("c\x00"))) expectLSM(` 3: - 000017:[a#3,RANGEDEL-b#2,SET] - 000018:[b#1,RANGEDEL-c#72057594037927935,RANGEDEL] + 000013:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL] 5: - 000019:[c#4,SET-d#72057594037927935,RANGEDEL]`) + 000014:[c#4,SET-d#72057594037927935,RANGEDEL] +`) if _, _, err := d.Get([]byte("b")); err != ErrNotFound { t.Fatalf("expected not found, but found %v", err) @@ -465,10 +459,9 @@ func TestRangeDelCompactionTruncation3(t *testing.T) { require.NoError(t, d.Compact([]byte("a"), []byte("a\x00"))) expectLSM(` 4: - 000020:[a#3,RANGEDEL-b#2,SET] - 000021:[b#1,RANGEDEL-c#72057594037927935,RANGEDEL] + 000013:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL] 5: - 000019:[c#4,SET-d#72057594037927935,RANGEDEL] + 000014:[c#4,SET-d#72057594037927935,RANGEDEL] `) if v, _, err := d.Get([]byte("b")); err != ErrNotFound { diff --git a/testdata/compaction_find_grandparent_limit b/testdata/compaction_find_grandparent_limit index 6950848fe8..e1c88ba1c2 100644 --- a/testdata/compaction_find_grandparent_limit +++ b/testdata/compaction_find_grandparent_limit @@ -17,27 +17,29 @@ e-f 2 compact max-overlap=1 a b c d e f ---- -a-b -c-d -e-f +a-c +d-e +f-f compact max-overlap=2 a b c d e f ---- -a-d -e-f +a-c +d-e +f-f compact max-overlap=4 a b c d e f ---- -a-f +a-e +f-f compact max-overlap=1 b c d e f g h i j ---- -b-d -e-f -g-j +b-c +d-e +f-j compact max-overlap=1 a g h i j @@ -54,8 +56,8 @@ e-f compact max-overlap=1 c d e f ---- -c-d -e-f +c-e +f-f # Unequal size grandparents define @@ -67,13 +69,15 @@ e-f 3 compact max-overlap=1 a b c d e f ---- -a-d -e-f +a-c +d-e +f-f compact max-overlap=3 a b c d e f ---- -a-f +a-e +f-f # Unequal size grandparents define @@ -85,12 +89,12 @@ e-f 1 compact max-overlap=1 a b c d e f ---- -a-b -c-d -e-f +a-c +d-e +f-f compact max-overlap=3 a b c d e f ---- -a-d -e-f +a-c +d-f diff --git a/testdata/compaction_iter b/testdata/compaction_iter index 34bf1c9599..2f17e2bcff 100644 --- a/testdata/compaction_iter +++ b/testdata/compaction_iter @@ -553,7 +553,7 @@ tombstones ---- a#10,15:k f#9,1:f -a-k#10 +a-f#10 . f#8,1:f f-k#10 @@ -574,7 +574,6 @@ tombstones ---- f#10,15:k f#9,1:f -f-k#10 . f#8,1:f f-k#10 @@ -600,7 +599,6 @@ b#2,15:d c#3,15:e d#4,1:d b-c#2 -c-d#3 . c-d#3 d-e#3 @@ -619,8 +617,6 @@ b#2,15:d c#3,15:e d#4,1:d b-c#2 -c-d#3 -c-d#2 . c-d#3 c-d#2 @@ -643,7 +639,7 @@ tombstones a#1,1:a b#2,15:d c#4,1:d -b-d#2 +b-c#2 . c-d#2 . diff --git a/testdata/compaction_iter_set_with_del b/testdata/compaction_iter_set_with_del index da78252543..796984e0c7 100644 --- a/testdata/compaction_iter_set_with_del +++ b/testdata/compaction_iter_set_with_del @@ -553,7 +553,7 @@ tombstones ---- a#10,15:k f#9,1:f -a-k#10 +a-f#10 . f#8,1:f f-k#10 @@ -574,7 +574,6 @@ tombstones ---- f#10,15:k f#9,1:f -f-k#10 . f#8,1:f f-k#10 @@ -600,7 +599,6 @@ b#2,15:d c#3,15:e d#4,1:d b-c#2 -c-d#3 . c-d#3 d-e#3 @@ -619,8 +617,6 @@ b#2,15:d c#3,15:e d#4,1:d b-c#2 -c-d#3 -c-d#2 . c-d#3 c-d#2 @@ -643,7 +639,7 @@ tombstones a#1,1:a b#2,15:d c#4,1:d -b-d#2 +b-c#2 . c-d#2 . diff --git a/testdata/manual_compaction b/testdata/manual_compaction index d6111e3c17..a68d35f2f0 100644 --- a/testdata/manual_compaction +++ b/testdata/manual_compaction @@ -92,9 +92,8 @@ range-deletions-bytes-estimate: 1552 compact a-e L1 ---- 2: - 000008:[a#3,SET-b#72057594037927935,RANGEDEL] - 000009:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL] - 000010:[d#2,RANGEDEL-e#72057594037927935,RANGEDEL] + 000008:[a#3,SET-c#72057594037927935,RANGEDEL] + 000009:[c#2,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000006:[a#0,SET-b#0,SET] 000007:[c#0,SET-d#0,SET] @@ -105,7 +104,7 @@ wait-pending-table-stats num-entries: 2 num-deletions: 1 point-deletions-bytes-estimate: 0 -range-deletions-bytes-estimate: 31 +range-deletions-bytes-estimate: 776 # Same as above, except range tombstone covers multiple grandparent file boundaries. @@ -140,9 +139,9 @@ L3 compact a-e L1 ---- 2: - 000010:[a#3,SET-b#72057594037927935,RANGEDEL] - 000011:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL] - 000012:[d#2,RANGEDEL-f#72057594037927935,RANGEDEL] + 000010:[a#3,SET-c#72057594037927935,RANGEDEL] + 000011:[c#2,RANGEDEL-e#72057594037927935,RANGEDEL] + 000012:[e#2,RANGEDEL-f#72057594037927935,RANGEDEL] 000013:[f#2,RANGEDEL-g#72057594037927935,RANGEDEL] 3: 000006:[a#0,SET-b#0,SET] @@ -215,9 +214,8 @@ L3 compact a-e L1 ---- 2: - 000008:[a#3,RANGEDEL-b#72057594037927935,RANGEDEL] - 000009:[b#3,RANGEDEL-d#72057594037927935,RANGEDEL] - 000010:[d#3,RANGEDEL-e#72057594037927935,RANGEDEL] + 000008:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL] + 000009:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000006:[a#0,SET-b#0,SET] 000007:[c#0,SET-d#0,SET] @@ -417,13 +415,10 @@ c:4 a:3 . -# This is a similar scenario to the one above, but we've arranged for -# the compaction to produce an sstable (9) for which the smallest -# range tombstone overlaps in key space with the largest key in the -# previous sstable (8). This necessitates adjusting the seqnum of the -# range tombstone. There was no actual iteration bug from earlier -# behavior of setting the seqnum to 0, but now we set the seqnum to -# prev.LargestKey.SeqNum-1. +# This is a similar scenario to the one above. In older versions of Pebble this +# case necessitated adjusting the seqnum of the range tombstone to +# prev.LargestKey.SeqNum-1. We no longer allow user keys to be split across +# sstables, and the seqnum adjustment is no longer necessary. # # Note the target-file-size of 26 is specially tailored to get the # desired compaction output. @@ -446,8 +441,8 @@ L3 compact a-e L1 ---- 2: - 000007:[a#4,SET-b#2,SET] - 000008:[b#1,RANGEDEL-e#72057594037927935,RANGEDEL] + 000007:[a#4,SET-a#4,SET] + 000008:[b#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000006:[b#1,SET-b#1,SET] @@ -483,8 +478,8 @@ L3 compact a-e L1 ---- 2: - 000007:[a#4,SET-b#3,SET] - 000008:[b#2,RANGEDEL-e#72057594037927935,RANGEDEL] + 000007:[a#4,SET-a#4,SET] + 000008:[b#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000006:[b#1,SET-b#1,SET] @@ -524,8 +519,8 @@ L3 compact a-e L1 ---- 2: - 000007:[a#4,SET-b#4,SET] - 000008:[b#2,RANGEDEL-e#72057594037927935,RANGEDEL] + 000007:[a#4,SET-a#4,SET] + 000008:[b#4,SET-e#72057594037927935,RANGEDEL] 3: 000006:[b#1,SET-b#1,SET] @@ -545,10 +540,7 @@ a:4 . # Test a scenario where the last point key in an sstable has a seqnum -# of 0, and the smallest range tombstone in the next sstable starts -# with the same key. We have to tweak the kind of the boundary key in -# sstable 8 to be a DEL rather than a RANGEDEL so that the two tables -# are disjoint in the key space. +# of 0. define target-file-sizes=(1, 1, 26) snapshots=(2) L1 @@ -577,7 +569,8 @@ a:3 compact a-e L1 ---- 2: - 000007:[a#3,SET-e#72057594037927935,RANGEDEL] + 000007:[a#3,SET-c#72057594037927935,RANGEDEL] + 000008:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL] 000006:[c#0,SET-d#0,SET] @@ -602,12 +595,12 @@ L1 c.SET.1:1 d.SET.1:1 L3 - a.RANGEDEL.2:b + c.RANGEDEL.2:d ---- 1: 000004:[a#3,SET-e#72057594037927935,RANGEDEL] 3: - 000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL] + 000005:[c#2,RANGEDEL-d#72057594037927935,RANGEDEL] compact a-f L1 ---- @@ -615,7 +608,7 @@ compact a-f L1 000006:[a#3,SET-c#72057594037927935,RANGEDEL] 000007:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: - 000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL] + 000005:[c#2,RANGEDEL-d#72057594037927935,RANGEDEL] # Test a scenario where we the last point key in an sstable has a # seqnum of 0, but there is another range tombstone later in the diff --git a/testdata/manual_compaction_set_with_del b/testdata/manual_compaction_set_with_del index e73e310266..7bd4ec9c12 100644 --- a/testdata/manual_compaction_set_with_del +++ b/testdata/manual_compaction_set_with_del @@ -92,9 +92,8 @@ range-deletions-bytes-estimate: 1552 compact a-e L1 ---- 2: - 000008:[a#3,SETWITHDEL-b#72057594037927935,RANGEDEL] - 000009:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL] - 000010:[d#2,RANGEDEL-e#72057594037927935,RANGEDEL] + 000008:[a#3,SETWITHDEL-c#72057594037927935,RANGEDEL] + 000009:[c#2,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000006:[a#0,SET-b#0,SET] 000007:[c#0,SET-d#0,SET] @@ -105,7 +104,7 @@ wait-pending-table-stats num-entries: 2 num-deletions: 1 point-deletions-bytes-estimate: 0 -range-deletions-bytes-estimate: 31 +range-deletions-bytes-estimate: 776 # Same as above, except range tombstone covers multiple grandparent file boundaries. @@ -140,9 +139,9 @@ L3 compact a-e L1 ---- 2: - 000010:[a#3,SETWITHDEL-b#72057594037927935,RANGEDEL] - 000011:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL] - 000012:[d#2,RANGEDEL-f#72057594037927935,RANGEDEL] + 000010:[a#3,SETWITHDEL-c#72057594037927935,RANGEDEL] + 000011:[c#2,RANGEDEL-e#72057594037927935,RANGEDEL] + 000012:[e#2,RANGEDEL-f#72057594037927935,RANGEDEL] 000013:[f#2,RANGEDEL-g#72057594037927935,RANGEDEL] 3: 000006:[a#0,SET-b#0,SET] @@ -215,9 +214,8 @@ L3 compact a-e L1 ---- 2: - 000008:[a#3,RANGEDEL-b#72057594037927935,RANGEDEL] - 000009:[b#3,RANGEDEL-d#72057594037927935,RANGEDEL] - 000010:[d#3,RANGEDEL-e#72057594037927935,RANGEDEL] + 000008:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL] + 000009:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000006:[a#0,SET-b#0,SET] 000007:[c#0,SET-d#0,SET] @@ -417,13 +415,10 @@ c:4 a:3 . -# This is a similar scenario to the one above, but we've arranged for -# the compaction to produce an sstable (9) for which the smallest -# range tombstone overlaps in key space with the largest key in the -# previous sstable (8). This necessitates adjusting the seqnum of the -# range tombstone. There was no actual iteration bug from earlier -# behavior of setting the seqnum to 0, but now we set the seqnum to -# prev.LargestKey.SeqNum-1. +# This is a similar scenario to the one above. In older versions of Pebble this +# case necessitated adjusting the seqnum of the range tombstone to +# prev.LargestKey.SeqNum-1. We no longer allow user keys to be split across +# sstables, and the seqnum adjustment is no longer necessary. # # Note the target-file-size of 26 is specially tailored to get the # desired compaction output. @@ -446,8 +441,8 @@ L3 compact a-e L1 ---- 2: - 000007:[a#4,SET-b#2,SET] - 000008:[b#1,RANGEDEL-e#72057594037927935,RANGEDEL] + 000007:[a#4,SET-a#4,SET] + 000008:[b#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000006:[b#1,SET-b#1,SET] @@ -483,8 +478,8 @@ L3 compact a-e L1 ---- 2: - 000007:[a#4,SET-b#3,SET] - 000008:[b#2,RANGEDEL-e#72057594037927935,RANGEDEL] + 000007:[a#4,SET-a#4,SET] + 000008:[b#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000006:[b#1,SET-b#1,SET] @@ -524,8 +519,8 @@ L3 compact a-e L1 ---- 2: - 000007:[a#4,SET-b#4,SET] - 000008:[b#2,RANGEDEL-e#72057594037927935,RANGEDEL] + 000007:[a#4,SET-a#4,SET] + 000008:[b#4,SET-e#72057594037927935,RANGEDEL] 3: 000006:[b#1,SET-b#1,SET] @@ -545,10 +540,7 @@ a:4 . # Test a scenario where the last point key in an sstable has a seqnum -# of 0, and the smallest range tombstone in the next sstable starts -# with the same key. We have to tweak the kind of the boundary key in -# sstable 8 to be a DEL rather than a RANGEDEL so that the two tables -# are disjoint in the key space. +# of 0. define target-file-sizes=(1, 1, 26) snapshots=(2) L1 @@ -577,7 +569,8 @@ a:3 compact a-e L1 ---- 2: - 000007:[a#3,SET-e#72057594037927935,RANGEDEL] + 000007:[a#3,SET-c#72057594037927935,RANGEDEL] + 000008:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL] 000006:[c#0,SET-d#0,SET] @@ -602,12 +595,12 @@ L1 c.SET.1:1 d.SET.1:1 L3 - a.RANGEDEL.2:b + c.RANGEDEL.2:d ---- 1: 000004:[a#3,SET-e#72057594037927935,RANGEDEL] 3: - 000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL] + 000005:[c#2,RANGEDEL-d#72057594037927935,RANGEDEL] compact a-f L1 ---- @@ -615,7 +608,8 @@ compact a-f L1 000006:[a#3,SET-c#72057594037927935,RANGEDEL] 000007:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL] 3: - 000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL] + 000005:[c#2,RANGEDEL-d#72057594037927935,RANGEDEL] + # Test a scenario where we the last point key in an sstable has a # seqnum of 0, but there is another range tombstone later in the