From a860bbad07c19bb9ab314138b7ed23d14a2a610d Mon Sep 17 00:00:00 2001
From: Jackson Owens <jackson@cockroachlabs.com>
Date: Mon, 24 Jan 2022 13:56:56 -0500
Subject: [PATCH] compaction: don't split outputs within a user key

During a compaction, if the current sstable hits the file size limit, defer
finishing the sstable if the next sstable would share a user key. This is the
current behavior of flushes, and this change brings parity between the two.

This change is motivated by introduction of range keys (see #1339). This
ensures we can always cleanly truncate range keys that span range-key boundaries.

This commit also removes (keyspan.Fragmenter).FlushTo. Now that we prohibit
splitting sstables in the middle of a user key, the Fragmenter's FlushTo
function is unnecessary. Compactions and flushes always use the
TruncateAndFlushTo variant.

This change required a tweak to the way grandparent limits are applied, in
order to switch the grandparent splitter's comparison into a >= comparsion.
This was necessary due to the shift in interpreting `splitterSuggestion`s as
exclusive boundaries.

Close #734.
---
 compaction.go                                 | 330 ++++--------------
 compaction_iter.go                            |   9 +-
 compaction_iter_test.go                       |   2 +-
 compaction_picker.go                          |   5 +
 docs/range_deletions.md                       |   4 +
 internal/keyspan/fragmenter.go                |  85 +----
 internal/keyspan/fragmenter_test.go           |  33 +-
 internal/keyspan/testdata/fragmenter_flush_to | 107 ------
 internal/keyspan/testdata/fragmenter_values   |  23 --
 range_del_test.go                             |  37 +-
 testdata/compaction_find_grandparent_limit    |  42 ++-
 testdata/compaction_iter                      |   8 +-
 testdata/compaction_iter_set_with_del         |   8 +-
 testdata/manual_compaction                    |  55 ++-
 testdata/manual_compaction_set_with_del       |  56 ++-
 15 files changed, 180 insertions(+), 624 deletions(-)
 delete mode 100644 internal/keyspan/testdata/fragmenter_flush_to

diff --git a/compaction.go b/compaction.go
index ae09ea09ab..fd74a361b0 100644
--- a/compaction.go
+++ b/compaction.go
@@ -145,124 +145,30 @@ func (f *fileSizeSplitter) onNewOutput(key *InternalKey) []byte {
 	return nil
 }
 
-type grandparentLimitSplitter struct {
-	c     *compaction
-	ve    *versionEdit
-	limit []byte
+type limitFuncSplitter struct {
+	c         *compaction
+	limitFunc func(userKey []byte) []byte
+	limit     []byte
 }
 
-func (g *grandparentLimitSplitter) shouldSplitBefore(
+func (lf *limitFuncSplitter) shouldSplitBefore(
 	key *InternalKey, tw *sstable.Writer,
 ) compactionSplitSuggestion {
-	if g.limit != nil && g.c.cmp(key.UserKey, g.limit) > 0 {
+	// NB: The limit must be applied using >= since lf.limit may be used as the
+	// `splitterSuggestion` ultimately passed to `compactionIter.Tombstones` to
+	// serve as an *exclusive* end boundary truncation point. If we used > then,
+	// we may have already added a key with the user key `lf.limit` to the
+	// previous sstable.
+	if lf.limit != nil && lf.c.cmp(key.UserKey, lf.limit) >= 0 {
 		return splitNow
 	}
 	return noSplit
 }
 
-func (g *grandparentLimitSplitter) onNewOutput(key *InternalKey) []byte {
-	g.limit = nil
-	if g.c.rangeDelFrag.Empty() || len(g.ve.NewFiles) == 0 {
-		// In this case, `limit` will be a larger user key than `key.UserKey`, or
-		// nil. In either case, the inner loop will execute at least once to
-		// process `key`, and the input iterator will be advanced.
-		if key != nil {
-			g.limit = g.c.findGrandparentLimit(key.UserKey)
-		} else { // !c.rangeDelFrag.Empty()
-			g.limit = g.c.findGrandparentLimit(g.c.rangeDelFrag.Start())
-		}
-	} else {
-		// There is a range tombstone spanning from the last file into the
-		// current one. Therefore this file's smallest boundary will overlap the
-		// last file's largest boundary.
-		//
-		// In this case, `limit` will be a larger user key than the previous
-		// file's largest key and correspond to a grandparent file's largest user
-		// key, or nil. Then, it is possible the inner loop executes zero times,
-		// and the output file contains only range tombstones. That is fine as
-		// long as the number of times we execute this case is bounded. Since
-		// `findGrandparentLimit()` returns a strictly larger user key each time
-		// and it corresponds to a grandparent file largest key, the number of
-		// times this case can execute is bounded by the number of grandparent
-		// files (plus one for the final time it returns nil).
-		//
-		// n > 0 since we cannot have seen range tombstones at the
-		// beginning of the first file.
-		n := len(g.ve.NewFiles)
-		g.limit = g.c.findGrandparentLimit(g.ve.NewFiles[n-1].Meta.Largest.UserKey)
-		// This conditional is necessary to maintain the invariant that
-		// successive calls to finishOutput() have an increasing
-		// limit, and that the fragmenter's *FlushTo() and Add() calls are
-		// always made with successively increasing user keys. It's possible
-		// for the last file's Meta.Largest.UserKey
-		// to be substantially less than the last key returned by
-		// the compactionIter, such as in a sequence of consecutive
-		// rangedel tombstones, of which some but not all get elided.
-		// Consider this example:
-		//
-		// Compaction input (all range tombstones in this snippet):
-		// a-b
-		// c-d
-		// e-f  (elided)
-		// g-h  (elided) <-- grandparent limit before this (at ff)
-		// i-j  (elided)
-		// k-q           <-- grandparent limit at k
-		// m-q
-		//
-		// Note that elided tombstones are added to the fragmenter, but
-		// removed before they make their way from the fragmenter onto
-		// iter.tombstones. They still affect the fragmenter's internal
-		// tracking of the key up to which all tombstones have been flushed.
-		// After the first output is cut with limit = g, the fragmenter
-		// is empty, so the next grandparent calculation happens with
-		// key = g, and returns k. We continue adding range tombstones to
-		// the fragmenter between [g,k], which includes k-q as we only
-		// switch outputs after the limit is exceeded. So key = m and
-		// limit = k when finishOutput is called. Since the start key in
-		// the fragmenter (k) matches the limit, and no point keys were
-		// added, we actually don't produce a new output (see the
-		// conditional in finishOutput() on why).
-		//
-		// When we try to calculate the next grandparent limit, since
-		// c.rangeDelFrag.Empty() == false (as k-q is sitting in it),
-		// we fall into this case, and use the end key of the last written
-		// sstable (which was [a-d]) to calculate the grandparent limit.
-		// That gets us g again, which gets us to call
-		// c.rangeDelFrag.FlushTo(g), which violates the
-		// invariant that the current flush-to key (g) be greater than the
-		// last one (k).
-		//
-		// To solve this, if the grandparent limit falls "in between"
-		// ve.NewFiles[n-1].Meta.Largest.UserKey and c.rangeDelFrag.Start(),
-		// re-calculate a higher limit ahead of that key.
-		if g.c.rangeDelFrag.Start() != nil && g.c.cmp(g.limit, g.c.rangeDelFrag.Start()) <= 0 {
-			g.limit = g.c.findGrandparentLimit(g.c.rangeDelFrag.Start())
-		}
-	}
-	return g.limit
-}
-
-type l0LimitSplitter struct {
-	c     *compaction
-	ve    *versionEdit
-	limit []byte
-}
-
-func (l *l0LimitSplitter) shouldSplitBefore(
-	key *InternalKey, tw *sstable.Writer,
-) compactionSplitSuggestion {
-	if l.limit != nil && l.c.cmp(key.UserKey, l.limit) > 0 {
-		return splitNow
-	}
-	return noSplit
-}
-
-func (l *l0LimitSplitter) onNewOutput(key *InternalKey) []byte {
-	l.limit = nil
-	// For flushes being split across multiple sstables, call
-	// findL0Limit to find the next L0 limit.
+func (lf *limitFuncSplitter) onNewOutput(key *InternalKey) []byte {
+	lf.limit = nil
 	if key != nil {
-		l.limit = l.c.findL0Limit(key.UserKey)
+		lf.limit = lf.limitFunc(key.UserKey)
 	} else {
 		// Use the start key of the first pending tombstone to find the
 		// next limit. All pending tombstones have the same start key.
@@ -279,12 +185,11 @@ func (l *l0LimitSplitter) onNewOutput(key *InternalKey) []byte {
 		// and finishOutput() would not advance any further because
 		// the next range tombstone to write does not start until after
 		// the L0 split point.
-		startKey := l.c.rangeDelFrag.Start()
-		if startKey != nil {
-			l.limit = l.c.findL0Limit(startKey)
+		if startKey := lf.c.rangeDelFrag.Start(); startKey != nil {
+			lf.limit = lf.limitFunc(startKey)
 		}
 	}
-	return l.limit
+	return lf.limit
 }
 
 // splitterGroup is a compactionOutputSplitter that splits whenever one of its
@@ -858,14 +763,6 @@ func seekGT(iter *manifest.LevelIterator, cmp base.Compare, key []byte) *manifes
 // current heuristic stops output of a table if the addition of another key
 // would cause the table to overlap more than 10x the target file size at
 // level N. See maxGrandparentOverlapBytes.
-//
-// TODO(peter): Stopping compaction output in the middle of a user-key creates
-// 2 sstables that need to be compacted together as an "atomic compaction
-// unit". This is unfortunate as it removes the benefit of stopping output to
-// an sstable in order to prevent a large compaction with the next level. Seems
-// better to adjust findGrandparentLimit to not stop output in the middle of a
-// user-key. Perhaps this isn't a problem if the compaction picking heuristics
-// always pick the right (older) sibling for compaction first.
 func (c *compaction) findGrandparentLimit(start []byte) []byte {
 	iter := c.grandparents.Iter()
 	var overlappedBytes uint64
@@ -874,8 +771,8 @@ func (c *compaction) findGrandparentLimit(start []byte) []byte {
 		// To ensure forward progress we always return a larger user
 		// key than where we started. See comments above clients of
 		// this function for how this is used.
-		if overlappedBytes > c.maxOverlapBytes && c.cmp(start, f.Largest.UserKey) < 0 {
-			return f.Largest.UserKey
+		if overlappedBytes > c.maxOverlapBytes && c.cmp(start, f.Smallest.UserKey) < 0 {
+			return f.Smallest.UserKey
 		}
 	}
 	return nil
@@ -1060,13 +957,15 @@ func (c *compaction) newInputIter(newIters tableNewIters) (_ internalIterator, r
 			c.closers = append(c.closers, rangeDelIter)
 			rangeDelIter = noCloseIter{rangeDelIter}
 
-			// Truncate the range tombstones returned by the iterator to the upper
-			// bound of the atomic compaction unit. Note that we need do this
-			// truncation at read time in order to handle RocksDB generated sstables
-			// which do not truncate range tombstones to atomic compaction unit
-			// boundaries at write time. Because we're doing the truncation at read
-			// time, we follow RocksDB's lead and do not truncate tombstones to
-			// atomic unit boundaries at compaction time.
+			// Truncate the range tombstones returned by the iterator to the
+			// upper bound of the atomic compaction unit. Note that we need do
+			// this truncation at read time in order to handle sstables
+			// generated by RocksDB and earlier versions of Pebble which do not
+			// truncate range tombstones to atomic compaction unit boundaries at
+			// write time.
+			//
+			// The current Pebble compaction logic DOES truncate tombstones to
+			// atomic unit boundaries at compaction time too.
 			atomicUnit, _ := expandToAtomicUnit(c.cmp, f.Slice(), true /* disableIsCompacting */)
 			lowerBound, upperBound := manifest.KeyRange(c.cmp, atomicUnit.Iter())
 			// Range deletion tombstones are often written to sstables
@@ -2213,7 +2112,7 @@ func (d *DB) runCompaction(
 		// compactionIter.Tombstones via keyspan.Fragmenter.FlushTo, and by the
 		// WriterMetadata.LargestRangeDel.UserKey.
 		splitKey = append([]byte(nil), splitKey...)
-		for _, v := range iter.Tombstones(splitKey, splitL0Outputs) {
+		for _, v := range iter.Tombstones(splitKey) {
 			if tw == nil {
 				if err := newOutput(); err != nil {
 					return err
@@ -2280,10 +2179,9 @@ func (d *DB) runCompaction(
 		outputMetrics.Size += int64(meta.Size)
 		outputMetrics.NumFiles++
 
-		// The handling of range boundaries is a bit complicated.
 		if n := len(ve.NewFiles); n > 1 {
-			// This is not the first output file. Bound the smallest range key by the
-			// previous tables largest key.
+			// This is not the first output file. Ensure the sstable boundaries
+			// are nonoverlapping.
 			prevMeta := ve.NewFiles[n-2].Meta
 			if writerMeta.SmallestRangeDel.UserKey != nil {
 				c := d.cmp(writerMeta.SmallestRangeDel.UserKey, prevMeta.Largest.UserKey)
@@ -2292,57 +2190,30 @@ func (d *DB) runCompaction(
 						"pebble: smallest range tombstone start key is less than previous sstable largest key: %s < %s",
 						writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey),
 						prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey))
-				}
-				if c == 0 && prevMeta.Largest.SeqNum() <= writerMeta.SmallestRangeDel.SeqNum() {
-					// The user key portion of the range boundary start key is equal to
-					// the previous table's largest key. We need the tables to be
-					// key-space partitioned, so force the boundary to a key that we know
-					// is larger than the previous table's largest key.
-					if prevMeta.Largest.SeqNum() == 0 {
-						// If the seqnum of the previous table's largest key is 0, we can't
-						// decrement it. This should never happen as we take care in the
-						// main compaction loop to avoid generating an sstable with a
-						// largest key containing a zero seqnum.
-						return errors.Errorf(
-							"pebble: previous sstable largest key unexpectedly has 0 seqnum: %s",
-							prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey))
-					}
-					// TODO(peter): Technically, this produces a small gap with the
-					// previous sstable. The largest key of the previous table may be
-					// b#5.SET. The smallest key of the new sstable may then become
-					// b#4.RANGEDEL even though the tombstone is [b,z)#6. If iteration
-					// ever precisely truncates sstable boundaries, the key b#5.DEL at
-					// a lower level could slip through. Note that this can't ever
-					// actually happen, though, because the only way for two records to
-					// have the same seqnum is via ingestion. And even if it did happen
-					// revealing a deletion tombstone is not problematic. Slightly more
-					// worrisome is the combination of b#5.MERGE, b#5.SET and
-					// b#4.RANGEDEL, but we can't ever see b#5.MERGE and b#5.SET at
-					// different levels in the tree due to the ingestion argument and
-					// atomic compaction units.
-					//
-					// TODO(sumeer): Incorporate the the comment in
-					// https://github.com/cockroachdb/pebble/pull/479#pullrequestreview-340600654
-					// into docs/range_deletions.md and reference the correctness
-					// argument here. Note that that comment might be slightly incorrect.
-					writerMeta.SmallestRangeDel.SetSeqNum(prevMeta.Largest.SeqNum() - 1)
+				} else if c == 0 && !prevMeta.Largest.IsExclusiveSentinel() {
+					// The user key portion of the range boundary start key is
+					// equal to the previous table's largest key user key, and
+					// the previous table's largest key is not exclusive. This
+					// violates the invariant that tables are key-space
+					// partitioned.
+					return errors.Errorf(
+						"pebble: invariant violation: previous sstable largest key %s, current sstable smallest rangedel: %s",
+						prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey),
+						writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey),
+					)
 				}
 			}
 		}
 
-		if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil {
-			// The current file is not the last output file and there is a range tombstone in it.
-			// If the tombstone extends into the next file, then truncate it for the purposes of
-			// computing meta.Largest. For example, say the next file's first key is c#7,1 and the
-			// current file's last key is c#10,1 and the current file has a range tombstone
-			// [b, d)#12,15. For purposes of the bounds we pretend that the range tombstone ends at
-			// c#inf where inf is the InternalKeyRangeDeleteSentinel. Note that this is just for
-			// purposes of bounds computation -- the current sstable will end up with a Largest key
-			// of c#7,1 so the range tombstone in the current file will be able to delete c#7.
-			if d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) >= 0 {
-				writerMeta.LargestRangeDel.UserKey = splitKey
-				writerMeta.LargestRangeDel.Trailer = InternalKeyRangeDeleteSentinel
-			}
+		// Verify that all range deletions outputted to the sstable are
+		// truncated to split key.
+		if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil &&
+			d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) > 0 {
+			return errors.Errorf(
+				"pebble: invariant violation: rangedel largest key %q extends beyond split key %q",
+				writerMeta.LargestRangeDel.Pretty(d.opts.Comparer.FormatKey),
+				d.opts.Comparer.FormatKey(splitKey),
+			)
 		}
 
 		meta.Smallest = writerMeta.Smallest(d.cmp)
@@ -2373,12 +2244,10 @@ func (d *DB) runCompaction(
 					c.largest.Pretty(d.opts.Comparer.FormatKey))
 			}
 		}
-		// Verify that when splitting an output to L0, we never split different
-		// revisions of the same user key across two different sstables.
-		if splitL0Outputs {
-			if err := c.errorOnUserKeyOverlap(ve); err != nil {
-				return err
-			}
+		// Verify that we never split different revisions of the same user key
+		// across two different sstables.
+		if err := c.errorOnUserKeyOverlap(ve); err != nil {
+			return err
 		}
 		if err := meta.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
 			return err
@@ -2392,24 +2261,14 @@ func (d *DB) runCompaction(
 	// the splitterGroup can be composed of multiple splitters. In this case,
 	// we start off with splitters for file sizes, grandparent limits, and (for
 	// L0 splits) L0 limits, before wrapping them in an splitterGroup.
-	//
-	// There is a complication here: We may not be able to switch SSTables right
-	// away when we are splitting an L0 output. We do not split the same user
-	// key across different sstables within one flush or intra-L0 compaction, so
-	// the userKeyChangeSplitter ensures we are at a user key change boundary
-	// when doing a split.
 	outputSplitters := []compactionOutputSplitter{
-		&fileSizeSplitter{maxFileSize: c.maxOutputFileSize},
-		&grandparentLimitSplitter{c: c, ve: ve},
-	}
-	var splitter compactionOutputSplitter
-	if splitL0Outputs {
-		// outputSplitters[0] is the file size splitter, which doesn't guarantee
-		// that all advised splits will be at user key change boundaries. Wrap
-		// it in a userKeyChangeSplitter.
-		outputSplitters[0] = &userKeyChangeSplitter{
+		// We do not split the same user key across different sstables within
+		// one flush or compaction. The fileSizeSplitter may request a split in
+		// the middle of a user key, so the userKeyChangeSplitter ensures we are
+		// at a user key change boundary when doing a split.
+		&userKeyChangeSplitter{
 			cmp:      c.cmp,
-			splitter: outputSplitters[0],
+			splitter: &fileSizeSplitter{maxFileSize: c.maxOutputFileSize},
 			unsafePrevUserKey: func() []byte {
 				// Return the largest point key written to tw or the start of
 				// the current range deletion in the fragmenter, whichever is
@@ -2420,13 +2279,13 @@ func (d *DB) runCompaction(
 				}
 				return c.rangeDelFrag.Start()
 			},
-		}
-		outputSplitters = append(outputSplitters, &l0LimitSplitter{c: c, ve: ve})
+		},
+		&limitFuncSplitter{c: c, limitFunc: c.findGrandparentLimit},
 	}
-	splitter = &splitterGroup{
-		cmp:       c.cmp,
-		splitters: outputSplitters,
+	if splitL0Outputs {
+		outputSplitters = append(outputSplitters, &limitFuncSplitter{c: c, limitFunc: c.findL0Limit})
 	}
+	splitter := &splitterGroup{cmp: c.cmp, splitters: outputSplitters}
 
 	// NB: we avoid calling maybeThrottle on a nilPacer because the cost of
 	// dynamic dispatch in the hot loop below is pronounced in CPU profiles (see
@@ -2448,7 +2307,6 @@ func (d *DB) runCompaction(
 		splitterSuggestion := splitter.onNewOutput(key)
 
 		// Each inner loop iteration processes one key from the input iterator.
-		prevPointSeqNum := InternalKeySeqNumMax
 		for ; key != nil; key, val = iter.Next() {
 			if split := splitter.shouldSplitBefore(key, tw); split == splitNow {
 				break
@@ -2476,21 +2334,11 @@ func (d *DB) runCompaction(
 			if err := tw.Add(*key, val); err != nil {
 				return nil, pendingOutputs, err
 			}
-			prevPointSeqNum = key.SeqNum()
 		}
 
 		// A splitter requested a split, and we're ready to finish the output.
 		// We need to choose the key at which to split any pending range
 		// tombstones.
-		//
-		// There's a complication here. We need to ensure that for a user key
-		// k we never end up with one output's largest key as k#0 and the
-		// next output's smallest key as k#RANGEDEL,#x where x > 0. This is a
-		// problem because k#RANGEDEL,#x sorts before k#0.  Normally, we just
-		// adjust the seqnum of the next output's smallest boundary to be
-		// less, but that's not possible with the zero seqnum. We can avoid
-		// this case with careful picking of where to split pending range
-		// tombstones.
 		var splitKey []byte
 		switch {
 		case key != nil:
@@ -2504,50 +2352,10 @@ func (d *DB) runCompaction(
 			// Splitting at the current key will cause this output to overlap
 			// a potentially unbounded number of grandparents.
 			splitKey = key.UserKey
-		case key == nil && splitL0Outputs:
-			// We ran out of keys with flush splits enabled. Set splitKey to
-			// nil so all range tombstones get flushed in the current sstable.
-			// Consider this example:
-			//
-			// a.SET.4
-			// d.MERGE.5
-			// d.RANGEDEL.3:f
-			// (no more keys remaining)
-			//
-			// Where d is a flush split key (i.e. splitterSuggestion = 'd').
-			// Since d.MERGE.5 has already been written to this output by this
-			// point (as it's <= splitterSuggestion), and outputs written to L0
-			// cannot have user keys split across multiple sstables, we have to
-			// set splitKey to a key greater than 'd' to ensure the range
-			// deletion also gets flushed. Setting the splitKey to nil is the
-			// simplest way to ensure that.
-			//
-			// TODO(jackson): This case is only problematic if splitKey equals
-			// the user key of the last point key added. We don't need to
-			// flush *all* range tombstones to the current sstable. We could
-			// flush up to the next grandparent limit greater than
-			// `splitterSuggestion` instead.
-			splitKey = nil
-		case key == nil && prevPointSeqNum != 0:
-			// The last key added did not have a zero sequence number, so
-			// we'll always be able to adjust the next table's smallest key.
-			// NB: Because of the splitter's `onNewOutput` contract,
-			// `splitterSuggestion` must be >= any key previously added to the
-			// current output sstable.
+		case key == nil:
+			// NB: Because of the userKeyChangeSplitter, `splitterSuggestion`
+			// must be > any key previously added to the current output sstable.
 			splitKey = splitterSuggestion
-		case key == nil && prevPointSeqNum == 0:
-			// The last key added did have a zero sequence number. The
-			// splitters' suggested split point might have the same user key,
-			// which would cause the next output to have an unadjustable
-			// smallest key. To prevent that, we ignore the splitter's
-			// suggestion, leaving splitKey nil to flush all pending range
-			// tombstones.
-			// TODO(jackson): This case is only problematic if splitKey equals
-			// the user key of the last point key added. We don't need to
-			// flush *all* range tombstones to the current sstable. We could
-			// flush up to the next grandparent limit greater than
-			// `splitterSuggestion` instead.
-			splitKey = nil
 		default:
 			return nil, nil, errors.New("pebble: not reached")
 		}
diff --git a/compaction_iter.go b/compaction_iter.go
index 9ec825fd50..e7747fd4d6 100644
--- a/compaction_iter.go
+++ b/compaction_iter.go
@@ -770,17 +770,14 @@ func (i *compactionIter) Close() error {
 // exclude specifies if the specified key is exclusive or inclusive.
 // When exclude = true, all returned range tombstones are truncated to the
 // specified key.
-func (i *compactionIter) Tombstones(key []byte, exclude bool) []keyspan.Span {
-	switch {
-	case key == nil:
+func (i *compactionIter) Tombstones(key []byte) []keyspan.Span {
+	if key == nil {
 		i.rangeDelFrag.Finish()
-	case exclude:
+	} else {
 		// The specified end key is exclusive; no versions of the specified
 		// user key (including range tombstones covering that key) should
 		// be flushed yet.
 		i.rangeDelFrag.TruncateAndFlushTo(key)
-	default:
-		i.rangeDelFrag.FlushTo(key)
 	}
 	tombstones := i.tombstones
 	i.tombstones = nil
diff --git a/compaction_iter_test.go b/compaction_iter_test.go
index 918b7560da..02b604872d 100644
--- a/compaction_iter_test.go
+++ b/compaction_iter_test.go
@@ -192,7 +192,7 @@ func TestCompactionIter(t *testing.T) {
 						if len(parts) == 2 {
 							key = []byte(parts[1])
 						}
-						for _, v := range iter.Tombstones(key, false) {
+						for _, v := range iter.Tombstones(key) {
 							fmt.Fprintf(&b, "%s-%s#%d\n",
 								v.Start.UserKey, v.End, v.Start.SeqNum())
 						}
diff --git a/compaction_picker.go b/compaction_picker.go
index 6a3cd47726..4155477db4 100644
--- a/compaction_picker.go
+++ b/compaction_picker.go
@@ -374,6 +374,11 @@ func (pc *pickedCompaction) grow(sm, la InternalKey, maxExpandedBytes uint64) bo
 // disableIsCompacting is true, isCompacting always returns false. This helps
 // avoid spurious races from being detected when this method is used outside
 // of compaction picking code.
+//
+// TODO(jackson): Compactions and flushes no longer split a user key between two
+// sstables. We could perform a migration, re-compacting any sstables with split
+// user keys, which would allow us to remove atomic compaction unit expansion
+// code.
 func expandToAtomicUnit(
 	cmp Compare, inputs manifest.LevelSlice, disableIsCompacting bool,
 ) (slice manifest.LevelSlice, isCompacting bool) {
diff --git a/docs/range_deletions.md b/docs/range_deletions.md
index e1c9ac6564..19b8f06a38 100644
--- a/docs/range_deletions.md
+++ b/docs/range_deletions.md
@@ -1,5 +1,9 @@
 # Range Deletions
 
+TODO: The following explanation of range deletions does not take into account
+the recent change to prohibit splitting of a user key between sstables. This
+change simplifies the logic, removing 'improperly truncated range tombstones.'
+
 TODO: The following explanation of range deletions ignores the
 kind/trailer that appears at the end of keys after the sequence
 number. This should be harmless but need to add a justification on why
diff --git a/internal/keyspan/fragmenter.go b/internal/keyspan/fragmenter.go
index 76e5319443..276709a146 100644
--- a/internal/keyspan/fragmenter.go
+++ b/internal/keyspan/fragmenter.go
@@ -88,8 +88,8 @@ type Fragmenter struct {
 	// flushed. All pending spans have the same Start.UserKey.
 	pending []Span
 	// doneBuf is used to buffer completed span fragments when flushing to a
-	// specific key (e.g. FlushTo). It is cached in the Fragmenter to allow
-	// reuse.
+	// specific key (e.g. TruncateAndFlushTo). It is cached in the Fragmenter to
+	// allow reuse.
 	doneBuf []Span
 	// sortBuf is used to sort fragments by end key when flushing.
 	sortBuf spansByEndKey
@@ -266,88 +266,15 @@ func (f *Fragmenter) Empty() bool {
 	return f.finished || len(f.pending) == 0
 }
 
-// FlushTo flushes all of the fragments with a start key <= key. Used during
-// compaction to force emitting of spans which straddle an sstable boundary.
-// Note that the emitted spans are not truncated to the specified key. Consider
+// TruncateAndFlushTo flushes all of the fragments with a start key <= key,
+// truncating spans to the specified end key. Used during compaction to force
+// emitting of spans which straddle an sstable boundary. Consider
 // the scenario:
 //
 //     a---------k#10
 //          f#8
 //          f#7
-//
-// If the compaction logic splits f#8 and f#7 into different sstables, we can't
-// truncate the span [a,k) at f. Doing so could produce an sstable with the
-// records:
-//
-//     a----f#10
-//          f#8
-//
-// The span [a,f) does not cover the key f.
-func (f *Fragmenter) FlushTo(key []byte) {
-	if f.finished {
-		panic("pebble: span fragmenter already finished")
-	}
-
-	if f.flushedKey != nil {
-		switch c := f.Cmp(key, f.flushedKey); {
-		case c < 0:
-			panic(fmt.Sprintf("pebble: flush-to key (%s) < flushed key (%s)",
-				f.Format(key), f.Format(f.flushedKey)))
-		}
-	}
-	f.flushedKey = append(f.flushedKey[:0], key...)
-
-	if len(f.pending) > 0 {
-		// Since all of the pending spans have the same start key, we only need
-		// to compare against the first one.
-		switch c := f.Cmp(f.pending[0].Start.UserKey, key); {
-		case c > 0:
-			panic(fmt.Sprintf("pebble: keys must be in order: %s > %s",
-				f.Format(f.pending[0].Start.UserKey), f.Format(key)))
-		}
-		// Note that we explicitly do not return early here if Start.UserKey ==
-		// key. Similar to the scenario described above, consider:
-		//
-		//          f----k#10
-		//          f#8
-		//          f#7
-		//
-		// If the compaction logic splits f#8 and f#7 into different sstables, we
-		// have to emit the span [f,k) in both sstables.
-	}
-
-	// At this point we know that the new start key is greater than the pending
-	// spans start keys. We flush all span fragments with a start key
-	// <= key.
-	f.flush(f.pending, key)
-
-	// Truncate the pending spans to start with key, filtering any which
-	// would become empty.
-	pending := f.pending
-	f.pending = f.pending[:0]
-	for _, s := range pending {
-		if f.Cmp(key, s.End) < 0 {
-			//   s: a--+--e
-			// new:    c------
-			f.pending = append(f.pending, Span{
-				Start: base.MakeInternalKey(key, s.Start.SeqNum(), s.Start.Kind()),
-				End:   s.End,
-				Value: s.Value,
-			})
-		}
-	}
-}
-
-// TruncateAndFlushTo is similar to FlushTo, except it also truncates spans to
-// the specified end key by calling truncateAndFlush. Only called in compactions
-// where we can guarantee that all versions of UserKeys < key have been written,
-// or in other words, where we can ensure we don't split a user key across two
-// sstables. Going back to the scenario from above:
-//
-//    a---------k#10
-//         f#8
-//         f#7
-//
+///
 // Let's say the next user key after f is g. Calling TruncateAndFlushTo(g) will
 // flush this span:
 //
diff --git a/internal/keyspan/fragmenter_test.go b/internal/keyspan/fragmenter_test.go
index 16782b221f..86d9cfb3d5 100644
--- a/internal/keyspan/fragmenter_test.go
+++ b/internal/keyspan/fragmenter_test.go
@@ -45,14 +45,7 @@ func buildSpans(
 		},
 	}
 	for _, line := range strings.Split(s, "\n") {
-		if strings.HasPrefix(line, "flush-to ") {
-			parts := strings.Split(line, " ")
-			if len(parts) != 2 {
-				t.Fatalf("expected 2 components, but found %d: %s", len(parts), line)
-			}
-			f.FlushTo([]byte(parts[1]))
-			continue
-		} else if strings.HasPrefix(line, "truncate-and-flush-to ") {
+		if strings.HasPrefix(line, "truncate-and-flush-to ") {
 			parts := strings.Split(line, " ")
 			if len(parts) != 2 {
 				t.Fatalf("expected 2 components, but found %d: %s", len(parts), line)
@@ -204,30 +197,6 @@ func TestFragmenterDeleted(t *testing.T) {
 	})
 }
 
-func TestFragmenterFlushTo(t *testing.T) {
-	cmp := base.DefaultComparer.Compare
-	fmtKey := base.DefaultComparer.FormatKey
-
-	datadriven.RunTest(t, "testdata/fragmenter_flush_to", func(d *datadriven.TestData) string {
-		switch d.Cmd {
-		case "build":
-			return func() (result string) {
-				defer func() {
-					if r := recover(); r != nil {
-						result = fmt.Sprint(r)
-					}
-				}()
-
-				spans := buildSpans(t, cmp, fmtKey, d.Input, base.InternalKeyKindRangeDelete)
-				return formatSpans(spans)
-			}()
-
-		default:
-			return fmt.Sprintf("unknown command: %s", d.Cmd)
-		}
-	})
-}
-
 func TestFragmenterTruncateAndFlushTo(t *testing.T) {
 	cmp := base.DefaultComparer.Compare
 	fmtKey := base.DefaultComparer.FormatKey
diff --git a/internal/keyspan/testdata/fragmenter_flush_to b/internal/keyspan/testdata/fragmenter_flush_to
deleted file mode 100644
index a4b935358a..0000000000
--- a/internal/keyspan/testdata/fragmenter_flush_to
+++ /dev/null
@@ -1,107 +0,0 @@
-build
-2: a--c
-flush-to c
-1:  b--d
-----
-pebble: start key (b) < flushed key (c)
-
-build
-flush-to c
-1:  b--d
-----
-pebble: start key (b) < flushed key (c)
-
-build
-flush-to c
-flush-to b
-----
-pebble: flush-to key (b) < flushed key (c)
-
-build
-3: a--d
-2:    d--g
-flush-to c
-----
-pebble: flush-to key (c) < flushed key (d)
-
-# Note that the duplication of the fragments is expected. The flush-to
-# is emitting tombstones to different sstables.
-
-build
-3: a--d
-flush-to a
-----
-3: a--d
-3: a--d
-
-build
-3: a--d
-2:    d--g
-flush-to d
-----
-3: a--d
-2:    d--g
-2:    d--g
-
-build
-2: a----f
-flush-to c
-----
-2: a----f
-2:   c--f
-
-build
-2: a----f
-flush-to f
-----
-2: a----f
-
-build
-3: a-c
-1: a-----g
-flush-to d
-----
-3: a-c
-1: a-c
-1:   c---g
-1:    d--g
-
-build
-2: a---e
-1: a------h
-flush-to c
-----
-2: a---e
-1: a---e
-2:   c-e
-1:   c-e
-1:     e--h
-
-build
-3: a-c
-2: a---e
-1: a-----g
-flush-to d
-3:    d----i
-----
-3: a-c
-2: a-c
-1: a-c
-2:   c-e
-1:   c-e
-3:    de
-2:    de
-1:    de
-3:     e-g
-1:     e-g
-3:       g-i
-
-build
-3: a-c
-2: a-----g
-flush-to e
-----
-3: a-c
-2: a-c
-2:   c---g
-2:     e-g
diff --git a/internal/keyspan/testdata/fragmenter_values b/internal/keyspan/testdata/fragmenter_values
index 8e2e5fcbc8..7462ae8b4b 100644
--- a/internal/keyspan/testdata/fragmenter_values
+++ b/internal/keyspan/testdata/fragmenter_values
@@ -63,26 +63,3 @@ truncate-and-flush-to d
 3:     e-g                    orange
 1:     e-g                    coconut
 3:       g-i                  orange
-
-# NB: Unlike the above truncate-and-flush-to calls, a flush-to does not truncate
-# the end boundary. In this case, the fragments beginning at `c` are not
-# truncated to `d`, they're flushed with the bounadries formed by fragmentation
-# (`e`)
-build
-3: a-c                        apple
-2: a---e                      banana
-1: a-----g                    coconut
-flush-to d
-3:    d----i                  orange
-----
-3: a-c                        apple
-2: a-c                        banana
-1: a-c                        coconut
-2:   c-e                      banana
-1:   c-e                      coconut
-3:    de                      orange
-2:    de                      banana
-1:    de                      coconut
-3:     e-g                    orange
-1:     e-g                    coconut
-3:       g-i                  orange
diff --git a/range_del_test.go b/range_del_test.go
index e2d7389c75..dc94a79388 100644
--- a/range_del_test.go
+++ b/range_del_test.go
@@ -267,20 +267,18 @@ func TestRangeDelCompactionTruncation(t *testing.T) {
 1:
   000008:[a#3,RANGEDEL-b#72057594037927935,RANGEDEL]
 2:
-  000012:[b#4,SET-b#4,SET]
-  000013:[b#3,RANGEDEL-c#72057594037927935,RANGEDEL]
+  000012:[b#4,SET-c#72057594037927935,RANGEDEL]
 3:
-  000014:[c#5,SET-d#72057594037927935,RANGEDEL]
+  000013:[c#5,SET-d#72057594037927935,RANGEDEL]
 `)
 		} else {
 			expectLSM(`
 1:
   000008:[a#3,RANGEDEL-b#72057594037927935,RANGEDEL]
 2:
-  000012:[b#4,SETWITHDEL-b#4,SETWITHDEL]
-  000013:[b#3,RANGEDEL-c#72057594037927935,RANGEDEL]
+  000012:[b#4,SETWITHDEL-c#72057594037927935,RANGEDEL]
 3:
-  000014:[c#5,SET-d#72057594037927935,RANGEDEL]
+  000013:[c#5,SET-d#72057594037927935,RANGEDEL]
 `)
 		}
 
@@ -359,17 +357,15 @@ func TestRangeDelCompactionTruncation2(t *testing.T) {
 	require.NoError(t, d.Compact([]byte("b"), []byte("b\x00")))
 	expectLSM(`
 6:
-  000008:[a#3,RANGEDEL-b#2,SET]
-  000009:[b#1,RANGEDEL-d#72057594037927935,RANGEDEL]
+  000009:[a#3,RANGEDEL-d#72057594037927935,RANGEDEL]
 `)
 
 	require.NoError(t, d.Set([]byte("c"), bytes.Repeat([]byte("d"), 100), nil))
 	require.NoError(t, d.Compact([]byte("c"), []byte("c\x00")))
 	expectLSM(`
 6:
-  000012:[a#3,RANGEDEL-b#2,SET]
-  000013:[b#1,RANGEDEL-c#72057594037927935,RANGEDEL]
-  000014:[c#4,SET-d#72057594037927935,RANGEDEL]
+  000012:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL]
+  000013:[c#4,SET-d#72057594037927935,RANGEDEL]
 `)
 }
 
@@ -435,8 +431,7 @@ func TestRangeDelCompactionTruncation3(t *testing.T) {
 	}
 	expectLSM(`
 3:
-  000012:[a#3,RANGEDEL-b#2,SET]
-  000013:[b#1,RANGEDEL-d#72057594037927935,RANGEDEL]
+  000009:[a#3,RANGEDEL-d#72057594037927935,RANGEDEL]
 `)
 
 	require.NoError(t, d.Set([]byte("c"), bytes.Repeat([]byte("d"), 100), nil))
@@ -444,19 +439,18 @@ func TestRangeDelCompactionTruncation3(t *testing.T) {
 	require.NoError(t, d.Compact([]byte("c"), []byte("c\x00")))
 	expectLSM(`
 3:
-  000017:[a#3,RANGEDEL-b#2,SET]
-  000018:[b#1,RANGEDEL-c#72057594037927935,RANGEDEL]
+  000013:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL]
 4:
-  000019:[c#4,SET-d#72057594037927935,RANGEDEL]
+  000014:[c#4,SET-d#72057594037927935,RANGEDEL]
 `)
 
 	require.NoError(t, d.Compact([]byte("c"), []byte("c\x00")))
 	expectLSM(`
 3:
-  000017:[a#3,RANGEDEL-b#2,SET]
-  000018:[b#1,RANGEDEL-c#72057594037927935,RANGEDEL]
+  000013:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL]
 5:
-  000019:[c#4,SET-d#72057594037927935,RANGEDEL]`)
+  000014:[c#4,SET-d#72057594037927935,RANGEDEL]
+`)
 
 	if _, _, err := d.Get([]byte("b")); err != ErrNotFound {
 		t.Fatalf("expected not found, but found %v", err)
@@ -465,10 +459,9 @@ func TestRangeDelCompactionTruncation3(t *testing.T) {
 	require.NoError(t, d.Compact([]byte("a"), []byte("a\x00")))
 	expectLSM(`
 4:
-  000020:[a#3,RANGEDEL-b#2,SET]
-  000021:[b#1,RANGEDEL-c#72057594037927935,RANGEDEL]
+  000013:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL]
 5:
-  000019:[c#4,SET-d#72057594037927935,RANGEDEL]
+  000014:[c#4,SET-d#72057594037927935,RANGEDEL]
 `)
 
 	if v, _, err := d.Get([]byte("b")); err != ErrNotFound {
diff --git a/testdata/compaction_find_grandparent_limit b/testdata/compaction_find_grandparent_limit
index 6950848fe8..e1c88ba1c2 100644
--- a/testdata/compaction_find_grandparent_limit
+++ b/testdata/compaction_find_grandparent_limit
@@ -17,27 +17,29 @@ e-f 2
 compact max-overlap=1
 a b c d e f
 ----
-a-b
-c-d
-e-f
+a-c
+d-e
+f-f
 
 compact max-overlap=2
 a b c d e f
 ----
-a-d
-e-f
+a-c
+d-e
+f-f
 
 compact max-overlap=4
 a b c d e f
 ----
-a-f
+a-e
+f-f
 
 compact max-overlap=1
 b c d e f g h i j
 ----
-b-d
-e-f
-g-j
+b-c
+d-e
+f-j
 
 compact max-overlap=1
 a g h i j
@@ -54,8 +56,8 @@ e-f
 compact max-overlap=1
 c d e f
 ----
-c-d
-e-f
+c-e
+f-f
 
 # Unequal size grandparents
 define
@@ -67,13 +69,15 @@ e-f 3
 compact max-overlap=1
 a b c d e f
 ----
-a-d
-e-f
+a-c
+d-e
+f-f
 
 compact max-overlap=3
 a b c d e f
 ----
-a-f
+a-e
+f-f
 
 # Unequal size grandparents
 define
@@ -85,12 +89,12 @@ e-f 1
 compact max-overlap=1
 a b c d e f
 ----
-a-b
-c-d
-e-f
+a-c
+d-e
+f-f
 
 compact max-overlap=3
 a b c d e f
 ----
-a-d
-e-f
+a-c
+d-f
diff --git a/testdata/compaction_iter b/testdata/compaction_iter
index 34bf1c9599..2f17e2bcff 100644
--- a/testdata/compaction_iter
+++ b/testdata/compaction_iter
@@ -553,7 +553,7 @@ tombstones
 ----
 a#10,15:k
 f#9,1:f
-a-k#10
+a-f#10
 .
 f#8,1:f
 f-k#10
@@ -574,7 +574,6 @@ tombstones
 ----
 f#10,15:k
 f#9,1:f
-f-k#10
 .
 f#8,1:f
 f-k#10
@@ -600,7 +599,6 @@ b#2,15:d
 c#3,15:e
 d#4,1:d
 b-c#2
-c-d#3
 .
 c-d#3
 d-e#3
@@ -619,8 +617,6 @@ b#2,15:d
 c#3,15:e
 d#4,1:d
 b-c#2
-c-d#3
-c-d#2
 .
 c-d#3
 c-d#2
@@ -643,7 +639,7 @@ tombstones
 a#1,1:a
 b#2,15:d
 c#4,1:d
-b-d#2
+b-c#2
 .
 c-d#2
 .
diff --git a/testdata/compaction_iter_set_with_del b/testdata/compaction_iter_set_with_del
index da78252543..796984e0c7 100644
--- a/testdata/compaction_iter_set_with_del
+++ b/testdata/compaction_iter_set_with_del
@@ -553,7 +553,7 @@ tombstones
 ----
 a#10,15:k
 f#9,1:f
-a-k#10
+a-f#10
 .
 f#8,1:f
 f-k#10
@@ -574,7 +574,6 @@ tombstones
 ----
 f#10,15:k
 f#9,1:f
-f-k#10
 .
 f#8,1:f
 f-k#10
@@ -600,7 +599,6 @@ b#2,15:d
 c#3,15:e
 d#4,1:d
 b-c#2
-c-d#3
 .
 c-d#3
 d-e#3
@@ -619,8 +617,6 @@ b#2,15:d
 c#3,15:e
 d#4,1:d
 b-c#2
-c-d#3
-c-d#2
 .
 c-d#3
 c-d#2
@@ -643,7 +639,7 @@ tombstones
 a#1,1:a
 b#2,15:d
 c#4,1:d
-b-d#2
+b-c#2
 .
 c-d#2
 .
diff --git a/testdata/manual_compaction b/testdata/manual_compaction
index d6111e3c17..a68d35f2f0 100644
--- a/testdata/manual_compaction
+++ b/testdata/manual_compaction
@@ -92,9 +92,8 @@ range-deletions-bytes-estimate: 1552
 compact a-e L1
 ----
 2:
-  000008:[a#3,SET-b#72057594037927935,RANGEDEL]
-  000009:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL]
-  000010:[d#2,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000008:[a#3,SET-c#72057594037927935,RANGEDEL]
+  000009:[c#2,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000006:[a#0,SET-b#0,SET]
   000007:[c#0,SET-d#0,SET]
@@ -105,7 +104,7 @@ wait-pending-table-stats
 num-entries: 2
 num-deletions: 1
 point-deletions-bytes-estimate: 0
-range-deletions-bytes-estimate: 31
+range-deletions-bytes-estimate: 776
 
 # Same as above, except range tombstone covers multiple grandparent file boundaries.
 
@@ -140,9 +139,9 @@ L3
 compact a-e L1
 ----
 2:
-  000010:[a#3,SET-b#72057594037927935,RANGEDEL]
-  000011:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL]
-  000012:[d#2,RANGEDEL-f#72057594037927935,RANGEDEL]
+  000010:[a#3,SET-c#72057594037927935,RANGEDEL]
+  000011:[c#2,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000012:[e#2,RANGEDEL-f#72057594037927935,RANGEDEL]
   000013:[f#2,RANGEDEL-g#72057594037927935,RANGEDEL]
 3:
   000006:[a#0,SET-b#0,SET]
@@ -215,9 +214,8 @@ L3
 compact a-e L1
 ----
 2:
-  000008:[a#3,RANGEDEL-b#72057594037927935,RANGEDEL]
-  000009:[b#3,RANGEDEL-d#72057594037927935,RANGEDEL]
-  000010:[d#3,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000008:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL]
+  000009:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000006:[a#0,SET-b#0,SET]
   000007:[c#0,SET-d#0,SET]
@@ -417,13 +415,10 @@ c:4
 a:3
 .
 
-# This is a similar scenario to the one above, but we've arranged for
-# the compaction to produce an sstable (9) for which the smallest
-# range tombstone overlaps in key space with the largest key in the
-# previous sstable (8). This necessitates adjusting the seqnum of the
-# range tombstone. There was no actual iteration bug from earlier
-# behavior of setting the seqnum to 0, but now we set the seqnum to
-# prev.LargestKey.SeqNum-1.
+# This is a similar scenario to the one above. In older versions of Pebble this
+# case necessitated adjusting the seqnum of the range tombstone to
+# prev.LargestKey.SeqNum-1. We no longer allow user keys to be split across
+# sstables, and the seqnum adjustment is no longer necessary.
 #
 # Note the target-file-size of 26 is specially tailored to get the
 # desired compaction output.
@@ -446,8 +441,8 @@ L3
 compact a-e L1
 ----
 2:
-  000007:[a#4,SET-b#2,SET]
-  000008:[b#1,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000006:[b#1,SET-b#1,SET]
 
@@ -483,8 +478,8 @@ L3
 compact a-e L1
 ----
 2:
-  000007:[a#4,SET-b#3,SET]
-  000008:[b#2,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000006:[b#1,SET-b#1,SET]
 
@@ -524,8 +519,8 @@ L3
 compact a-e L1
 ----
 2:
-  000007:[a#4,SET-b#4,SET]
-  000008:[b#2,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#4,SET-e#72057594037927935,RANGEDEL]
 3:
   000006:[b#1,SET-b#1,SET]
 
@@ -545,10 +540,7 @@ a:4
 .
 
 # Test a scenario where the last point key in an sstable has a seqnum
-# of 0, and the smallest range tombstone in the next sstable starts
-# with the same key. We have to tweak the kind of the boundary key in
-# sstable 8 to be a DEL rather than a RANGEDEL so that the two tables
-# are disjoint in the key space.
+# of 0.
 
 define target-file-sizes=(1, 1, 26) snapshots=(2)
 L1
@@ -577,7 +569,8 @@ a:3
 compact a-e L1
 ----
 2:
-  000007:[a#3,SET-e#72057594037927935,RANGEDEL]
+  000007:[a#3,SET-c#72057594037927935,RANGEDEL]
+  000008:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL]
   000006:[c#0,SET-d#0,SET]
@@ -602,12 +595,12 @@ L1
   c.SET.1:1
   d.SET.1:1
 L3
-  a.RANGEDEL.2:b
+  c.RANGEDEL.2:d
 ----
 1:
   000004:[a#3,SET-e#72057594037927935,RANGEDEL]
 3:
-  000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL]
+  000005:[c#2,RANGEDEL-d#72057594037927935,RANGEDEL]
 
 compact a-f L1
 ----
@@ -615,7 +608,7 @@ compact a-f L1
   000006:[a#3,SET-c#72057594037927935,RANGEDEL]
   000007:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
-  000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL]
+  000005:[c#2,RANGEDEL-d#72057594037927935,RANGEDEL]
 
 # Test a scenario where we the last point key in an sstable has a
 # seqnum of 0, but there is another range tombstone later in the
diff --git a/testdata/manual_compaction_set_with_del b/testdata/manual_compaction_set_with_del
index e73e310266..7bd4ec9c12 100644
--- a/testdata/manual_compaction_set_with_del
+++ b/testdata/manual_compaction_set_with_del
@@ -92,9 +92,8 @@ range-deletions-bytes-estimate: 1552
 compact a-e L1
 ----
 2:
-  000008:[a#3,SETWITHDEL-b#72057594037927935,RANGEDEL]
-  000009:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL]
-  000010:[d#2,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000008:[a#3,SETWITHDEL-c#72057594037927935,RANGEDEL]
+  000009:[c#2,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000006:[a#0,SET-b#0,SET]
   000007:[c#0,SET-d#0,SET]
@@ -105,7 +104,7 @@ wait-pending-table-stats
 num-entries: 2
 num-deletions: 1
 point-deletions-bytes-estimate: 0
-range-deletions-bytes-estimate: 31
+range-deletions-bytes-estimate: 776
 
 # Same as above, except range tombstone covers multiple grandparent file boundaries.
 
@@ -140,9 +139,9 @@ L3
 compact a-e L1
 ----
 2:
-  000010:[a#3,SETWITHDEL-b#72057594037927935,RANGEDEL]
-  000011:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL]
-  000012:[d#2,RANGEDEL-f#72057594037927935,RANGEDEL]
+  000010:[a#3,SETWITHDEL-c#72057594037927935,RANGEDEL]
+  000011:[c#2,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000012:[e#2,RANGEDEL-f#72057594037927935,RANGEDEL]
   000013:[f#2,RANGEDEL-g#72057594037927935,RANGEDEL]
 3:
   000006:[a#0,SET-b#0,SET]
@@ -215,9 +214,8 @@ L3
 compact a-e L1
 ----
 2:
-  000008:[a#3,RANGEDEL-b#72057594037927935,RANGEDEL]
-  000009:[b#3,RANGEDEL-d#72057594037927935,RANGEDEL]
-  000010:[d#3,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000008:[a#3,RANGEDEL-c#72057594037927935,RANGEDEL]
+  000009:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000006:[a#0,SET-b#0,SET]
   000007:[c#0,SET-d#0,SET]
@@ -417,13 +415,10 @@ c:4
 a:3
 .
 
-# This is a similar scenario to the one above, but we've arranged for
-# the compaction to produce an sstable (9) for which the smallest
-# range tombstone overlaps in key space with the largest key in the
-# previous sstable (8). This necessitates adjusting the seqnum of the
-# range tombstone. There was no actual iteration bug from earlier
-# behavior of setting the seqnum to 0, but now we set the seqnum to
-# prev.LargestKey.SeqNum-1.
+# This is a similar scenario to the one above. In older versions of Pebble this
+# case necessitated adjusting the seqnum of the range tombstone to
+# prev.LargestKey.SeqNum-1. We no longer allow user keys to be split across
+# sstables, and the seqnum adjustment is no longer necessary.
 #
 # Note the target-file-size of 26 is specially tailored to get the
 # desired compaction output.
@@ -446,8 +441,8 @@ L3
 compact a-e L1
 ----
 2:
-  000007:[a#4,SET-b#2,SET]
-  000008:[b#1,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000006:[b#1,SET-b#1,SET]
 
@@ -483,8 +478,8 @@ L3
 compact a-e L1
 ----
 2:
-  000007:[a#4,SET-b#3,SET]
-  000008:[b#2,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000006:[b#1,SET-b#1,SET]
 
@@ -524,8 +519,8 @@ L3
 compact a-e L1
 ----
 2:
-  000007:[a#4,SET-b#4,SET]
-  000008:[b#2,RANGEDEL-e#72057594037927935,RANGEDEL]
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#4,SET-e#72057594037927935,RANGEDEL]
 3:
   000006:[b#1,SET-b#1,SET]
 
@@ -545,10 +540,7 @@ a:4
 .
 
 # Test a scenario where the last point key in an sstable has a seqnum
-# of 0, and the smallest range tombstone in the next sstable starts
-# with the same key. We have to tweak the kind of the boundary key in
-# sstable 8 to be a DEL rather than a RANGEDEL so that the two tables
-# are disjoint in the key space.
+# of 0.
 
 define target-file-sizes=(1, 1, 26) snapshots=(2)
 L1
@@ -577,7 +569,8 @@ a:3
 compact a-e L1
 ----
 2:
-  000007:[a#3,SET-e#72057594037927935,RANGEDEL]
+  000007:[a#3,SET-c#72057594037927935,RANGEDEL]
+  000008:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
   000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL]
   000006:[c#0,SET-d#0,SET]
@@ -602,12 +595,12 @@ L1
   c.SET.1:1
   d.SET.1:1
 L3
-  a.RANGEDEL.2:b
+  c.RANGEDEL.2:d
 ----
 1:
   000004:[a#3,SET-e#72057594037927935,RANGEDEL]
 3:
-  000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL]
+  000005:[c#2,RANGEDEL-d#72057594037927935,RANGEDEL]
 
 compact a-f L1
 ----
@@ -615,7 +608,8 @@ compact a-f L1
   000006:[a#3,SET-c#72057594037927935,RANGEDEL]
   000007:[c#3,RANGEDEL-e#72057594037927935,RANGEDEL]
 3:
-  000005:[a#2,RANGEDEL-b#72057594037927935,RANGEDEL]
+  000005:[c#2,RANGEDEL-d#72057594037927935,RANGEDEL]
+
 
 # Test a scenario where we the last point key in an sstable has a
 # seqnum of 0, but there is another range tombstone later in the