diff --git a/compaction.go b/compaction.go
index 77c95e1ea4..d9d2e97371 100644
--- a/compaction.go
+++ b/compaction.go
@@ -3233,7 +3233,10 @@ func (d *DB) runCompaction(
 					return nil, pendingOutputs, stats, err
 				}
 			}
-			if err := tw.Add(*key, val); err != nil {
+			// iter.snapshotPinned is broader than whether the point was covered by
+			// a RANGEDEL, but it is harmless to pass true when the callee will also
+			// independently discover that the point is obsolete.
+			if err := tw.AddWithForceObsolete(*key, val, iter.snapshotPinned); err != nil {
 				return nil, pendingOutputs, stats, err
 			}
 			if iter.snapshotPinned {
diff --git a/compaction_iter.go b/compaction_iter.go
index b013036b6b..6623e9343e 100644
--- a/compaction_iter.go
+++ b/compaction_iter.go
@@ -383,6 +383,13 @@ func (i *compactionIter) Next() (*InternalKey, []byte) {
 		} else if cover == keyspan.CoversInvisibly {
 			// i.iterKey would be deleted by a range deletion if there weren't
 			// any open snapshots. Mark it as pinned.
+			//
+			// TODO(sumeer): there are multiple places in this file where we call
+			// i.rangeDelFrag.Covers and this is the only one where we are fiddling
+			// with i.snapshotPinned. i.snapshotPinned was previously being used
+			// only for stats, where a mistake does not lead to corruption. But it
+			// is also now being used for the forceObsolete bit in
+			// Writer.AddWithForceObsolete(). Give this more scrutiny.
 			i.snapshotPinned = true
 		}
 
diff --git a/external_iterator.go b/external_iterator.go
index 3c45c2cd48..69375401d8 100644
--- a/external_iterator.go
+++ b/external_iterator.go
@@ -209,15 +209,10 @@ func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterato
 				pointIter    internalIterator
 				err          error
 			)
-			pointIter, err = r.NewIterWithBlockPropertyFiltersAndContext(
-				ctx,
-				it.opts.LowerBound,
-				it.opts.UpperBound,
-				nil,   /* BlockPropertiesFilterer */
-				false, /* useFilterBlock */
-				&it.stats.InternalStats,
-				sstable.TrivialReaderProvider{Reader: r},
-			)
+			pointIter, err = r.NewIterWithBlockPropertyFiltersAndContextEtc(
+				ctx, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */
+				false /* hideObsoletePoints */, false, /* useFilterBlock */
+				&it.stats.InternalStats, sstable.TrivialReaderProvider{Reader: r})
 			if err != nil {
 				return nil, err
 			}
diff --git a/internal/base/internal.go b/internal/base/internal.go
index 6f954bb851..7b55c819a0 100644
--- a/internal/base/internal.go
+++ b/internal/base/internal.go
@@ -24,6 +24,14 @@ const (
 	//InternalKeyKindColumnFamilyDeletion     InternalKeyKind = 4
 	//InternalKeyKindColumnFamilyValue        InternalKeyKind = 5
 	//InternalKeyKindColumnFamilyMerge        InternalKeyKind = 6
+
+	// InternalKeyKindSingleDelete (SINGLEDEL) is a performance optimization
+	// solely for compactions (to reduce write amp and space amp). Readers other
+	// than compactions should treat SINGLEDEL as equivalent to a DEL.
+	// Historically, it was simpler for readers other than compactions to treat
+	// SINGLEDEL as equivalent to DEL, but as of the introduction of
+	// InternalKeyKindSSTableInternalObsoleteBit, this is also necessary for
+	// correctness.
 	InternalKeyKindSingleDelete InternalKeyKind = 7
 	//InternalKeyKindColumnFamilySingleDelete InternalKeyKind = 8
 	//InternalKeyKindBeginPrepareXID          InternalKeyKind = 9
@@ -71,7 +79,7 @@ const (
 	// value indicating the (len(key)+len(value)) of the shadowed entry the
 	// tombstone is expected to delete. This value is used to inform compaction
 	// heuristics, but is not required to be accurate for correctness.
-	InternalKeyKindDeleteSized = 23
+	InternalKeyKindDeleteSized InternalKeyKind = 23
 
 	// This maximum value isn't part of the file format. Future extensions may
 	// increase this value.
@@ -84,12 +92,17 @@ const (
 	// seqNum.
 	InternalKeyKindMax InternalKeyKind = 23
 
+	// Internal to the sstable format. Not exposed by any sstable iterator.
+	// Declared here to prevent definition of valid key kinds that set this bit.
+	InternalKeyKindSSTableInternalObsoleteBit  InternalKeyKind = 64
+	InternalKeyKindSSTableInternalObsoleteMask InternalKeyKind = 191
+
 	// InternalKeyZeroSeqnumMaxTrailer is the largest trailer with a
 	// zero sequence number.
-	InternalKeyZeroSeqnumMaxTrailer = uint64(InternalKeyKindInvalid)
+	InternalKeyZeroSeqnumMaxTrailer = uint64(255)
 
 	// A marker for an invalid key.
-	InternalKeyKindInvalid InternalKeyKind = 255
+	InternalKeyKindInvalid InternalKeyKind = InternalKeyKindSSTableInternalObsoleteMask
 
 	// InternalKeySeqNumBatch is a bit that is set on batch sequence numbers
 	// which prevents those entries from being excluded from iteration.
diff --git a/iterator.go b/iterator.go
index 996817c78b..9c1ff3eea9 100644
--- a/iterator.go
+++ b/iterator.go
@@ -569,6 +569,9 @@ func (i *Iterator) findNextEntry(limit []byte) {
 			return
 
 		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+			// NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
+			// only simpler, but is also necessary for correctness due to
+			// InternalKeyKindSSTableInternalObsoleteBit.
 			i.nextUserKey()
 			continue
 
@@ -632,6 +635,9 @@ func (i *Iterator) nextPointCurrentUserKey() bool {
 		return false
 
 	case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+		// NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
+		// only simpler, but is also necessary for correctness due to
+		// InternalKeyKindSSTableInternalObsoleteBit.
 		return false
 
 	case InternalKeyKindSet, InternalKeyKindSetWithDelete:
@@ -1095,6 +1101,10 @@ func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) {
 		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
 			// We've hit a deletion tombstone. Return everything up to this
 			// point.
+			//
+			// NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
+			// only simpler, but is also necessary for correctness due to
+			// InternalKeyKindSSTableInternalObsoleteBit.
 			return
 
 		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
diff --git a/level_iter_test.go b/level_iter_test.go
index af65b943b4..d2ec77b80b 100644
--- a/level_iter_test.go
+++ b/level_iter_test.go
@@ -163,8 +163,8 @@ func (lt *levelIterTest) newIters(
 	ctx context.Context, file *manifest.FileMetadata, opts *IterOptions, iio internalIterOpts,
 ) (internalIterator, keyspan.FragmentIterator, error) {
 	lt.itersCreated++
-	iter, err := lt.readers[file.FileNum].NewIterWithBlockPropertyFiltersAndContext(
-		ctx, opts.LowerBound, opts.UpperBound, nil, true, iio.stats,
+	iter, err := lt.readers[file.FileNum].NewIterWithBlockPropertyFiltersAndContextEtc(
+		ctx, opts.LowerBound, opts.UpperBound, nil, false, true, iio.stats,
 		sstable.TrivialReaderProvider{Reader: lt.readers[file.FileNum]})
 	if err != nil {
 		return nil, nil, err
diff --git a/options.go b/options.go
index 58d09aabbd..cdfc2f904e 100644
--- a/options.go
+++ b/options.go
@@ -626,18 +626,24 @@ type Options struct {
 		ShortAttributeExtractor ShortAttributeExtractor
 
 		// RequiredInPlaceValueBound specifies an optional span of user key
-		// prefixes for which the values must be stored with the key. This is
-		// useful for statically known exclusions to value separation. In
-		// CockroachDB, this will be used for the lock table key space that has
-		// non-empty suffixes, but those locks don't represent actual MVCC
-		// versions (the suffix ordering is arbitrary). We will also need to add
-		// support for dynamically configured exclusions (we want the default to
-		// be to allow Pebble to decide whether to separate the value or not,
-		// hence this is structured as exclusions), for example, for users of
-		// CockroachDB to dynamically exclude certain tables.
+		// prefixes that are not-MVCC, but have a suffix. For these the values
+		// must be stored with the key, since the concept of "older versions" is
+		// not defined. It is also useful for statically known exclusions to value
+		// separation. In CockroachDB, this will be used for the lock table key
+		// space that has non-empty suffixes, but those locks don't represent
+		// actual MVCC versions (the suffix ordering is arbitrary). We will also
+		// need to add support for dynamically configured exclusions (we want the
+		// default to be to allow Pebble to decide whether to separate the value
+		// or not, hence this is structured as exclusions), for example, for users
+		// of CockroachDB to dynamically exclude certain tables.
 		//
 		// Any change in exclusion behavior takes effect only on future written
 		// sstables, and does not start rewriting existing sstables.
+		//
+		// Even ignoring changes in this setting, exclusions are interpreted as a
+		// guidance by Pebble, and not necessarily honored. Specifically, user
+		// keys with multiple Pebble-versions *may* have the older versions stored
+		// in value blocks.
 		RequiredInPlaceValueBound UserKeyPrefixBound
 
 		// DisableIngestAsFlushable disables lazy ingestion of sstables through
diff --git a/sstable/block.go b/sstable/block.go
index 27cf77ff8c..82fa3ee394 100644
--- a/sstable/block.go
+++ b/sstable/block.go
@@ -47,7 +47,10 @@ type blockWriter struct {
 	// ensuring that any of the keys with the same prefix can be used to
 	// assemble the full key when the prefix does change.
 	restarts []uint32
-	curKey   []byte
+	// Do not read curKey directly from outside blockWriter since it can have
+	// the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or
+	// getCurUserKey() instead.
+	curKey []byte
 	// curValue excludes the optional prefix provided to
 	// storeWithOptionalValuePrefix.
 	curValue []byte
@@ -79,6 +82,20 @@ const setHasSameKeyPrefixRestartMask uint32 = 1 << 31
 const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
 const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
 
+func (w *blockWriter) getCurKey() InternalKey {
+	k := base.DecodeInternalKey(w.curKey)
+	k.Trailer = k.Trailer & trailerObsoleteMask
+	return k
+}
+
+func (w *blockWriter) getCurUserKey() []byte {
+	n := len(w.curKey) - base.InternalTrailerLen
+	if n < 0 {
+		panic("corrupt key in blockWriter buffer")
+	}
+	return w.curKey[:n:n]
+}
+
 // If !addValuePrefix, the valuePrefix is ignored.
 func (w *blockWriter) storeWithOptionalValuePrefix(
 	keySize int,
@@ -191,12 +208,13 @@ func (w *blockWriter) storeWithOptionalValuePrefix(
 
 func (w *blockWriter) add(key InternalKey, value []byte) {
 	w.addWithOptionalValuePrefix(
-		key, value, len(key.UserKey), false, 0, false)
+		key, false, value, len(key.UserKey), false, 0, false)
 }
 
 // Callers that always set addValuePrefix to false should use add() instead.
 func (w *blockWriter) addWithOptionalValuePrefix(
 	key InternalKey,
+	isObsolete bool,
 	value []byte,
 	maxSharedKeyLen int,
 	addValuePrefix bool,
@@ -210,6 +228,9 @@ func (w *blockWriter) addWithOptionalValuePrefix(
 		w.curKey = make([]byte, 0, size*2)
 	}
 	w.curKey = w.curKey[:size]
+	if isObsolete {
+		key.Trailer = key.Trailer | trailerObsoleteBit
+	}
 	key.Encode(w.curKey)
 
 	w.storeWithOptionalValuePrefix(
@@ -381,6 +402,7 @@ type blockIter struct {
 		vbr            *valueBlockReader
 		hasValuePrefix bool
 	}
+	hideObsoletePoints bool
 }
 
 // blockIter implements the base.InternalIterator interface.
@@ -388,14 +410,16 @@ var _ base.InternalIterator = (*blockIter)(nil)
 
 func newBlockIter(cmp Compare, block block) (*blockIter, error) {
 	i := &blockIter{}
-	return i, i.init(cmp, block, 0)
+	return i, i.init(cmp, block, 0, false)
 }
 
 func (i *blockIter) String() string {
 	return "block"
 }
 
-func (i *blockIter) init(cmp Compare, block block, globalSeqNum uint64) error {
+func (i *blockIter) init(
+	cmp Compare, block block, globalSeqNum uint64, hideObsoletePoints bool,
+) error {
 	numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
 	if numRestarts == 0 {
 		return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
@@ -408,6 +432,7 @@ func (i *blockIter) init(cmp Compare, block block, globalSeqNum uint64) error {
 	i.data = block
 	i.fullKey = i.fullKey[:0]
 	i.val = nil
+	i.hideObsoletePoints = hideObsoletePoints
 	i.clearCache()
 	if i.restarts > 0 {
 		if err := i.readFirstKey(); err != nil {
@@ -420,10 +445,16 @@ func (i *blockIter) init(cmp Compare, block block, globalSeqNum uint64) error {
 	return nil
 }
 
-func (i *blockIter) initHandle(cmp Compare, block cache.Handle, globalSeqNum uint64) error {
+// NB: two cases of hideObsoletePoints:
+//   - Local sstable iteration: globalSeqNum will be set iff the sstable was
+//     ingested.
+//   - Foreign sstable iteration: globalSeqNum is always set.
+func (i *blockIter) initHandle(
+	cmp Compare, block cache.Handle, globalSeqNum uint64, hideObsoletePoints bool,
+) error {
 	i.cacheHandle.Release()
 	i.cacheHandle = block
-	return i.init(cmp, block.Get(), globalSeqNum)
+	return i.init(cmp, block.Get(), globalSeqNum, hideObsoletePoints)
 }
 
 func (i *blockIter) invalidate() {
@@ -590,7 +621,9 @@ func (i *blockIter) readFirstKey() error {
 	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
 	// BlockIter benchmarks.
 	if n := len(firstKey) - 8; n >= 0 {
-		i.firstKey.Trailer = binary.LittleEndian.Uint64(firstKey[n:])
+		// NB: we do not track whether the firstKey is obsolete since the trailer
+		// of the firstKey is not used.
+		i.firstKey.Trailer = binary.LittleEndian.Uint64(firstKey[n:]) & trailerObsoleteMask
 		i.firstKey.UserKey = firstKey[:n:n]
 		if i.globalSeqNum != 0 {
 			i.firstKey.SetSeqNum(i.globalSeqNum)
@@ -603,11 +636,19 @@ func (i *blockIter) readFirstKey() error {
 	return nil
 }
 
-func (i *blockIter) decodeInternalKey(key []byte) {
+// The sstable internal obsolete bit is set when writing a block and unset by
+// blockIter, so no code outside block writing/reading code ever sees it.
+const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
+const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
+
+func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
 	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
 	// BlockIter benchmarks.
 	if n := len(key) - 8; n >= 0 {
-		i.ikey.Trailer = binary.LittleEndian.Uint64(key[n:])
+		trailer := binary.LittleEndian.Uint64(key[n:])
+		hiddenPoint = i.hideObsoletePoints &&
+			(trailer&trailerObsoleteBit != 0)
+		i.ikey.Trailer = trailer & trailerObsoleteMask
 		i.ikey.UserKey = key[:n:n]
 		if i.globalSeqNum != 0 {
 			i.ikey.SetSeqNum(i.globalSeqNum)
@@ -616,6 +657,7 @@ func (i *blockIter) decodeInternalKey(key []byte) {
 		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
 		i.ikey.UserKey = nil
 	}
+	return hiddenPoint
 }
 
 func (i *blockIter) clearCache() {
@@ -640,13 +682,15 @@ func (i *blockIter) cacheEntry() {
 	i.cachedBuf = append(i.cachedBuf, i.key...)
 }
 
+func (i *blockIter) firstUserKey() []byte {
+	return i.firstKey.UserKey
+}
+
 // SeekGE implements internalIterator.SeekGE, as documented in the pebble
 // package.
 func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
 	i.clearCache()
 
-	ikey := base.MakeSearchKey(key)
-
 	// Find the index of the smallest restart point whose key is > the key
 	// sought; index will be numRestarts if there is no such restart point.
 	i.offset = 0
@@ -700,23 +744,23 @@ func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, ba
 				ptr = unsafe.Pointer(uintptr(ptr) + 5)
 			}
 
-			// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
-			// BlockIter benchmarks.
+			// Manually inlining part of base.DecodeInternalKey provides a 5-10%
+			// speedup on BlockIter benchmarks.
 			s := getBytes(ptr, int(v1))
-			var k InternalKey
+			var k []byte
 			if n := len(s) - 8; n >= 0 {
-				k.Trailer = binary.LittleEndian.Uint64(s[n:])
-				k.UserKey = s[:n:n]
-				// NB: We can't have duplicate keys if the globalSeqNum != 0, so we
-				// leave the seqnum on this key as 0 as it won't affect our search
-				// since ikey has the maximum seqnum.
-			} else {
-				k.Trailer = uint64(InternalKeyKindInvalid)
+				k = s[:n:n]
 			}
+			// Else k is invalid, and left as nil
 
-			if base.InternalCompare(i.cmp, ikey, k) >= 0 {
+			if i.cmp(key, k) > 0 {
+				// The search key is greater than the user key at this restart point.
+				// Search beyond this restart point, since we are trying to find the
+				// first restart point with a user key >= the search key.
 				index = h + 1 // preserves f(i-1) == false
 			} else {
+				// k >= search key, so prune everything after index (since index
+				// satisfies the property we are looking for).
 				upper = h // preserves f(j) == true
 			}
 		}
@@ -724,21 +768,25 @@ func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, ba
 		// => answer is index.
 	}
 
-	// Since keys are strictly increasing, if index > 0 then the restart point at
-	// index-1 will be the largest whose key is <= the key sought.  If index ==
-	// 0, then all keys in this block are larger than the key sought, and offset
-	// remains at zero.
+	// index is the first restart point with key >= search key. Define the keys
+	// between a restart point and the next restart point as belonging to that
+	// restart point.
+	//
+	// Since keys are strictly increasing, if index > 0 then the restart point
+	// at index-1 will be the first one that has some keys belonging to it that
+	// could be equal to the search key.  If index == 0, then all keys in this
+	// block are larger than the key sought, and offset remains at zero.
 	if index > 0 {
 		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
 	}
 	i.readEntry()
-	i.decodeInternalKey(i.key)
+	hiddenPoint := i.decodeInternalKey(i.key)
 
 	// Iterate from that restart point to somewhere >= the key sought.
 	if !i.valid() {
 		return nil, base.LazyValue{}
 	}
-	if base.InternalCompare(i.cmp, i.ikey, ikey) >= 0 {
+	if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 {
 		// Initialize i.lazyValue
 		if !i.lazyValueHandling.hasValuePrefix ||
 			base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
@@ -751,7 +799,7 @@ func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, ba
 		return &i.ikey, i.lazyValue
 	}
 	for i.Next(); i.valid(); i.Next() {
-		if base.InternalCompare(i.cmp, i.ikey, ikey) >= 0 {
+		if i.cmp(i.ikey.UserKey, key) >= 0 {
 			// i.Next() has already initialized i.lazyValue.
 			return &i.ikey, i.lazyValue
 		}
@@ -773,8 +821,6 @@ func (i *blockIter) SeekPrefixGE(
 func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
 	i.clearCache()
 
-	ikey := base.MakeSearchKey(key)
-
 	// Find the index of the smallest restart point whose key is >= the key
 	// sought; index will be numRestarts if there is no such restart point.
 	i.offset = 0
@@ -828,23 +874,23 @@ func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, ba
 				ptr = unsafe.Pointer(uintptr(ptr) + 5)
 			}
 
-			// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
-			// BlockIter benchmarks.
+			// Manually inlining part of base.DecodeInternalKey provides a 5-10%
+			// speedup on BlockIter benchmarks.
 			s := getBytes(ptr, int(v1))
-			var k InternalKey
+			var k []byte
 			if n := len(s) - 8; n >= 0 {
-				k.Trailer = binary.LittleEndian.Uint64(s[n:])
-				k.UserKey = s[:n:n]
-				// NB: We can't have duplicate keys if the globalSeqNum != 0, so we
-				// leave the seqnum on this key as 0 as it won't affect our search
-				// since ikey has the maximum seqnum.
-			} else {
-				k.Trailer = uint64(InternalKeyKindInvalid)
+				k = s[:n:n]
 			}
+			// Else k is invalid, and left as nil
 
-			if base.InternalCompare(i.cmp, ikey, k) > 0 {
+			if i.cmp(key, k) > 0 {
+				// The search key is greater than the user key at this restart point.
+				// Search beyond this restart point, since we are trying to find the
+				// first restart point with a user key >= the search key.
 				index = h + 1 // preserves f(i-1) == false
 			} else {
+				// k >= search key, so prune everything after index (since index
+				// satisfies the property we are looking for).
 				upper = h // preserves f(j) == true
 			}
 		}
@@ -852,8 +898,15 @@ func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, ba
 		// => answer is index.
 	}
 
-	// Since keys are strictly increasing, if index > 0 then the restart point at
-	// index-1 will be the largest whose key is < the key sought.
+	// index is the first restart point with key >= search key. Define the keys
+	// between a restart point and the next restart point as belonging to that
+	// restart point. Note that index could be equal to i.numRestarts, i.e., we
+	// are past the last restart.
+	//
+	// Since keys are strictly increasing, if index > 0 then the restart point
+	// at index-1 will be the first one that has some keys belonging to it that
+	// are less than the search key.  If index == 0, then all keys in this block
+	// are larger than the search key, so there is no match.
 	targetOffset := i.restarts
 	if index > 0 {
 		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
@@ -876,24 +929,41 @@ func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, ba
 	for {
 		i.offset = i.nextOffset
 		i.readEntry()
-		i.decodeInternalKey(i.key)
-
-		if i.cmp(i.ikey.UserKey, ikey.UserKey) >= 0 {
+		// When hidden keys are common, there is additional optimization possible
+		// by not caching entries that are hidden (note that some calls to
+		// cacheEntry don't decode the internal key before caching, but checking
+		// whether a key is hidden does not require full decoding). However, we do
+		// need to use the blockEntry.offset in the cache for the first entry at
+		// the reset point to do the binary search when the cache is empty -- so
+		// we would need to cache that first entry (though not the key) even if
+		// was hidden. Our current assumption is that if there are large numbers
+		// of hidden keys we will be able to skip whole blocks (using block
+		// property filters) so we don't bother optimizing.
+		hiddenPoint := i.decodeInternalKey(i.key)
+
+		// NB: we don't use the hiddenPoint return value of decodeInternalKey
+		// since we want to stop as soon as we reach a key >= ikey.UserKey, so
+		// that we can reverse.
+		if i.cmp(i.ikey.UserKey, key) >= 0 {
 			// The current key is greater than or equal to our search key. Back up to
 			// the previous key which was less than our search key. Note that this for
 			// loop will execute at least once with this if-block not being true, so
 			// the key we are backing up to is the last one this loop cached.
-			i.Prev()
-			// i.Prev() has already initialized i.lazyValue.
-			return &i.ikey, i.lazyValue
+			return i.Prev()
 		}
 
 		if i.nextOffset >= targetOffset {
-			// We've reached the end of the current restart block. Return the current
-			// key. When the restart interval is 1, the first iteration of the for
-			// loop will bring us here. In that case ikey is backed by the block so
-			// we get the desired key stability guarantee for the lifetime of the
-			// blockIter.
+			// We've reached the end of the current restart block. Return the
+			// current key if not hidden, else call Prev().
+			//
+			// When the restart interval is 1, the first iteration of the for loop
+			// will bring us here. In that case ikey is backed by the block so we
+			// get the desired key stability guarantee for the lifetime of the
+			// blockIter. That is, we never cache anything and therefore never
+			// return a key backed by cachedBuf.
+			if hiddenPoint {
+				return i.Prev()
+			}
 			break
 		}
 
@@ -923,7 +993,10 @@ func (i *blockIter) First() (*InternalKey, base.LazyValue) {
 	}
 	i.clearCache()
 	i.readEntry()
-	i.decodeInternalKey(i.key)
+	hiddenPoint := i.decodeInternalKey(i.key)
+	if hiddenPoint {
+		return i.Next()
+	}
 	if !i.lazyValueHandling.hasValuePrefix ||
 		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
 		i.lazyValue = base.MakeInPlaceValue(i.val)
@@ -958,7 +1031,10 @@ func (i *blockIter) Last() (*InternalKey, base.LazyValue) {
 		i.readEntry()
 	}
 
-	i.decodeInternalKey(i.key)
+	hiddenPoint := i.decodeInternalKey(i.key)
+	if hiddenPoint {
+		return i.Prev()
+	}
 	if !i.lazyValueHandling.hasValuePrefix ||
 		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
 		i.lazyValue = base.MakeInPlaceValue(i.val)
@@ -988,18 +1064,26 @@ func (i *blockIter) Next() (*InternalKey, base.LazyValue) {
 		i.clearCache()
 	}
 
+start:
 	i.offset = i.nextOffset
 	if !i.valid() {
 		return nil, base.LazyValue{}
 	}
 	i.readEntry()
 	// Manually inlined version of i.decodeInternalKey(i.key).
+	hiddenPoint := false
 	if n := len(i.key) - 8; n >= 0 {
-		i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:])
+		trailer := binary.LittleEndian.Uint64(i.key[n:])
+		hiddenPoint = i.hideObsoletePoints &&
+			(trailer&trailerObsoleteBit != 0)
+		i.ikey.Trailer = trailer & trailerObsoleteMask
 		i.ikey.UserKey = i.key[:n:n]
 		if i.globalSeqNum != 0 {
 			i.ikey.SetSeqNum(i.globalSeqNum)
 		}
+		if hiddenPoint {
+			goto start
+		}
 	} else {
 		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
 		i.ikey.UserKey = nil
@@ -1015,8 +1099,8 @@ func (i *blockIter) Next() (*InternalKey, base.LazyValue) {
 	return &i.ikey, i.lazyValue
 }
 
-// nextPrefix is used for implementing NPrefix.
-func (i *blockIter) nextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+// NextPrefix implements (base.InternalIterator).NextPrefix.
+func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
 	if i.lazyValueHandling.hasValuePrefix {
 		return i.nextPrefixV3(succKey)
 	}
@@ -1141,6 +1225,7 @@ func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue)
 		// The trailer is written in little endian, so the key kind is the first
 		// byte in the trailer that is encoded in the slice [unshared-8:unshared].
 		keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
+		keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
 		prefixChanged := false
 		if keyKind == InternalKeyKindSet {
 			if invariants.Enabled && value == 0 {
@@ -1256,8 +1341,12 @@ func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue)
 			i.key = i.fullKey
 		}
 		// Manually inlined version of i.decodeInternalKey(i.key).
+		hiddenPoint := false
 		if n := len(i.key) - 8; n >= 0 {
-			i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:])
+			trailer := binary.LittleEndian.Uint64(i.key[n:])
+			hiddenPoint = i.hideObsoletePoints &&
+				(trailer&trailerObsoleteBit != 0)
+			i.ikey.Trailer = trailer & trailerObsoleteMask
 			i.ikey.UserKey = i.key[:n:n]
 			if i.globalSeqNum != 0 {
 				i.ikey.SetSeqNum(i.globalSeqNum)
@@ -1273,6 +1362,9 @@ func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue)
 		}
 		if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 {
 			// Prefix has changed.
+			if hiddenPoint {
+				return i.Next()
+			}
 			if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
 				panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
 			}
@@ -1294,35 +1386,30 @@ func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue)
 	return i.SeekGE(succKey, base.SeekGEFlagsNone)
 }
 
-// NextPrefix implements (base.InternalIterator).NextPrefix
-func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
-	const nextsBeforeSeek = 3
-	k, v := i.Next()
-	for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ {
-		if j >= nextsBeforeSeek {
-			return i.SeekGE(succKey, base.SeekGEFlagsNone)
-		}
-		k, v = i.Next()
-	}
-	return k, v
-}
-
 // Prev implements internalIterator.Prev, as documented in the pebble
 // package.
 func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
-	if n := len(i.cached) - 1; n >= 0 {
+start:
+	for n := len(i.cached) - 1; n >= 0; n-- {
 		i.nextOffset = i.offset
 		e := &i.cached[n]
 		i.offset = e.offset
 		i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
 		// Manually inlined version of i.decodeInternalKey(i.key).
 		i.key = i.cachedBuf[e.keyStart:e.keyEnd]
+		hiddenPoint := false
 		if n := len(i.key) - 8; n >= 0 {
-			i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:])
+			trailer := binary.LittleEndian.Uint64(i.key[n:])
+			hiddenPoint = i.hideObsoletePoints &&
+				(trailer&trailerObsoleteBit != 0)
+			i.ikey.Trailer = trailer & trailerObsoleteMask
 			i.ikey.UserKey = i.key[:n:n]
 			if i.globalSeqNum != 0 {
 				i.ikey.SetSeqNum(i.globalSeqNum)
 			}
+			if hiddenPoint {
+				continue
+			}
 		} else {
 			i.ikey.Trailer = uint64(InternalKeyKindInvalid)
 			i.ikey.UserKey = nil
@@ -1360,6 +1447,8 @@ func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
 			// index ≤ h < upper
 			offset := decodeRestart(i.data[i.restarts+4*h:])
 			if offset < targetOffset {
+				// Looking for the first restart that has offset >= targetOffset, so
+				// ignore h and earlier.
 				index = h + 1 // preserves f(i-1) == false
 			} else {
 				upper = h // preserves f(j) == true
@@ -1369,20 +1458,35 @@ func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
 		// => answer is index.
 	}
 
+	// index is first restart with offset >= targetOffset. Note that
+	// targetOffset may not be at a restart point since one can call Prev()
+	// after Next() (so the cache was not populated) and targetOffset refers to
+	// the current entry. index-1 must have an offset < targetOffset (it can't
+	// be equal to targetOffset since the binary search would have selected that
+	// as the index).
 	i.offset = 0
 	if index > 0 {
 		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
 	}
+	// TODO(sumeer): why is the else case not an error given targetOffset is a
+	// valid offset.
 
 	i.readEntry()
 
+	// We stop when i.nextOffset == targetOffset since the targetOffset is the
+	// entry we are stepping back from, and we don't need to cache the entry
+	// before it, since it is the candidate to return.
 	for i.nextOffset < targetOffset {
 		i.cacheEntry()
 		i.offset = i.nextOffset
 		i.readEntry()
 	}
 
-	i.decodeInternalKey(i.key)
+	hiddenPoint := i.decodeInternalKey(i.key)
+	if hiddenPoint {
+		// Use the cache.
+		goto start
+	}
 	if !i.lazyValueHandling.hasValuePrefix ||
 		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
 		i.lazyValue = base.MakeInPlaceValue(i.val)
diff --git a/sstable/block_property.go b/sstable/block_property.go
index 1e986fbc3e..9bec5294dc 100644
--- a/sstable/block_property.go
+++ b/sstable/block_property.go
@@ -117,7 +117,7 @@ type BlockPropertyCollector interface {
 }
 
 // SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector
-// interface that allows a block property collector to indicate the it supports
+// interface that allows a block property collector to indicate that it supports
 // being *updated* during suffix replacement, i.e. when an existing SST in which
 // all keys have the same key suffix is updated to have a new suffix.
 //
diff --git a/sstable/block_property_test.go b/sstable/block_property_test.go
index ece6884a52..e7298eccb7 100644
--- a/sstable/block_property_test.go
+++ b/sstable/block_property_test.go
@@ -1320,7 +1320,8 @@ func runBlockPropsCmd(r *Reader, td *datadriven.TestData) string {
 			if err != nil {
 				return err.Error()
 			}
-			if err := subiter.init(r.Compare, subIndex.Get(), 0 /* globalSeqNum */); err != nil {
+			if err := subiter.init(
+				r.Compare, subIndex.Get(), 0 /* globalSeqNum */, false); err != nil {
 				return err.Error()
 			}
 			for key, value := subiter.First(); key != nil; key, value = subiter.Next() {
diff --git a/sstable/block_test.go b/sstable/block_test.go
index d191247640..7d1aed6e9f 100644
--- a/sstable/block_test.go
+++ b/sstable/block_test.go
@@ -56,7 +56,7 @@ func TestBlockWriterWithPrefix(t *testing.T) {
 		valuePrefix valuePrefix,
 		setHasSameKeyPrefix bool) {
 		w.addWithOptionalValuePrefix(
-			key, value, len(key.UserKey), addValuePrefix, valuePrefix, setHasSameKeyPrefix)
+			key, false, value, len(key.UserKey), addValuePrefix, valuePrefix, setHasSameKeyPrefix)
 	}
 	addAdapter(
 		ikey("apple"), []byte("red"), false, 0, true)
diff --git a/sstable/format.go b/sstable/format.go
index e8c620028f..919eb69f8d 100644
--- a/sstable/format.go
+++ b/sstable/format.go
@@ -29,6 +29,49 @@ const (
 	TableFormatMax = TableFormatPebblev4
 )
 
+// TableFormatPebblev4, in addition to DELSIZED, introduces the use of
+// InternalKeyKindSSTableInternalObsoleteBit. See
+// https://github.com/cockroachdb/pebble/issues/2465.
+//
+// TODO(sumeer): write a long comment using the text in that issue, including
+// the definition of obsolete and the additional key-kind transformations that
+// need to go with it. And that obsolete bit is not permitted for Merge kinds.
+//
+// Setting the obsolete bit on point keys is advanced usage, so we support 2
+// modes, both of which must be truthful when setting the obsolete bit, but
+// vary in when they don't set the obsolete bit.
+//
+// - Non-strict: In this mode, the bit does not need to be set for keys that
+//   are obsolete. Additionally, any sstable containing MERGE keys can only
+//   use this mode. An iterator over such an sstable, when configured to
+//   hideObsoletePoints, can expose multiple internal keys per user key, and
+//   can expose keys that are deleted by rangedels in the same sstable. This
+//   is the mode that non-advanced users should use. Pebble without
+//   disaggregated storage will also use this mode and will best-effort set
+//   the obsolete bit, to optimize iteration when snapshots have retained many
+//   obsolete keys.
+//
+// - Strict: In this mode, every obsolete key must have the obsolete bit set,
+//   and no MERGE keys are permitted. An iterator over such an sstable, when
+//   configured to hideObsoletePoints satisfies two properties:
+//   - S1: will expose at most one internal key per user key, which is the
+//     most recent one.
+//   - S2: will never expose keys that are deleted by rangedels in the same
+//     sstable.
+//
+//   This is the mode for two use cases in disaggregated storage (which will
+//   exclude parts of the key space that has MERGEs), for levels that contain
+//   sstables that can become foreign sstables.
+//
+//   - Pebble compaction output to these levels that can become foreign
+//     sstables.
+//
+//   - CockroachDB ingest operations that can ingest into the levels that can
+//     become foreign sstables. Note, these are not sstables corresponding to
+//     copied data for CockroachDB range snapshots. This case occurs for
+//     operations like index backfills: these trivially satisfy the strictness
+//     criteria since they only write one key per userkey.
+
 // ParseTableFormat parses the given magic bytes and version into its
 // corresponding internal TableFormat.
 func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) {
diff --git a/sstable/internal.go b/sstable/internal.go
index 039da91074..0fe7c9937d 100644
--- a/sstable/internal.go
+++ b/sstable/internal.go
@@ -23,6 +23,7 @@ const (
 	InternalKeyKindLogData         = base.InternalKeyKindLogData
 	InternalKeyKindSingleDelete    = base.InternalKeyKindSingleDelete
 	InternalKeyKindRangeDelete     = base.InternalKeyKindRangeDelete
+	InternalKeyKindSetWithDelete   = base.InternalKeyKindSetWithDelete
 	InternalKeyKindDeleteSized     = base.InternalKeyKindDeleteSized
 	InternalKeyKindMax             = base.InternalKeyKindMax
 	InternalKeyKindInvalid         = base.InternalKeyKindInvalid
diff --git a/sstable/options.go b/sstable/options.go
index 66dbdd4d0a..2fa2d2bf32 100644
--- a/sstable/options.go
+++ b/sstable/options.go
@@ -210,6 +210,15 @@ type WriterOptions struct {
 	// by a wider range of tools and libraries.
 	TableFormat TableFormat
 
+	// IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment
+	// in format.go. Must be false if format < TableFormatPebblev4.
+	IsStrictObsolete bool
+
+	// WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is
+	// use to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the
+	// youngest for a userkey.
+	WritingToLowestLevel bool
+
 	// TablePropertyCollectors is a list of TablePropertyCollector creation
 	// functions. A new TablePropertyCollector is created for each sstable built
 	// and lives for the lifetime of the table.
diff --git a/sstable/properties.go b/sstable/properties.go
index c85aa1db60..00881cb1ae 100644
--- a/sstable/properties.go
+++ b/sstable/properties.go
@@ -101,6 +101,9 @@ type Properties struct {
 	IndexType uint32 `prop:"rocksdb.block.based.table.index.type"`
 	// Whether delta encoding is used to encode the index values.
 	IndexValueIsDeltaEncoded uint64 `prop:"rocksdb.index.value.is.delta.encoded"`
+	// For formats >= TableFormatPebblev4, this is set to true if the obsolete
+	// bit is strict for all the point keys.
+	IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"`
 	// The name of the merger used in this table. Empty if no merger is used.
 	MergerName string `prop:"rocksdb.merge.operator"`
 	// The number of blocks in this table.
@@ -349,6 +352,9 @@ func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) {
 	p.saveUvarint(m, unsafe.Offsetof(p.IndexSize), p.IndexSize)
 	p.saveUint32(m, unsafe.Offsetof(p.IndexType), p.IndexType)
 	p.saveUvarint(m, unsafe.Offsetof(p.IndexValueIsDeltaEncoded), p.IndexValueIsDeltaEncoded)
+	if p.IsStrictObsolete {
+		p.saveBool(m, unsafe.Offsetof(p.IsStrictObsolete), p.IsStrictObsolete)
+	}
 	if p.MergerName != "" {
 		p.saveString(m, unsafe.Offsetof(p.MergerName), p.MergerName)
 	}
diff --git a/sstable/reader.go b/sstable/reader.go
index 710198edd5..191ca575e4 100644
--- a/sstable/reader.go
+++ b/sstable/reader.go
@@ -323,6 +323,8 @@ type singleLevelIterator struct {
 	// is high).
 	useFilter              bool
 	lastBloomFilterMatched bool
+
+	hideObsoletePoints bool
 }
 
 // singleLevelIterator implements the base.InternalIterator interface.
@@ -404,7 +406,7 @@ func (i *singleLevelIterator) init(
 	v *virtualState,
 	lower, upper []byte,
 	filterer *BlockPropertiesFilterer,
-	useFilter bool,
+	useFilter, hideObsoletePoints bool,
 	stats *base.InternalIteratorStats,
 	rp ReaderProvider,
 ) error {
@@ -428,7 +430,8 @@ func (i *singleLevelIterator) init(
 	i.reader = r
 	i.cmp = r.Compare
 	i.stats = stats
-	err = i.index.initHandle(i.cmp, indexH, r.Properties.GlobalSeqNum)
+	i.hideObsoletePoints = hideObsoletePoints
+	err = i.index.initHandle(i.cmp, indexH, r.Properties.GlobalSeqNum, false)
 	if err != nil {
 		// blockIter.Close releases indexH and always returns a nil error
 		_ = i.index.Close()
@@ -566,7 +569,7 @@ func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult {
 		i.err = err
 		return loadBlockFailed
 	}
-	i.err = i.data.initHandle(i.cmp, block, i.reader.Properties.GlobalSeqNum)
+	i.err = i.data.initHandle(i.cmp, block, i.reader.Properties.GlobalSeqNum, i.hideObsoletePoints)
 	if i.err != nil {
 		// The block is partially loaded, and we don't want it to appear valid.
 		i.data.invalidate()
@@ -653,12 +656,13 @@ func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult {
 }
 
 func (i *singleLevelIterator) initBoundsForAlreadyLoadedBlock() {
-	if i.data.firstKey.UserKey == nil {
+	if i.data.firstUserKey() == nil {
 		panic("initBoundsForAlreadyLoadedBlock must not be called on empty or corrupted block")
 	}
 	i.blockLower = i.lower
 	if i.blockLower != nil {
-		if i.data.firstKey.UserKey != nil && i.cmp(i.blockLower, i.data.firstKey.UserKey) < 0 {
+		firstUserKey := i.data.firstUserKey()
+		if firstUserKey != nil && i.cmp(i.blockLower, firstUserKey) < 0 {
 			// The lower-bound is less than the first key in the block. No need
 			// to check the lower-bound again for this block.
 			i.blockLower = nil
@@ -1066,7 +1070,7 @@ func (i *singleLevelIterator) SeekLT(
 
 	var dontSeekWithinBlock bool
 	if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() &&
-		boundsCmp < 0 && i.cmp(i.data.firstKey.UserKey, key) < 0 {
+		boundsCmp < 0 && i.cmp(i.data.firstUserKey(), key) < 0 {
 		// Fast-path: The bounds have moved backward, and this SeekLT is
 		// respecting the upper bound (guaranteed by Iterator). We know that
 		// the iterator must already be positioned within or just outside the
@@ -1314,7 +1318,7 @@ func (i *singleLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.Laz
 	if i.err != nil {
 		return nil, base.LazyValue{}
 	}
-	if key, val := i.data.nextPrefix(succKey); key != nil {
+	if key, val := i.data.NextPrefix(succKey); key != nil {
 		if i.blockUpper != nil {
 			cmp := i.cmp(key.UserKey, i.blockUpper)
 			if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
@@ -1763,8 +1767,7 @@ func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult {
 		i.err = err
 		return loadBlockFailed
 	}
-	if i.err = i.index.initHandle(
-		i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum); i.err == nil {
+	if i.err = i.index.initHandle(i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum, false); i.err == nil {
 		return loadBlockOK
 	}
 	return loadBlockFailed
@@ -1842,7 +1845,7 @@ func (i *twoLevelIterator) init(
 	v *virtualState,
 	lower, upper []byte,
 	filterer *BlockPropertiesFilterer,
-	useFilter bool,
+	useFilter, hideObsoletePoints bool,
 	stats *base.InternalIteratorStats,
 	rp ReaderProvider,
 ) error {
@@ -1867,7 +1870,8 @@ func (i *twoLevelIterator) init(
 	i.reader = r
 	i.cmp = r.Compare
 	i.stats = stats
-	err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum)
+	i.hideObsoletePoints = hideObsoletePoints
+	err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum, false)
 	if err != nil {
 		// blockIter.Close releases topLevelIndexH and always returns a nil error
 		_ = i.topLevelIndex.Close()
@@ -2907,21 +2911,20 @@ func (v *VirtualReader) NewCompactionIter(
 	return v.reader.newCompactionIter(bytesIterated, rp, &v.vState)
 }
 
-// NewIterWithBlockPropertyFiltersAndContext wraps
+// NewIterWithBlockPropertyFiltersAndContextEtc wraps
 // Reader.NewIterWithBlockPropertyFiltersAndContext. We assume that the passed
 // in [lower, upper) bounds will have at least some overlap with the virtual
 // sstable bounds. No overlap is not currently supported in the iterator.
-func (v *VirtualReader) NewIterWithBlockPropertyFiltersAndContext(
+func (v *VirtualReader) NewIterWithBlockPropertyFiltersAndContextEtc(
 	ctx context.Context,
 	lower, upper []byte,
 	filterer *BlockPropertiesFilterer,
-	useFilterBlock bool,
+	hideObsoletePoints, useFilterBlock bool,
 	stats *base.InternalIteratorStats,
 	rp ReaderProvider,
 ) (Iterator, error) {
 	return v.reader.newIterWithBlockPropertyFiltersAndContext(
-		ctx,
-		lower, upper, filterer, useFilterBlock, stats, rp, &v.vState,
+		ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, &v.vState,
 	)
 }
 
@@ -3060,24 +3063,29 @@ func (r *Reader) NewIterWithBlockPropertyFilters(
 ) (Iterator, error) {
 	return r.newIterWithBlockPropertyFiltersAndContext(
 		context.Background(),
-		lower, upper, filterer, useFilterBlock, stats, rp, nil,
+		lower, upper, filterer, false, useFilterBlock, stats, rp, nil,
 	)
 }
 
-// NewIterWithBlockPropertyFiltersAndContext is similar to
+// NewIterWithBlockPropertyFiltersAndContextEtc is similar to
 // NewIterWithBlockPropertyFilters and additionally accepts a context for
 // tracing.
-func (r *Reader) NewIterWithBlockPropertyFiltersAndContext(
+//
+// If hideObsoletePoints, the callee assumes that filterer already includes
+// obsoleteKeyBlockPropertyFilter.
+//
+// TODO(sumeer): clean this up since the caller should not have to know that
+// the table format >= v4.
+func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc(
 	ctx context.Context,
 	lower, upper []byte,
 	filterer *BlockPropertiesFilterer,
-	useFilterBlock bool,
+	hideObsoletePoints, useFilterBlock bool,
 	stats *base.InternalIteratorStats,
 	rp ReaderProvider,
 ) (Iterator, error) {
 	return r.newIterWithBlockPropertyFiltersAndContext(
-		ctx,
-		lower, upper, filterer, useFilterBlock, stats, rp, nil,
+		ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, nil,
 	)
 }
 
@@ -3085,6 +3093,7 @@ func (r *Reader) newIterWithBlockPropertyFiltersAndContext(
 	ctx context.Context,
 	lower, upper []byte,
 	filterer *BlockPropertiesFilterer,
+	hideObsoletePoints bool,
 	useFilterBlock bool,
 	stats *base.InternalIteratorStats,
 	rp ReaderProvider,
@@ -3095,7 +3104,7 @@ func (r *Reader) newIterWithBlockPropertyFiltersAndContext(
 	// until the final iterator closes.
 	if r.Properties.IndexType == twoLevelIndex {
 		i := twoLevelIterPool.Get().(*twoLevelIterator)
-		err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, stats, rp)
+		err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp)
 		if err != nil {
 			return nil, err
 		}
@@ -3103,7 +3112,7 @@ func (r *Reader) newIterWithBlockPropertyFiltersAndContext(
 	}
 
 	i := singleLevelIterPool.Get().(*singleLevelIterator)
-	err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, stats, rp)
+	err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp)
 	if err != nil {
 		return nil, err
 	}
@@ -3132,11 +3141,7 @@ func (r *Reader) newCompactionIter(
 ) (Iterator, error) {
 	if r.Properties.IndexType == twoLevelIndex {
 		i := twoLevelIterPool.Get().(*twoLevelIterator)
-		err := i.init(
-			context.Background(),
-			r, v, nil /* lower */, nil /* upper */, nil,
-			false /* useFilter */, nil /* stats */, rp,
-		)
+		err := i.init(context.Background(), r, v, nil, nil, nil, false, false, nil, rp)
 		if err != nil {
 			return nil, err
 		}
@@ -3147,10 +3152,7 @@ func (r *Reader) newCompactionIter(
 		}, nil
 	}
 	i := singleLevelIterPool.Get().(*singleLevelIterator)
-	err := i.init(
-		context.Background(), r, v, nil /* lower */, nil, /* upper */
-		nil, false /* useFilter */, nil /* stats */, rp,
-	)
+	err := i.init(context.Background(), r, v, nil, nil, nil, false, false, nil, rp)
 	if err != nil {
 		return nil, err
 	}
@@ -3176,7 +3178,7 @@ func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) {
 		return nil, err
 	}
 	i := &fragmentBlockIter{}
-	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil {
+	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
 		return nil, err
 	}
 	return i, nil
@@ -3197,7 +3199,7 @@ func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) {
 		return nil, err
 	}
 	i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter)
-	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil {
+	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
 		return nil, err
 	}
 	return i, nil
@@ -3357,7 +3359,7 @@ func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) {
 	// tombstones. We need properly fragmented and sorted range tombstones in
 	// order to serve from them directly.
 	iter := &blockIter{}
-	if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum); err != nil {
+	if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil {
 		return nil, err
 	}
 	var tombstones []keyspan.Span
@@ -3549,7 +3551,7 @@ func (r *Reader) Layout() (*Layout, error) {
 			if err != nil {
 				return nil, err
 			}
-			if err := iter.init(r.Compare, subIndex.Get(), 0 /* globalSeqNum */); err != nil {
+			if err := iter.init(r.Compare, subIndex.Get(), 0, false); err != nil {
 				return nil, err
 			}
 			for key, value := iter.First(); key != nil; key, value = iter.Next() {
diff --git a/sstable/reader_test.go b/sstable/reader_test.go
index 4d432a9533..fdf377ab0d 100644
--- a/sstable/reader_test.go
+++ b/sstable/reader_test.go
@@ -414,15 +414,9 @@ func TestVirtualReader(t *testing.T) {
 			}
 
 			var stats base.InternalIteratorStats
-			iter, err := v.NewIterWithBlockPropertyFiltersAndContext(
-				context.Background(),
-				lower,
-				upper,
-				nil,
-				false,
-				&stats,
-				TrivialReaderProvider{Reader: r},
-			)
+			iter, err := v.NewIterWithBlockPropertyFiltersAndContextEtc(
+				context.Background(), lower, upper, nil, false, false,
+				&stats, TrivialReaderProvider{Reader: r})
 			if err != nil {
 				return err.Error()
 			}
@@ -473,7 +467,7 @@ func TestReader(t *testing.T) {
 		"prefixFilter": "testdata/prefixreader",
 	}
 
-	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} {
+	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4} {
 		for dName, blockSize := range blockSizes {
 			for iName, indexBlockSize := range blockSizes {
 				for lName, tableOpt := range writerOpts {
@@ -497,6 +491,29 @@ func TestReader(t *testing.T) {
 	}
 }
 
+func TestReaderHideObsolete(t *testing.T) {
+	blockSizes := map[string]int{
+		"1bytes":   1,
+		"5bytes":   5,
+		"10bytes":  10,
+		"25bytes":  25,
+		"Maxbytes": math.MaxInt32,
+	}
+	for dName, blockSize := range blockSizes {
+		opts := WriterOptions{
+			TableFormat:    TableFormatPebblev4,
+			BlockSize:      blockSize,
+			IndexBlockSize: blockSize,
+			Comparer:       testkeys.Comparer,
+		}
+		t.Run(fmt.Sprintf("blockSize=%s", dName), func(t *testing.T) {
+			runTestReader(
+				t, opts, "testdata/reader_hide_obsolete",
+				nil /* Reader */, 0, false, true)
+		})
+	}
+}
+
 func TestHamletReader(t *testing.T) {
 	prebuiltSSTs := []string{
 		"testdata/h.ldb",
@@ -722,12 +739,23 @@ func runTestReader(
 				}
 				var stats base.InternalIteratorStats
 				r.Properties.GlobalSeqNum = seqNum
-				var filterer *BlockPropertiesFilterer
+				hideObsoletePoints := false
+				var bpfs []BlockPropertyFilter
+				if d.HasArg("hide-obsolete-points") {
+					d.ScanArgs(t, "hide-obsolete-points", &hideObsoletePoints)
+					if hideObsoletePoints {
+						bpfs = append(bpfs, obsoleteKeyBlockPropertyFilter{})
+					}
+				}
 				if d.HasArg("block-property-filter") {
 					var filterMin, filterMax uint64
 					d.ScanArgs(t, "block-property-filter", &filterMin, &filterMax)
 					bpf := NewTestKeysBlockPropertyFilter(filterMin, filterMax)
-					filterer = newBlockPropertiesFilterer([]BlockPropertyFilter{bpf}, nil)
+					bpfs = append(bpfs, bpf)
+				}
+				var filterer *BlockPropertiesFilterer
+				if len(bpfs) > 0 {
+					filterer = newBlockPropertiesFilterer(bpfs, nil)
 					intersects, err :=
 						filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
 					if err != nil {
@@ -737,10 +765,12 @@ func runTestReader(
 						return "table does not intersect BlockPropertyFilter"
 					}
 				}
-				iter, err := r.NewIterWithBlockPropertyFilters(
+				iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc(
+					context.Background(),
 					nil, /* lower */
 					nil, /* upper */
 					filterer,
+					hideObsoletePoints,
 					true, /* use filter block */
 					&stats,
 					TrivialReaderProvider{Reader: r},
@@ -1897,6 +1927,114 @@ func BenchmarkIteratorScanNextPrefix(b *testing.B) {
 	}
 }
 
+func BenchmarkIteratorScanObsolete(b *testing.B) {
+	options := WriterOptions{
+		BlockSize:            32 << 10,
+		BlockRestartInterval: 16,
+		FilterPolicy:         nil,
+		Compression:          SnappyCompression,
+		Comparer:             testkeys.Comparer,
+	}
+	const keyCount = 1 << 20
+	const keyLen = 10
+
+	// Take the very large keyspace consisting of alphabetic characters of
+	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
+	// picking every 1 key every keyCount keys.
+	keys := testkeys.Alpha(keyLen)
+	keys = keys.EveryN(keys.Count() / keyCount)
+	if keys.Count() < keyCount {
+		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
+	}
+	expectedKeyCount := keys.Count()
+	keyBuf := make([]byte, keyLen)
+	setupBench := func(b *testing.B, tableFormat TableFormat, cacheSize int64) *Reader {
+		mem := vfs.NewMem()
+		f0, err := mem.Create("bench")
+		require.NoError(b, err)
+		options.TableFormat = tableFormat
+		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
+		val := make([]byte, 100)
+		rng := rand.New(rand.NewSource(100))
+		for i := 0; i < keys.Count(); i++ {
+			n := testkeys.WriteKey(keyBuf, keys, i)
+			key := keyBuf[:n]
+			rng.Read(val)
+			forceObsolete := true
+			if i == 0 {
+				forceObsolete = false
+			}
+			require.NoError(b, w.AddWithForceObsolete(
+				base.MakeInternalKey(key, 0, InternalKeyKindSet), val, forceObsolete))
+		}
+		require.NoError(b, w.Close())
+		c := cache.New(cacheSize)
+		defer c.Unref()
+		// Re-open the filename for reading.
+		f0, err = mem.Open("bench")
+		require.NoError(b, err)
+		r, err := newReader(f0, ReaderOptions{
+			Cache:    c,
+			Comparer: testkeys.Comparer,
+		})
+		require.NoError(b, err)
+		return r
+	}
+	for _, format := range []TableFormat{TableFormatPebblev3, TableFormatPebblev4} {
+		b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) {
+			// 150MiB results in a high cache hit rate for both formats.
+			for _, cacheSize := range []int64{1, 150 << 20} {
+				b.Run(fmt.Sprintf("cache-size=%s", humanize.IEC.Int64(cacheSize)),
+					func(b *testing.B) {
+						r := setupBench(b, format, cacheSize)
+						defer func() {
+							require.NoError(b, r.Close())
+						}()
+						for _, hideObsoletePoints := range []bool{false, true} {
+							b.Run(fmt.Sprintf("hide-obsolete=%t", hideObsoletePoints), func(b *testing.B) {
+								var filterer *BlockPropertiesFilterer
+								if format == TableFormatPebblev4 && hideObsoletePoints {
+									filterer = newBlockPropertiesFilterer(
+										[]BlockPropertyFilter{obsoleteKeyBlockPropertyFilter{}}, nil)
+									intersects, err :=
+										filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
+									if err != nil {
+										b.Fatalf("%s", err.Error())
+									}
+									if !intersects {
+										b.Fatalf("sstable does not intersect")
+									}
+								}
+								iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc(
+									context.Background(), nil, nil, filterer, hideObsoletePoints,
+									true, nil, TrivialReaderProvider{Reader: r})
+								require.NoError(b, err)
+								b.ResetTimer()
+								for i := 0; i < b.N; i++ {
+									count := 0
+									k, _ := iter.First()
+									for k != nil {
+										count++
+										k, _ = iter.Next()
+									}
+									if format == TableFormatPebblev4 && hideObsoletePoints {
+										if count != 1 {
+											b.Fatalf("found %d points", count)
+										}
+									} else {
+										if count != expectedKeyCount {
+											b.Fatalf("found %d points", count)
+										}
+									}
+								}
+							})
+						}
+					})
+			}
+		})
+	}
+}
+
 func newReader(r ReadableFile, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) {
 	readable, err := NewSimpleReadable(r)
 	if err != nil {
diff --git a/sstable/suffix_rewriter.go b/sstable/suffix_rewriter.go
index b1423fddb8..f1d5c5fd13 100644
--- a/sstable/suffix_rewriter.go
+++ b/sstable/suffix_rewriter.go
@@ -184,7 +184,7 @@ func rewriteBlocks(
 		if err != nil {
 			return err
 		}
-		if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum); err != nil {
+		if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum, false); err != nil {
 			return err
 		}
 
@@ -472,7 +472,8 @@ func RewriteKeySuffixesViaWriter(
 		if err != nil {
 			return nil, err
 		}
-		if w.addPoint(scratch, val); err != nil {
+		// TODO: correct forceObsolete value.
+		if w.addPoint(scratch, val, false); err != nil {
 			return nil, err
 		}
 		k, v = i.Next()
diff --git a/sstable/suffix_rewriter_test.go b/sstable/suffix_rewriter_test.go
index 0d9865c775..c3132d9b70 100644
--- a/sstable/suffix_rewriter_test.go
+++ b/sstable/suffix_rewriter_test.go
@@ -76,6 +76,9 @@ func TestRewriteSuffixProps(t *testing.T) {
 				t.Run(fmt.Sprintf("byBlocks=%v", byBlocks), func(t *testing.T) {
 					rewrittenSST := &memFile{}
 					if byBlocks {
+						if format == TableFormatPebblev4 {
+							expectedProps["obsolete-key"] = string([]byte{3})
+						}
 						_, rewriteFormat, err := rewriteKeySuffixesInBlocks(
 							r, rewrittenSST, rwOpts, from, to, 8)
 						// rewriteFormat is equal to the original format, since
@@ -83,6 +86,9 @@ func TestRewriteSuffixProps(t *testing.T) {
 						require.Equal(t, wOpts.TableFormat, rewriteFormat)
 						require.NoError(t, err)
 					} else {
+						if rwOpts.TableFormat != TableFormatPebblev4 {
+							delete(expectedProps, "obsolete-key")
+						}
 						_, err := RewriteKeySuffixesViaWriter(r, rewrittenSST, rwOpts, from, to)
 						require.NoError(t, err)
 					}
@@ -119,7 +125,9 @@ func TestRewriteSuffixProps(t *testing.T) {
 						for !newDecoder.done() {
 							id, val, err := newDecoder.next()
 							require.NoError(t, err)
-							newProps[id] = val
+							if int(id) < len(newProps) {
+								newProps[id] = val
+							}
 						}
 						require.Equal(t, oldProps[0], newProps[1])
 						ival.decode(newProps[0])
diff --git a/sstable/testdata/reader_hide_obsolete/iter b/sstable/testdata/reader_hide_obsolete/iter
new file mode 100644
index 0000000000..173a0f6ad8
--- /dev/null
+++ b/sstable/testdata/reader_hide_obsolete/iter
@@ -0,0 +1,154 @@
+build
+a.SET.1:A
+b.SINGLEDEL.4:
+b.SET.2:B
+c.DEL.5:
+c.SET.3:C
+d.SET.4:D4
+d.SET.2:D2
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+----
+<a:1>:A
+<b:4>:
+<b:2>:B
+<c:5>:
+<c:3>:C
+<d:4>:D4
+<d:2>:D2
+.
+<d:2>:D2
+<d:4>:D4
+<c:3>:C
+<c:5>:
+<b:2>:B
+<b:4>:
+<a:1>:A
+.
+
+iter hide-obsolete-points=true
+first
+next
+next
+next
+next
+prev
+prev
+prev
+prev
+prev
+----
+<a:1>:A
+<b:4>:
+<c:5>:
+<d:4>:D4
+.
+<d:4>:D4
+<c:5>:
+<b:4>:
+<a:1>:A
+.
+
+iter hide-obsolete-points=true
+seek-ge c
+prev
+prev
+next
+next
+next
+seek-lt c
+next
+prev
+prev
+----
+<c:5>:
+<b:4>:
+<a:1>:A
+<b:4>:
+<c:5>:
+<d:4>:D4
+<b:4>:
+<c:5>:
+<b:4>:
+<a:1>:A
+
+build
+a.SET.3:A
+a.MERGE.2:A2
+b.MERGE.20:B20
+b.MERGE.18:B18
+b.SET.16:B16
+b.SET.14:B14
+c.MERGE.30:C30
+c.MERGE.28:C28
+c.DEL.26:
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+<a:3>:A
+<a:2>:A2
+<b:20>:B20
+<b:18>:B18
+<b:16>:B16
+<b:14>:B14
+<c:30>:C30
+<c:28>:C28
+
+iter hide-obsolete-points=true
+first
+next
+next
+next
+next
+next
+next
+last
+prev
+prev
+prev
+prev
+prev
+prev
+----
+<a:3>:A
+<b:20>:B20
+<b:18>:B18
+<b:16>:B16
+<c:30>:C30
+<c:28>:C28
+.
+<c:28>:C28
+<c:30>:C30
+<b:16>:B16
+<b:18>:B18
+<b:20>:B20
+<a:3>:A
+.
+
+# TODO(sumeer): writing to lowest level.
diff --git a/sstable/testdata/virtual_reader b/sstable/testdata/virtual_reader
index b5b507d245..c0765e31a9 100644
--- a/sstable/testdata/virtual_reader
+++ b/sstable/testdata/virtual_reader
@@ -159,7 +159,7 @@ virtualize dd.SET.5-ddd.SET.6
 ----
 bounds:  [dd#5,1-ddd#6,1]
 filenum: 000008
-props:   8,1
+props:   7,1
 
 # Check lower bound enforcement during SeekPrefixGE.
 iter
@@ -566,7 +566,7 @@ virtualize f.SET.6-h.SET.9
 ----
 bounds:  [f#6,1-h#9,1]
 filenum: 000020
-props:   6,0
+props:   5,0
 
 iter
 seek-lt z
diff --git a/sstable/testdata/writer_value_blocks b/sstable/testdata/writer_value_blocks
index 869b849afb..8590e42925 100644
--- a/sstable/testdata/writer_value_blocks
+++ b/sstable/testdata/writer_value_blocks
@@ -132,7 +132,7 @@ layout
         72    record (21 = 3 [0] + 14 + 4) [restart]
                 blue@8#16,1:value handle {valueLen:6 blockNum:0 offsetInBlock:5}
         93    [restart 72]
-       101    [trailer compression=none checksum=0xdc74261]
+       101    [trailer compression=none checksum=0x4e65b9b6]
        106  data (29)
        106    record (21 = 3 [0] + 14 + 4) [restart]
                 blue@6#16,1:value handle {valueLen:15 blockNum:1 offsetInBlock:0}
@@ -146,77 +146,78 @@ layout
        173    block:38/29 [restart]
        192    [restart 173]
        200    [trailer compression=none checksum=0x21d27815]
-       205  index (27)
+       205  index (30)
        205    block:72/29 [restart]
-       224    [restart 205]
-       232    [trailer compression=none checksum=0xbae26eb3]
-       237  index (22)
-       237    block:106/29 [restart]
-       251    [restart 237]
-       259    [trailer compression=none checksum=0x802be702]
-       264  top-index (77)
-       264    block:140/28 [restart]
-       285    block:173/27 [restart]
-       305    block:205/27 [restart]
-       325    block:237/22 [restart]
-       340    [restart 264]
-       344    [restart 285]
-       348    [restart 305]
-       352    [restart 325]
-       341    [trailer compression=snappy checksum=0x6b2d79b]
-       346  value-block (11)
-       362  value-block (15)
-       382  value-index (8)
-       395  properties (785)
-       395    pebble.num.value-blocks (27) [restart]
-       422    pebble.num.values.in.value-blocks (21)
-       443    pebble.value-blocks.size (21)
-       464    rocksdb.block.based.table.index.type (43)
-       507    rocksdb.block.based.table.prefix.filtering (20)
-       527    rocksdb.block.based.table.whole.key.filtering (23)
-       550    rocksdb.column.family.id (24)
-       574    rocksdb.comparator (35)
-       609    rocksdb.compression (16)
-       625    rocksdb.compression_options (106)
-       731    rocksdb.creation.time (16)
-       747    rocksdb.data.size (14)
-       761    rocksdb.deleted.keys (15)
-       776    rocksdb.external_sst_file.global_seqno (41)
-       817    rocksdb.external_sst_file.version (14)
-       831    rocksdb.filter.size (15)
-       846    rocksdb.fixed.key.length (18)
-       864    rocksdb.format.version (17)
-       881    rocksdb.index.key.is.user.key (25)
-       906    rocksdb.index.partitions (14)
-       920    rocksdb.index.size (9)
-       929    rocksdb.index.value.is.delta.encoded (26)
-       955    rocksdb.merge.operands (18)
-       973    rocksdb.merge.operator (24)
-       997    rocksdb.num.data.blocks (19)
-      1016    rocksdb.num.entries (11)
-      1027    rocksdb.num.range-deletions (19)
-      1046    rocksdb.oldest.key.time (19)
-      1065    rocksdb.prefix.extractor.name (31)
-      1096    rocksdb.property.collectors (22)
-      1118    rocksdb.raw.key.size (16)
-      1134    rocksdb.raw.value.size (14)
-      1148    rocksdb.top-level.index.size (24)
-      1172    [restart 395]
-      1180    [trailer compression=none checksum=0x4352b7fd]
-      1185  meta-index (64)
-      1185    pebble.value_index block:382/8 value-blocks-index-lengths: 1(num), 2(offset), 1(length) [restart]
-      1212    rocksdb.properties block:395/785 [restart]
-      1237    [restart 1185]
-      1241    [restart 1212]
-      1249    [trailer compression=none checksum=0xe7aed935]
-      1254  footer (53)
-      1254    checksum type: crc32c
-      1255    meta: offset=1185, length=64
-      1258    index: offset=264, length=77
-      1261    [padding]
-      1295    version: 4
-      1299    magic number: 0xf09faab3f09faab3
-      1307  EOF
+       227    [restart 205]
+       235    [trailer compression=none checksum=0xba0b26fe]
+       240  index (22)
+       240    block:106/29 [restart]
+       254    [restart 240]
+       262    [trailer compression=none checksum=0x802be702]
+       267  top-index (85)
+       267    block:140/28 [restart]
+       288    block:173/27 [restart]
+       308    block:205/30 [restart]
+       331    block:240/22 [restart]
+       346    [restart 267]
+       350    [restart 288]
+       354    [restart 308]
+       358    [restart 331]
+       352    [trailer compression=snappy checksum=0x8bd0d63a]
+       357  value-block (11)
+       373  value-block (15)
+       393  value-index (8)
+       406  properties (813)
+       406    obsolete-key (16) [restart]
+       422    pebble.num.value-blocks (27)
+       449    pebble.num.values.in.value-blocks (21)
+       470    pebble.value-blocks.size (21)
+       491    rocksdb.block.based.table.index.type (43)
+       534    rocksdb.block.based.table.prefix.filtering (20)
+       554    rocksdb.block.based.table.whole.key.filtering (23)
+       577    rocksdb.column.family.id (24)
+       601    rocksdb.comparator (35)
+       636    rocksdb.compression (16)
+       652    rocksdb.compression_options (106)
+       758    rocksdb.creation.time (16)
+       774    rocksdb.data.size (14)
+       788    rocksdb.deleted.keys (15)
+       803    rocksdb.external_sst_file.global_seqno (41)
+       844    rocksdb.external_sst_file.version (14)
+       858    rocksdb.filter.size (15)
+       873    rocksdb.fixed.key.length (18)
+       891    rocksdb.format.version (17)
+       908    rocksdb.index.key.is.user.key (25)
+       933    rocksdb.index.partitions (14)
+       947    rocksdb.index.size (9)
+       956    rocksdb.index.value.is.delta.encoded (26)
+       982    rocksdb.merge.operands (18)
+      1000    rocksdb.merge.operator (24)
+      1024    rocksdb.num.data.blocks (19)
+      1043    rocksdb.num.entries (11)
+      1054    rocksdb.num.range-deletions (19)
+      1073    rocksdb.oldest.key.time (19)
+      1092    rocksdb.prefix.extractor.name (31)
+      1123    rocksdb.property.collectors (34)
+      1157    rocksdb.raw.key.size (16)
+      1173    rocksdb.raw.value.size (14)
+      1187    rocksdb.top-level.index.size (24)
+      1211    [restart 406]
+      1219    [trailer compression=none checksum=0xa07c3fa5]
+      1224  meta-index (64)
+      1224    pebble.value_index block:393/8 value-blocks-index-lengths: 1(num), 2(offset), 1(length) [restart]
+      1251    rocksdb.properties block:406/813 [restart]
+      1276    [restart 1224]
+      1280    [restart 1251]
+      1288    [trailer compression=none checksum=0xfe295720]
+      1293  footer (53)
+      1293    checksum type: crc32c
+      1294    meta: offset=1224, length=64
+      1297    index: offset=267, length=85
+      1300    [padding]
+      1334    version: 4
+      1338    magic number: 0xf09faab3f09faab3
+      1346  EOF
 
 # Require that [c,e) must be in-place.
 build in-place-bound=(c,e)
@@ -296,47 +297,48 @@ layout
         71    block:0/66 [restart]
         85    [restart 71]
         93    [trailer compression=none checksum=0xf80f5bcf]
-        98  properties (715)
-        98    pebble.raw.point-tombstone.key.size (39) [restart]
-       137    rocksdb.block.based.table.index.type (43)
-       180    rocksdb.block.based.table.prefix.filtering (20)
-       200    rocksdb.block.based.table.whole.key.filtering (23)
-       223    rocksdb.column.family.id (24)
-       247    rocksdb.comparator (35)
-       282    rocksdb.compression (16)
-       298    rocksdb.compression_options (106)
-       404    rocksdb.creation.time (16)
-       420    rocksdb.data.size (13)
-       433    rocksdb.deleted.keys (15)
-       448    rocksdb.external_sst_file.global_seqno (41)
-       489    rocksdb.external_sst_file.version (14)
-       503    rocksdb.filter.size (15)
-       518    rocksdb.fixed.key.length (18)
-       536    rocksdb.format.version (17)
-       553    rocksdb.index.key.is.user.key (25)
-       578    rocksdb.index.size (8)
-       586    rocksdb.index.value.is.delta.encoded (26)
-       612    rocksdb.merge.operands (18)
-       630    rocksdb.merge.operator (24)
-       654    rocksdb.num.data.blocks (19)
-       673    rocksdb.num.entries (11)
-       684    rocksdb.num.range-deletions (19)
-       703    rocksdb.oldest.key.time (19)
-       722    rocksdb.prefix.extractor.name (31)
-       753    rocksdb.property.collectors (22)
-       775    rocksdb.raw.key.size (16)
-       791    rocksdb.raw.value.size (14)
-       805    [restart 98]
-       813    [trailer compression=none checksum=0xfb9d6722]
-       818  meta-index (32)
-       818    rocksdb.properties block:98/715 [restart]
-       842    [restart 818]
-       850    [trailer compression=none checksum=0x8ca405dc]
-       855  footer (53)
-       855    checksum type: crc32c
-       856    meta: offset=818, length=32
-       859    index: offset=71, length=22
-       861    [padding]
-       896    version: 4
-       900    magic number: 0xf09faab3f09faab3
-       908  EOF
+        98  properties (743)
+        98    obsolete-key (16) [restart]
+       114    pebble.raw.point-tombstone.key.size (39)
+       153    rocksdb.block.based.table.index.type (43)
+       196    rocksdb.block.based.table.prefix.filtering (20)
+       216    rocksdb.block.based.table.whole.key.filtering (23)
+       239    rocksdb.column.family.id (24)
+       263    rocksdb.comparator (35)
+       298    rocksdb.compression (16)
+       314    rocksdb.compression_options (106)
+       420    rocksdb.creation.time (16)
+       436    rocksdb.data.size (13)
+       449    rocksdb.deleted.keys (15)
+       464    rocksdb.external_sst_file.global_seqno (41)
+       505    rocksdb.external_sst_file.version (14)
+       519    rocksdb.filter.size (15)
+       534    rocksdb.fixed.key.length (18)
+       552    rocksdb.format.version (17)
+       569    rocksdb.index.key.is.user.key (25)
+       594    rocksdb.index.size (8)
+       602    rocksdb.index.value.is.delta.encoded (26)
+       628    rocksdb.merge.operands (18)
+       646    rocksdb.merge.operator (24)
+       670    rocksdb.num.data.blocks (19)
+       689    rocksdb.num.entries (11)
+       700    rocksdb.num.range-deletions (19)
+       719    rocksdb.oldest.key.time (19)
+       738    rocksdb.prefix.extractor.name (31)
+       769    rocksdb.property.collectors (34)
+       803    rocksdb.raw.key.size (16)
+       819    rocksdb.raw.value.size (14)
+       833    [restart 98]
+       841    [trailer compression=none checksum=0x71c150c9]
+       846  meta-index (32)
+       846    rocksdb.properties block:98/743 [restart]
+       870    [restart 846]
+       878    [trailer compression=none checksum=0x545483dc]
+       883  footer (53)
+       883    checksum type: crc32c
+       884    meta: offset=846, length=32
+       887    index: offset=71, length=22
+       889    [padding]
+       924    version: 4
+       928    magic number: 0xf09faab3f09faab3
+       936  EOF
diff --git a/sstable/writer.go b/sstable/writer.go
index 83ebad4ca0..6252ba12a4 100644
--- a/sstable/writer.go
+++ b/sstable/writer.go
@@ -132,6 +132,8 @@ type Writer struct {
 	separator               Separator
 	successor               Successor
 	tableFormat             TableFormat
+	isStrictObsolete        bool
+	writingToLowestLevel    bool
 	cache                   *cache.Cache
 	restartInterval         int
 	checksumType            ChecksumType
@@ -166,6 +168,7 @@ type Writer struct {
 	props               Properties
 	propCollectors      []TablePropertyCollector
 	blockPropCollectors []BlockPropertyCollector
+	obsoleteCollector   obsoleteKeyBlockPropertyCollector
 	blockPropsEncoder   blockPropertiesEncoder
 	// filter accumulates the filter block. If populated, the filter ingests
 	// either the output of w.split (i.e. a prefix extractor) if w.split is not
@@ -195,6 +198,11 @@ type Writer struct {
 	// blockBuf consists of the state which is owned by and used by the Writer client
 	// goroutine.
 	blockBuf blockBuf
+	// lastUserKeyInPrevBlock is a buffer with a copy of the last user key in
+	// the previous block, which is populated when the block is flushed. This is
+	// useful since in this case we cannot rely on
+	// dataBlockBuf.dataBlock.getCurUserKey().
+	lastUserKeyInPrevBlock []byte
 
 	coordination coordinationState
 
@@ -673,7 +681,9 @@ func (w *Writer) Set(key, value []byte) error {
 	if w.err != nil {
 		return w.err
 	}
-	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value)
+	// forceObsolete is false based on the assumption that no RANGEDELs in the
+	// sstable that delete the added points.
+	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value, false)
 }
 
 // Delete deletes the value for the given key. The sequence number is set to
@@ -685,7 +695,9 @@ func (w *Writer) Delete(key []byte) error {
 	if w.err != nil {
 		return w.err
 	}
-	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil)
+	// forceObsolete is false based on the assumption that no RANGEDELs in the
+	// sstable that delete the added points.
+	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil, false)
 }
 
 // DeleteRange deletes all of the keys (and values) in the range [start,end)
@@ -711,7 +723,10 @@ func (w *Writer) Merge(key, value []byte) error {
 	if w.err != nil {
 		return w.err
 	}
-	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value)
+	// forceObsolete is false based on the assumption that no RANGEDELs in the
+	// sstable that delete the added points. If the user configured this writer
+	// to be strict-obsolete, addPoint will reject the addition of this MERGE.
+	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value, false)
 }
 
 // Add adds a key/value pair to the table being written. For a given Writer,
@@ -721,6 +736,24 @@ func (w *Writer) Merge(key, value []byte) error {
 // point entries. Additionally, range deletion tombstones must be fragmented
 // (i.e. by keyspan.Fragmenter).
 func (w *Writer) Add(key InternalKey, value []byte) error {
+	if w.isStrictObsolete {
+		return errors.Errorf("use AddWithForceObsolete")
+	}
+	return w.AddWithForceObsolete(key, value, false)
+}
+
+// AddWithForceObsolete should be used when writing a strict-obsolete sstable.
+//
+// forceObsolete indicates whether the caller has determined that this key is
+// obsolete even though it may be the latest point key for this userkey. This
+// should be set to true for keys obsoleted by RANGEDELs, and is required for
+// strict-obsolete sstables.
+//
+// Note that there are two properties, S1 and S2 (see comment in format.go)
+// that strict-obsolete ssts must satisfy. S2, due to RANGEDELs, is solely the
+// responsibility of the caller. S1 is solely the responsibility of the
+// callee.
+func (w *Writer) AddWithForceObsolete(key InternalKey, value []byte, forceObsolete bool) error {
 	if w.err != nil {
 		return w.err
 	}
@@ -735,7 +768,7 @@ func (w *Writer) Add(key InternalKey, value []byte) error {
 			"pebble: range keys must be added via one of the RangeKey* functions")
 		return w.err
 	}
-	return w.addPoint(key, value)
+	return w.addPoint(key, value, forceObsolete)
 }
 
 func (w *Writer) makeAddPointDecisionV2(key InternalKey) error {
@@ -745,12 +778,7 @@ func (w *Writer) makeAddPointDecisionV2(key InternalKey) error {
 		return nil
 	}
 	if !w.disableKeyOrderChecks {
-		prevPointUserKey := w.dataBlockBuf.dataBlock.curKey
-		n := len(prevPointUserKey) - base.InternalTrailerLen
-		if n < 0 {
-			panic("corrupt key in blockWriter buffer")
-		}
-		prevPointUserKey = prevPointUserKey[:n:n]
+		prevPointUserKey := w.dataBlockBuf.dataBlock.getCurUserKey()
 		cmpUser := w.compare(prevPointUserKey, key.UserKey)
 		if cmpUser > 0 || (cmpUser == 0 && prevTrailer <= key.Trailer) {
 			return errors.Errorf(
@@ -762,9 +790,17 @@ func (w *Writer) makeAddPointDecisionV2(key InternalKey) error {
 	return nil
 }
 
+// REQUIRES: at least one point has been written to the Writer.
+func (w *Writer) getLastPointUserKey() []byte {
+	if w.dataBlockBuf.dataBlock.nEntries == 0 {
+		return w.lastUserKeyInPrevBlock
+	}
+	return w.dataBlockBuf.dataBlock.getCurUserKey()
+}
+
 func (w *Writer) makeAddPointDecisionV3(
 	key InternalKey, valueLen int,
-) (setHasSamePrefix bool, writeToValueBlock bool, err error) {
+) (setHasSamePrefix bool, writeToValueBlock bool, isObsolete bool, err error) {
 	prevPointKeyInfo := w.lastPointKeyInfo
 	w.lastPointKeyInfo.userKeyLen = len(key.UserKey)
 	w.lastPointKeyInfo.prefixLen = w.lastPointKeyInfo.userKeyLen
@@ -772,18 +808,15 @@ func (w *Writer) makeAddPointDecisionV3(
 		w.lastPointKeyInfo.prefixLen = w.split(key.UserKey)
 	}
 	w.lastPointKeyInfo.trailer = key.Trailer
-	if w.dataBlockBuf.dataBlock.nEntries == 0 {
-		return false, false, nil
-	}
-	prevPointUserKey := w.dataBlockBuf.dataBlock.curKey
-	n := len(prevPointUserKey) - base.InternalTrailerLen
-	if n < 0 {
-		panic("corrupt key in blockWriter buffer")
+	if !w.meta.HasPointKeys {
+		return false, false, false, nil
 	}
-	prevPointUserKey = prevPointUserKey[:n:n]
+	keyKind := base.TrailerKind(key.Trailer)
+	prevPointUserKey := w.getLastPointUserKey()
 	prevPointKey := InternalKey{UserKey: prevPointUserKey, Trailer: prevPointKeyInfo.trailer}
-	considerWriteToValueBlock := base.TrailerKind(prevPointKeyInfo.trailer) == InternalKeyKindSet &&
-		base.TrailerKind(key.Trailer) == InternalKeyKindSet
+	prevKeyKind := base.TrailerKind(prevPointKeyInfo.trailer)
+	considerWriteToValueBlock := prevKeyKind == InternalKeyKindSet &&
+		keyKind == InternalKeyKindSet
 	if considerWriteToValueBlock && !w.requiredInPlaceValueBound.IsEmpty() {
 		keyPrefix := key.UserKey[:w.lastPointKeyInfo.prefixLen]
 		cmpUpper := w.compare(
@@ -812,14 +845,39 @@ func (w *Writer) makeAddPointDecisionV3(
 	} else {
 		cmpUser = w.compare(prevPointUserKey, key.UserKey)
 	}
+	keyKindIsSetOrMerge := keyKind == InternalKeyKindSet || keyKind == InternalKeyKindSetWithDelete ||
+		keyKind == InternalKeyKindMerge
+	keyKindIsPointDelete := keyKind == InternalKeyKindDelete ||
+		keyKind == InternalKeyKindSingleDelete || keyKind == InternalKeyKindDeleteSized
+	if !keyKindIsSetOrMerge && !keyKindIsPointDelete {
+		panic(errors.AssertionFailedf("unexpected key kind %s", keyKind.String()))
+	}
+	// If same user key, then the current key is obsolete if:
+	// - The prev key was not a MERGE.
+	// - If the prev key was a MERGE, then as long as the current key is not a
+	//   SET or SETWITHDELETE or MERGE, then it is obsolete. SET, SETWITHDELETE,
+	//   MERGE are not obsolete since they will be merged.
+	//
+	// If not the same user key, but it is some kind of point delete, and we are
+	// writing to the lowest level, then it is also obsolete.
+	isObsolete = (cmpUser == 0 && (prevKeyKind != InternalKeyKindMerge || !keyKindIsSetOrMerge)) ||
+		(w.writingToLowestLevel &&
+			(keyKind == InternalKeyKindDelete || keyKind == InternalKeyKindSingleDelete ||
+				keyKind == InternalKeyKindDeleteSized))
+	// TODO(sumeer): storing isObsolete SET and SETWITHDEL in value blocks is
+	// possible, but requires some care in documenting and checking invariants.
+	// There is code that assumes nothing in value blocks because of single MVCC
+	// version (those should be ok). We have to ensure setHasSamePrefix is
+	// correctly initialized here etc.
+
 	if !w.disableKeyOrderChecks &&
 		(cmpUser > 0 || (cmpUser == 0 && prevPointKeyInfo.trailer <= key.Trailer)) {
-		return false, false, errors.Errorf(
+		return false, false, false, errors.Errorf(
 			"pebble: keys must be added in strictly increasing order: %s, %s",
 			prevPointKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
 	}
 	if !considerWriteToValueBlock {
-		return false, false, nil
+		return false, false, isObsolete, nil
 	}
 	// NB: it is possible that cmpUser == 0, i.e., these two SETs have identical
 	// user keys (because of an open snapshot). This should be the rare case.
@@ -832,19 +890,24 @@ func (w *Writer) makeAddPointDecisionV3(
 	if considerWriteToValueBlock && valueLen <= tinyValueThreshold {
 		considerWriteToValueBlock = false
 	}
-	return setHasSamePrefix, considerWriteToValueBlock, nil
+	return setHasSamePrefix, considerWriteToValueBlock, isObsolete, nil
 }
 
-func (w *Writer) addPoint(key InternalKey, value []byte) error {
+func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) error {
+	if w.isStrictObsolete && key.Kind() == InternalKeyKindMerge {
+		return errors.Errorf("MERGE not supported in a strict-obsolete sstable")
+	}
 	var err error
 	var setHasSameKeyPrefix, writeToValueBlock, addPrefixToValueStoredWithKey bool
+	var isObsolete bool
 	maxSharedKeyLen := len(key.UserKey)
 	if w.valueBlockWriter != nil {
 		// maxSharedKeyLen is limited to the prefix of the preceding key. If the
 		// preceding key was in a different block, then the blockWriter will
 		// ignore this maxSharedKeyLen.
 		maxSharedKeyLen = w.lastPointKeyInfo.prefixLen
-		setHasSameKeyPrefix, writeToValueBlock, err = w.makeAddPointDecisionV3(key, len(value))
+		setHasSameKeyPrefix, writeToValueBlock, isObsolete, err =
+			w.makeAddPointDecisionV3(key, len(value))
 		addPrefixToValueStoredWithKey = base.TrailerKind(key.Trailer) == InternalKeyKindSet
 	} else {
 		err = w.makeAddPointDecisionV2(key)
@@ -852,6 +915,7 @@ func (w *Writer) addPoint(key InternalKey, value []byte) error {
 	if err != nil {
 		return err
 	}
+	isObsolete = w.tableFormat >= TableFormatPebblev4 && (isObsolete || forceObsolete)
 	var valueStoredWithKey []byte
 	var prefix valuePrefix
 	var valueStoredWithKeyLen int
@@ -907,16 +971,19 @@ func (w *Writer) addPoint(key InternalKey, value []byte) error {
 			return err
 		}
 	}
+	if w.tableFormat >= TableFormatPebblev4 {
+		w.obsoleteCollector.AddPoint(isObsolete)
+	}
 
 	w.maybeAddToFilter(key.UserKey)
 	w.dataBlockBuf.dataBlock.addWithOptionalValuePrefix(
-		key, valueStoredWithKey, maxSharedKeyLen, addPrefixToValueStoredWithKey, prefix,
+		key, isObsolete, valueStoredWithKey, maxSharedKeyLen, addPrefixToValueStoredWithKey, prefix,
 		setHasSameKeyPrefix)
 
 	w.meta.updateSeqNum(key.SeqNum())
 
 	if !w.meta.HasPointKeys {
-		k := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey)
+		k := w.dataBlockBuf.dataBlock.getCurKey()
 		// NB: We need to ensure that SmallestPoint.UserKey is set, so we create
 		// an InternalKey which is semantically identical to the key, but won't
 		// have a nil UserKey. We do this, because key.UserKey could be nil, and
@@ -968,7 +1035,7 @@ func (w *Writer) addTombstone(key InternalKey, value []byte) error {
 	if !w.disableKeyOrderChecks && !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 {
 		// Check that tombstones are being added in fragmented order. If the two
 		// tombstones overlap, their start and end keys must be identical.
-		prevKey := base.DecodeInternalKey(w.rangeDelBlock.curKey)
+		prevKey := w.rangeDelBlock.getCurKey()
 		switch c := w.compare(prevKey.UserKey, key.UserKey); {
 		case c > 0:
 			w.err = errors.Errorf("pebble: keys must be added in order: %s, %s",
@@ -1164,7 +1231,7 @@ func (w *Writer) encodeRangeKeySpan(span keyspan.Span) {
 
 func (w *Writer) addRangeKey(key InternalKey, value []byte) error {
 	if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 {
-		prevStartKey := base.DecodeInternalKey(w.rangeKeyBlock.curKey)
+		prevStartKey := w.rangeKeyBlock.getCurKey()
 		prevEndKey, _, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue)
 		if !ok {
 			// We panic here as we should have previously decoded and validated this
@@ -1310,7 +1377,7 @@ func (w *Writer) flush(key InternalKey) error {
 	// dataBlockBuf is added back to a sync.Pool. In this particular case, the
 	// byte slice which supports "sep" will eventually be copied when "sep" is
 	// added to the index block.
-	prevKey := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey)
+	prevKey := w.dataBlockBuf.dataBlock.getCurKey()
 	sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf)
 	// We determine that we should flush an index block from the Writer client
 	// goroutine, but we actually finish the index block from the writeQueue.
@@ -1342,6 +1409,13 @@ func (w *Writer) flush(key InternalKey) error {
 	// BlockPropertyCollector.AddPrevDataBlockToIndexBlock.
 	w.addPrevDataBlockToIndexBlockProps()
 
+	// Copy the prevKey before handing off the blockWriter.
+	//
+	// TODO(sumeer): since we don't pool the Writer itself, this slice will be
+	// allocated at least once per Writer. We can improve the allocation story
+	// here.
+	w.lastUserKeyInPrevBlock = append(w.lastUserKeyInPrevBlock[:0], prevKey.UserKey...)
+
 	// Schedule a write.
 	writeTask := writeTaskPool.Get().(*writeTask)
 	// We're setting compressionDone to indicate that compression of this block
@@ -1600,7 +1674,7 @@ func (w *Writer) finishIndexBlock(indexBuf *indexBlockBuf, props []byte) error {
 		nEntries: indexBuf.block.nEntries, properties: props,
 	}
 	w.indexSepAlloc, part.sep = cloneKeyWithBuf(
-		base.DecodeInternalKey(indexBuf.block.curKey), w.indexSepAlloc,
+		indexBuf.block.getCurKey(), w.indexSepAlloc,
 	)
 	bk := indexBuf.finish()
 	if len(w.indexBlockAlloc) < len(bk) {
@@ -1804,7 +1878,7 @@ func (w *Writer) Close() (err error) {
 	//    however, if a dataBlock is flushed, then we add a key to the new w.dataBlockBuf in the
 	//    addPoint function after the flush occurs.
 	if w.dataBlockBuf.dataBlock.nEntries >= 1 {
-		w.meta.SetLargestPointKey(base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey).Clone())
+		w.meta.SetLargestPointKey(w.dataBlockBuf.dataBlock.getCurKey().Clone())
 	}
 
 	// Finish the last data block, or force an empty data block if there
@@ -1818,7 +1892,7 @@ func (w *Writer) Close() (err error) {
 		if err != nil {
 			return err
 		}
-		prevKey := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey)
+		prevKey := w.dataBlockBuf.dataBlock.getCurKey()
 		if err := w.addIndexEntrySync(prevKey, InternalKey{}, bhp, w.dataBlockBuf.tmp[:]); err != nil {
 			return err
 		}
@@ -1897,7 +1971,7 @@ func (w *Writer) Close() (err error) {
 
 	var rangeKeyBH BlockHandle
 	if w.props.NumRangeKeys() > 0 {
-		key := base.DecodeInternalKey(w.rangeKeyBlock.curKey)
+		key := w.rangeKeyBlock.getCurKey()
 		kind := key.Kind()
 		endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue)
 		if !ok {
@@ -2086,7 +2160,7 @@ func (o PreviousPointKeyOpt) UnsafeKey() base.InternalKey {
 	if o.w.dataBlockBuf.dataBlock.nEntries >= 1 {
 		// o.w.dataBlockBuf.dataBlock.curKey is guaranteed to point to the last point key
 		// which was added to the Writer.
-		return base.DecodeInternalKey(o.w.dataBlockBuf.dataBlock.curKey)
+		return o.w.dataBlockBuf.dataBlock.getCurKey()
 	}
 	return base.InternalKey{}
 }
@@ -2115,6 +2189,8 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write
 		separator:               o.Comparer.Separator,
 		successor:               o.Comparer.Successor,
 		tableFormat:             o.TableFormat,
+		isStrictObsolete:        o.IsStrictObsolete,
+		writingToLowestLevel:    o.WritingToLowestLevel,
 		cache:                   o.Cache,
 		restartInterval:         o.BlockRestartInterval,
 		checksumType:            o.Checksum,
@@ -2188,7 +2264,8 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write
 	w.props.PropertyCollectorNames = "[]"
 	w.props.ExternalFormatVersion = rocksDBExternalFormatVersion
 
-	if len(o.TablePropertyCollectors) > 0 || len(o.BlockPropertyCollectors) > 0 {
+	if len(o.TablePropertyCollectors) > 0 || len(o.BlockPropertyCollectors) > 0 ||
+		w.tableFormat >= TableFormatPebblev4 {
 		var buf bytes.Buffer
 		buf.WriteString("[")
 		if len(o.TablePropertyCollectors) > 0 {
@@ -2201,16 +2278,22 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write
 				buf.WriteString(w.propCollectors[i].Name())
 			}
 		}
+		numBlockPropertyCollectors := len(o.BlockPropertyCollectors)
+		if w.tableFormat >= TableFormatPebblev4 {
+			numBlockPropertyCollectors++
+		}
+		// shortID is a uint8, so we cannot exceed that number of block
+		// property collectors.
+		if numBlockPropertyCollectors > math.MaxUint8 {
+			w.err = errors.New("pebble: too many block property collectors")
+			return w
+		}
+		if numBlockPropertyCollectors > 0 {
+			w.blockPropCollectors = make([]BlockPropertyCollector, numBlockPropertyCollectors)
+		}
 		if len(o.BlockPropertyCollectors) > 0 {
-			// shortID is a uint8, so we cannot exceed that number of block
-			// property collectors.
-			if len(o.BlockPropertyCollectors) > math.MaxUint8 {
-				w.err = errors.New("pebble: too many block property collectors")
-				return w
-			}
 			// The shortID assigned to a collector is the same as its index in
 			// this slice.
-			w.blockPropCollectors = make([]BlockPropertyCollector, len(o.BlockPropertyCollectors))
 			for i := range o.BlockPropertyCollectors {
 				w.blockPropCollectors[i] = o.BlockPropertyCollectors[i]()
 				if i > 0 || len(o.TablePropertyCollectors) > 0 {
@@ -2219,6 +2302,13 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write
 				buf.WriteString(w.blockPropCollectors[i].Name())
 			}
 		}
+		if w.tableFormat >= TableFormatPebblev4 {
+			if numBlockPropertyCollectors > 1 || len(o.TablePropertyCollectors) > 0 {
+				buf.WriteString(",")
+			}
+			w.blockPropCollectors[numBlockPropertyCollectors-1] = &w.obsoleteCollector
+			buf.WriteString(w.obsoleteCollector.Name())
+		}
 		buf.WriteString("]")
 		w.props.PropertyCollectorNames = buf.String()
 	}
@@ -2252,3 +2342,82 @@ func init() {
 	}
 	private.SSTableInternalProperties = internalGetProperties
 }
+
+type obsoleteKeyBlockPropertyCollector struct {
+	blockIsNonObsolete bool
+	indexIsNonObsolete bool
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) Name() string {
+	return "obsolete-key"
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) Add(key InternalKey, value []byte) error {
+	// Ignore.
+	return nil
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) AddPoint(isObsolete bool) {
+	o.blockIsNonObsolete = o.blockIsNonObsolete || !isObsolete
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) FinishDataBlock(buf []byte) ([]byte, error) {
+	if o.blockIsNonObsolete {
+		return buf, nil
+	}
+	buf = append(buf, 't')
+	return buf, nil
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) AddPrevDataBlockToIndexBlock() {
+	o.indexIsNonObsolete = o.indexIsNonObsolete || o.blockIsNonObsolete
+	o.blockIsNonObsolete = false
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
+	indexIsNonObsolete := o.indexIsNonObsolete
+	o.indexIsNonObsolete = false
+	if indexIsNonObsolete {
+		return buf, nil
+	}
+	buf = append(buf, 't')
+	return buf, nil
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) FinishTable(buf []byte) ([]byte, error) {
+	// Table as a whole is never obsolete.
+	return buf, nil
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) UpdateKeySuffixes(
+	oldProp []byte, oldSuffix, newSuffix []byte,
+) error {
+	isObsolete, err := propToIsObsolete(oldProp)
+	if err != nil {
+		return err
+	}
+	o.blockIsNonObsolete = !isObsolete
+	return nil
+}
+
+type obsoleteKeyBlockPropertyFilter struct{}
+
+func (o obsoleteKeyBlockPropertyFilter) Name() string {
+	return "obsolete-key"
+}
+
+// Intersects returns true if the set represented by prop intersects with
+// the set in the filter.
+func (o obsoleteKeyBlockPropertyFilter) Intersects(prop []byte) (bool, error) {
+	return propToIsObsolete(prop)
+}
+
+func propToIsObsolete(prop []byte) (bool, error) {
+	if len(prop) == 0 {
+		return true, nil
+	}
+	if len(prop) > 1 || prop[0] != 't' {
+		return false, errors.Errorf("unexpected property %x", prop)
+	}
+	return false, nil
+}
diff --git a/sstable/writer_test.go b/sstable/writer_test.go
index 4d504a8778..bb5a47ddfe 100644
--- a/sstable/writer_test.go
+++ b/sstable/writer_test.go
@@ -883,7 +883,7 @@ func TestWriterRace(t *testing.T) {
 					w.Add(base.MakeInternalKey(keys[ki], uint64(ki), InternalKeyKindSet), val),
 				)
 				require.Equal(
-					t, base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey).UserKey, keys[ki],
+					t, w.dataBlockBuf.dataBlock.getCurKey().UserKey, keys[ki],
 				)
 			}
 			require.NoError(t, w.Close())
diff --git a/table_cache.go b/table_cache.go
index 9a3c5b5724..6754075e63 100644
--- a/table_cache.go
+++ b/table_cache.go
@@ -387,14 +387,7 @@ func (c *tableCacheShard) newIters(
 
 	type iterCreator interface {
 		NewRawRangeDelIter() (keyspan.FragmentIterator, error)
-		NewIterWithBlockPropertyFiltersAndContext(
-			ctx context.Context,
-			lower, upper []byte,
-			filterer *sstable.BlockPropertiesFilterer,
-			useFilterBlock bool,
-			stats *base.InternalIteratorStats,
-			rp sstable.ReaderProvider,
-		) (sstable.Iterator, error)
+		NewIterWithBlockPropertyFiltersAndContextEtc(ctx context.Context, lower, upper []byte, filterer *sstable.BlockPropertiesFilterer, hideObsoletePoints, useFilterBlock bool, stats *base.InternalIteratorStats, rp sstable.ReaderProvider) (sstable.Iterator, error)
 		NewCompactionIter(
 			bytesIterated *uint64,
 			rp sstable.ReaderProvider,
@@ -454,11 +447,9 @@ func (c *tableCacheShard) newIters(
 	if internalOpts.bytesIterated != nil {
 		iter, err = ic.NewCompactionIter(internalOpts.bytesIterated, rp)
 	} else {
-		iter, err = ic.NewIterWithBlockPropertyFiltersAndContext(
-			ctx,
-			opts.GetLowerBound(), opts.GetUpperBound(),
-			filterer, useFilter, internalOpts.stats, rp,
-		)
+		iter, err = ic.NewIterWithBlockPropertyFiltersAndContextEtc(
+			ctx, opts.GetLowerBound(), opts.GetUpperBound(), filterer, false, useFilter,
+			internalOpts.stats, rp)
 	}
 	if err != nil {
 		if rangeDelIter != nil {
diff --git a/testdata/checkpoint b/testdata/checkpoint
index 986d23b111..9c38a7c932 100644
--- a/testdata/checkpoint
+++ b/testdata/checkpoint
@@ -256,23 +256,23 @@ close: db/000009.sst
 sync: db
 sync: db/MANIFEST-000001
 open: db/000005.sst
-read-at(739, 53): db/000005.sst
-read-at(702, 37): db/000005.sst
-read-at(74, 628): db/000005.sst
+read-at(767, 53): db/000005.sst
+read-at(730, 37): db/000005.sst
+read-at(74, 656): db/000005.sst
 read-at(47, 27): db/000005.sst
 open: db/000005.sst
 close: db/000005.sst
 open: db/000009.sst
-read-at(734, 53): db/000009.sst
-read-at(697, 37): db/000009.sst
-read-at(69, 628): db/000009.sst
+read-at(762, 53): db/000009.sst
+read-at(725, 37): db/000009.sst
+read-at(69, 656): db/000009.sst
 read-at(42, 27): db/000009.sst
 open: db/000009.sst
 close: db/000009.sst
 open: db/000007.sst
-read-at(739, 53): db/000007.sst
-read-at(702, 37): db/000007.sst
-read-at(74, 628): db/000007.sst
+read-at(767, 53): db/000007.sst
+read-at(730, 37): db/000007.sst
+read-at(74, 656): db/000007.sst
 read-at(47, 27): db/000007.sst
 open: db/000007.sst
 close: db/000007.sst
@@ -341,15 +341,15 @@ close: checkpoints/checkpoint1/000006.log
 scan checkpoints/checkpoint1
 ----
 open: checkpoints/checkpoint1/000007.sst
-read-at(739, 53): checkpoints/checkpoint1/000007.sst
-read-at(702, 37): checkpoints/checkpoint1/000007.sst
-read-at(74, 628): checkpoints/checkpoint1/000007.sst
+read-at(767, 53): checkpoints/checkpoint1/000007.sst
+read-at(730, 37): checkpoints/checkpoint1/000007.sst
+read-at(74, 656): checkpoints/checkpoint1/000007.sst
 read-at(47, 27): checkpoints/checkpoint1/000007.sst
 read-at(0, 47): checkpoints/checkpoint1/000007.sst
 open: checkpoints/checkpoint1/000005.sst
-read-at(739, 53): checkpoints/checkpoint1/000005.sst
-read-at(702, 37): checkpoints/checkpoint1/000005.sst
-read-at(74, 628): checkpoints/checkpoint1/000005.sst
+read-at(767, 53): checkpoints/checkpoint1/000005.sst
+read-at(730, 37): checkpoints/checkpoint1/000005.sst
+read-at(74, 656): checkpoints/checkpoint1/000005.sst
 read-at(47, 27): checkpoints/checkpoint1/000005.sst
 read-at(0, 47): checkpoints/checkpoint1/000005.sst
 a 1
@@ -364,9 +364,9 @@ g 10
 scan db
 ----
 open: db/000010.sst
-read-at(766, 53): db/000010.sst
-read-at(729, 37): db/000010.sst
-read-at(101, 628): db/000010.sst
+read-at(794, 53): db/000010.sst
+read-at(757, 37): db/000010.sst
+read-at(101, 656): db/000010.sst
 read-at(74, 27): db/000010.sst
 read-at(0, 74): db/000010.sst
 a 1
@@ -406,9 +406,9 @@ close: checkpoints/checkpoint2/000006.log
 scan checkpoints/checkpoint2
 ----
 open: checkpoints/checkpoint2/000007.sst
-read-at(739, 53): checkpoints/checkpoint2/000007.sst
-read-at(702, 37): checkpoints/checkpoint2/000007.sst
-read-at(74, 628): checkpoints/checkpoint2/000007.sst
+read-at(767, 53): checkpoints/checkpoint2/000007.sst
+read-at(730, 37): checkpoints/checkpoint2/000007.sst
+read-at(74, 656): checkpoints/checkpoint2/000007.sst
 read-at(47, 27): checkpoints/checkpoint2/000007.sst
 read-at(0, 47): checkpoints/checkpoint2/000007.sst
 b 5
@@ -446,15 +446,15 @@ close: checkpoints/checkpoint3/000006.log
 scan checkpoints/checkpoint3
 ----
 open: checkpoints/checkpoint3/000007.sst
-read-at(739, 53): checkpoints/checkpoint3/000007.sst
-read-at(702, 37): checkpoints/checkpoint3/000007.sst
-read-at(74, 628): checkpoints/checkpoint3/000007.sst
+read-at(767, 53): checkpoints/checkpoint3/000007.sst
+read-at(730, 37): checkpoints/checkpoint3/000007.sst
+read-at(74, 656): checkpoints/checkpoint3/000007.sst
 read-at(47, 27): checkpoints/checkpoint3/000007.sst
 read-at(0, 47): checkpoints/checkpoint3/000007.sst
 open: checkpoints/checkpoint3/000005.sst
-read-at(739, 53): checkpoints/checkpoint3/000005.sst
-read-at(702, 37): checkpoints/checkpoint3/000005.sst
-read-at(74, 628): checkpoints/checkpoint3/000005.sst
+read-at(767, 53): checkpoints/checkpoint3/000005.sst
+read-at(730, 37): checkpoints/checkpoint3/000005.sst
+read-at(74, 656): checkpoints/checkpoint3/000005.sst
 read-at(47, 27): checkpoints/checkpoint3/000005.sst
 read-at(0, 47): checkpoints/checkpoint3/000005.sst
 a 1
diff --git a/testdata/compaction_delete_only_hints b/testdata/compaction_delete_only_hints
index 6bee9d15e6..83606100dd 100644
--- a/testdata/compaction_delete_only_hints
+++ b/testdata/compaction_delete_only_hints
@@ -88,7 +88,7 @@ maybe-compact
 Deletion hints:
   (none)
 Compactions:
-  [JOB 100] compacted(delete-only) L2 [000005] (786 B) + L3 [000006] (786 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
+  [JOB 100] compacted(delete-only) L2 [000005] (814 B) + L3 [000006] (814 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
 
 # Verify that compaction correctly handles the presence of multiple
 # overlapping hints which might delete a file multiple times. All of the
@@ -127,7 +127,7 @@ maybe-compact
 Deletion hints:
   (none)
 Compactions:
-  [JOB 100] compacted(delete-only) L2 [000006] (786 B) + L3 [000007] (786 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
+  [JOB 100] compacted(delete-only) L2 [000006] (814 B) + L3 [000007] (814 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
 
 # Test a range tombstone that is already compacted into L6.
 
@@ -206,7 +206,7 @@ maybe-compact
 Deletion hints:
   (none)
 Compactions:
-  [JOB 100] compacted(delete-only) L2 [000005] (786 B) + L3 [000006] (786 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
+  [JOB 100] compacted(delete-only) L2 [000005] (814 B) + L3 [000006] (814 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
 
 # A deletion hint present on an sstable in a higher level should NOT result in a
 # deletion-only compaction incorrectly removing an sstable in L6 following an
@@ -255,7 +255,7 @@ L0.000001 a-z seqnums(tombstone=5-27, file-smallest=0, type=point-key-only)
 close-snapshot
 10
 ----
-[JOB 100] compacted(elision-only) L6 [000004] (850 B) + L6 [] (0 B) -> L6 [000005] (771 B), in 1.0s (2.0s total), output rate 771 B/s
+[JOB 100] compacted(elision-only) L6 [000004] (878 B) + L6 [] (0 B) -> L6 [000005] (799 B), in 1.0s (2.0s total), output rate 799 B/s
 
 # The deletion hint was removed by the elision-only compaction.
 get-hints
@@ -414,4 +414,4 @@ maybe-compact
 Deletion hints:
   (none)
 Compactions:
-  [JOB 100] compacted(delete-only) L6 [000006 000007 000008 000009 000011] (4.5 K) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
+  [JOB 100] compacted(delete-only) L6 [000006 000007 000008 000009 000011] (4.6 K) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
diff --git a/testdata/compaction_iter b/testdata/compaction_iter
index 016cdbd5bf..a4e73827d5 100644
--- a/testdata/compaction_iter
+++ b/testdata/compaction_iter
@@ -238,7 +238,7 @@ a.SET.1:c
 iter
 first
 ----
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.SET.2:b
@@ -250,7 +250,7 @@ first
 next
 ----
 a#2,1:b
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.MERGE.2:b
@@ -262,7 +262,7 @@ first
 next
 ----
 a#2,2:b
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.INVALID.2:c
@@ -273,7 +273,7 @@ iter
 first
 tombstones
 ----
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 .
 
 define
diff --git a/testdata/compaction_iter_delete_sized b/testdata/compaction_iter_delete_sized
index c1663fcbfe..0483fd530a 100644
--- a/testdata/compaction_iter_delete_sized
+++ b/testdata/compaction_iter_delete_sized
@@ -238,7 +238,7 @@ a.SET.1:c
 iter
 first
 ----
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.SET.2:b
@@ -250,7 +250,7 @@ first
 next
 ----
 a#2,18:b
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.MERGE.2:b
@@ -262,7 +262,7 @@ first
 next
 ----
 a#2,2:b
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.INVALID.2:c
@@ -273,7 +273,7 @@ iter
 first
 tombstones
 ----
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 .
 
 define
@@ -1338,7 +1338,7 @@ first
 next
 ----
 a#2,18:b
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.SET.3:c
@@ -1351,7 +1351,7 @@ first
 next
 ----
 a#3,18:c
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 # SINGLEDEL that meets a SETWITHDEL is transformed into a DEL.
 
diff --git a/testdata/compaction_iter_set_with_del b/testdata/compaction_iter_set_with_del
index 796984e0c7..0cb35f6b29 100644
--- a/testdata/compaction_iter_set_with_del
+++ b/testdata/compaction_iter_set_with_del
@@ -238,7 +238,7 @@ a.SET.1:c
 iter
 first
 ----
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.SET.2:b
@@ -250,7 +250,7 @@ first
 next
 ----
 a#2,18:b
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.MERGE.2:b
@@ -262,7 +262,7 @@ first
 next
 ----
 a#2,2:b
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.INVALID.2:c
@@ -273,7 +273,7 @@ iter
 first
 tombstones
 ----
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 .
 
 define
@@ -1338,7 +1338,7 @@ first
 next
 ----
 a#2,18:b
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 define
 a.SET.3:c
@@ -1351,7 +1351,7 @@ first
 next
 ----
 a#3,18:c
-err=invalid internal key kind: 255
+err=invalid internal key kind: 191
 
 # SINGLEDEL that meets a SETWITHDEL is transformed into a DEL.
 
diff --git a/testdata/compaction_tombstones b/testdata/compaction_tombstones
index c197cff21d..d53b03f994 100644
--- a/testdata/compaction_tombstones
+++ b/testdata/compaction_tombstones
@@ -41,7 +41,7 @@ range-deletions-bytes-estimate: 0
 
 maybe-compact
 ----
-[JOB 100] compacted(elision-only) L6 [000004] (853 B) + L6 [] (0 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
+[JOB 100] compacted(elision-only) L6 [000004] (884 B) + L6 [] (0 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
 
 # Test a table that straddles a snapshot. It should not be compacted.
 define snapshots=(50)
@@ -80,12 +80,12 @@ wait-pending-table-stats
 num-entries: 2
 num-deletions: 1
 num-range-key-sets: 0
-point-deletions-bytes-estimate: 108
+point-deletions-bytes-estimate: 111
 range-deletions-bytes-estimate: 0
 
 maybe-compact
 ----
-[JOB 100] compacted(elision-only) L6 [000004] (823 B) + L6 [] (0 B) -> L6 [000005] (772 B), in 1.0s (2.0s total), output rate 772 B/s
+[JOB 100] compacted(elision-only) L6 [000004] (851 B) + L6 [] (0 B) -> L6 [000005] (800 B), in 1.0s (2.0s total), output rate 800 B/s
 
 version
 ----
@@ -119,7 +119,7 @@ wait-pending-table-stats
 num-entries: 6
 num-deletions: 2
 num-range-key-sets: 0
-point-deletions-bytes-estimate: 48
+point-deletions-bytes-estimate: 49
 range-deletions-bytes-estimate: 66
 
 maybe-compact
@@ -134,7 +134,7 @@ close-snapshot
 close-snapshot
 103
 ----
-[JOB 100] compacted(elision-only) L6 [000004] (1001 B) + L6 [] (0 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
+[JOB 100] compacted(elision-only) L6 [000004] (1.0 K) + L6 [] (0 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
 
 # Test a table that contains both deletions and non-deletions, but whose
 # non-deletions well outnumber its deletions. The table should not be
@@ -152,7 +152,7 @@ wait-pending-table-stats
 num-entries: 11
 num-deletions: 1
 num-range-key-sets: 0
-point-deletions-bytes-estimate: 26
+point-deletions-bytes-estimate: 27
 range-deletions-bytes-estimate: 0
 
 close-snapshot
@@ -233,7 +233,7 @@ wait-pending-table-stats
 num-entries: 3
 num-deletions: 3
 num-range-key-sets: 0
-point-deletions-bytes-estimate: 6966
+point-deletions-bytes-estimate: 6994
 range-deletions-bytes-estimate: 0
 
 # By plain file size, 000005 should be picked because it is larger and
@@ -243,7 +243,7 @@ range-deletions-bytes-estimate: 0
 
 maybe-compact
 ----
-[JOB 100] compacted(default) L5 [000004] (833 B) + L6 [000006] (13 K) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
+[JOB 100] compacted(default) L5 [000004] (861 B) + L6 [000006] (13 K) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s
 
 # A table containing only range keys is not eligible for elision.
 # RANGEKEYDEL or RANGEKEYUNSET.
@@ -323,7 +323,7 @@ range-deletions-bytes-estimate: 41
 
 maybe-compact
 ----
-[JOB 100] compacted(elision-only) L6 [000004] (1003 B) + L6 [] (0 B) -> L6 [000005] (778 B), in 1.0s (2.0s total), output rate 778 B/s
+[JOB 100] compacted(elision-only) L6 [000004] (1.0 K) + L6 [] (0 B) -> L6 [000005] (806 B), in 1.0s (2.0s total), output rate 806 B/s
 
 # Close the DB, asserting that the reference counts balance.
 close
@@ -359,7 +359,7 @@ wait-pending-table-stats
 num-entries: 2
 num-deletions: 1
 num-range-key-sets: 0
-point-deletions-bytes-estimate: 2786
+point-deletions-bytes-estimate: 2797
 range-deletions-bytes-estimate: 0
 
 wait-pending-table-stats
@@ -373,7 +373,7 @@ range-deletions-bytes-estimate: 8246
 
 maybe-compact
 ----
-[JOB 100] compacted(default) L5 [000005] (850 B) + L6 [000007] (13 K) -> L6 [000008] (4.8 K), in 1.0s (2.0s total), output rate 4.8 K/s
+[JOB 100] compacted(default) L5 [000005] (878 B) + L6 [000007] (13 K) -> L6 [000008] (4.8 K), in 1.0s (2.0s total), output rate 4.8 K/s
 
 # The same LSM as above. However, this time, with point tombstone weighting at
 # 2x, the table with the point tombstone (000004) will be selected as the
@@ -402,7 +402,7 @@ wait-pending-table-stats
 num-entries: 2
 num-deletions: 1
 num-range-key-sets: 0
-point-deletions-bytes-estimate: 2786
+point-deletions-bytes-estimate: 2797
 range-deletions-bytes-estimate: 0
 
 wait-pending-table-stats
@@ -416,4 +416,4 @@ range-deletions-bytes-estimate: 8246
 
 maybe-compact
 ----
-[JOB 100] compacted(default) L5 [000005] (850 B) + L6 [000007] (13 K) -> L6 [000008] (4.8 K), in 1.0s (2.0s total), output rate 4.8 K/s
+[JOB 100] compacted(default) L5 [000005] (878 B) + L6 [000007] (13 K) -> L6 [000008] (4.8 K), in 1.0s (2.0s total), output rate 4.8 K/s
diff --git a/testdata/event_listener b/testdata/event_listener
index 4bfb91792d..571dd723a1 100644
--- a/testdata/event_listener
+++ b/testdata/event_listener
@@ -127,7 +127,7 @@ close: db/marker.manifest.000002.MANIFEST-000006
 remove: db/marker.manifest.000001.MANIFEST-000001
 sync: db
 [JOB 5] MANIFEST created 000006
-[JOB 5] flushed 1 memtable to L0 [000005] (771 B), in 1.0s (2.0s total), output rate 771 B/s
+[JOB 5] flushed 1 memtable to L0 [000005] (799 B), in 1.0s (2.0s total), output rate 799 B/s
 
 compact
 ----
@@ -151,21 +151,21 @@ close: db/marker.manifest.000003.MANIFEST-000009
 remove: db/marker.manifest.000002.MANIFEST-000006
 sync: db
 [JOB 7] MANIFEST created 000009
-[JOB 7] flushed 1 memtable to L0 [000008] (771 B), in 1.0s (2.0s total), output rate 771 B/s
+[JOB 7] flushed 1 memtable to L0 [000008] (799 B), in 1.0s (2.0s total), output rate 799 B/s
 remove: db/MANIFEST-000001
 [JOB 7] MANIFEST deleted 000001
-[JOB 8] compacting(default) L0 [000005 000008] (1.5 K) + L6 [] (0 B)
+[JOB 8] compacting(default) L0 [000005 000008] (1.6 K) + L6 [] (0 B)
 open: db/000005.sst
-read-at(718, 53): db/000005.sst
-read-at(681, 37): db/000005.sst
-read-at(53, 628): db/000005.sst
+read-at(746, 53): db/000005.sst
+read-at(709, 37): db/000005.sst
+read-at(53, 656): db/000005.sst
 read-at(26, 27): db/000005.sst
 open: db/000005.sst
 close: db/000005.sst
 open: db/000008.sst
-read-at(718, 53): db/000008.sst
-read-at(681, 37): db/000008.sst
-read-at(53, 628): db/000008.sst
+read-at(746, 53): db/000008.sst
+read-at(709, 37): db/000008.sst
+read-at(53, 656): db/000008.sst
 read-at(26, 27): db/000008.sst
 open: db/000008.sst
 close: db/000008.sst
@@ -188,7 +188,7 @@ close: db/marker.manifest.000004.MANIFEST-000011
 remove: db/marker.manifest.000003.MANIFEST-000009
 sync: db
 [JOB 8] MANIFEST created 000011
-[JOB 8] compacted(default) L0 [000005 000008] (1.5 K) + L6 [] (0 B) -> L6 [000010] (771 B), in 1.0s (3.0s total), output rate 771 B/s
+[JOB 8] compacted(default) L0 [000005 000008] (1.6 K) + L6 [] (0 B) -> L6 [000010] (799 B), in 1.0s (3.0s total), output rate 799 B/s
 close: db/000005.sst
 close: db/000008.sst
 remove: db/000005.sst
@@ -223,7 +223,7 @@ close: db/marker.manifest.000005.MANIFEST-000014
 remove: db/marker.manifest.000004.MANIFEST-000011
 sync: db
 [JOB 10] MANIFEST created 000014
-[JOB 10] flushed 1 memtable to L0 [000013] (771 B), in 1.0s (2.0s total), output rate 771 B/s
+[JOB 10] flushed 1 memtable to L0 [000013] (799 B), in 1.0s (2.0s total), output rate 799 B/s
 
 enable-file-deletions
 ----
@@ -233,9 +233,9 @@ remove: db/MANIFEST-000009
 ingest
 ----
 open: ext/0
-read-at(773, 53): ext/0
-read-at(736, 37): ext/0
-read-at(53, 683): ext/0
+read-at(801, 53): ext/0
+read-at(764, 37): ext/0
+read-at(53, 711): ext/0
 read-at(26, 27): ext/0
 read-at(0, 26): ext/0
 close: ext/0
@@ -243,9 +243,9 @@ link: ext/0 -> db/000015.sst
 [JOB 12] ingesting: sstable created 000015
 sync: db
 open: db/000013.sst
-read-at(718, 53): db/000013.sst
-read-at(681, 37): db/000013.sst
-read-at(53, 628): db/000013.sst
+read-at(746, 53): db/000013.sst
+read-at(709, 37): db/000013.sst
+read-at(53, 656): db/000013.sst
 read-at(26, 27): db/000013.sst
 read-at(0, 26): db/000013.sst
 create: db/MANIFEST-000016
@@ -257,28 +257,28 @@ remove: db/marker.manifest.000005.MANIFEST-000014
 sync: db
 [JOB 12] MANIFEST created 000016
 remove: ext/0
-[JOB 12] ingested L0:000015 (826 B)
+[JOB 12] ingested L0:000015 (854 B)
 
 metrics
 ----
 __level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
     WAL         1    27 B       -    48 B       -       -       -       -   108 B       -       -       -     2.2
-      0         2   1.6 K    0.40    81 B   826 B       1     0 B       0   2.3 K       3     0 B       2    28.6
+      0         2   1.6 K    0.40    81 B   854 B       1     0 B       0   2.3 K       3     0 B       2    29.6
       1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
       2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
       3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
       4         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
       5         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
-      6         1   771 B       -   1.5 K     0 B       0     0 B       0   771 B       1   1.5 K       1     0.5
-  total         3   2.3 K       -   934 B   826 B       1     0 B       0   3.9 K       4   1.5 K       3     4.3
+      6         1   799 B       -   1.6 K     0 B       0     0 B       0   799 B       1   1.6 K       1     0.5
+  total         3   2.4 K       -   962 B   854 B       1     0 B       0   4.1 K       4   1.6 K       3     4.3
   flush         3                             0 B       0       0  (ingest = tables-ingested, move = ingested-as-flushable)
-compact         1   2.3 K     0 B       0                          (size == estimated-debt, score = in-progress-bytes, in = num-in-progress)
+compact         1   2.4 K     0 B       0                          (size == estimated-debt, score = in-progress-bytes, in = num-in-progress)
   ctype         1       0       0       0       0       0       0  (default, delete, elision, move, read, rewrite, multi-level)
  memtbl         1   256 K
 zmemtbl         0     0 B
    ztbl         0     0 B
- bcache         8   1.4 K   11.1%  (score == hit-rate)
- tcache         1   744 B   40.0%  (score == hit-rate)
+ bcache         8   1.5 K   11.1%  (score == hit-rate)
+ tcache         1   752 B   40.0%  (score == hit-rate)
   snaps         0       -       0  (score == earliest seq num)
  titers         0
  filter         -       -    0.0%  (score == utility)
@@ -291,16 +291,16 @@ ingest-flushable
 ----
 sync-data: wal/000012.log
 open: ext/a
-read-at(773, 53): ext/a
-read-at(736, 37): ext/a
-read-at(53, 683): ext/a
+read-at(801, 53): ext/a
+read-at(764, 37): ext/a
+read-at(53, 711): ext/a
 read-at(26, 27): ext/a
 read-at(0, 26): ext/a
 close: ext/a
 open: ext/b
-read-at(773, 53): ext/b
-read-at(736, 37): ext/b
-read-at(53, 683): ext/b
+read-at(801, 53): ext/b
+read-at(764, 37): ext/b
+read-at(53, 711): ext/b
 read-at(26, 27): ext/b
 read-at(0, 26): ext/b
 close: ext/b
@@ -322,7 +322,7 @@ sync: wal
 [JOB 15] WAL created 000020
 remove: ext/a
 remove: ext/b
-[JOB 13] ingested as flushable 000017 (826 B), 000018 (826 B)
+[JOB 13] ingested as flushable 000017 (854 B), 000018 (854 B)
 sync-data: wal/000020.log
 close: wal/000020.log
 create: wal/000021.log
@@ -335,7 +335,7 @@ sync-data: db/000022.sst
 close: db/000022.sst
 sync: db
 sync: db/MANIFEST-000016
-[JOB 17] flushed 1 memtable to L0 [000022] (771 B), in 1.0s (2.0s total), output rate 771 B/s
+[JOB 17] flushed 1 memtable to L0 [000022] (799 B), in 1.0s (2.0s total), output rate 799 B/s
 remove: db/MANIFEST-000011
 [JOB 17] MANIFEST deleted 000011
 [JOB 18] flushing 2 ingested tables
@@ -347,7 +347,7 @@ close: db/marker.manifest.000007.MANIFEST-000023
 remove: db/marker.manifest.000006.MANIFEST-000016
 sync: db
 [JOB 18] MANIFEST created 000023
-[JOB 18] flushed 2 ingested flushables L0:000017 (826 B) + L6:000018 (826 B) in 1.0s (2.0s total), output rate 1.6 K/s
+[JOB 18] flushed 2 ingested flushables L0:000017 (854 B) + L6:000018 (854 B) in 1.0s (2.0s total), output rate 1.7 K/s
 remove: db/MANIFEST-000014
 [JOB 18] MANIFEST deleted 000014
 [JOB 19] flushing 1 memtable to L0
@@ -358,22 +358,22 @@ metrics
 ----
 __level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
     WAL         1    29 B       -    82 B       -       -       -       -   110 B       -       -       -     1.3
-      0         4   3.1 K    0.80    81 B   1.6 K       2     0 B       0   3.0 K       4     0 B       4    38.1
+      0         4   3.2 K    0.80    81 B   1.7 K       2     0 B       0   3.1 K       4     0 B       4    39.5
       1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
       2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
       3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
       4         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
       5         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
-      6         2   1.6 K       -   1.5 K   826 B       1     0 B       0   771 B       1   1.5 K       1     0.5
-  total         6   4.7 K       -   2.5 K   2.4 K       3     0 B       0   6.3 K       5   1.5 K       5     2.5
-  flush         6                           1.6 K       2       1  (ingest = tables-ingested, move = ingested-as-flushable)
-compact         1   4.7 K     0 B       0                          (size == estimated-debt, score = in-progress-bytes, in = num-in-progress)
+      6         2   1.6 K       -   1.6 K   854 B       1     0 B       0   799 B       1   1.6 K       1     0.5
+  total         6   4.8 K       -   2.6 K   2.5 K       3     0 B       0   6.5 K       5   1.6 K       5     2.5
+  flush         6                           1.7 K       2       1  (ingest = tables-ingested, move = ingested-as-flushable)
+compact         1   4.8 K     0 B       0                          (size == estimated-debt, score = in-progress-bytes, in = num-in-progress)
   ctype         1       0       0       0       0       0       0  (default, delete, elision, move, read, rewrite, multi-level)
  memtbl         1   512 K
 zmemtbl         0     0 B
    ztbl         0     0 B
- bcache        16   2.9 K   14.3%  (score == hit-rate)
- tcache         1   744 B   50.0%  (score == hit-rate)
+ bcache        16   3.0 K   14.3%  (score == hit-rate)
+ tcache         1   752 B   50.0%  (score == hit-rate)
   snaps         0       -       0  (score == earliest seq num)
  titers         0
  filter         -       -    0.0%  (score == utility)
diff --git a/testdata/ingest b/testdata/ingest
index a811e12a1e..fb95856def 100644
--- a/testdata/ingest
+++ b/testdata/ingest
@@ -48,7 +48,7 @@ compact         0     0 B     0 B       0                          (size == esti
 zmemtbl         0     0 B
    ztbl         0     0 B
  bcache         8   1.5 K   42.9%  (score == hit-rate)
- tcache         1   744 B   50.0%  (score == hit-rate)
+ tcache         1   752 B   50.0%  (score == hit-rate)
   snaps         0       -       0  (score == earliest seq num)
  titers         0
  filter         -       -    0.0%  (score == utility)
@@ -349,7 +349,7 @@ num-entries: 2
 num-deletions: 2
 num-range-key-sets: 0
 point-deletions-bytes-estimate: 0
-range-deletions-bytes-estimate: 1666
+range-deletions-bytes-estimate: 1694
 
 # A set operation takes precedence over a range deletion at the same
 # sequence number as can occur during ingestion.
diff --git a/testdata/manual_compaction_file_boundaries_delsized b/testdata/manual_compaction_file_boundaries_delsized
index b83e57e265..9ec939d662 100644
--- a/testdata/manual_compaction_file_boundaries_delsized
+++ b/testdata/manual_compaction_file_boundaries_delsized
@@ -290,60 +290,60 @@ compact a-zz L1
 file-sizes
 ----
 L2:
-  000095:[a#101,1-az@1#129,1]: 7637 bytes (7.5 K)
-  000096:[b@1#130,1-bz@1#156,1]: 6629 bytes (6.5 K)
-  000097:[c@1#157,1-cz@1#183,1]: 6629 bytes (6.5 K)
-  000098:[d@1#184,1-dz@1#210,1]: 6629 bytes (6.5 K)
-  000099:[e@1#211,1-ez@1#237,1]: 6629 bytes (6.5 K)
-  000100:[f@1#238,1-fz@1#264,1]: 6629 bytes (6.5 K)
-  000101:[g@1#265,1-gz@1#291,1]: 6629 bytes (6.5 K)
-  000102:[h@1#292,1-hz@1#318,1]: 6629 bytes (6.5 K)
-  000103:[i@1#319,1-iz@1#345,1]: 6629 bytes (6.5 K)
-  000104:[j@1#346,1-jz@1#372,1]: 6629 bytes (6.5 K)
-  000105:[k@1#373,1-kz@1#399,1]: 6629 bytes (6.5 K)
-  000106:[l@1#400,1-lz@1#426,1]: 6629 bytes (6.5 K)
-  000107:[m@1#427,1-mz@1#453,1]: 6629 bytes (6.5 K)
-  000108:[n@1#454,1-nz@1#480,1]: 6629 bytes (6.5 K)
-  000109:[o@1#481,1-oz@1#507,1]: 6629 bytes (6.5 K)
-  000110:[p@1#508,1-pz@1#534,1]: 6629 bytes (6.5 K)
-  000111:[q@1#535,1-qz@1#561,1]: 6628 bytes (6.5 K)
-  000112:[r@1#562,1-rz@1#588,1]: 6629 bytes (6.5 K)
-  000113:[s@1#589,1-sz@1#615,1]: 6629 bytes (6.5 K)
-  000114:[t@1#616,1-tz@1#642,1]: 6629 bytes (6.5 K)
-  000115:[u@1#643,1-uz@1#669,1]: 6629 bytes (6.5 K)
-  000116:[v@1#670,1-vz@1#696,1]: 6629 bytes (6.5 K)
-  000117:[w@1#697,1-wz@1#723,1]: 6629 bytes (6.5 K)
-  000118:[x@1#724,1-xz@1#750,1]: 6629 bytes (6.5 K)
-  000119:[y@1#751,1-yz@1#777,1]: 6629 bytes (6.5 K)
-  000120:[z#102,1-zr@1#796,1]: 5909 bytes (5.8 K)
-  000121:[zs@1#797,1-zz@1#804,1]: 2491 bytes (2.4 K)
+  000095:[a#101,1-az@1#129,1]: 7665 bytes (7.5 K)
+  000096:[b@1#130,1-bz@1#156,1]: 6657 bytes (6.5 K)
+  000097:[c@1#157,1-cz@1#183,1]: 6657 bytes (6.5 K)
+  000098:[d@1#184,1-dz@1#210,1]: 6657 bytes (6.5 K)
+  000099:[e@1#211,1-ez@1#237,1]: 6657 bytes (6.5 K)
+  000100:[f@1#238,1-fz@1#264,1]: 6657 bytes (6.5 K)
+  000101:[g@1#265,1-gz@1#291,1]: 6657 bytes (6.5 K)
+  000102:[h@1#292,1-hz@1#318,1]: 6657 bytes (6.5 K)
+  000103:[i@1#319,1-iz@1#345,1]: 6657 bytes (6.5 K)
+  000104:[j@1#346,1-jz@1#372,1]: 6657 bytes (6.5 K)
+  000105:[k@1#373,1-kz@1#399,1]: 6657 bytes (6.5 K)
+  000106:[l@1#400,1-lz@1#426,1]: 6657 bytes (6.5 K)
+  000107:[m@1#427,1-mz@1#453,1]: 6657 bytes (6.5 K)
+  000108:[n@1#454,1-nz@1#480,1]: 6657 bytes (6.5 K)
+  000109:[o@1#481,1-oz@1#507,1]: 6657 bytes (6.5 K)
+  000110:[p@1#508,1-pz@1#534,1]: 6657 bytes (6.5 K)
+  000111:[q@1#535,1-qz@1#561,1]: 6656 bytes (6.5 K)
+  000112:[r@1#562,1-rz@1#588,1]: 6657 bytes (6.5 K)
+  000113:[s@1#589,1-sz@1#615,1]: 6657 bytes (6.5 K)
+  000114:[t@1#616,1-tz@1#642,1]: 6657 bytes (6.5 K)
+  000115:[u@1#643,1-uz@1#669,1]: 6657 bytes (6.5 K)
+  000116:[v@1#670,1-vz@1#696,1]: 6657 bytes (6.5 K)
+  000117:[w@1#697,1-wz@1#723,1]: 6657 bytes (6.5 K)
+  000118:[x@1#724,1-xz@1#750,1]: 6657 bytes (6.5 K)
+  000119:[y@1#751,1-yz@1#777,1]: 6657 bytes (6.5 K)
+  000120:[z#102,1-zr@1#796,1]: 5937 bytes (5.8 K)
+  000121:[zs@1#797,1-zz@1#804,1]: 2519 bytes (2.5 K)
 L3:
-  000005:[a#1,1-a#1,1]: 10776 bytes (10 K)
-  000006:[b#2,1-b#2,1]: 10776 bytes (10 K)
-  000007:[c#3,1-c#3,1]: 10776 bytes (10 K)
-  000008:[d#4,1-d#4,1]: 10776 bytes (10 K)
-  000009:[e#5,1-e#5,1]: 10776 bytes (10 K)
-  000010:[f#6,1-f#6,1]: 10776 bytes (10 K)
-  000011:[g#7,1-g#7,1]: 10776 bytes (10 K)
-  000012:[h#8,1-h#8,1]: 10776 bytes (10 K)
-  000013:[i#9,1-i#9,1]: 10776 bytes (10 K)
-  000014:[j#10,1-j#10,1]: 10776 bytes (10 K)
-  000015:[k#11,1-k#11,1]: 10776 bytes (10 K)
-  000016:[l#12,1-l#12,1]: 10776 bytes (10 K)
-  000017:[m#13,1-m#13,1]: 10776 bytes (10 K)
-  000018:[n#14,1-n#14,1]: 10776 bytes (10 K)
-  000019:[o#15,1-o#15,1]: 10776 bytes (10 K)
-  000020:[p#16,1-p#16,1]: 10776 bytes (10 K)
-  000021:[q#17,1-q#17,1]: 10776 bytes (10 K)
-  000022:[r#18,1-r#18,1]: 10776 bytes (10 K)
-  000023:[s#19,1-s#19,1]: 10776 bytes (10 K)
-  000024:[t#20,1-t#20,1]: 10776 bytes (10 K)
-  000025:[u#21,1-u#21,1]: 10776 bytes (10 K)
-  000026:[v#22,1-v#22,1]: 10776 bytes (10 K)
-  000027:[w#23,1-w#23,1]: 10776 bytes (10 K)
-  000028:[x#24,1-x#24,1]: 10776 bytes (10 K)
-  000029:[y#25,1-y#25,1]: 10776 bytes (10 K)
-  000030:[z#26,1-z#26,1]: 10776 bytes (10 K)
+  000005:[a#1,1-a#1,1]: 10804 bytes (11 K)
+  000006:[b#2,1-b#2,1]: 10804 bytes (11 K)
+  000007:[c#3,1-c#3,1]: 10804 bytes (11 K)
+  000008:[d#4,1-d#4,1]: 10804 bytes (11 K)
+  000009:[e#5,1-e#5,1]: 10804 bytes (11 K)
+  000010:[f#6,1-f#6,1]: 10804 bytes (11 K)
+  000011:[g#7,1-g#7,1]: 10804 bytes (11 K)
+  000012:[h#8,1-h#8,1]: 10804 bytes (11 K)
+  000013:[i#9,1-i#9,1]: 10804 bytes (11 K)
+  000014:[j#10,1-j#10,1]: 10804 bytes (11 K)
+  000015:[k#11,1-k#11,1]: 10804 bytes (11 K)
+  000016:[l#12,1-l#12,1]: 10804 bytes (11 K)
+  000017:[m#13,1-m#13,1]: 10804 bytes (11 K)
+  000018:[n#14,1-n#14,1]: 10804 bytes (11 K)
+  000019:[o#15,1-o#15,1]: 10804 bytes (11 K)
+  000020:[p#16,1-p#16,1]: 10804 bytes (11 K)
+  000021:[q#17,1-q#17,1]: 10804 bytes (11 K)
+  000022:[r#18,1-r#18,1]: 10804 bytes (11 K)
+  000023:[s#19,1-s#19,1]: 10804 bytes (11 K)
+  000024:[t#20,1-t#20,1]: 10804 bytes (11 K)
+  000025:[u#21,1-u#21,1]: 10804 bytes (11 K)
+  000026:[v#22,1-v#22,1]: 10804 bytes (11 K)
+  000027:[w#23,1-w#23,1]: 10804 bytes (11 K)
+  000028:[x#24,1-x#24,1]: 10804 bytes (11 K)
+  000029:[y#25,1-y#25,1]: 10804 bytes (11 K)
+  000030:[z#26,1-z#26,1]: 10804 bytes (11 K)
 
 # Test a scenario where there exists a grandparent file (in L3), but the L1->L2
 # compaction doesn't reach it until late in the compaction. The output file
@@ -399,11 +399,11 @@ compact a-zz L1
 file-sizes
 ----
 L2:
-  000007:[a#201,1-j#210,1]: 10958 bytes (11 K)
-  000008:[k#211,1-o#215,1]: 5865 bytes (5.7 K)
-  000009:[z#102,1-z#102,1]: 781 bytes (781 B)
+  000007:[a#201,1-j#210,1]: 10986 bytes (11 K)
+  000008:[k#211,1-o#215,1]: 5893 bytes (5.8 K)
+  000009:[z#102,1-z#102,1]: 809 bytes (809 B)
 L3:
-  000006:[m#1,1-m#1,1]: 10776 bytes (10 K)
+  000006:[m#1,1-m#1,1]: 10804 bytes (11 K)
 
 # Test the file-size splitter's adaptive tolerance for early-splitting at a
 # grandparent boundary. The L1->L2 compaction has many opportunities to split at
@@ -501,20 +501,20 @@ compact a-zz L1
 file-sizes
 ----
 L2:
-  000019:[a#201,1-e#205,1]: 5865 bytes (5.7 K)
-  000020:[f#206,1-l#212,1]: 7893 bytes (7.7 K)
-  000021:[m#213,1-z#102,1]: 3827 bytes (3.7 K)
+  000019:[a#201,1-e#205,1]: 5893 bytes (5.8 K)
+  000020:[f#206,1-l#212,1]: 7921 bytes (7.7 K)
+  000021:[m#213,1-z#102,1]: 3855 bytes (3.8 K)
 L3:
-  000006:[a#1,1-a#1,1]: 1776 bytes (1.7 K)
-  000007:[ab#2,1-ab#2,1]: 1777 bytes (1.7 K)
-  000008:[ac#3,1-ac#3,1]: 1777 bytes (1.7 K)
-  000013:[ad#8,1-ad#8,1]: 1777 bytes (1.7 K)
-  000012:[ad#7,1-ad#7,1]: 1777 bytes (1.7 K)
-  000011:[ad#6,1-ad#6,1]: 1777 bytes (1.7 K)
-  000010:[ad#5,1-ad#5,1]: 1777 bytes (1.7 K)
-  000009:[ad#4,1-ad#4,1]: 1777 bytes (1.7 K)
-  000014:[c#9,1-c#9,1]: 1776 bytes (1.7 K)
-  000015:[d#10,1-d#10,1]: 1776 bytes (1.7 K)
-  000016:[e#11,1-e#11,1]: 1776 bytes (1.7 K)
-  000017:[f#12,1-f#12,1]: 1776 bytes (1.7 K)
-  000018:[m#13,1-m#13,1]: 1776 bytes (1.7 K)
+  000006:[a#1,1-a#1,1]: 1804 bytes (1.8 K)
+  000007:[ab#2,1-ab#2,1]: 1805 bytes (1.8 K)
+  000008:[ac#3,1-ac#3,1]: 1805 bytes (1.8 K)
+  000013:[ad#8,1-ad#8,1]: 1805 bytes (1.8 K)
+  000012:[ad#7,1-ad#7,1]: 1805 bytes (1.8 K)
+  000011:[ad#6,1-ad#6,1]: 1805 bytes (1.8 K)
+  000010:[ad#5,1-ad#5,1]: 1805 bytes (1.8 K)
+  000009:[ad#4,1-ad#4,1]: 1805 bytes (1.8 K)
+  000014:[c#9,1-c#9,1]: 1804 bytes (1.8 K)
+  000015:[d#10,1-d#10,1]: 1804 bytes (1.8 K)
+  000016:[e#11,1-e#11,1]: 1804 bytes (1.8 K)
+  000017:[f#12,1-f#12,1]: 1804 bytes (1.8 K)
+  000018:[m#13,1-m#13,1]: 1804 bytes (1.8 K)
diff --git a/testdata/marked_for_compaction b/testdata/marked_for_compaction
index 6d17017e96..fa99de8b58 100644
--- a/testdata/marked_for_compaction
+++ b/testdata/marked_for_compaction
@@ -20,8 +20,8 @@ marked L0.000004
 
 maybe-compact
 ----
-[JOB 100] compacted(rewrite) L1 [000005] (779 B) + L1 [] (0 B) -> L1 [000006] (779 B), in 1.0s (2.0s total), output rate 779 B/s
-[JOB 100] compacted(rewrite) L0 [000004] (774 B) + L0 [] (0 B) -> L0 [000007] (774 B), in 1.0s (2.0s total), output rate 774 B/s
+[JOB 100] compacted(rewrite) L1 [000005] (807 B) + L1 [] (0 B) -> L1 [000006] (807 B), in 1.0s (2.0s total), output rate 807 B/s
+[JOB 100] compacted(rewrite) L0 [000004] (802 B) + L0 [] (0 B) -> L0 [000007] (802 B), in 1.0s (2.0s total), output rate 802 B/s
 0.0:
   000007:[c#11,SET-c#11,SET] points:[c#11,SET-c#11,SET]
 1:
diff --git a/testdata/metrics b/testdata/metrics
index 81981b62e6..539ad9d87d 100644
--- a/testdata/metrics
+++ b/testdata/metrics
@@ -34,7 +34,7 @@ compact         0     0 B     0 B       0                          (size == esti
 zmemtbl         1   256 K
    ztbl         0     0 B
  bcache         4   697 B    0.0%  (score == hit-rate)
- tcache         1   744 B    0.0%  (score == hit-rate)
+ tcache         1   752 B    0.0%  (score == hit-rate)
   snaps         0       -       0  (score == earliest seq num)
  titers         1
  filter         -       -    0.0%  (score == utility)
@@ -145,7 +145,7 @@ compact         1     0 B     0 B       0                          (size == esti
 zmemtbl         1   256 K
    ztbl         1   770 B
  bcache         4   697 B   42.9%  (score == hit-rate)
- tcache         1   744 B   66.7%  (score == hit-rate)
+ tcache         1   752 B   66.7%  (score == hit-rate)
   snaps         0       -       0  (score == earliest seq num)
  titers         1
  filter         -       -    0.0%  (score == utility)
diff --git a/testdata/table_stats b/testdata/table_stats
index d370ee88f2..0768426a95 100644
--- a/testdata/table_stats
+++ b/testdata/table_stats
@@ -542,5 +542,5 @@ wait-pending-table-stats
 num-entries: 5
 num-deletions: 2
 num-range-key-sets: 0
-point-deletions-bytes-estimate: 112974
+point-deletions-bytes-estimate: 113036
 range-deletions-bytes-estimate: 0