From f31f26118789d5d456ee91ecf7bd0fd56e282c3b Mon Sep 17 00:00:00 2001
From: Jackson Owens <jackson@cockroachlabs.com>
Date: Thu, 28 Mar 2024 18:02:34 -0400
Subject: [PATCH] db: unconditionally interleave ignorable boundaries

The levelIter has a concept of ignorable boundary keys. When a levelIter
exhausts a file's point keys, it's possible that the same file still contains
range deletions relevant to other levels of the LSM. The levelIter will, under
some conditions, interleave the largest boundary key of the sstable into
iteration as an 'ignorable boundary key,' so that the file's range deletions
remain accessible until all other levels progress beyound the file's boundary.

When block-property filters are in use, a file's point key iterator may become
exhausted early, before the file's range deletions are irrelevant, even if the
file's largest key is not a range deletion. To work around this subtlety, the
sstable iterator previously would surface knowledge of whether any point keys
may have been skipped through a MaybeFilteredKeys method. The levelIter used
this method to determine when to interleave an ignorable largest boundary in
case there may be additional relevant range deletions.

This commit removes the conditioning on the output of MaybeFilteredKeys,
instead unconditionally interleaving a synthetic boundary at a file's largest
point key if the levelIter user is using the file's range deletion iterator.
This simplifies the logic and removes a fragile reliance on the sstable's
iterators accounting of when it may have filtered keys.

Future work (#2863) will remove the need to interleave synthetic boundaries at
all, instead interleaving the range deletion bounds themselves among point
keys.

Informs #2863.
Close #3334.
---
 level_checker.go               | 78 ++++++++++++++++++----------------
 level_iter.go                  | 44 ++++++++-----------
 merging_iter.go                | 14 +++++-
 testdata/level_iter_boundaries |  2 +
 testdata/level_iter_seek       |  6 +++
 testdata/merging_iter          |  2 +-
 6 files changed, 80 insertions(+), 66 deletions(-)

diff --git a/level_checker.go b/level_checker.go
index 96488f3220..716d6f4727 100644
--- a/level_checker.go
+++ b/level_checker.go
@@ -131,7 +131,8 @@ func (m *simpleMergingIter) step() bool {
 	item := &m.heap.items[0]
 	l := &m.levels[item.index]
 	// Sentinels are not relevant for this point checking.
-	if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
+	if !l.isIgnorableBoundaryKey && !item.key.IsExclusiveSentinel() &&
+		item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
 		m.numPoints++
 		keyChanged := m.heap.cmp(item.key.UserKey, m.lastKey.UserKey) != 0
 		if !keyChanged {
@@ -223,46 +224,49 @@ func (m *simpleMergingIter) step() bool {
 	m.lastIterMsg = l.iter.String()
 
 	// Step to the next point.
-	if l.iterKey, l.iterValue = l.iter.Next(); l.iterKey != nil {
-		// Check point keys in an sstable are ordered. Although not required, we check
-		// for memtables as well. A subtle check here is that successive sstables of
-		// L1 and higher levels are ordered. This happens when levelIter moves to the
-		// next sstable in the level, in which case item.key is previous sstable's
-		// last point key.
-		if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 {
-			m.err = errors.Errorf("out of order keys %s >= %s in %s",
-				item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter)
-			return false
+	l.iterKey, l.iterValue = l.iter.Next()
+	if !l.isIgnorableBoundaryKey {
+		if l.iterKey != nil {
+			// Check point keys in an sstable are ordered. Although not required, we check
+			// for memtables as well. A subtle check here is that successive sstables of
+			// L1 and higher levels are ordered. This happens when levelIter moves to the
+			// next sstable in the level, in which case item.key is previous sstable's
+			// last point key.
+			if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 {
+				m.err = errors.Errorf("out of order keys %s >= %s in %s",
+					item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter)
+				return false
+			}
+			item.key.Trailer = l.iterKey.Trailer
+			item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
+			item.value = l.iterValue
+			if m.heap.len() > 1 {
+				m.heap.fix(0)
+			}
+		} else {
+			m.err = l.iter.Close()
+			l.iter = nil
+			m.heap.pop()
 		}
-		item.key.Trailer = l.iterKey.Trailer
-		item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
-		item.value = l.iterValue
-		if m.heap.len() > 1 {
-			m.heap.fix(0)
+		if m.err != nil {
+			return false
 		}
-	} else {
-		m.err = l.iter.Close()
-		l.iter = nil
-		m.heap.pop()
-	}
-	if m.err != nil {
-		return false
-	}
-	if m.heap.len() == 0 {
-		// Last record was a MERGE record.
-		if m.valueMerger != nil {
-			var closer io.Closer
-			_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
-			if m.err == nil && closer != nil {
-				m.err = closer.Close()
-			}
-			if m.err != nil {
-				m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
-					item.key.Pretty(m.formatKey), m.lastIterMsg)
+		if m.heap.len() == 0 {
+			// Last record was a MERGE record.
+			if m.valueMerger != nil {
+				var closer io.Closer
+				_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
+				if m.err == nil && closer != nil {
+					m.err = closer.Close()
+				}
+				if m.err != nil {
+					m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
+						item.key.Pretty(m.formatKey), m.lastIterMsg)
+				}
+				m.valueMerger = nil
 			}
-			m.valueMerger = nil
+			return false
 		}
-		return false
 	}
 	m.positionRangeDels()
 	return true
diff --git a/level_iter.go b/level_iter.go
index c8fc5d256e..f115d8973b 100644
--- a/level_iter.go
+++ b/level_iter.go
@@ -1018,37 +1018,29 @@ func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) {
 				// bounds.
 				return nil, base.LazyValue{}
 			}
-			// If the boundary is a range deletion tombstone, return that key.
-			if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete {
-				l.largestBoundary = &l.iterFile.LargestPointKey
-				if l.boundaryContext != nil {
-					l.boundaryContext.isIgnorableBoundaryKey = true
-				}
-				return l.largestBoundary, base.LazyValue{}
-			}
-			// If the last point iterator positioning op might've skipped keys,
-			// it's possible the file's range deletions are still relevant to
-			// other levels. Return the largest boundary as a special ignorable
-			// marker to avoid advancing to the next file.
-			//
-			// The sstable iterator cannot guarantee that keys were skipped. A
-			// SeekGE that lands on a index separator k only knows that the
-			// block at the index entry contains keys ≤ k. We can't know whether
-			// there were actually keys between the seek key and the index
-			// separator key. If the block is then excluded due to block
-			// property filters, the iterator does not know whether keys were
-			// actually skipped by the block's exclusion.
+			// If the boundary is a range deletion tombstone, or the caller is
+			// accessing range dels through l.rangeDelIterPtr, pause at an
+			// ignorable boundary key to avoid advancing to the next file until
+			// other levels are caught up.
 			//
-			// Since MaybeFilteredKeys cannot guarantee that keys were skipped,
-			// it's possible l.iterFile.Largest was already returned. Returning
-			// l.iterFile.Largest again is a violation of the strict
+			// Note that even if the largest boundary is not a range deletion,
+			// there may still be range deletions beyong the last point key
+			// returned. When block-property filters are in use, the sstable
+			// iterator may have transparently skipped a tail of the point keys
+			// in the file. If the last point key returned /was/ the largest
+			// key, then we'll return a key with the same user key and trailer
+			// twice.  Returning it again is a violation of the strict
 			// monotonicity normally provided. The mergingIter's heap can
 			// tolerate this repeat key and in this case will keep the level at
 			// the top of the heap and immediately skip the entry, advancing to
 			// the next file.
-			if *l.rangeDelIterPtr != nil && l.filteredIter != nil &&
-				l.filteredIter.MaybeFilteredKeys() {
-				l.largestBoundary = &l.iterFile.Largest
+			//
+			// TODO(jackson): We should be able to condition this only on
+			// *l.rangeDelIterPtr != nil, but the getIter retains tombstones
+			// returned by the rangeDelIter after it's nil'd the ptr.
+			if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete ||
+				*l.rangeDelIterPtr != nil {
+				l.largestBoundary = &l.iterFile.LargestPointKey
 				if l.boundaryContext != nil {
 					l.boundaryContext.isIgnorableBoundaryKey = true
 				}
diff --git a/merging_iter.go b/merging_iter.go
index 741d7687dd..a2a5a8fe1b 100644
--- a/merging_iter.go
+++ b/merging_iter.go
@@ -952,6 +952,17 @@ func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) {
 		if m.levels[item.index].isSyntheticIterBoundsKey {
 			break
 		}
+		// Skip ignorable boundary keys. These are not real keys and exist to
+		// keep sstables open until we've surpassed their end boundaries so that
+		// their range deletions are visible.
+		if m.levels[item.index].isIgnorableBoundaryKey {
+			m.err = m.prevEntry(item)
+			if m.err != nil {
+				return nil, base.LazyValue{}
+			}
+			continue
+		}
+
 		m.addItemStats(item)
 		if isDeleted, err := m.isPrevEntryDeleted(item); err != nil {
 			m.err = err
@@ -960,8 +971,7 @@ func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) {
 			m.stats.PointsCoveredByRangeTombstones++
 			continue
 		}
-		if item.iterKey.Visible(m.snapshot, m.batchSnapshot) &&
-			(!m.levels[item.index].isIgnorableBoundaryKey) {
+		if item.iterKey.Visible(m.snapshot, m.batchSnapshot) {
 			return item.iterKey, item.iterValue
 		}
 		m.err = m.prevEntry(item)
diff --git a/testdata/level_iter_boundaries b/testdata/level_iter_boundaries
index c6d57ebdfd..1dcd0395b8 100644
--- a/testdata/level_iter_boundaries
+++ b/testdata/level_iter_boundaries
@@ -131,8 +131,10 @@ c.SET.2:c
 iter
 first
 next
+next
 ----
 c#2,SET:c
+c#2,SET:
 .
 
 iter
diff --git a/testdata/level_iter_seek b/testdata/level_iter_seek
index 1b00fee024..495d72e966 100644
--- a/testdata/level_iter_seek
+++ b/testdata/level_iter_seek
@@ -105,6 +105,8 @@ next
 stats
 next
 stats
+next
+stats
 reset-stats
 stats
 ----
@@ -117,6 +119,8 @@ c#7,SET:c
 {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 f#5,SET:f
 {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+f#5,SET:
+{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 g#4,SET:g
 {BlockBytes:112 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 h#3,SET:h
@@ -380,8 +384,10 @@ j.SET.3:j
 iter
 seek-ge f
 next
+next
 ----
 f/<invalid>#5,SET:f
+f#5,SET:
 i#4,SET:i
 
 # The below count should be 2, as we skip over the rangekey-only file.
diff --git a/testdata/merging_iter b/testdata/merging_iter
index ec61ee1d23..5c93e79e0a 100644
--- a/testdata/merging_iter
+++ b/testdata/merging_iter
@@ -40,7 +40,7 @@ c#27,SET:27
 e#10,SET:10
 g#20,SET:20
 .
-{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:5 ValueBytes:8 PointCount:5 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:6 ValueBytes:8 PointCount:6 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 {BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 
 # seekGE() should not allow the rangedel to act on points in the lower sstable that are after it.