db: unconditionally interleave ignorable boundaries

The levelIter has a concept of ignorable boundary keys. When a levelIter exhausts a file's point keys, it's possible that the same file still contains range deletions relevant to other levels of the LSM. The levelIter will, under some conditions, interleave the largest boundary key of the sstable into iteration as an 'ignorable boundary key,' so that the file's range deletions remain accessible until all other levels progress beyound the file's boundary. When block-property filters are in use, a file's point key iterator may become exhausted early, before the file's range deletions are irrelevant, even if the file's largest key is not a range deletion. To work around this subtlety, the sstable iterator previously would surface knowledge of whether any point keys may have been skipped through a MaybeFilteredKeys method. The levelIter used this method to determine when to interleave an ignorable largest boundary in case there may be additional relevant range deletions. This commit removes the conditioning on the output of MaybeFilteredKeys, instead unconditionally interleaving a synthetic boundary at a file's largest point key if the levelIter user is using the file's range deletion iterator. This simplifies the logic and removes a fragile reliance on the sstable's iterators accounting of when it may have filtered keys. Future work (#2863) will remove the need to interleave synthetic boundaries at all, instead interleaving the range deletion bounds themselves among point keys. Informs #2863. Close #3334.
cockroachdb · Apr 5, 2024 · f31f261 · f31f261
1 parent 65b27f0
commit f31f261
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 66 deletions.
diff --git a/level_checker.go b/level_checker.go
@@ -131,7 +131,8 @@ func (m *simpleMergingIter) step() bool {
 	item := &m.heap.items[0]
 	l := &m.levels[item.index]
 	// Sentinels are not relevant for this point checking.
-	if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
+	if !l.isIgnorableBoundaryKey && !item.key.IsExclusiveSentinel() &&
+		item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
 		m.numPoints++
 		keyChanged := m.heap.cmp(item.key.UserKey, m.lastKey.UserKey) != 0
 		if !keyChanged {
@@ -223,46 +224,49 @@ func (m *simpleMergingIter) step() bool {
 	m.lastIterMsg = l.iter.String()
 
 	// Step to the next point.
-	if l.iterKey, l.iterValue = l.iter.Next(); l.iterKey != nil {
-		// Check point keys in an sstable are ordered. Although not required, we check
-		// for memtables as well. A subtle check here is that successive sstables of
-		// L1 and higher levels are ordered. This happens when levelIter moves to the
-		// next sstable in the level, in which case item.key is previous sstable's
-		// last point key.
-		if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 {
-			m.err = errors.Errorf("out of order keys %s >= %s in %s",
-				item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter)
-			return false
+	l.iterKey, l.iterValue = l.iter.Next()
+	if !l.isIgnorableBoundaryKey {
+		if l.iterKey != nil {
+			// Check point keys in an sstable are ordered. Although not required, we check
+			// for memtables as well. A subtle check here is that successive sstables of
+			// L1 and higher levels are ordered. This happens when levelIter moves to the
+			// next sstable in the level, in which case item.key is previous sstable's
+			// last point key.
+			if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 {
+				m.err = errors.Errorf("out of order keys %s >= %s in %s",
+					item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter)
+				return false
+			}
+			item.key.Trailer = l.iterKey.Trailer
+			item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
+			item.value = l.iterValue
+			if m.heap.len() > 1 {
+				m.heap.fix(0)
+			}
+		} else {
+			m.err = l.iter.Close()
+			l.iter = nil
+			m.heap.pop()
 		}
-		item.key.Trailer = l.iterKey.Trailer
-		item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
-		item.value = l.iterValue
-		if m.heap.len() > 1 {
-			m.heap.fix(0)
+		if m.err != nil {
+			return false
 		}
-	} else {
-		m.err = l.iter.Close()
-		l.iter = nil
-		m.heap.pop()
-	}
-	if m.err != nil {
-		return false
-	}
-	if m.heap.len() == 0 {
-		// Last record was a MERGE record.
-		if m.valueMerger != nil {
-			var closer io.Closer
-			_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
-			if m.err == nil && closer != nil {
-				m.err = closer.Close()
-			}
-			if m.err != nil {
-				m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
-					item.key.Pretty(m.formatKey), m.lastIterMsg)
+		if m.heap.len() == 0 {
+			// Last record was a MERGE record.
+			if m.valueMerger != nil {
+				var closer io.Closer
+				_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
+				if m.err == nil && closer != nil {
+					m.err = closer.Close()
+				}
+				if m.err != nil {
+					m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
+						item.key.Pretty(m.formatKey), m.lastIterMsg)
+				}
+				m.valueMerger = nil
 			}
-			m.valueMerger = nil
+			return false
 		}
-		return false
 	}
 	m.positionRangeDels()
 	return true

diff --git a/level_iter.go b/level_iter.go
@@ -1018,37 +1018,29 @@ func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) {
 				// bounds.
 				return nil, base.LazyValue{}
 			}
-			// If the boundary is a range deletion tombstone, return that key.
-			if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete {
-				l.largestBoundary = &l.iterFile.LargestPointKey
-				if l.boundaryContext != nil {
-					l.boundaryContext.isIgnorableBoundaryKey = true
-				}
-				return l.largestBoundary, base.LazyValue{}
-			}
-			// If the last point iterator positioning op might've skipped keys,
-			// it's possible the file's range deletions are still relevant to
-			// other levels. Return the largest boundary as a special ignorable
-			// marker to avoid advancing to the next file.
-			//
-			// The sstable iterator cannot guarantee that keys were skipped. A
-			// SeekGE that lands on a index separator k only knows that the
-			// block at the index entry contains keys ≤ k. We can't know whether
-			// there were actually keys between the seek key and the index
-			// separator key. If the block is then excluded due to block
-			// property filters, the iterator does not know whether keys were
-			// actually skipped by the block's exclusion.
+			// If the boundary is a range deletion tombstone, or the caller is
+			// accessing range dels through l.rangeDelIterPtr, pause at an
+			// ignorable boundary key to avoid advancing to the next file until
+			// other levels are caught up.
 			//
-			// Since MaybeFilteredKeys cannot guarantee that keys were skipped,
-			// it's possible l.iterFile.Largest was already returned. Returning
-			// l.iterFile.Largest again is a violation of the strict
+			// Note that even if the largest boundary is not a range deletion,
+			// there may still be range deletions beyong the last point key
+			// returned. When block-property filters are in use, the sstable
+			// iterator may have transparently skipped a tail of the point keys
+			// in the file. If the last point key returned /was/ the largest
+			// key, then we'll return a key with the same user key and trailer
+			// twice.  Returning it again is a violation of the strict
 			// monotonicity normally provided. The mergingIter's heap can
 			// tolerate this repeat key and in this case will keep the level at
 			// the top of the heap and immediately skip the entry, advancing to
 			// the next file.
-			if *l.rangeDelIterPtr != nil && l.filteredIter != nil &&
-				l.filteredIter.MaybeFilteredKeys() {
-				l.largestBoundary = &l.iterFile.Largest
+			//
+			// TODO(jackson): We should be able to condition this only on
+			// *l.rangeDelIterPtr != nil, but the getIter retains tombstones
+			// returned by the rangeDelIter after it's nil'd the ptr.
+			if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete ||
+				*l.rangeDelIterPtr != nil {
+				l.largestBoundary = &l.iterFile.LargestPointKey
 				if l.boundaryContext != nil {
 					l.boundaryContext.isIgnorableBoundaryKey = true
 				}

diff --git a/merging_iter.go b/merging_iter.go
@@ -952,6 +952,17 @@ func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) {
 		if m.levels[item.index].isSyntheticIterBoundsKey {
 			break
 		}
+		// Skip ignorable boundary keys. These are not real keys and exist to
+		// keep sstables open until we've surpassed their end boundaries so that
+		// their range deletions are visible.
+		if m.levels[item.index].isIgnorableBoundaryKey {
+			m.err = m.prevEntry(item)
+			if m.err != nil {
+				return nil, base.LazyValue{}
+			}
+			continue
+		}
+
 		m.addItemStats(item)
 		if isDeleted, err := m.isPrevEntryDeleted(item); err != nil {
 			m.err = err
@@ -960,8 +971,7 @@ func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) {
 			m.stats.PointsCoveredByRangeTombstones++
 			continue
 		}
-		if item.iterKey.Visible(m.snapshot, m.batchSnapshot) &&
-			(!m.levels[item.index].isIgnorableBoundaryKey) {
+		if item.iterKey.Visible(m.snapshot, m.batchSnapshot) {
 			return item.iterKey, item.iterValue
 		}
 		m.err = m.prevEntry(item)

diff --git a/testdata/level_iter_boundaries b/testdata/level_iter_boundaries
@@ -131,8 +131,10 @@ c.SET.2:c
 iter
 first
 next
+next
 ----
 c#2,SET:c
+c#2,SET:
 .
 
 iter

diff --git a/testdata/level_iter_seek b/testdata/level_iter_seek
@@ -105,6 +105,8 @@ next
 stats
 next
 stats
+next
+stats
 reset-stats
 stats
 ----
@@ -117,6 +119,8 @@ c#7,SET:c
 {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 f#5,SET:f
 {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+f#5,SET:
+{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 g#4,SET:g
 {BlockBytes:112 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 h#3,SET:h
@@ -380,8 +384,10 @@ j.SET.3:j
 iter
 seek-ge f
 next
+next
 ----
 f/<invalid>#5,SET:f
+f#5,SET:
 i#4,SET:i
 
 # The below count should be 2, as we skip over the rangekey-only file.

diff --git a/testdata/merging_iter b/testdata/merging_iter
@@ -40,7 +40,7 @@ c#27,SET:27
 e#10,SET:10
 g#20,SET:20
 .
-{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:5 ValueBytes:8 PointCount:5 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:6 ValueBytes:8 PointCount:6 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 {BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
 
 # seekGE() should not allow the rangedel to act on points in the lower sstable that are after it.
-Original file line number
+Diff line change
@@ Expand Up / @@ -131,8 +131,10 @@ c.SET.2:c @@
     iter
     first
     next
+    next
     ----
     c#2,SET:c
+    c#2,SET:
     .
     iter
@@ Expand Down @@