From a19391e252f2da527682da1ab339e8e9488f9dd1 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Thu, 2 May 2024 17:31:10 -0400 Subject: [PATCH] db: refactor ingest overlap checking Refactor the overlap checking to avoid using the levelIter to populate the range deletion iterator. This was subtle and confusing. This refactor will also benefit the implementation of #2112. Informs #2112. Informs #2863. --- checkpoint_test.go | 3 + compaction.go | 21 ++++-- flushable.go | 35 ++++++++- ingest.go | 179 ++++++++++----------------------------------- ingest_test.go | 18 ++++- overlap.go | 163 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 264 insertions(+), 155 deletions(-) create mode 100644 overlap.go diff --git a/checkpoint_test.go b/checkpoint_test.go index 54e3127e019..fa7e7ed65c6 100644 --- a/checkpoint_test.go +++ b/checkpoint_test.go @@ -25,6 +25,9 @@ import ( func testCheckpointImpl(t *testing.T, ddFile string, createOnShared bool) { dbs := make(map[string]*DB) defer func() { + if r := recover(); r != nil { + panic(r) + } for _, db := range dbs { if db.closed.Load() == nil { require.NoError(t, db.Close()) diff --git a/compaction.go b/compaction.go index f5da98d7aaa..6c24f9f3140 100644 --- a/compaction.go +++ b/compaction.go @@ -1245,7 +1245,6 @@ func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) { c.version = d.mu.versions.currentVersion() baseLevel := d.mu.versions.picker.getBaseLevel() - iterOpts := IterOptions{logger: d.opts.Logger} ve := &versionEdit{} var ingestSplitFiles []ingestSplitFile ingestFlushable := c.flushing[0].flushable.(*ingestedFlushable) @@ -1272,6 +1271,19 @@ func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) { ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata) } + ctx := context.Background() + overlapChecker := &overlapChecker{ + comparer: d.opts.Comparer, + newIters: d.newIters, + opts: IterOptions{ + logger: d.opts.Logger, + CategoryAndQoS: sstable.CategoryAndQoS{ + Category: "pebble-ingest", + QoSLevel: sstable.LatencySensitiveQoSLevel, + }, + }, + v: c.version, + } replacedFiles := make(map[base.FileNum][]newFileEntry) for _, file := range ingestFlushable.files { var fileToSplit *fileMetadata @@ -1284,11 +1296,8 @@ func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) { level = 6 } else { var err error - level, fileToSplit, err = ingestTargetLevel( - d.newIters, d.tableNewRangeKeyIter, iterOpts, d.opts.Comparer, - c.version, baseLevel, d.mu.compact.inProgress, file.FileMetadata, - suggestSplit, - ) + level, fileToSplit, err = ingestTargetLevel(ctx, overlapChecker, + baseLevel, d.mu.compact.inProgress, file.FileMetadata, suggestSplit) if err != nil { return nil, err } diff --git a/flushable.go b/flushable.go index 7103a0de2d3..ce3b74fbb8d 100644 --- a/flushable.go +++ b/flushable.go @@ -301,8 +301,7 @@ func (s *ingestedFlushable) computePossibleOverlaps( fn func(bounded) shouldContinue, bounded ...bounded, ) { for _, b := range bounded { - bounds := b.UserKeyBounds() - if s.anyFileOverlaps(b, &bounds) { + if s.anyFileOverlaps(b.UserKeyBounds()) { // Some file overlaps in key boundaries. The file doesn't necessarily // contain any keys within the key range, but we would need to perform I/O // to know for sure. The flushable interface dictates that we're not @@ -316,7 +315,7 @@ func (s *ingestedFlushable) computePossibleOverlaps( // anyFileBoundsOverlap returns true if there is at least a file in s.files with // bounds that overlap the given bounds. -func (s *ingestedFlushable) anyFileOverlaps(b bounded, bounds *base.UserKeyBounds) bool { +func (s *ingestedFlushable) anyFileOverlaps(bounds base.UserKeyBounds) bool { // Note that s.files are non-overlapping and sorted. for _, f := range s.files { fileBounds := f.UserKeyBounds() @@ -350,7 +349,11 @@ func computePossibleOverlapsGenericImpl[F flushable]( rangeDelIter := f.newRangeDelIter(nil) rkeyIter := f.newRangeKeyIter(nil) for _, b := range bounded { - if overlapWithIterator(iter, &rangeDelIter, rkeyIter, b.UserKeyBounds(), cmp) { + overlap, err := determineOverlapAllIters(cmp, b.UserKeyBounds(), iter, rangeDelIter, rkeyIter) + if invariants.Enabled && err != nil { + panic(errors.AssertionFailedf("expected iterator to be infallible: %v", err)) + } + if overlap { if !fn(b) { break } @@ -367,3 +370,27 @@ func computePossibleOverlapsGenericImpl[F flushable]( } } } + +// determineOverlapAllIters checks for overlap in a point iterator, range +// deletion iterator and range key iterator. +func determineOverlapAllIters( + cmp base.Compare, + bounds base.UserKeyBounds, + pointIter base.InternalIterator, + rangeDelIter, rangeKeyIter keyspan.FragmentIterator, +) (bool, error) { + if pointIter != nil { + if pointOverlap, err := determineOverlapPointIterator(cmp, bounds, pointIter); pointOverlap || err != nil { + return pointOverlap, err + } + } + if rangeDelIter != nil { + if rangeDelOverlap, err := determineOverlapKeyspanIterator(cmp, bounds, rangeDelIter); rangeDelOverlap || err != nil { + return rangeDelOverlap, err + } + } + if rangeKeyIter != nil { + return determineOverlapKeyspanIterator(cmp, bounds, rangeKeyIter) + } + return false, nil +} diff --git a/ingest.go b/ingest.go index 96a9819b1c2..e8100fa2d29 100644 --- a/ingest.go +++ b/ingest.go @@ -14,8 +14,6 @@ import ( "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/private" "github.com/cockroachdb/pebble/objstorage" @@ -888,92 +886,13 @@ func ingestUpdateSeqNum( return nil } -// overlapWIthIterator returns true if the given iterators produce keys or spans -// overlapping the given UserKeyBounds. May return false positives (e.g. in -// error cases). -func overlapWithIterator( - iter internalIterator, - rangeDelIter *keyspan.FragmentIterator, - rkeyIter keyspan.FragmentIterator, - bounds base.UserKeyBounds, - cmp Compare, -) bool { - // Check overlap with point operations. - // - // When using levelIter, it seeks to the SST whose boundaries - // contain keyRange.smallest.UserKey(S). - // It then tries to find a point in that SST that is >= S. - // If there's no such point it means the SST ends in a tombstone in which case - // levelIter.SeekGE generates a boundary range del sentinel. - // The comparison of this boundary with keyRange.largest(L) below - // is subtle but maintains correctness. - // 1) boundary < L, - // since boundary is also > S (initial seek), - // whatever the boundary's start key may be, we're always overlapping. - // 2) boundary > L, - // overlap with boundary cannot be determined since we don't know boundary's start key. - // We require checking for overlap with rangeDelIter. - // 3) boundary == L and L is not sentinel, - // means boundary < L and hence is similar to 1). - // 4) boundary == L and L is sentinel, - // we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap. - kv := iter.SeekGE(bounds.Start, base.SeekGEFlagsNone) - if kv != nil { - if bounds.End.IsUpperBoundForInternalKey(cmp, kv.K) { - return true - } - } - // Assume overlap if iterator errored. - if err := iter.Error(); err != nil { - return true - } - - computeOverlapWithSpans := func(rIter keyspan.FragmentIterator) (bool, error) { - // NB: The spans surfaced by the fragment iterator are non-overlapping. - span, err := rIter.SeekGE(bounds.Start) - if err != nil { - return false, err - } - for ; span != nil; span, err = rIter.Next() { - if !bounds.End.IsUpperBoundFor(cmp, span.Start) { - // The span starts after our bounds. - return false, nil - } - if !span.Empty() { - return true, nil - } - } - return false, err - } - - // rkeyIter is either a range key level iter, or a range key iterator - // over a single file. - if rkeyIter != nil { - // If an error occurs, assume overlap. - if overlap, err := computeOverlapWithSpans(rkeyIter); overlap || err != nil { - return true - } - } - - // Check overlap with range deletions. - if rangeDelIter == nil || *rangeDelIter == nil { - return false - } - overlap, err := computeOverlapWithSpans(*rangeDelIter) - // If an error occurs, assume overlap. - return overlap || err != nil -} - // ingestTargetLevel returns the target level for a file being ingested. // If suggestSplit is true, it accounts for ingest-time splitting as part of // its target level calculation, and if a split candidate is found, that file // is returned as the splitFile. func ingestTargetLevel( - newIters tableNewIters, - newRangeKeyIter keyspanimpl.TableNewSpanIter, - iterOps IterOptions, - comparer *Comparer, - v *version, + ctx context.Context, + overlapChecker *overlapChecker, baseLevel int, compactions map[*compaction]struct{}, meta *fileMetadata, @@ -1047,68 +966,29 @@ func ingestTargetLevel( // This assertion implicitly checks that we have the current version of // the metadata. - if v.L0Sublevels == nil { + if overlapChecker.v.L0Sublevels == nil { return 0, nil, base.AssertionFailedf("could not read L0 sublevels") } - iterOps.CategoryAndQoS = sstable.CategoryAndQoS{ - Category: "pebble-ingest", - QoSLevel: sstable.LatencySensitiveQoSLevel, - } - // Check for overlap over the keys of L0 by iterating over the sublevels. - for subLevel := 0; subLevel < len(v.L0SublevelFiles); subLevel++ { - iter := newLevelIter(context.Background(), - iterOps, comparer, newIters, v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), internalIterOpts{}) - - var rangeDelIter keyspan.FragmentIterator - // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE - // sets it up for the target file. - iter.initRangeDel(&rangeDelIter) + bounds := meta.UserKeyBounds() - levelIter := keyspanimpl.LevelIter{} - levelIter.Init( - keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter, - v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), manifest.KeyTypeRange, - ) - - overlap := overlapWithIterator(iter, &rangeDelIter, &levelIter, meta.UserKeyBounds(), comparer.Compare) - err := iter.Close() // Closes range del iter as well. - err = firstError(err, levelIter.Close()) - if err != nil { - return 0, nil, err - } - if overlap { - return targetLevel, nil, nil - } + // Check for overlap over the keys of L0. + if overlap, err := overlapChecker.determineAnyOverlapInLevel(ctx, bounds, 0); err != nil { + return 0, nil, err + } else if overlap { + return 0, nil, nil } - level := baseLevel - for ; level < numLevels; level++ { - levelIter := newLevelIter(context.Background(), - iterOps, comparer, newIters, v.Levels[level].Iter(), manifest.Level(level), internalIterOpts{}) - var rangeDelIter keyspan.FragmentIterator - // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE - // sets it up for the target file. - levelIter.initRangeDel(&rangeDelIter) - - rkeyLevelIter := &keyspanimpl.LevelIter{} - rkeyLevelIter.Init( - keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter, - v.Levels[level].Iter(), manifest.Level(level), manifest.KeyTypeRange, - ) - - overlap := overlapWithIterator(levelIter, &rangeDelIter, rkeyLevelIter, meta.UserKeyBounds(), comparer.Compare) - err := levelIter.Close() // Closes range del iter as well. - err = firstError(err, rkeyLevelIter.Close()) + for level := baseLevel; level < numLevels; level++ { + dataOverlap, err := overlapChecker.determineAnyOverlapInLevel(ctx, bounds, level) if err != nil { return 0, nil, err - } - if overlap { + } else if dataOverlap { return targetLevel, splitFile, nil } // Check boundary overlap. var candidateSplitFile *fileMetadata - boundaryOverlaps := v.Overlaps(level, meta.UserKeyBounds()) + boundaryOverlaps := overlapChecker.v.Overlaps(level, bounds) if !boundaryOverlaps.Empty() { // We are already guaranteed to not have any data overlaps with files // in boundaryOverlaps, otherwise we'd have returned in the above if @@ -1145,8 +1025,8 @@ func ingestTargetLevel( if c.outputLevel == nil || level != c.outputLevel.level { continue } - if comparer.Compare(meta.Smallest.UserKey, c.largest.UserKey) <= 0 && - comparer.Compare(meta.Largest.UserKey, c.smallest.UserKey) >= 0 { + if overlapChecker.comparer.Compare(meta.Smallest.UserKey, c.largest.UserKey) <= 0 && + overlapChecker.comparer.Compare(meta.Largest.UserKey, c.smallest.UserKey) >= 0 { overlaps = true break } @@ -1513,6 +1393,7 @@ func (d *DB) ingest( } } } + ctx := context.Background() // Allocate file numbers for all of the files being ingested and mark them as // pending in order to prevent them from being deleted. Note that this causes // the file number ordering to be out of alignment with sequence number @@ -1787,7 +1668,7 @@ func (d *DB) ingest( // Assign the sstables to the correct level in the LSM and apply the // version edit. - ve, err = d.ingestApply(jobID, loadResult, mut, exciseSpan, seqNum) + ve, err = d.ingestApply(ctx, jobID, loadResult, mut, exciseSpan, seqNum) } // Only one ingest can occur at a time because if not, one would block waiting @@ -2268,7 +2149,12 @@ func (d *DB) ingestSplit( } func (d *DB) ingestApply( - jobID JobID, lr ingestLoadResult, mut *memTable, exciseSpan KeyRange, exciseSeqNum uint64, + ctx context.Context, + jobID JobID, + lr ingestLoadResult, + mut *memTable, + exciseSpan KeyRange, + exciseSeqNum uint64, ) (*versionEdit, error) { d.mu.Lock() defer d.mu.Unlock() @@ -2301,11 +2187,22 @@ func (d *DB) ingestApply( } } + current := d.mu.versions.currentVersion() + overlapChecker := &overlapChecker{ + comparer: d.opts.Comparer, + newIters: d.newIters, + opts: IterOptions{ + logger: d.opts.Logger, + CategoryAndQoS: sstable.CategoryAndQoS{ + Category: "pebble-ingest", + QoSLevel: sstable.LatencySensitiveQoSLevel, + }, + }, + v: current, + } shouldIngestSplit := d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables - current := d.mu.versions.currentVersion() baseLevel := d.mu.versions.picker.getBaseLevel() - iterOps := IterOptions{logger: d.opts.Logger} // filesToSplit is a list where each element is a pair consisting of a file // being ingested and a file being split to make room for an ingestion into // that level. Each ingested file will appear at most once in this list. It @@ -2370,8 +2267,8 @@ func (d *DB) ingestApply( // not see any version applications while we're at this. The one // complication here would be pulling out the mu.compact.inProgress // check from ingestTargetLevel, as that requires d.mu to be held. - f.Level, splitFile, err = ingestTargetLevel( - d.newIters, d.tableNewRangeKeyIter, iterOps, d.opts.Comparer, current, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit) + f.Level, splitFile, err = ingestTargetLevel(ctx, overlapChecker, + baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit) } if splitFile != nil { diff --git a/ingest_test.go b/ingest_test.go index 2e76140c47b..edd3e51f3ec 100644 --- a/ingest_test.go +++ b/ingest_test.go @@ -2277,10 +2277,20 @@ func TestIngestTargetLevel(t *testing.T) { for _, target := range strings.Split(td.Input, "\n") { meta, err := manifest.ParseFileMetadataDebug(target) require.NoError(t, err) - level, overlapFile, err := ingestTargetLevel( - d.newIters, d.tableNewRangeKeyIter, IterOptions{logger: d.opts.Logger}, - d.opts.Comparer, d.mu.versions.currentVersion(), 1, d.mu.compact.inProgress, meta, - suggestSplit) + overlapChecker := &overlapChecker{ + comparer: d.opts.Comparer, + newIters: d.newIters, + opts: IterOptions{ + logger: d.opts.Logger, + CategoryAndQoS: sstable.CategoryAndQoS{ + Category: "pebble-ingest", + QoSLevel: sstable.LatencySensitiveQoSLevel, + }, + }, + v: d.mu.versions.currentVersion(), + } + level, overlapFile, err := ingestTargetLevel(context.Background(), overlapChecker, + 1, d.mu.compact.inProgress, meta, suggestSplit) if err != nil { return err.Error() } diff --git a/overlap.go b/overlap.go new file mode 100644 index 00000000000..71e1ad653c7 --- /dev/null +++ b/overlap.go @@ -0,0 +1,163 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/internal/base" + "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl" + "github.com/cockroachdb/pebble/internal/manifest" +) + +// An overlapChecker provides facilities for checking whether any keys within a +// particular LSM version overlap a set of bounds. +type overlapChecker struct { + comparer *base.Comparer + newIters tableNewIters + opts IterOptions + v *version + + // bufs; reused across calls to avoid allocations. + upperBoundBuf []byte + pointLevelIter levelIter + keyspanLevelIter keyspanimpl.LevelIter +} + +func (c *overlapChecker) determineAnyOverlapInLevel( + ctx context.Context, bounds base.UserKeyBounds, level int, +) (bool, error) { + // Propagating an upper bound can prevent a levelIter from unnecessarily + // opening files that fall outside bounds if no files within a level overlap + // the provided bounds. + c.opts.UpperBound = nil + if bounds.End.Kind == base.Exclusive { + c.opts.UpperBound = bounds.End.Key + } else if c.comparer.ImmediateSuccessor != nil { + si := c.comparer.Split(bounds.End.Key) + c.upperBoundBuf = c.comparer.ImmediateSuccessor(c.upperBoundBuf[:0], bounds.End.Key[:si]) + c.opts.UpperBound = c.upperBoundBuf + } + + // Check for overlap over the keys of L0 by iterating over the sublevels. + // NB: sublevel 0 contains the newest keys, whereas sublevel n contains the + // oldest keys. + if level == 0 { + for subLevel := 0; subLevel < len(c.v.L0SublevelFiles); subLevel++ { + manifestIter := c.v.L0Sublevels.Levels[subLevel].Iter() + pointOverlap, err := c.determinePointKeyOverlapInLevel( + ctx, bounds, manifest.Level(0), manifestIter) + if err != nil { + return false, err + } else if pointOverlap { + return true, nil + } + rangeOverlap, err := c.determineRangeKeyOverlapInLevel( + ctx, bounds, manifest.Level(0), manifestIter) + if err != nil { + return false, err + } else if rangeOverlap { + return true, nil + } + } + return false, nil + } + + // Note that the ordering of checking for point key overlap first is + // significant. When checking for range key overlap, we use + // c.v.RangeKeyLevels which only contains file metadata for files that + // contain range keys. If a level contains both point and range key overlap, + // the sequence number returned by largestSeqNumInBounds using + // rangeManifestIter is not guaranteed to be at least as high as the largest + // overlapping point key. Since the check for point keys uses c.v.Levels + // which contains all files, the inverse is not true. + pointManifestIter := c.v.Levels[level].Iter() + pointOverlap, err := c.determinePointKeyOverlapInLevel( + ctx, bounds, manifest.Level(level), pointManifestIter) + if pointOverlap || err != nil { + return pointOverlap, err + } + + rangeManifestIter := c.v.RangeKeyLevels[level].Iter() + return c.determineRangeKeyOverlapInLevel( + ctx, bounds, manifest.Level(level), rangeManifestIter) +} + +func (c *overlapChecker) determinePointKeyOverlapInLevel( + ctx context.Context, + bounds base.UserKeyBounds, + level manifest.Level, + metadataIter manifest.LevelIterator, +) (bool, error) { + // Check for overlapping point keys. + { + c.pointLevelIter.init(ctx, c.opts, c.comparer, c.newIters, metadataIter, level, internalIterOpts{}) + pointOverlap, err := determineOverlapPointIterator(c.comparer.Compare, bounds, &c.pointLevelIter) + err = errors.CombineErrors(err, c.pointLevelIter.Close()) + if pointOverlap || err != nil { + return pointOverlap, err + } + } + // Check for overlapping range deletions. + { + c.keyspanLevelIter.Init( + keyspan.SpanIterOptions{}, c.comparer.Compare, tableNewRangeDelIter(ctx, c.newIters), + metadataIter, level, manifest.KeyTypePoint, + ) + rangeDeletionOverlap, err := determineOverlapKeyspanIterator(c.comparer.Compare, bounds, &c.keyspanLevelIter) + err = errors.CombineErrors(err, c.keyspanLevelIter.Close()) + if rangeDeletionOverlap || err != nil { + return rangeDeletionOverlap, err + } + } + return false, nil +} + +func (c *overlapChecker) determineRangeKeyOverlapInLevel( + ctx context.Context, + bounds base.UserKeyBounds, + level manifest.Level, + metadataIter manifest.LevelIterator, +) (bool, error) { + // Check for overlapping range keys. + c.keyspanLevelIter.Init( + keyspan.SpanIterOptions{}, c.comparer.Compare, tableNewRangeKeyIter(ctx, c.newIters), + metadataIter, level, manifest.KeyTypeRange, + ) + rangeKeyOverlap, err := determineOverlapKeyspanIterator(c.comparer.Compare, bounds, &c.keyspanLevelIter) + return rangeKeyOverlap, errors.CombineErrors(err, c.keyspanLevelIter.Close()) +} + +func determineOverlapPointIterator( + cmp base.Compare, bounds base.UserKeyBounds, iter internalIterator, +) (bool, error) { + kv := iter.SeekGE(bounds.Start, base.SeekGEFlagsNone) + if kv == nil { + return false, iter.Error() + } + return bounds.End.IsUpperBoundForInternalKey(cmp, kv.K), nil +} + +func determineOverlapKeyspanIterator( + cmp base.Compare, bounds base.UserKeyBounds, iter keyspan.FragmentIterator, +) (bool, error) { + // NB: The spans surfaced by the fragment iterator are non-overlapping. + span, err := iter.SeekGE(bounds.Start) + if err != nil { + return false, err + } + for ; span != nil; span, err = iter.Next() { + if !bounds.End.IsUpperBoundFor(cmp, span.Start) { + // The span starts after our bounds. + return false, nil + } + if !span.Empty() { + return true, nil + } + } + return false, err +}