Skip to content

Commit

Permalink
internal/manifest: narrow Overlaps to exclude RANGEDEL sentinel keys.
Browse files Browse the repository at this point in the history
When the largest key in a sstable is a range tombstone, the sstable's largest
boundary is set to the range deletion sentinel with the tombstone's exclusive
end boundary and the maximum sequence number. The range deletion sentinel
serves as a marker, indicating that the file's bounds end immediately before
the first key with the sentinel's user key.

In many places such as growing compactions or determining atomic compaction
units, Pebble considers the sentinel as exclusive and avoids unnecessarily
including files that share the same user key but only as a sentinel's exclusive
end bounary.

The `(*manifest.Version).Overlaps` method did not share this logic and only
considered user keys. This method is used during compaction picking when
finding the initial compaction inputs, for determining in-use key ranges,
grandparent files, etc.

This change adapts this method to consider a file with a largest user key equal
to the search range's start user key nonoverlapping if the file's largest key
is a range deletion sentinel. Additionally, Overlaps now takes an exclusiveEnd
parameter, indicating whether the end user key provided to Overlaps should
similarly be treated as an exclusive bound.

This change is expected to reduce write amplification in the presence of range
deletions by avoiding unnecessarily pulling in files due to perceived overlap.

Additionally, some of these RangeDeletionSentinel comparisons are updated to
use a new (*InternalKey).IsExclusiveSentinel helper. With the introduction of
range keys, we will have additional exclusive end boundary key kinds.
  • Loading branch information
jbowens committed Jan 11, 2022
1 parent 8ab4358 commit d673586
Show file tree
Hide file tree
Showing 23 changed files with 298 additions and 241 deletions.
19 changes: 12 additions & 7 deletions compaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ func newCompaction(pc *pickedCompaction, opts *Options, bytesCompacted *uint64)
// are the grandparent sstables).
if c.outputLevel.level+1 < numLevels {
c.grandparents = c.version.Overlaps(c.outputLevel.level+1, c.cmp,
c.smallest.UserKey, c.largest.UserKey)
c.smallest.UserKey, c.largest.UserKey, c.largest.IsExclusiveSentinel())
}
c.setupInuseKeyRanges()

Expand Down Expand Up @@ -718,8 +718,8 @@ func newFlush(
if opts.FlushSplitBytes > 0 {
c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize)
c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0)
c.grandparents = c.version.Overlaps(baseLevel, c.cmp,
c.smallest.UserKey, c.largest.UserKey)
c.grandparents = c.version.Overlaps(baseLevel, c.cmp, c.smallest.UserKey,
c.largest.UserKey, c.largest.IsExclusiveSentinel())
adjustGrandparentOverlapBytesForFlush(c, flushingBytes)
}

Expand Down Expand Up @@ -752,7 +752,10 @@ func calculateInuseKeyRanges(
}

for ; level < numLevels; level++ {
overlaps := v.Overlaps(level, cmp, smallest, largest)
// NB: We always treat `largest` as inclusive for simplicity, because
// there's little consequence to calculating slightly broader in-use key
// ranges.
overlaps := v.Overlaps(level, cmp, smallest, largest, false /* exclusiveEnd */)
iter := overlaps.Iter()

// We may already have in-use key ranges from higher levels. Iterate
Expand Down Expand Up @@ -898,7 +901,7 @@ func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error {
if n := len(ve.NewFiles); n > 1 {
meta := ve.NewFiles[n-1].Meta
prevMeta := ve.NewFiles[n-2].Meta
if prevMeta.Largest.Trailer != InternalKeyRangeDeleteSentinel &&
if !prevMeta.Largest.IsExclusiveSentinel() &&
c.cmp(prevMeta.Largest.UserKey, meta.Smallest.UserKey) >= 0 {
return errors.Errorf("pebble: compaction split user key across two sstables: %s in %s and %s",
prevMeta.Largest.Pretty(c.formatKey),
Expand Down Expand Up @@ -1694,6 +1697,8 @@ func (d *DB) maybeScheduleCompactionPicker(
// into the same snapshot stripe, a delete-only compaction may delete any
// sstables within the range.
type deleteCompactionHint struct {
// start and end are user keys specifying a key range [start, end) of
// deleted keys.
start []byte
end []byte
// The level of the file containing the range tombstone(s) when the hint
Expand Down Expand Up @@ -1742,7 +1747,7 @@ func (h *deleteCompactionHint) canDelete(cmp Compare, m *fileMetadata, snapshots
return false
}

// The file's keys must be completely contianed within the hint range.
// The file's keys must be completely contained within the hint range.
return cmp(h.start, m.Smallest.UserKey) <= 0 && cmp(m.Largest.UserKey, h.end) < 0
}

Expand Down Expand Up @@ -1854,7 +1859,7 @@ func checkDeleteCompactionHints(
// The hint h will be resolved and dropped, regardless of whether
// there are any tables that can be deleted.
for l := h.tombstoneLevel + 1; l < numLevels; l++ {
overlaps := v.Overlaps(l, cmp, h.start, h.end)
overlaps := v.Overlaps(l, cmp, h.start, h.end, true /* exclusiveEnd */)
iter := overlaps.Iter()
for m := iter.First(); m != nil; m = iter.Next() {
if m.Compacting || !h.canDelete(cmp, m, snapshots) || files[m] {
Expand Down
7 changes: 4 additions & 3 deletions compaction_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package pebble

import (
"fmt"
"io"
"sort"
"strconv"
Expand Down Expand Up @@ -456,9 +457,9 @@ func (i *compactionIter) skipInStripe() {

func (i *compactionIter) iterNext() bool {
i.iterKey, i.iterValue = i.iter.Next()
// We should never see a range delete sentinel in the compaction input.
if i.iterKey != nil && i.iterKey.Trailer == InternalKeyRangeDeleteSentinel {
panic("pebble: unexpected range delete sentinel in compaction input")
// We should never see an exclusive sentinel in the compaction input.
if i.iterKey != nil && i.iterKey.IsExclusiveSentinel() {
panic(fmt.Sprintf("pebble: unexpected exclusive sentinel in compaction input, trailer = %x", i.iterKey.Trailer))
}
return i.iterKey != nil
}
Expand Down
39 changes: 19 additions & 20 deletions compaction_picker.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ func (pc *pickedCompaction) setupInputs(opts *Options, diskAvailBytes uint64) bo
// sstables, and then expand those tables to a clean cut. No need to do
// this for intra-L0 compactions; outputLevel.files is left empty for those.
if pc.startLevel.level != pc.outputLevel.level {
pc.outputLevel.files = pc.version.Overlaps(pc.outputLevel.level, pc.cmp, pc.smallest.UserKey, pc.largest.UserKey)
pc.outputLevel.files = pc.version.Overlaps(pc.outputLevel.level, pc.cmp, pc.smallest.UserKey,
pc.largest.UserKey, pc.largest.IsExclusiveSentinel())
pc.outputLevel.files, isCompacting = expandToAtomicUnit(pc.cmp, pc.outputLevel.files, false /* disableIsCompacting */)
if isCompacting {
return false
Expand Down Expand Up @@ -303,7 +304,8 @@ func (pc *pickedCompaction) grow(sm, la InternalKey, maxExpandedBytes uint64) bo
if pc.outputLevel.files.Empty() {
return false
}
grow0 := pc.version.Overlaps(pc.startLevel.level, pc.cmp, sm.UserKey, la.UserKey)
grow0 := pc.version.Overlaps(pc.startLevel.level, pc.cmp, sm.UserKey,
la.UserKey, la.IsExclusiveSentinel())
grow0, isCompacting := expandToAtomicUnit(pc.cmp, grow0, false /* disableIsCompacting */)
if isCompacting {
return false
Expand All @@ -315,7 +317,8 @@ func (pc *pickedCompaction) grow(sm, la InternalKey, maxExpandedBytes uint64) bo
return false
}
sm1, la1 := manifest.KeyRange(pc.cmp, grow0.Iter())
grow1 := pc.version.Overlaps(pc.outputLevel.level, pc.cmp, sm1.UserKey, la1.UserKey)
grow1 := pc.version.Overlaps(pc.outputLevel.level, pc.cmp, sm1.UserKey,
la1.UserKey, la1.IsExclusiveSentinel())
grow1, isCompacting = expandToAtomicUnit(pc.cmp, grow1, false /* disableIsCompacting */)
if isCompacting {
return false
Expand Down Expand Up @@ -392,12 +395,9 @@ func expandToAtomicUnit(
if cmp(prev.Largest.UserKey, cur.Smallest.UserKey) < 0 {
break
}
if prev.Largest.Trailer == InternalKeyRangeDeleteSentinel {
// The range deletion sentinel key is set for the largest key in a
// table when a range deletion tombstone straddles a table. It
// isn't necessary to include the prev table in the atomic
// compaction unit as prev.largest.UserKey does not actually exist
// in the prev table.
if prev.Largest.IsExclusiveSentinel() {
// The table prev has a largest key indicating that the user key
// prev.largest.UserKey doesn't actually exist in the table.
break
}
// prev.Largest.UserKey == cur.Smallest.UserKey, so we need to
Expand All @@ -413,12 +413,9 @@ func expandToAtomicUnit(
if cmp(cur.Largest.UserKey, next.Smallest.UserKey) < 0 {
break
}
if cur.Largest.Trailer == InternalKeyRangeDeleteSentinel {
// The range deletion sentinel key is set for the largest key
// in a table when a range deletion tombstone straddles a
// table. It isn't necessary to include the next table in the
// compaction as PeekPrev().Largest.UserKey does not actually
// exist in the table.
if cur.Largest.IsExclusiveSentinel() {
// The table cur has a largest key indicating that the user key
// cur.largest.UserKey doesn't actually exist in the table.
break
}
// cur.Largest.UserKey == next.Smallest.UserKey, so we need to
Expand Down Expand Up @@ -1169,7 +1166,8 @@ func pickAutoHelper(
if pc.startLevel.level == 0 {
cmp := opts.Comparer.Compare
smallest, largest := manifest.KeyRange(cmp, pc.startLevel.files.Iter())
pc.startLevel.files = vers.Overlaps(0, cmp, smallest.UserKey, largest.UserKey)
pc.startLevel.files = vers.Overlaps(0, cmp, smallest.UserKey,
largest.UserKey, largest.IsExclusiveSentinel())
if pc.startLevel.files.Empty() {
panic("pebble: empty compaction")
}
Expand Down Expand Up @@ -1383,7 +1381,8 @@ func pickManualHelper(
pc = newPickedCompaction(opts, vers, manual.level, baseLevel)
manual.outputLevel = pc.outputLevel.level
cmp := opts.Comparer.Compare
pc.startLevel.files = vers.Overlaps(manual.level, cmp, manual.start.UserKey, manual.end.UserKey)
pc.startLevel.files = vers.Overlaps(manual.level, cmp, manual.start.UserKey,
manual.end.UserKey, manual.end.IsExclusiveSentinel())
if pc.startLevel.files.Empty() {
// Nothing to do
return nil
Expand Down Expand Up @@ -1415,7 +1414,7 @@ func (p *compactionPickerByScore) pickReadTriggeredCompaction(
func pickReadTriggeredCompactionHelper(
p *compactionPickerByScore, rc *readCompaction, env compactionEnv) (pc *pickedCompaction) {
cmp := p.opts.Comparer.Compare
overlapSlice := p.vers.Overlaps(rc.level, cmp, rc.start, rc.end)
overlapSlice := p.vers.Overlaps(rc.level, cmp, rc.start, rc.end, false /* exclusiveEnd */)
if overlapSlice.Empty() {
// If there is no overlap, then the file with the key range
// must have been compacted away. So, we don't proceed to
Expand Down Expand Up @@ -1448,8 +1447,8 @@ func pickReadTriggeredCompactionHelper(

// Prevent read compactions which are too wide.
outputOverlaps := pc.version.Overlaps(
pc.outputLevel.level, pc.cmp, pc.smallest.UserKey, pc.largest.UserKey,
)
pc.outputLevel.level, pc.cmp, pc.smallest.UserKey,
pc.largest.UserKey, pc.largest.IsExclusiveSentinel())
if outputOverlaps.SizeSum() > pc.maxReadCompactionBytes {
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion compaction_picker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1098,7 +1098,7 @@ func TestPickedCompactionSetupInputs(t *testing.T) {
}
pc.version = newVersion(opts, files)
pc.startLevel.files = pc.version.Overlaps(pc.startLevel.level, pc.cmp,
[]byte(args[0].String()), []byte(args[1].String()))
[]byte(args[0].String()), []byte(args[1].String()), false /* exclusiveEnd */)

var isCompacting bool
if !pc.setupInputs(opts, availBytes) {
Expand Down
3 changes: 2 additions & 1 deletion compaction_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1721,7 +1721,8 @@ func TestCompactionDeleteOnlyHints(t *testing.T) {
if !force {
// Find the file in the current version.
v := d.mu.versions.currentVersion()
overlaps := v.Overlaps(tombstoneLevel, d.opts.Comparer.Compare, start, end)
overlaps := v.Overlaps(tombstoneLevel, d.opts.Comparer.Compare, start,
end, true /* exclusiveEnd */)
iter := overlaps.Iter()
for m := iter.First(); m != nil; m = iter.Next() {
if m.FileNum.String() == parts[1] {
Expand Down
4 changes: 2 additions & 2 deletions db.go
Original file line number Diff line number Diff line change
Expand Up @@ -1126,7 +1126,7 @@ func (d *DB) Compact(
maxLevelWithFiles := 1
cur := d.mu.versions.currentVersion()
for level := 0; level < numLevels; level++ {
overlaps := cur.Overlaps(level, d.cmp, start, end)
overlaps := cur.Overlaps(level, d.cmp, start, end, iEnd.IsExclusiveSentinel())
if !overlaps.Empty() {
maxLevelWithFiles = level + 1
}
Expand Down Expand Up @@ -1400,7 +1400,7 @@ func (d *DB) EstimateDiskUsage(start, end []byte) (uint64, error) {
// We can only use `Overlaps` to restrict `files` at L1+ since at L0 it
// expands the range iteratively until it has found a set of files that
// do not overlap any other L0 files outside that set.
overlaps := readState.current.Overlaps(level, d.opts.Comparer.Compare, start, end)
overlaps := readState.current.Overlaps(level, d.opts.Comparer.Compare, start, end, false /* exclusiveEnd */)
iter = overlaps.Iter()
}
for file := iter.First(); file != nil; file = iter.Next() {
Expand Down
3 changes: 2 additions & 1 deletion ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,8 @@ func ingestTargetLevel(
}

// Check boundary overlap.
boundaryOverlaps := v.Overlaps(level, cmp, meta.Smallest.UserKey, meta.Largest.UserKey)
boundaryOverlaps := v.Overlaps(level, cmp, meta.Smallest.UserKey,
meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
if !boundaryOverlaps.Empty() {
continue
}
Expand Down
11 changes: 10 additions & 1 deletion internal/base/internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ func MakeInternalKey(userKey []byte, seqNum uint64, kind InternalKeyKind) Intern
}

// MakeSearchKey constructs an internal key that is appropriate for searching
// for a the specified user key. The search key contain the maximual sequence
// for a the specified user key. The search key contain the maximal sequence
// number and kind ensuring that it sorts before any other internal keys for
// the same user key.
func MakeSearchKey(userKey []byte) InternalKey {
Expand Down Expand Up @@ -342,6 +342,15 @@ func (k InternalKey) Pretty(f FormatKey) fmt.Formatter {
return prettyInternalKey{k, f}
}

// IsExclusiveSentinel returns whether this internal key excludes point keys
// with the same user key if used as an end boundary. See the comment on
// InternalKeyRangeDeletionSentinel.
func (k InternalKey) IsExclusiveSentinel() bool {
// TODO(jackson): This may need to change to include separate sentinels for
// range key unsets and deletes.
return k.Trailer == InternalKeyRangeDeleteSentinel || k.Trailer == InternalKeyBoundaryRangeKey
}

type prettyInternalKey struct {
InternalKey
formatKey FormatKey
Expand Down
6 changes: 3 additions & 3 deletions internal/manifest/l0_sublevels.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ func NewL0Sublevels(
keys = append(keys, intervalKey{key: f.Smallest.UserKey})
keys = append(keys, intervalKey{
key: f.Largest.UserKey,
isLargest: f.Largest.Trailer != base.InternalKeyRangeDeleteSentinel,
isLargest: !f.Largest.IsExclusiveSentinel(),
})
}
keys = sortAndDedup(keys, cmp)
Expand Down Expand Up @@ -269,7 +269,7 @@ func NewL0Sublevels(
cmp,
intervalKey{
key: f.Largest.UserKey,
isLargest: f.Largest.Trailer != base.InternalKeyRangeDeleteSentinel},
isLargest: !f.Largest.IsExclusiveSentinel()},
keys[f.minIntervalIndex+index]) <= 0
})
if f.maxIntervalIndex == len(keys) {
Expand Down Expand Up @@ -381,7 +381,7 @@ func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) {
// compacting.
for _, c := range inProgress {
startIK := intervalKey{key: c.Smallest.UserKey, isLargest: false}
endIK := intervalKey{key: c.Largest.UserKey, isLargest: c.Largest.Trailer != base.InternalKeyRangeDeleteSentinel}
endIK := intervalKey{key: c.Largest.UserKey, isLargest: !c.Largest.IsExclusiveSentinel()}
start := sort.Search(len(s.orderedIntervals), func(i int) bool {
return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) >= 0
})
Expand Down
2 changes: 1 addition & 1 deletion internal/manifest/l0_sublevels_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ func visualizeSublevels(
buf.WriteByte(middleChar)
lastChar++
}
if f.Largest.Trailer == base.InternalKeyRangeDeleteSentinel &&
if f.Largest.IsExclusiveSentinel() &&
j < len(files)-1 && files[j+1].Smallest.UserKey[0] == f.Largest.UserKey[0] {
// This case happens where two successive files have
// matching end/start user keys but where the left-side file
Expand Down
3 changes: 2 additions & 1 deletion internal/manifest/level_metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ func (lm *LevelMetadata) Slice() LevelSlice {
// key-sorted (eg, non-L0).
func (lm *LevelMetadata) Find(cmp base.Compare, m *FileMetadata) *LevelFile {
// TODO(jackson): Add an assertion that lm is key-sorted.
o := overlaps(lm.Iter(), cmp, m.Smallest.UserKey, m.Largest.UserKey)
o := overlaps(lm.Iter(), cmp, m.Smallest.UserKey,
m.Largest.UserKey, m.Largest.IsExclusiveSentinel())
iter := o.Iter()
for f := iter.First(); f != nil; f = iter.Next() {
if f == m {
Expand Down
40 changes: 28 additions & 12 deletions internal/manifest/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,22 +242,37 @@ func SortBySmallest(files []*FileMetadata, cmp Compare) {
sort.Sort(bySmallest{files, cmp})
}

func overlaps(iter LevelIterator, cmp Compare, start, end []byte) LevelSlice {
func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
startIter := iter.Clone()
startIter.SeekGE(cmp, start)

// SeekGE compares user keys. The user key `start` may be equal to the
// f.Largest because f.Largest is a range deletion sentinel, indicating that
// the user key `start` is NOT contained within the file f. If that's the
// case, we can narrow the overlapping bounds to exclude the file with the
// sentinel.
if f := startIter.Current(); f != nil && f.Largest.IsExclusiveSentinel() &&
cmp(f.Largest.UserKey, start) == 0 {
startIter.Next()
}

endIter := iter.Clone()
endIter.SeekGE(cmp, end)

// endIter is now pointing at the *first* file with a largest key >= end.
// If there are multiple files including the user key `end`, we want all
// of them, so move forward.
for endIter.Current() != nil && cmp(endIter.Current().Largest.UserKey, end) == 0 {
endIter.Next()
if !exclusiveEnd {
// endIter is now pointing at the *first* file with a largest key >= end.
// If there are multiple files including the user key `end`, we want all
// of them, so move forward.
for f := endIter.Current(); f != nil && cmp(f.Largest.UserKey, end) == 0; {
f = endIter.Next()
}
}

// LevelSlice uses inclusive bounds, so if we seeked to the end sentinel
// or nexted too far because Largest.UserKey equaled `end`, go back.
if !endIter.iter.valid() || cmp(endIter.Current().Smallest.UserKey, end) > 0 {
if !endIter.iter.valid() {
endIter.Prev()
} else if c := cmp(endIter.Current().Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd {
endIter.Prev()
}

Expand Down Expand Up @@ -484,7 +499,8 @@ func (v *Version) InitL0Sublevels(
func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool {
iter := v.Levels[level].Iter()
if level > 0 {
overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey)
overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey,
m.Largest.IsExclusiveSentinel())
iter = overlaps.Iter()
}
for f := iter.First(); f != nil; f = iter.Next() {
Expand All @@ -503,7 +519,7 @@ func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool {
// and the computation is repeated until [start, end] stabilizes.
// The returned files are a subsequence of the input files, i.e., the ordering
// is not changed.
func (v *Version) Overlaps(level int, cmp Compare, start, end []byte) LevelSlice {
func (v *Version) Overlaps(level int, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
if level == 0 {
// Indices that have been selected as overlapping.
l0 := v.Levels[level]
Expand All @@ -520,11 +536,11 @@ func (v *Version) Overlaps(level int, cmp Compare, start, end []byte) LevelSlice
}
smallest := meta.Smallest.UserKey
largest := meta.Largest.UserKey
if cmp(largest, start) < 0 {
if c := cmp(largest, start); c < 0 || c == 0 && meta.Largest.IsExclusiveSentinel() {
// meta is completely before the specified range; skip it.
continue
}
if cmp(smallest, end) > 0 {
if c := cmp(smallest, end); c > 0 || c == 0 && exclusiveEnd {
// meta is completely after the specified range; skip it.
continue
}
Expand Down Expand Up @@ -571,7 +587,7 @@ func (v *Version) Overlaps(level int, cmp Compare, start, end []byte) LevelSlice
return slice
}

return overlaps(v.Levels[level].Iter(), cmp, start, end)
return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd)
}

// CheckOrdering checks that the files are consistent with respect to
Expand Down
Loading

0 comments on commit d673586

Please sign in to comment.