-
Notifications
You must be signed in to change notification settings - Fork 466
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
db: add external sstable merging iterator
Add a pebble.NewExternalIter function that may be used to construct a *pebble.Iterator that reads from a provided slice of sstables rather than committed database state. Input sstables are required to contain all zero-sequence number keys. Shadowing of keys is resolved by treating the files as ordered in reverse chronological order. This iterator is intended to replace the storage package's multiIterator.
- Loading branch information
Showing
11 changed files
with
563 additions
and
58 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use | ||
// of this source code is governed by a BSD-style license that can be found in | ||
// the LICENSE file. | ||
|
||
package pebble | ||
|
||
import ( | ||
"github.com/cockroachdb/pebble/internal/base" | ||
"github.com/cockroachdb/pebble/internal/keyspan" | ||
"github.com/cockroachdb/pebble/internal/manifest" | ||
"github.com/cockroachdb/pebble/internal/rangekey" | ||
"github.com/cockroachdb/pebble/sstable" | ||
) | ||
|
||
// NewExternalIter takes an input set of sstable files which may overlap | ||
// arbitrarily and returns an Iterator over the merged contents of the sstables. | ||
// Input sstables may contain point keys, range keys, range deletions, etc. The | ||
// input files slice must be sorted in reverse chronological ordering. A key in | ||
// a file at a lower index will shadow a key with an identical user key | ||
// contained within a file at a higher index. | ||
// | ||
// Input sstables must only contain keys with the zero sequence number. | ||
func NewExternalIter( | ||
o *Options, | ||
iterOpts *IterOptions, | ||
files []sstable.ReadableFile, | ||
extraReaderOpts ...sstable.ReaderOption, | ||
) (it *Iterator, err error) { | ||
var readers []*sstable.Reader | ||
|
||
// Ensure we close all the opened readers if we error out. | ||
closeReaders := func() { | ||
for i := range readers { | ||
_ = readers[i].Close() | ||
} | ||
} | ||
defer func() { | ||
if err != nil { | ||
closeReaders() | ||
} | ||
}() | ||
readers, err = openExternalTables(o, files, o.MakeReaderOptions(), extraReaderOpts...) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
buf := iterAllocPool.Get().(*iterAlloc) | ||
dbi := &buf.dbi | ||
*dbi = Iterator{ | ||
alloc: buf, | ||
cmp: o.Comparer.Compare, | ||
equal: o.equal(), | ||
iter: &buf.merging, | ||
merge: o.Merger.Merge, | ||
split: o.Comparer.Split, | ||
readState: nil, | ||
keyBuf: buf.keyBuf, | ||
prefixOrFullSeekKey: buf.prefixOrFullSeekKey, | ||
batch: nil, | ||
newIters: func(f *manifest.FileMetadata, opts *IterOptions, bytesIterated *uint64) (internalIterator, keyspan.FragmentIterator, error) { | ||
// NB: External iterators are currently constructed without any | ||
// `levelIters`. newIters should never be called. When we support | ||
// organizing multiple non-overlapping files into a single level | ||
// (see TODO below), we'll need to adjust this tableNewIters | ||
// implementation to open iterators by looking up f in a map | ||
// of readers indexed by *fileMetadata. | ||
panic("unreachable") | ||
}, | ||
seqNum: base.InternalKeySeqNumMax, | ||
} | ||
if iterOpts != nil { | ||
dbi.opts = *iterOpts | ||
} | ||
dbi.opts.logger = o.Logger | ||
|
||
// TODO(jackson): In some instances we could generate fewer levels by using | ||
// L0Sublevels code to organize nonoverlapping files into the same level. | ||
// This would allow us to use levelIters and keep a smaller set of data and | ||
// files in-memory. However, it would also require us to identify the bounds | ||
// of all the files upfront. | ||
|
||
// Ensure we close all iters if error out early. | ||
mlevels := buf.mlevels[:0] | ||
var rangeKeyIters []keyspan.FragmentIterator | ||
defer func() { | ||
if err != nil { | ||
for i := range rangeKeyIters { | ||
_ = rangeKeyIters[i].Close() | ||
} | ||
for i := range mlevels { | ||
if mlevels[i].iter != nil { | ||
_ = mlevels[i].iter.Close() | ||
} | ||
if mlevels[i].rangeDelIter != nil { | ||
_ = mlevels[i].rangeDelIter.Close() | ||
} | ||
} | ||
} | ||
}() | ||
if iterOpts.pointKeys() { | ||
if len(files) > cap(mlevels) { | ||
mlevels = make([]mergingIterLevel, 0, len(files)) | ||
} | ||
for _, r := range readers { | ||
pointIter, err := r.NewIter(dbi.opts.LowerBound, dbi.opts.UpperBound) | ||
if err != nil { | ||
return nil, err | ||
} | ||
rangeDelIter, err := r.NewRawRangeDelIter() | ||
if err != nil { | ||
_ = pointIter.Close() | ||
return nil, err | ||
} | ||
mlevels = append(mlevels, mergingIterLevel{ | ||
iter: pointIter, | ||
rangeDelIter: rangeDelIter, | ||
}) | ||
} | ||
} | ||
buf.merging.init(&dbi.opts, dbi.cmp, dbi.split, mlevels...) | ||
buf.merging.snapshot = base.InternalKeySeqNumMax | ||
buf.merging.elideRangeTombstones = true | ||
|
||
if dbi.opts.rangeKeys() { | ||
for _, r := range readers { | ||
rki, err := r.NewRawRangeKeyIter() | ||
if err != nil { | ||
return nil, err | ||
} | ||
if rki != nil { | ||
rangeKeyIters = append(rangeKeyIters, rki) | ||
} | ||
} | ||
|
||
// TODO(jackson): Pool range-key iterator objects. | ||
dbi.rangeKey = &iteratorRangeKeyState{} | ||
fragmentedIter := &rangekey.Iter{} | ||
fragmentedIter.Init(o.Comparer.Compare, o.Comparer.FormatKey, base.InternalKeySeqNumMax, rangeKeyIters...) | ||
iter := &rangekey.DefragmentingIter{} | ||
iter.Init(o.Comparer.Compare, fragmentedIter, rangekey.DefragmentLogical) | ||
dbi.rangeKey.rangeKeyIter = iter | ||
|
||
dbi.rangeKey.iter.Init(dbi.cmp, dbi.split, &buf.merging, iter, dbi.opts.RangeKeyMasking.Suffix) | ||
dbi.iter = &dbi.rangeKey.iter | ||
dbi.iter.SetBounds(dbi.opts.LowerBound, dbi.opts.UpperBound) | ||
} | ||
|
||
// Close all the opened sstable.Readers when the Iterator is closed. | ||
dbi.closeHook = closeReaders | ||
return dbi, nil | ||
} | ||
|
||
func openExternalTables( | ||
o *Options, | ||
files []sstable.ReadableFile, | ||
readerOpts sstable.ReaderOptions, | ||
extraReaderOpts ...sstable.ReaderOption, | ||
) (readers []*sstable.Reader, err error) { | ||
readers = make([]*sstable.Reader, 0, len(files)) | ||
for i := range files { | ||
r, err := sstable.NewReader(files[i], readerOpts, extraReaderOpts...) | ||
if err != nil { | ||
return readers, err | ||
} | ||
// Use the index of the file in files as the sequence number for all of | ||
// its keys. | ||
r.Properties.GlobalSeqNum = uint64(len(files) - i) | ||
readers = append(readers, r) | ||
} | ||
return readers, err | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use | ||
// of this source code is governed by a BSD-style license that can be found in | ||
// the LICENSE file. | ||
|
||
package pebble | ||
|
||
import ( | ||
"fmt" | ||
"testing" | ||
|
||
"github.com/cockroachdb/pebble/internal/datadriven" | ||
"github.com/cockroachdb/pebble/internal/testkeys" | ||
"github.com/cockroachdb/pebble/sstable" | ||
"github.com/cockroachdb/pebble/vfs" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestExternalIterator(t *testing.T) { | ||
mem := vfs.NewMem() | ||
o := &Options{ | ||
FS: mem, | ||
Comparer: testkeys.Comparer, | ||
FormatMajorVersion: FormatRangeKeys, | ||
} | ||
o.EnsureDefaults() | ||
d, err := Open("", o) | ||
require.NoError(t, err) | ||
defer func() { require.NoError(t, d.Close()) }() | ||
|
||
datadriven.RunTest(t, "testdata/external_iterator", func(td *datadriven.TestData) string { | ||
switch td.Cmd { | ||
case "reset": | ||
mem = vfs.NewMem() | ||
return "" | ||
case "build": | ||
if err := runBuildCmd(td, d, mem); err != nil { | ||
return err.Error() | ||
} | ||
return "" | ||
case "iter": | ||
opts := IterOptions{KeyTypes: IterKeyTypePointsAndRanges} | ||
var files []sstable.ReadableFile | ||
for _, arg := range td.CmdArgs { | ||
switch arg.Key { | ||
case "mask-suffix": | ||
opts.RangeKeyMasking.Suffix = []byte(arg.Vals[0]) | ||
case "lower": | ||
opts.LowerBound = []byte(arg.Vals[0]) | ||
case "upper": | ||
opts.UpperBound = []byte(arg.Vals[0]) | ||
case "files": | ||
for _, v := range arg.Vals { | ||
f, err := mem.Open(v) | ||
require.NoError(t, err) | ||
files = append(files, f) | ||
} | ||
} | ||
} | ||
it, err := NewExternalIter(o, &opts, files) | ||
require.NoError(t, err) | ||
return runIterCmd(td, it, true /* close iter */) | ||
default: | ||
return fmt.Sprintf("unknown command: %s", td.Cmd) | ||
} | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.