diff --git a/compaction_test.go b/compaction_test.go index a7486db82d..b860d188d5 100644 --- a/compaction_test.go +++ b/compaction_test.go @@ -910,7 +910,7 @@ func TestCompaction(t *testing.T) { return "", "", errors.WithStack(err) } defer r.Close() - iter, err := r.NewIter(nil /* lower */, nil /* upper */) + iter, err := r.NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return "", "", errors.WithStack(err) } diff --git a/external_iterator.go b/external_iterator.go index fbe87eae5e..d53497701c 100644 --- a/external_iterator.go +++ b/external_iterator.go @@ -183,6 +183,11 @@ func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterato if len(it.externalReaders) > cap(mlevels) { mlevels = make([]mergingIterLevel, 0, len(it.externalReaders)) } + // We set a synthetic sequence number, with lower levels having higer numbers. + seqNum := 0 + for _, readers := range it.externalReaders { + seqNum += len(readers) + } for _, readers := range it.externalReaders { var combinedIters []internalIterator for _, r := range readers { @@ -196,15 +201,17 @@ func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterato // not have obsolete points (so the performance optimization is // unnecessary), and we don't want to bother constructing a // BlockPropertiesFilterer that includes obsoleteKeyBlockPropertyFilter. + transforms := sstable.IterTransforms{SyntheticSeqNum: sstable.SyntheticSeqNum(seqNum)} + seqNum-- pointIter, err = r.NewIterWithBlockPropertyFiltersAndContextEtc( - ctx, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */ - false /* hideObsoletePoints */, false, /* useFilterBlock */ + ctx, transforms, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */ + false, /* useFilterBlock */ &it.stats.InternalStats, it.opts.CategoryAndQoS, nil, sstable.TrivialReaderProvider{Reader: r}) if err != nil { return nil, err } - rangeDelIter, err = r.NewRawRangeDelIter() + rangeDelIter, err = r.NewRawRangeDelIter(transforms) if err != nil { return nil, err } @@ -268,9 +275,16 @@ func finishInitializingExternal(ctx context.Context, it *Iterator) error { // TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not // operate on FileMetadatas (similar to simpleLevelIter), and implements // this optimization. + // We set a synthetic sequence number, with lower levels having higer numbers. + seqNum := 0 + for _, readers := range it.externalReaders { + seqNum += len(readers) + } for _, readers := range it.externalReaders { for _, r := range readers { - if rki, err := r.NewRawRangeKeyIter(); err != nil { + transforms := sstable.IterTransforms{SyntheticSeqNum: sstable.SyntheticSeqNum(seqNum)} + seqNum-- + if rki, err := r.NewRawRangeKeyIter(transforms); err != nil { return err } else if rki != nil { rangeKeyIters = append(rangeKeyIters, rki) @@ -322,9 +336,6 @@ func openExternalTables( if err != nil { return readers, err } - // Use the index of the file in files as the sequence number for all of - // its keys. - r.Properties.GlobalSeqNum = uint64(len(files) - i + seqNumOffset) readers = append(readers, r) } return readers, err diff --git a/external_iterator_test.go b/external_iterator_test.go index 6d2c87ee6f..2940e0b55b 100644 --- a/external_iterator_test.go +++ b/external_iterator_test.go @@ -115,7 +115,7 @@ func TestSimpleLevelIter(t *testing.T) { }() var internalIters []internalIterator for i := range readers { - iter, err := readers[i].NewIter(nil, nil) + iter, err := readers[i].NewIter(sstable.NoTransforms, nil, nil) require.NoError(t, err) internalIters = append(internalIters, iter) } @@ -241,7 +241,7 @@ func TestIterRandomizedMaybeFilteredKeys(t *testing.T) { var iter sstable.Iterator iter, err = r.NewIterWithBlockPropertyFilters( - nil, nil, filterer, false /* useFilterBlock */, nil, /* stats */ + sstable.NoTransforms, nil, nil, filterer, false /* useFilterBlock */, nil, /* stats */ sstable.CategoryAndQoS{}, nil, sstable.TrivialReaderProvider{Reader: r}) require.NoError(t, err) defer iter.Close() diff --git a/ingest.go b/ingest.go index fda76be9cf..ea18a05939 100644 --- a/ingest.go +++ b/ingest.go @@ -292,7 +292,7 @@ func ingestLoad1( maybeSetStatsFromProperties(meta.PhysicalMeta(), &r.Properties) { - iter, err := r.NewIter(nil /* lower */, nil /* upper */) + iter, err := r.NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return nil, err } @@ -318,7 +318,7 @@ func ingestLoad1( } } - iter, err := r.NewRawRangeDelIter() + iter, err := r.NewRawRangeDelIter(sstable.NoTransforms) if err != nil { return nil, err } @@ -348,7 +348,7 @@ func ingestLoad1( // Update the range-key bounds for the table. { - iter, err := r.NewRawRangeKeyIter() + iter, err := r.NewRawRangeKeyIter(sstable.NoTransforms) if err != nil { return nil, err } diff --git a/internal/manifest/version.go b/internal/manifest/version.go index 3becb44346..4cccff82e7 100644 --- a/internal/manifest/version.go +++ b/internal/manifest/version.go @@ -283,6 +283,23 @@ func (m *FileMetadata) InternalKeyBounds() (InternalKey, InternalKey) { return m.Smallest, m.Largest } +// SyntheticSeqNum returns a SyntheticSeqNum which is set when SmallestSeqNum +// equals LargestSeqNum. +func (m *FileMetadata) SyntheticSeqNum() sstable.SyntheticSeqNum { + if m.SmallestSeqNum == m.LargestSeqNum { + return sstable.SyntheticSeqNum(m.SmallestSeqNum) + } + return sstable.NoSyntheticSeqNum +} + +// IterTransforms returns an sstable.IterTransforms that has SyntheticSeqNum set as needed. +func (m *FileMetadata) IterTransforms() sstable.IterTransforms { + return sstable.IterTransforms{ + SyntheticSeqNum: m.SyntheticSeqNum(), + SyntheticSuffix: m.SyntheticSuffix, + } +} + // PhysicalFileMeta is used by functions which want a guarantee that their input // belongs to a physical sst and not a virtual sst. // @@ -335,11 +352,10 @@ func (m VirtualFileMeta) VirtualReaderParams(isShared bool) sstable.VirtualReade Lower: m.Smallest, Upper: m.Largest, FileNum: m.FileNum, - IsShared: isShared, + IsSharedIngested: isShared && m.SyntheticSeqNum() != 0, Size: m.Size, BackingSize: m.FileBacking.Size, PrefixReplacement: m.PrefixReplacement, - SyntheticSuffix: m.SyntheticSuffix, } } diff --git a/level_checker_test.go b/level_checker_test.go index e73ba5ecf3..c063ccc6ad 100644 --- a/level_checker_test.go +++ b/level_checker_test.go @@ -97,11 +97,11 @@ func TestCheckLevelsCornerCases(t *testing.T) { newIters := func(_ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds) (iterSet, error) { r := readers[file.FileNum] - rangeDelIter, err := r.NewRawRangeDelIter() + rangeDelIter, err := r.NewRawRangeDelIter(sstable.NoTransforms) if err != nil { return iterSet{}, err } - iter, err := r.NewIter(nil /* lower */, nil /* upper */) + iter, err := r.NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return iterSet{}, err } diff --git a/level_iter_test.go b/level_iter_test.go index 038f5c88ea..01737f7ef7 100644 --- a/level_iter_test.go +++ b/level_iter_test.go @@ -165,13 +165,15 @@ func (lt *levelIterTest) newIters( kinds iterKinds, ) (iterSet, error) { lt.itersCreated++ + transforms := file.IterTransforms() iter, err := lt.readers[file.FileNum].NewIterWithBlockPropertyFiltersAndContextEtc( - ctx, opts.LowerBound, opts.UpperBound, nil, false, true, iio.stats, sstable.CategoryAndQoS{}, + ctx, transforms, + opts.LowerBound, opts.UpperBound, nil, true /* useFilterBlock */, iio.stats, sstable.CategoryAndQoS{}, nil, sstable.TrivialReaderProvider{Reader: lt.readers[file.FileNum]}) if err != nil { return iterSet{}, err } - rangeDelIter, err := lt.readers[file.FileNum].NewRawRangeDelIter() + rangeDelIter, err := lt.readers[file.FileNum].NewRawRangeDelIter(transforms) if err != nil { return iterSet{}, err } @@ -514,7 +516,7 @@ func buildLevelIterTables( meta := make([]*fileMetadata, len(readers)) for i := range readers { - iter, err := readers[i].NewIter(nil /* lower */, nil /* upper */) + iter, err := readers[i].NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) require.NoError(b, err) smallest, _ := iter.First() meta[i] = &fileMetadata{} @@ -541,7 +543,7 @@ func BenchmarkLevelIterSeekGE(b *testing.B) { newIters := func( _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds, ) (iterSet, error) { - iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */) + iter, err := readers[file.FileNum].NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) return iterSet{point: iter}, err } l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) @@ -583,6 +585,7 @@ func BenchmarkLevelIterSeqSeekGEWithBounds(b *testing.B) { _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, _ iterKinds, ) (iterSet, error) { iter, err := readers[file.FileNum].NewIter( + sstable.NoTransforms, opts.LowerBound, opts.UpperBound) return iterSet{point: iter}, err } @@ -625,6 +628,7 @@ func BenchmarkLevelIterSeqSeekPrefixGE(b *testing.B) { _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, _ iterKinds, ) (iterSet, error) { iter, err := readers[file.FileNum].NewIter( + sstable.NoTransforms, opts.LowerBound, opts.UpperBound) return iterSet{point: iter}, err } @@ -675,7 +679,7 @@ func BenchmarkLevelIterNext(b *testing.B) { newIters := func( _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds, ) (iterSet, error) { - iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */) + iter, err := readers[file.FileNum].NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) return iterSet{point: iter}, err } l := newLevelIter(context.Background(), IterOptions{}, testkeys.Comparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) @@ -709,7 +713,7 @@ func BenchmarkLevelIterPrev(b *testing.B) { newIters := func( _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds, ) (iterSet, error) { - iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */) + iter, err := readers[file.FileNum].NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) return iterSet{point: iter}, err } l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) diff --git a/merging_iter_test.go b/merging_iter_test.go index dd32c05412..c4c4656aa1 100644 --- a/merging_iter_test.go +++ b/merging_iter_test.go @@ -166,11 +166,12 @@ func TestMergingIterCornerCases(t *testing.T) { func(_ context.Context, file *manifest.FileMetadata, opts *IterOptions, iio internalIterOpts, kinds iterKinds, ) (iterSet, error) { r := readers[file.FileNum] - rangeDelIter, err := r.NewRawRangeDelIter() + rangeDelIter, err := r.NewRawRangeDelIter(sstable.NoTransforms) if err != nil { return iterSet{}, err } iter, err := r.NewIterWithBlockPropertyFilters( + sstable.NoTransforms, opts.GetLowerBound(), opts.GetUpperBound(), nil, true /* useFilterBlock */, iio.stats, sstable.CategoryAndQoS{}, nil, sstable.TrivialReaderProvider{Reader: r}) if err != nil { @@ -408,7 +409,7 @@ func BenchmarkMergingIterSeekGE(b *testing.B) { iters := make([]internalIterator, len(readers)) for i := range readers { var err error - iters[i], err = readers[i].NewIter(nil /* lower */, nil /* upper */) + iters[i], err = readers[i].NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) require.NoError(b, err) } var stats base.InternalIteratorStats @@ -441,7 +442,7 @@ func BenchmarkMergingIterNext(b *testing.B) { iters := make([]internalIterator, len(readers)) for i := range readers { var err error - iters[i], err = readers[i].NewIter(nil /* lower */, nil /* upper */) + iters[i], err = readers[i].NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) require.NoError(b, err) } var stats base.InternalIteratorStats @@ -477,7 +478,7 @@ func BenchmarkMergingIterPrev(b *testing.B) { iters := make([]internalIterator, len(readers)) for i := range readers { var err error - iters[i], err = readers[i].NewIter(nil /* lower */, nil /* upper */) + iters[i], err = readers[i].NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) require.NoError(b, err) } var stats base.InternalIteratorStats @@ -633,7 +634,7 @@ func buildLevelsForMergingIterSeqSeek( for i := range readers { meta := make([]*fileMetadata, len(readers[i])) for j := range readers[i] { - iter, err := readers[i][j].NewIter(nil /* lower */, nil /* upper */) + iter, err := readers[i][j].NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) require.NoError(b, err) smallest, _ := iter.First() meta[j] = &fileMetadata{} @@ -659,11 +660,11 @@ func buildMergingIter(readers [][]*sstable.Reader, levelSlices []manifest.LevelS _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, _ iterKinds, ) (iterSet, error) { iter, err := readers[levelIndex][file.FileNum].NewIter( - opts.LowerBound, opts.UpperBound) + sstable.NoTransforms, opts.LowerBound, opts.UpperBound) if err != nil { return iterSet{}, err } - rdIter, err := readers[levelIndex][file.FileNum].NewRawRangeDelIter() + rdIter, err := readers[levelIndex][file.FileNum].NewRawRangeDelIter(sstable.NoTransforms) if err != nil { iter.Close() return iterSet{}, err diff --git a/metamorphic/build.go b/metamorphic/build.go index dae743bf11..42ddc945b0 100644 --- a/metamorphic/build.go +++ b/metamorphic/build.go @@ -242,10 +242,10 @@ func openExternalObj( reader, err = sstable.NewReader(objstorageprovider.NewRemoteReadable(objReader, objSize), opts) panicIfErr(err) - pointIter, err = reader.NewIter(bounds.Start, bounds.End) + pointIter, err = reader.NewIter(sstable.NoTransforms, bounds.Start, bounds.End) panicIfErr(err) - rangeDelIter, err = reader.NewRawRangeDelIter() + rangeDelIter, err = reader.NewRawRangeDelIter(sstable.NoTransforms) panicIfErr(err) if rangeDelIter != nil { rangeDelIter = keyspan.Truncate( @@ -256,7 +256,7 @@ func openExternalObj( ) } - rangeKeyIter, err = reader.NewRawRangeKeyIter() + rangeKeyIter, err = reader.NewRawRangeKeyIter(sstable.NoTransforms) panicIfErr(err) if rangeKeyIter != nil { rangeKeyIter = keyspan.Truncate( diff --git a/replay/replay.go b/replay/replay.go index 7b6011b405..86d149115f 100644 --- a/replay/replay.go +++ b/replay/replay.go @@ -999,7 +999,7 @@ func loadFlushedSSTableKeys( defer r.Close() // Load all the point keys. - iter, err := r.NewIter(nil, nil) + iter, err := r.NewIter(sstable.NoTransforms, nil, nil) if err != nil { return err } @@ -1019,7 +1019,7 @@ func loadFlushedSSTableKeys( } // Load all the range tombstones. - if iter, err := r.NewRawRangeDelIter(); err != nil { + if iter, err := r.NewRawRangeDelIter(sstable.NoTransforms); err != nil { return err } else if iter != nil { defer iter.Close() @@ -1042,7 +1042,7 @@ func loadFlushedSSTableKeys( } // Load all the range keys. - if iter, err := r.NewRawRangeKeyIter(); err != nil { + if iter, err := r.NewRawRangeKeyIter(sstable.NoTransforms); err != nil { return err } else if iter != nil { defer iter.Close() diff --git a/sstable/block.go b/sstable/block.go index 0b360cd222..879e678df5 100644 --- a/sstable/block.go +++ b/sstable/block.go @@ -346,6 +346,42 @@ type blockEntry struct { type blockIter struct { cmp Compare split Split + + // Iterator transforms. + // + // SyntheticSuffix, if not nil, will replace the decoded ikey.UserKey suffix + // before the key is returned to the user. A sequence of iter operations on a + // block with a syntheticSuffix rule should return keys as if those operations + // ran on a block with keys that all had the syntheticSuffix. As an example: + // any sequence of block iter cmds should return the same keys for the + // following two blocks: + // + // blockA: a@3,b@3,c@3 + // blockB: a@1,b@2,c@1 with syntheticSuffix=3 + // + // To ensure this, Suffix replacement will not change the ordering of keys in + // the block because the iter assumes that no two keys in the block share the + // same prefix. Furthermore, during SeekGE and SeekLT operations, the block + // iterator handles "off by one" errors (explained in more detail in those + // functions) when, for a given key, originalSuffix < searchSuffix < + // replacementSuffix, with integer comparison. To handle these cases, the + // iterator assumes: + // + // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0 + // for keys with a suffix. + // + // NB: it is possible for a block iter to add a synthetic suffix on a key + // without a suffix, which implies + // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 , + // however, the iterator would never need to handle an off by one error in + // this case since originalSuffix (empty) > searchSuffix (non empty), with + // integer comparison. + // + // + // In addition, we also assume that any block with rangekeys will not contain + // a synthetic suffix. + transforms IterTransforms + // offset is the byte index that marks where the current key/value is // encoded in the block. offset int32 @@ -367,10 +403,9 @@ type blockIter struct { restarts int32 // Number of restart points in this block. Encoded at the end of the block // as a uint32. - numRestarts int32 - globalSeqNum uint64 - ptr unsafe.Pointer - data []byte + numRestarts int32 + ptr unsafe.Pointer + data []byte // key contains the raw key the iterator is currently pointed at. This may // point directly to data stored in the block (for a key which has no prefix // compression), to fullKey (for a prefix compressed key), or to a slice of @@ -419,81 +454,38 @@ type blockIter struct { vbr *valueBlockReader hasValuePrefix bool } - hideObsoletePoints bool - - // syntheticSuffix, if not nil, will replace the decoded ikey.UserKey suffix - // before the key is returned to the user. A sequence of iter operations on a - // block with a syntheticSuffix rule should return keys as if those operations - // ran on a block with keys that all had the syntheticSuffix. As an example: - // any sequence of block iter cmds should return the same keys for the - // following two blocks: - // - // blockA: a@3,b@3,c@3 - // blockB: a@1,b@2,c@1 with syntheticSuffix=3 - // - // To ensure this, Suffix replacement will not change the ordering of keys in - // the block because the iter assumes that no two keys in the block share the - // same prefix. Furthermore, during SeekGE and SeekLT operations, the block - // iterator handles "off by one" errors (explained in more detail in those - // functions) when, for a given key, originalSuffix < searchSuffix < - // replacementSuffix, with integer comparison. To handle these cases, the - // iterator assumes: - // - // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0 - // for keys with a suffix. - // - // NB: it is possible for a block iter to add a synthetic suffix on a key - // without a suffix, which implies - // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 , - // however, the iterator would never need to handle an off by one error in - // this case since originalSuffix (empty) > searchSuffix (non empty), with - // integer comparison. - // - // - // In addition, we also assume that any block with rangekeys will not contain - // a synthetic suffix. - syntheticSuffix SyntheticSuffix - synthSuffixBuf []byte + synthSuffixBuf []byte } // blockIter implements the base.InternalIterator interface. var _ base.InternalIterator = (*blockIter)(nil) func newBlockIter( - cmp Compare, split Split, block block, syntheticSuffix SyntheticSuffix, + cmp Compare, split Split, block block, transforms IterTransforms, ) (*blockIter, error) { i := &blockIter{} - return i, i.init(cmp, split, block, 0, false, syntheticSuffix) + return i, i.init(cmp, split, block, transforms) } func (i *blockIter) String() string { return "block" } -func (i *blockIter) init( - cmp Compare, - split Split, - block block, - globalSeqNum uint64, - hideObsoletePoints bool, - syntheticSuffix SyntheticSuffix, -) error { +func (i *blockIter) init(cmp Compare, split Split, block block, transforms IterTransforms) error { numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:])) if numRestarts == 0 { return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)") } - i.syntheticSuffix = syntheticSuffix + i.transforms = transforms i.synthSuffixBuf = i.synthSuffixBuf[:0] i.split = split i.cmp = cmp i.restarts = int32(len(block)) - 4*(1+numRestarts) i.numRestarts = numRestarts - i.globalSeqNum = globalSeqNum i.ptr = unsafe.Pointer(&block[0]) i.data = block i.fullKey = i.fullKey[:0] i.val = nil - i.hideObsoletePoints = hideObsoletePoints i.clearCache() if i.restarts > 0 { if err := i.readFirstKey(); err != nil { @@ -507,20 +499,15 @@ func (i *blockIter) init( } // NB: two cases of hideObsoletePoints: -// - Local sstable iteration: globalSeqNum will be set iff the sstable was +// - Local sstable iteration: syntheticSeqNum will be set iff the sstable was // ingested. -// - Foreign sstable iteration: globalSeqNum is always set. +// - Foreign sstable iteration: syntheticSeqNum is always set. func (i *blockIter) initHandle( - cmp Compare, - split Split, - block bufferHandle, - globalSeqNum uint64, - hideObsoletePoints bool, - syntheticSuffix SyntheticSuffix, + cmp Compare, split Split, block bufferHandle, transforms IterTransforms, ) error { i.handle.Release() i.handle = block - return i.init(cmp, split, block.Get(), globalSeqNum, hideObsoletePoints, syntheticSuffix) + return i.init(cmp, split, block.Get(), transforms) } func (i *blockIter) invalidate() { @@ -705,12 +692,12 @@ func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) { // BlockIter benchmarks. if n := len(key) - 8; n >= 0 { trailer := binary.LittleEndian.Uint64(key[n:]) - hiddenPoint = i.hideObsoletePoints && + hiddenPoint = i.transforms.HideObsoletePoints && (trailer&trailerObsoleteBit != 0) i.ikey.Trailer = trailer & trailerObsoleteMask i.ikey.UserKey = key[:n:n] - if i.globalSeqNum != 0 { - i.ikey.SetSeqNum(i.globalSeqNum) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.ikey.SetSeqNum(uint64(n)) } } else { i.ikey.Trailer = uint64(InternalKeyKindInvalid) @@ -720,20 +707,20 @@ func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) { } // maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with -// i.syntheticSuffix. allowInPlace is set to false if there's a chance that -// i.ikey.UserKey points to the same buffer as i.cachedBuf (i.e. during reverse -// iteration). +// i.transforms.syntheticSuffix. allowInPlace is set to false if there's a chance +// that i.ikey.UserKey points to the same buffer as i.cachedBuf (i.e. during +// reverse iteration). func (i *blockIter) maybeReplaceSuffix(allowInPlace bool) { - if i.syntheticSuffix != nil && i.ikey.UserKey != nil { + if i.transforms.SyntheticSuffix != nil && i.ikey.UserKey != nil { prefixLen := i.split(i.ikey.UserKey) - if allowInPlace && cap(i.ikey.UserKey) >= prefixLen+len(i.syntheticSuffix) { - i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.syntheticSuffix...) + if allowInPlace && cap(i.ikey.UserKey) >= prefixLen+len(i.transforms.SyntheticSuffix) { + i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.transforms.SyntheticSuffix...) return } // If ikey is cached or may get cached, we must copy // UserKey to a new buffer before prefix replacement. i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...) - i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...) + i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...) i.ikey.UserKey = i.synthSuffixBuf } } @@ -1014,7 +1001,7 @@ func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, ba } if index == 0 { - if i.syntheticSuffix != nil { + if i.transforms.SyntheticSuffix != nil { // The binary search was conducted on keys without suffix replacement, // implying the first key in the block may be less than the search key. To // double check, get the first key in the block with suffix replacement @@ -1055,7 +1042,7 @@ func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, ba if index < i.numRestarts { targetOffset = decodeRestart(i.data[i.restarts+4*(index):]) - if i.syntheticSuffix != nil { + if i.transforms.SyntheticSuffix != nil { // The binary search was conducted on keys without suffix replacement, // implying the returned restart point (index) may be less than the search // key, breaking the assumption described above. @@ -1277,24 +1264,24 @@ start: // Manually inlined version of i.decodeInternalKey(i.key). if n := len(i.key) - 8; n >= 0 { trailer := binary.LittleEndian.Uint64(i.key[n:]) - hiddenPoint := i.hideObsoletePoints && + hiddenPoint := i.transforms.HideObsoletePoints && (trailer&trailerObsoleteBit != 0) i.ikey.Trailer = trailer & trailerObsoleteMask i.ikey.UserKey = i.key[:n:n] - if i.globalSeqNum != 0 { - i.ikey.SetSeqNum(i.globalSeqNum) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.ikey.SetSeqNum(uint64(n)) } if hiddenPoint { goto start } - if i.syntheticSuffix != nil { + if i.transforms.SyntheticSuffix != nil { // Inlined version of i.maybeReplaceSuffix(true /* allowInPlace */) prefixLen := i.split(i.ikey.UserKey) - if cap(i.ikey.UserKey) >= prefixLen+len(i.syntheticSuffix) { - i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.syntheticSuffix...) + if cap(i.ikey.UserKey) >= prefixLen+len(i.transforms.SyntheticSuffix) { + i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.transforms.SyntheticSuffix...) } else { i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...) - i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...) + i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...) i.ikey.UserKey = i.synthSuffixBuf } } @@ -1558,21 +1545,21 @@ func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) hiddenPoint := false if n := len(i.key) - 8; n >= 0 { trailer := binary.LittleEndian.Uint64(i.key[n:]) - hiddenPoint = i.hideObsoletePoints && + hiddenPoint = i.transforms.HideObsoletePoints && (trailer&trailerObsoleteBit != 0) i.ikey.Trailer = trailer & trailerObsoleteMask i.ikey.UserKey = i.key[:n:n] - if i.globalSeqNum != 0 { - i.ikey.SetSeqNum(i.globalSeqNum) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.ikey.SetSeqNum(uint64(n)) } - if i.syntheticSuffix != nil { + if i.transforms.SyntheticSuffix != nil { // Inlined version of i.maybeReplaceSuffix(true /* allowInPlace */) prefixLen := i.split(i.ikey.UserKey) - if cap(i.ikey.UserKey) >= prefixLen+len(i.syntheticSuffix) { - i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.syntheticSuffix...) + if cap(i.ikey.UserKey) >= prefixLen+len(i.transforms.SyntheticSuffix) { + i.ikey.UserKey = append(i.ikey.UserKey[:prefixLen], i.transforms.SyntheticSuffix...) } else { i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...) - i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...) + i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...) i.ikey.UserKey = i.synthSuffixBuf } } @@ -1624,23 +1611,23 @@ start: i.key = i.cachedBuf[e.keyStart:e.keyEnd] if n := len(i.key) - 8; n >= 0 { trailer := binary.LittleEndian.Uint64(i.key[n:]) - hiddenPoint := i.hideObsoletePoints && + hiddenPoint := i.transforms.HideObsoletePoints && (trailer&trailerObsoleteBit != 0) if hiddenPoint { continue } i.ikey.Trailer = trailer & trailerObsoleteMask i.ikey.UserKey = i.key[:n:n] - if i.globalSeqNum != 0 { - i.ikey.SetSeqNum(i.globalSeqNum) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.ikey.SetSeqNum(uint64(n)) } - if i.syntheticSuffix != nil { + if i.transforms.SyntheticSuffix != nil { // Inlined version of i.maybeReplaceSuffix(false /* allowInPlace */) prefixLen := i.split(i.ikey.UserKey) // If ikey is cached or may get cached, we must de-reference // UserKey before prefix replacement. i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...) - i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...) + i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...) i.ikey.UserKey = i.synthSuffixBuf } } else { @@ -1720,13 +1707,13 @@ start: // Use the cache. goto start } - if i.syntheticSuffix != nil { + if i.transforms.SyntheticSuffix != nil { // Inlined version of i.maybeReplaceSuffix(false /* allowInPlace */) prefixLen := i.split(i.ikey.UserKey) // If ikey is cached or may get cached, we must de-reference // UserKey before prefix replacement. i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikey.UserKey[:prefixLen]...) - i.synthSuffixBuf = append(i.synthSuffixBuf, i.syntheticSuffix...) + i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix...) i.ikey.UserKey = i.synthSuffixBuf } if !i.lazyValueHandling.hasValuePrefix || diff --git a/sstable/block_property_test.go b/sstable/block_property_test.go index cf144a64fa..c7ecdcc8a1 100644 --- a/sstable/block_property_test.go +++ b/sstable/block_property_test.go @@ -952,7 +952,7 @@ func TestBlockProperties(t *testing.T) { var blocks []int var i int - iter, _ := newBlockIter(r.Compare, r.Split, indexH.Get(), nil /* syntheticSuffix */) + iter, _ := newBlockIter(r.Compare, r.Split, indexH.Get(), NoTransforms) for key, value := iter.First(); key != nil; key, value = iter.Next() { bh, err := decodeBlockHandleWithProperties(value.InPlaceValue()) if err != nil { @@ -1023,7 +1023,7 @@ func TestBlockProperties(t *testing.T) { return "filter excludes entire table" } iter, err := r.NewIterWithBlockPropertyFilters( - lower, upper, filterer, false /* use (bloom) filter */, &stats, + NoTransforms, lower, upper, filterer, false /* useFilterBlock */, &stats, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}) if err != nil { return err.Error() @@ -1108,7 +1108,7 @@ func TestBlockProperties_BoundLimited(t *testing.T) { return "filter excludes entire table" } iter, err := r.NewIterWithBlockPropertyFilters( - lower, upper, filterer, false /* use (bloom) filter */, &stats, + NoTransforms, lower, upper, filterer, false /* useFilterBlock */, &stats, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}) if err != nil { return err.Error() @@ -1319,7 +1319,7 @@ func runBlockPropsCmd(r *Reader, td *datadriven.TestData) string { return err.Error() } twoLevelIndex := r.Properties.IndexPartitions > 0 - i, err := newBlockIter(r.Compare, r.Split, bh.Get(), nil /* syntheticSuffix */) + i, err := newBlockIter(r.Compare, r.Split, bh.Get(), NoTransforms) if err != nil { return err.Error() } @@ -1366,7 +1366,7 @@ func runBlockPropsCmd(r *Reader, td *datadriven.TestData) string { if err != nil { return err.Error() } - if err := subiter.init(r.Compare, r.Split, subIndex.Get(), 0, false, nil); err != nil { + if err := subiter.init(r.Compare, r.Split, subIndex.Get(), NoTransforms); err != nil { return err.Error() } for key, value := subiter.First(); key != nil; key, value = subiter.Next() { diff --git a/sstable/block_test.go b/sstable/block_test.go index 7ced2526e4..31b784a49f 100644 --- a/sstable/block_test.go +++ b/sstable/block_test.go @@ -246,15 +246,16 @@ func TestBlockIter2(t *testing.T) { return "" case "iter": - iter, err := newBlockIter(bytes.Compare, nil, block, nil /* syntheticSuffix */) + globalSeqNum, err := scanGlobalSeqNum(d) + transforms := IterTransforms{SyntheticSeqNum: SyntheticSeqNum(globalSeqNum)} if err != nil { return err.Error() } - - iter.globalSeqNum, err = scanGlobalSeqNum(d) + iter, err := newBlockIter(bytes.Compare, nil, block, transforms) if err != nil { return err.Error() } + return itertest.RunInternalIterCmd(t, d, iter, itertest.Condensed) default: @@ -277,7 +278,7 @@ func TestBlockIterKeyStability(t *testing.T) { } block := w.finish() - i, err := newBlockIter(bytes.Compare, nil, block, nil /* syntheticSuffix */) + i, err := newBlockIter(bytes.Compare, nil, block, NoTransforms) require.NoError(t, err) // Check that the supplied slice resides within the bounds of the block. @@ -337,7 +338,7 @@ func TestBlockIterReverseDirections(t *testing.T) { for targetPos := 0; targetPos < w.restartInterval; targetPos++ { t.Run("", func(t *testing.T) { - i, err := newBlockIter(bytes.Compare, nil, block, nil /* syntheticSuffix */) + i, err := newBlockIter(bytes.Compare, nil, block, NoTransforms) require.NoError(t, err) pos := 3 @@ -417,10 +418,10 @@ func TestBlockSyntheticSuffix(t *testing.T) { suffixReplacedBlock := suffixWriter.finish() expectedBlock := expectedSuffixWriter.finish() - expect, err := newBlockIter(cmp, split, expectedBlock, nil /* syntheticSuffix */) + expect, err := newBlockIter(cmp, split, expectedBlock, NoTransforms) require.NoError(t, err) - got, err := newBlockIter(cmp, split, suffixReplacedBlock, synthSuffix) + got, err := newBlockIter(cmp, split, suffixReplacedBlock, IterTransforms{SyntheticSuffix: synthSuffix}) require.NoError(t, err) c := checker{t: t} @@ -547,7 +548,7 @@ func BenchmarkBlockIterSeekGE(b *testing.B) { syntheticSuffix = benchSynthSuffix } - it, err := newBlockIter(benchCmp, benchSplit, w.finish(), syntheticSuffix) + it, err := newBlockIter(benchCmp, benchSplit, w.finish(), IterTransforms{SyntheticSuffix: syntheticSuffix}) if err != nil { b.Fatal(err) } @@ -586,7 +587,7 @@ func BenchmarkBlockIterSeekLT(b *testing.B) { syntheticSuffix = benchSynthSuffix } - it, err := newBlockIter(benchCmp, benchSplit, w.finish(), syntheticSuffix) + it, err := newBlockIter(benchCmp, benchSplit, w.finish(), IterTransforms{SyntheticSuffix: syntheticSuffix}) if err != nil { b.Fatal(err) } @@ -632,7 +633,7 @@ func BenchmarkBlockIterNext(b *testing.B) { syntheticSuffix = benchSynthSuffix } - it, err := newBlockIter(benchCmp, benchSplit, w.finish(), syntheticSuffix) + it, err := newBlockIter(benchCmp, benchSplit, w.finish(), IterTransforms{SyntheticSuffix: syntheticSuffix}) if err != nil { b.Fatal(err) } @@ -666,7 +667,7 @@ func BenchmarkBlockIterPrev(b *testing.B) { syntheticSuffix = benchSynthSuffix } - it, err := newBlockIter(benchCmp, benchSplit, w.finish(), syntheticSuffix) + it, err := newBlockIter(benchCmp, benchSplit, w.finish(), IterTransforms{SyntheticSuffix: syntheticSuffix}) if err != nil { b.Fatal(err) } diff --git a/sstable/data_test.go b/sstable/data_test.go index 0c76344a22..c68e6d67d3 100644 --- a/sstable/data_test.go +++ b/sstable/data_test.go @@ -430,7 +430,7 @@ func runIterCmd( } fmt.Fprintf(&b, "| index.isDataInvalidated()=%t\n", si.index.isDataInvalidated()) fmt.Fprintf(&b, "| data.isDataInvalidated()=%t\n", si.data.isDataInvalidated()) - fmt.Fprintf(&b, "| hideObsoletePoints = %t\n", si.hideObsoletePoints) + fmt.Fprintf(&b, "| hideObsoletePoints = %t\n", si.transforms.HideObsoletePoints) fmt.Fprintf(&b, "| dataBH = (Offset: %d, Length: %d)\n", si.dataBH.Offset, si.dataBH.Length) fmt.Fprintf(&b, "| (boundsCmp,positionedUsingLatestBounds) = (%d,%t)\n", si.boundsCmp, si.positionedUsingLatestBounds) fmt.Fprintf(&b, "| exhaustedBounds = %d\n", si.exhaustedBounds) diff --git a/sstable/layout.go b/sstable/layout.go index 7c7a74dd66..63f2030f98 100644 --- a/sstable/layout.go +++ b/sstable/layout.go @@ -186,7 +186,7 @@ func (l *Layout) Describe( var lastKey InternalKey switch b.name { case "data", "range-del", "range-key": - iter, _ := newBlockIter(r.Compare, r.Split, h.Get(), nil /* syntheticSuffix */) + iter, _ := newBlockIter(r.Compare, r.Split, h.Get(), NoTransforms) for key, value := iter.First(); key != nil; key, value = iter.Next() { ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset)) shared, ptr := decodeVarint(ptr) @@ -238,7 +238,7 @@ func (l *Layout) Describe( formatRestarts(iter.data, iter.restarts, iter.numRestarts) formatTrailer() case "index", "top-index": - iter, _ := newBlockIter(r.Compare, r.Split, h.Get(), nil /* syntheticSuffix */) + iter, _ := newBlockIter(r.Compare, r.Split, h.Get(), NoTransforms) for key, value := iter.First(); key != nil; key, value = iter.Next() { bh, err := decodeBlockHandleWithProperties(value.InPlaceValue()) if err != nil { diff --git a/sstable/prefix_replacing_iterator_test.go b/sstable/prefix_replacing_iterator_test.go index 1008133989..e37f420a1b 100644 --- a/sstable/prefix_replacing_iterator_test.go +++ b/sstable/prefix_replacing_iterator_test.go @@ -26,7 +26,7 @@ func TestPrefixReplacingIterator(t *testing.T) { t.Run(fmt.Sprintf("%s_%s", tc.from, tc.to), func(t *testing.T) { r := buildTestTable(t, 20, 256, 256, DefaultCompression, tc.from) defer r.Close() - rawIter, err := r.NewIter(nil, nil) + rawIter, err := r.NewIter(NoTransforms, nil, nil) require.NoError(t, err) defer rawIter.Close() diff --git a/sstable/properties.go b/sstable/properties.go index 3bbf34a8af..2a3a92fb95 100644 --- a/sstable/properties.go +++ b/sstable/properties.go @@ -17,7 +17,6 @@ import ( ) const propertiesBlockRestartInterval = math.MaxInt32 -const propGlobalSeqnumName = "rocksdb.external_sst_file.global_seqno" var propTagMap = make(map[string]reflect.StructField) var propBoolTrue = []byte{'1'} @@ -143,9 +142,6 @@ type Properties struct { FilterPolicyName string `prop:"rocksdb.filter.policy"` // The size of filter block. FilterSize uint64 `prop:"rocksdb.filter.size"` - // The global sequence number to use for all entries in the table. Present if - // the table was created externally and ingested whole. - GlobalSeqNum uint64 `prop:"rocksdb.external_sst_file.global_seqno"` // Total number of index partitions if kTwoLevelIndexSearch is used. IndexPartitions uint64 `prop:"rocksdb.index.partitions"` // The size of index block. @@ -287,12 +283,7 @@ func (p *Properties) load( case reflect.Uint32: field.SetUint(uint64(binary.LittleEndian.Uint32(i.Value()))) case reflect.Uint64: - var n uint64 - if string(i.Key().UserKey) == propGlobalSeqnumName { - n = binary.LittleEndian.Uint64(i.Value()) - } else { - n, _ = binary.Uvarint(i.Value()) - } + n, _ := binary.Uvarint(i.Value()) field.SetUint(n) case reflect.String: field.SetString(intern.Bytes(i.Value())) @@ -333,6 +324,8 @@ func (p *Properties) saveUint64(m map[string][]byte, offset uintptr, value uint6 m[propOffsetTagMap[offset]] = buf[:] } +var _ = (*Properties).saveUint64 + func (p *Properties) saveUvarint(m map[string][]byte, offset uintptr, value uint64) { var buf [10]byte n := binary.PutUvarint(buf[:], value) @@ -361,7 +354,6 @@ func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) { p.saveUvarint(m, unsafe.Offsetof(p.DataSize), p.DataSize) if p.ExternalFormatVersion != 0 { p.saveUint32(m, unsafe.Offsetof(p.ExternalFormatVersion), p.ExternalFormatVersion) - p.saveUint64(m, unsafe.Offsetof(p.GlobalSeqNum), p.GlobalSeqNum) } if p.FilterPolicyName != "" { p.saveString(m, unsafe.Offsetof(p.FilterPolicyName), p.FilterPolicyName) diff --git a/sstable/properties_test.go b/sstable/properties_test.go index 8794c5382b..7890125003 100644 --- a/sstable/properties_test.go +++ b/sstable/properties_test.go @@ -75,7 +75,6 @@ var testProps = Properties{ ExternalFormatVersion: 4, FilterPolicyName: "filter policy name", FilterSize: 5, - GlobalSeqNum: 8, IndexPartitions: 10, IndexSize: 11, IndexType: 12, diff --git a/sstable/random_test.go b/sstable/random_test.go index f45ba08695..1ad6d4f430 100644 --- a/sstable/random_test.go +++ b/sstable/random_test.go @@ -96,6 +96,7 @@ func runErrorInjectionTest(t *testing.T, seed int64) { // operations on the range deletion and range key iterators? var stats base.InternalIteratorStats it, err := r.NewIterWithBlockPropertyFilters( + NoTransforms, nil /* lower TODO */, nil, /* upper TODO */ filterer, rng.Intn(2) == 1, /* use filter block */ diff --git a/sstable/reader.go b/sstable/reader.go index 729b173ac3..8992d52673 100644 --- a/sstable/reader.go +++ b/sstable/reader.go @@ -165,6 +165,11 @@ func (c *cacheOpts) writerApply(w *Writer) { // is not empty). type SyntheticSuffix []byte +// IsSet returns true if the synthetic suffix is not enpty. +func (ss SyntheticSuffix) IsSet() bool { + return len(ss) > 0 +} + // rawTombstonesOpt is a Reader open option for specifying that range // tombstones returned by Reader.NewRangeDelIter() should not be // fragmented. Used by debug tools to get a raw view of the tombstones @@ -184,32 +189,6 @@ func init() { private.SSTableRawTombstonesOpt = rawTombstonesOpt{} } -// CommonReader abstracts functionality over a Reader or a VirtualReader. This -// can be used by code which doesn't care to distinguish between a reader and a -// virtual reader. -type CommonReader interface { - NewRawRangeKeyIter() (keyspan.FragmentIterator, error) - NewRawRangeDelIter() (keyspan.FragmentIterator, error) - NewIterWithBlockPropertyFiltersAndContextEtc( - ctx context.Context, lower, upper []byte, - filterer *BlockPropertiesFilterer, - hideObsoletePoints, useFilterBlock bool, - stats *base.InternalIteratorStats, - categoryAndQoS CategoryAndQoS, - statsCollector *CategoryStatsCollector, - rp ReaderProvider, - ) (Iterator, error) - NewCompactionIter( - bytesIterated *uint64, - categoryAndQoS CategoryAndQoS, - statsCollector *CategoryStatsCollector, - rp ReaderProvider, - bufferPool *BufferPool, - ) (Iterator, error) - EstimateDiskUsage(start, end []byte) (uint64, error) - CommonProperties() *CommonProperties -} - // Reader is a table reader. type Reader struct { readable objstorage.Readable @@ -248,6 +227,8 @@ type Reader struct { metaBufferPoolAlloc [3]allocedBuffer } +var _ CommonReader = (*Reader)(nil) + // Close implements DB.Close, as documented in the pebble package. func (r *Reader) Close() error { r.opts.Cache.Unref() @@ -269,6 +250,7 @@ func (r *Reader) Close() error { // table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after // itself and returns a nil iterator. func (r *Reader) NewIterWithBlockPropertyFilters( + transforms IterTransforms, lower, upper []byte, filterer *BlockPropertiesFilterer, useFilterBlock bool, @@ -278,30 +260,31 @@ func (r *Reader) NewIterWithBlockPropertyFilters( rp ReaderProvider, ) (Iterator, error) { return r.newIterWithBlockPropertyFiltersAndContext( - context.Background(), lower, upper, filterer, false, useFilterBlock, stats, - categoryAndQoS, statsCollector, rp, nil) + context.Background(), transforms, lower, upper, filterer, useFilterBlock, + stats, categoryAndQoS, statsCollector, rp, nil) } // NewIterWithBlockPropertyFiltersAndContextEtc is similar to // NewIterWithBlockPropertyFilters and additionally accepts a context for // tracing. // -// If hideObsoletePoints, the callee assumes that filterer already includes -// obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by -// first calling TryAddBlockPropertyFilterForHideObsoletePoints. +// If transform.HideObsoletePoints is set, the callee assumes that filterer +// already includes obsoleteKeyBlockPropertyFilter. The caller can satisfy this +// contract by first calling TryAddBlockPropertyFilterForHideObsoletePoints. func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc( ctx context.Context, + transforms IterTransforms, lower, upper []byte, filterer *BlockPropertiesFilterer, - hideObsoletePoints, useFilterBlock bool, + useFilterBlock bool, stats *base.InternalIteratorStats, categoryAndQoS CategoryAndQoS, statsCollector *CategoryStatsCollector, rp ReaderProvider, ) (Iterator, error) { return r.newIterWithBlockPropertyFiltersAndContext( - ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, categoryAndQoS, - statsCollector, rp, nil) + ctx, transforms, lower, upper, filterer, useFilterBlock, + stats, categoryAndQoS, statsCollector, rp, nil) } // TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called @@ -322,23 +305,23 @@ func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints( func (r *Reader) newIterWithBlockPropertyFiltersAndContext( ctx context.Context, + transforms IterTransforms, lower, upper []byte, filterer *BlockPropertiesFilterer, - hideObsoletePoints bool, useFilterBlock bool, stats *base.InternalIteratorStats, categoryAndQoS CategoryAndQoS, statsCollector *CategoryStatsCollector, rp ReaderProvider, - v *virtualState, + vState *virtualState, ) (Iterator, error) { // NB: pebble.tableCache wraps the returned iterator with one which performs // reference counting on the Reader, preventing the Reader from being closed // until the final iterator closes. if r.Properties.IndexType == twoLevelIndex { i := twoLevelIterPool.Get().(*twoLevelIterator) - err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, - categoryAndQoS, statsCollector, rp, nil /* bufferPool */) + err := i.init(ctx, r, vState, transforms, lower, upper, filterer, useFilterBlock, + stats, categoryAndQoS, statsCollector, rp, nil /* bufferPool */) if err != nil { return nil, err } @@ -346,8 +329,8 @@ func (r *Reader) newIterWithBlockPropertyFiltersAndContext( } i := singleLevelIterPool.Get().(*singleLevelIterator) - err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, - categoryAndQoS, statsCollector, rp, nil /* bufferPool */) + err := i.init(ctx, r, vState, transforms, lower, upper, filterer, useFilterBlock, + stats, categoryAndQoS, statsCollector, rp, nil /* bufferPool */) if err != nil { return nil, err } @@ -358,40 +341,44 @@ func (r *Reader) newIterWithBlockPropertyFiltersAndContext( // occurs, NewIter cleans up after itself and returns a nil iterator. NewIter // must only be used when the Reader is guaranteed to outlive any LazyValues // returned from the iter. -func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) { +func (r *Reader) NewIter(transforms IterTransforms, lower, upper []byte) (Iterator, error) { return r.NewIterWithBlockPropertyFilters( - lower, upper, nil, true /* useFilterBlock */, nil, /* stats */ - CategoryAndQoS{}, nil /*statsCollector */, TrivialReaderProvider{Reader: r}) + transforms, lower, upper, nil, true, /* useFilterBlock */ + nil /* stats */, CategoryAndQoS{}, nil /* statsCollector */, TrivialReaderProvider{Reader: r}) } // NewCompactionIter returns an iterator similar to NewIter but it also increments // the number of bytes iterated. If an error occurs, NewCompactionIter cleans up // after itself and returns a nil iterator. func (r *Reader) NewCompactionIter( + transforms IterTransforms, bytesIterated *uint64, categoryAndQoS CategoryAndQoS, statsCollector *CategoryStatsCollector, rp ReaderProvider, bufferPool *BufferPool, ) (Iterator, error) { - return r.newCompactionIter(bytesIterated, categoryAndQoS, statsCollector, rp, nil, bufferPool) + return r.newCompactionIter(transforms, bytesIterated, categoryAndQoS, statsCollector, rp, nil, bufferPool) } func (r *Reader) newCompactionIter( + transforms IterTransforms, bytesIterated *uint64, categoryAndQoS CategoryAndQoS, statsCollector *CategoryStatsCollector, rp ReaderProvider, - v *virtualState, + vState *virtualState, bufferPool *BufferPool, ) (Iterator, error) { + if vState != nil && vState.isSharedIngested { + transforms.HideObsoletePoints = true + } if r.Properties.IndexType == twoLevelIndex { i := twoLevelIterPool.Get().(*twoLevelIterator) err := i.init( context.Background(), - r, v, nil /* lower */, nil /* upper */, nil, - false /* useFilter */, v != nil && v.isSharedIngested, /* hideObsoletePoints */ - nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool, + r, vState, transforms, nil /* lower */, nil /* upper */, nil, + false /* useFilter */, nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool, ) if err != nil { return nil, err @@ -404,9 +391,8 @@ func (r *Reader) newCompactionIter( } i := singleLevelIterPool.Get().(*singleLevelIterator) err := i.init( - context.Background(), r, v, nil /* lower */, nil, /* upper */ - nil, false /* useFilter */, v != nil && v.isSharedIngested, /* hideObsoletePoints */ - nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool, + context.Background(), r, vState, transforms, nil /* lower */, nil, /* upper */ + nil, false /* useFilter */, nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool, ) if err != nil { return nil, err @@ -424,10 +410,13 @@ func (r *Reader) newCompactionIter( // // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing // iterator. Add WithContext methods since the existing ones are public. -func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { +func (r *Reader) NewRawRangeDelIter(transforms IterTransforms) (keyspan.FragmentIterator, error) { if r.rangeDelBH.Length == 0 { return nil, nil } + if transforms.SyntheticSuffix.IsSet() { + return nil, errors.AssertionFailedf("synthetic suffix not supported with range del iterator") + } h, err := r.readRangeDel(nil /* stats */, nil /* iterStats */) if err != nil { return nil, err @@ -437,44 +426,37 @@ func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { // sstables. This is because rangedels do not apply to points in the same // sstable at the same sequence number anyway, so exposing obsolete rangedels // is harmless. - if err := i.blockIter.initHandle(r.Compare, r.Split, h, r.Properties.GlobalSeqNum, false, nil); err != nil { + if err := i.blockIter.initHandle(r.Compare, r.Split, h, transforms); err != nil { return nil, err } return i, nil } -func (r *Reader) newRawRangeKeyIter(vState *virtualState) (keyspan.FragmentIterator, error) { +// NewRawRangeKeyIter returns an internal iterator for the contents of the +// range-key block for the table. Returns nil if the table does not contain any +// range keys. +// +// TODO(sumeer): plumb context.Context since this path is relevant in the user-facing +// iterator. Add WithContext methods since the existing ones are public. +func (r *Reader) NewRawRangeKeyIter(transforms IterTransforms) (keyspan.FragmentIterator, error) { if r.rangeKeyBH.Length == 0 { return nil, nil } + if transforms.SyntheticSuffix.IsSet() { + return nil, errors.AssertionFailedf("synthetic suffix not supported with range key iterator") + } h, err := r.readRangeKey(nil /* stats */, nil /* iterStats */) if err != nil { return nil, err } i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter) - var globalSeqNum uint64 - // Don't pass a global sequence number for shared ingested sstables. The - // virtual reader needs to know the materialized sequence numbers, and will - // do the appropriate sequence number substitution. - if vState == nil || !vState.isSharedIngested { - globalSeqNum = r.Properties.GlobalSeqNum - } - if err := i.blockIter.initHandle(r.Compare, r.Split, h, globalSeqNum, false, nil); err != nil { + + if err := i.blockIter.initHandle(r.Compare, r.Split, h, transforms); err != nil { return nil, err } return i, nil } -// NewRawRangeKeyIter returns an internal iterator for the contents of the -// range-key block for the table. Returns nil if the table does not contain any -// range keys. -// -// TODO(sumeer): plumb context.Context since this path is relevant in the user-facing -// iterator. Add WithContext methods since the existing ones are public. -func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { - return r.newRawRangeKeyIter(nil /* vState */) -} - type rangeKeyFragmentBlockIter struct { fragmentBlockIter } @@ -714,7 +696,7 @@ func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) { // tombstones. We need properly fragmented and sorted range tombstones in // order to serve from them directly. iter := &blockIter{} - if err := iter.init(r.Compare, r.Split, b, r.Properties.GlobalSeqNum, false, nil); err != nil { + if err := iter.init(r.Compare, r.Split, b, NoTransforms); err != nil { return nil, err } var tombstones []keyspan.Span @@ -894,7 +876,7 @@ func (r *Reader) Layout() (*Layout, error) { if r.Properties.IndexPartitions == 0 { l.Index = append(l.Index, r.indexBH) - iter, _ := newBlockIter(r.Compare, r.Split, indexH.Get(), nil /* syntheticSuffix */) + iter, _ := newBlockIter(r.Compare, r.Split, indexH.Get(), NoTransforms) for key, value := iter.First(); key != nil; key, value = iter.Next() { dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) if err != nil { @@ -907,7 +889,7 @@ func (r *Reader) Layout() (*Layout, error) { } } else { l.TopIndex = r.indexBH - topIter, _ := newBlockIter(r.Compare, r.Split, indexH.Get(), nil /* syntheticSuffix */) + topIter, _ := newBlockIter(r.Compare, r.Split, indexH.Get(), NoTransforms) iter := &blockIter{} for key, value := topIter.First(); key != nil; key, value = topIter.Next() { indexBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) @@ -922,7 +904,7 @@ func (r *Reader) Layout() (*Layout, error) { return nil, err } // TODO(msbutler): figure out how to pass virtualState to layout call. - if err := iter.init(r.Compare, r.Split, subIndex.Get(), 0, false, nil); err != nil { + if err := iter.init(r.Compare, r.Split, subIndex.Get(), NoTransforms); err != nil { return nil, err } for key, value := iter.First(); key != nil; key, value = iter.Next() { @@ -1057,14 +1039,14 @@ func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) { // to the same blockIter over the single index in the unpartitioned case. var startIdxIter, endIdxIter *blockIter if r.Properties.IndexPartitions == 0 { - iter, err := newBlockIter(r.Compare, r.Split, indexH.Get(), nil /* syntheticSuffix */) + iter, err := newBlockIter(r.Compare, r.Split, indexH.Get(), NoTransforms) if err != nil { return 0, err } startIdxIter = iter endIdxIter = iter } else { - topIter, err := newBlockIter(r.Compare, r.Split, indexH.Get(), nil /* syntheticSuffix */) + topIter, err := newBlockIter(r.Compare, r.Split, indexH.Get(), NoTransforms) if err != nil { return 0, err } @@ -1084,7 +1066,7 @@ func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) { return 0, err } defer startIdxBlock.Release() - startIdxIter, err = newBlockIter(r.Compare, r.Split, startIdxBlock.Get(), nil /* syntheticSuffix */) + startIdxIter, err = newBlockIter(r.Compare, r.Split, startIdxBlock.Get(), NoTransforms) if err != nil { return 0, err } @@ -1105,7 +1087,7 @@ func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) { return 0, err } defer endIdxBlock.Release() - endIdxIter, err = newBlockIter(r.Compare, r.Split, endIdxBlock.Get(), nil /* syntheticSuffix */) + endIdxIter, err = newBlockIter(r.Compare, r.Split, endIdxBlock.Get(), NoTransforms) if err != nil { return 0, err } diff --git a/sstable/reader_common.go b/sstable/reader_common.go new file mode 100644 index 0000000000..573d08f6cc --- /dev/null +++ b/sstable/reader_common.go @@ -0,0 +1,100 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "bytes" + "context" + "fmt" + + "github.com/cockroachdb/pebble/internal/base" + "github.com/cockroachdb/pebble/internal/keyspan" +) + +// CommonReader abstracts functionality over a Reader or a VirtualReader. This +// can be used by code which doesn't care to distinguish between a reader and a +// virtual reader. +type CommonReader interface { + NewRawRangeKeyIter(transforms IterTransforms) (keyspan.FragmentIterator, error) + + NewRawRangeDelIter(transforms IterTransforms) (keyspan.FragmentIterator, error) + + NewIterWithBlockPropertyFiltersAndContextEtc( + ctx context.Context, + transforms IterTransforms, + lower, upper []byte, + filterer *BlockPropertiesFilterer, + useFilterBlock bool, + stats *base.InternalIteratorStats, + categoryAndQoS CategoryAndQoS, + statsCollector *CategoryStatsCollector, + rp ReaderProvider, + ) (Iterator, error) + + NewCompactionIter( + transforms IterTransforms, + bytesIterated *uint64, + categoryAndQoS CategoryAndQoS, + statsCollector *CategoryStatsCollector, + rp ReaderProvider, + bufferPool *BufferPool, + ) (Iterator, error) + + EstimateDiskUsage(start, end []byte) (uint64, error) + + CommonProperties() *CommonProperties +} + +// IterTransforms allow on-the-fly transformation of data at iteration time. +// +// These transformations could in principle be implemented as block transforms +// (at least for non-virtual sstables), but applying them during iteration is +// preferable. +type IterTransforms struct { + SyntheticSeqNum SyntheticSeqNum + HideObsoletePoints bool + SyntheticSuffix SyntheticSuffix +} + +// NoTransforms is the default value for IterTransforms. +var NoTransforms = IterTransforms{} + +// SyntheticSeqNum is used to override all sequence numbers in a table. It is +// set to a non-zero value when the table was created externally and ingested +// whole. +type SyntheticSeqNum uint64 + +// NoSyntheticSeqNum is the default zero value for SyntheticSeqNum, which +// disables overriding the sequence number. +const NoSyntheticSeqNum SyntheticSeqNum = 0 + +// PrefixReplacement represents a read-time replacement of a key prefix. +type PrefixReplacement struct { + // ContentPrefix is the existing prefix that each key is expected to have. + ContentPrefix []byte + // SyntheticPrefix replaces the ContentPrefix in all keys. If ContentPrefix is + // empty, we are just prepending the synthetic prefix. + SyntheticPrefix []byte +} + +// ReplaceArg replaces the new prefix in the argument with the original prefix. +func (p *PrefixReplacement) ReplaceArg(src []byte) []byte { + return p.replace(src, p.SyntheticPrefix, p.ContentPrefix) +} + +// ReplaceResult replaces the original prefix in the result with the new prefix. +func (p *PrefixReplacement) ReplaceResult(key []byte) []byte { + return p.replace(key, p.ContentPrefix, p.SyntheticPrefix) +} + +func (p *PrefixReplacement) replace(key, from, to []byte) []byte { + if !bytes.HasPrefix(key, from) { + panic(fmt.Sprintf("unexpected prefix in replace: %s", key)) + } + result := make([]byte, 0, len(to)+(len(key)-len(from))) + result = append(result, to...) + result = append(result, key[len(from):]...) + return result +} diff --git a/sstable/reader_iter.go b/sstable/reader_iter.go index 7fe9c7604b..2b5a267a16 100644 --- a/sstable/reader_iter.go +++ b/sstable/reader_iter.go @@ -5,7 +5,6 @@ package sstable import ( - "bytes" "fmt" "os" "sync" @@ -35,35 +34,6 @@ type Iterator interface { SetCloseHook(fn func(i Iterator) error) } -// PrefixReplacement represents a read-time replacement of a key prefix. -type PrefixReplacement struct { - // ContentPrefix is the existing prefix that each key is expected to have. - ContentPrefix []byte - // SyntheticPrefix replaces the ContentPrefix in all keys. If ContentPrefix is - // empty, we are just prepending the synthetic prefix. - SyntheticPrefix []byte -} - -// ReplaceArg replaces the new prefix in the argument with the original prefix. -func (p *PrefixReplacement) ReplaceArg(src []byte) []byte { - return p.replace(src, p.SyntheticPrefix, p.ContentPrefix) -} - -// ReplaceResult replaces the original prefix in the result with the new prefix. -func (p *PrefixReplacement) ReplaceResult(key []byte) []byte { - return p.replace(key, p.ContentPrefix, p.SyntheticPrefix) -} - -func (p *PrefixReplacement) replace(key, from, to []byte) []byte { - if !bytes.HasPrefix(key, from) { - panic(fmt.Sprintf("unexpected prefix in replace: %s", key)) - } - result := make([]byte, 0, len(to)+(len(key)-len(from))) - result = append(result, to...) - result = append(result, key[len(from):]...) - return result -} - // Iterator positioning optimizations and singleLevelIterator and // twoLevelIterator: // diff --git a/sstable/reader_iter_single_lvl.go b/sstable/reader_iter_single_lvl.go index 2cec8be64f..9b7da90edf 100644 --- a/sstable/reader_iter_single_lvl.go +++ b/sstable/reader_iter_single_lvl.go @@ -164,7 +164,7 @@ type singleLevelIterator struct { useFilter bool lastBloomFilterMatched bool - hideObsoletePoints bool + transforms IterTransforms // inPool is set to true before putting the iterator in the reusable pool; // used to detect double-close. @@ -185,9 +185,10 @@ func (i *singleLevelIterator) init( ctx context.Context, r *Reader, v *virtualState, + transforms IterTransforms, lower, upper []byte, filterer *BlockPropertiesFilterer, - useFilter, hideObsoletePoints bool, + useFilter bool, stats *base.InternalIteratorStats, categoryAndQoS CategoryAndQoS, statsCollector *CategoryStatsCollector, @@ -216,9 +217,9 @@ func (i *singleLevelIterator) init( i.reader = r i.cmp = r.Compare i.stats = stats - i.hideObsoletePoints = hideObsoletePoints + i.transforms = transforms i.bufferPool = bufferPool - err = i.index.initHandle(i.cmp, r.Split, indexH, r.Properties.GlobalSeqNum, false, i.getSyntheticSuffx()) + err = i.index.initHandle(i.cmp, r.Split, indexH, transforms) if err != nil { // blockIter.Close releases indexH and always returns a nil error _ = i.index.Close() @@ -250,13 +251,6 @@ func (i *singleLevelIterator) init( return nil } -func (i *singleLevelIterator) getSyntheticSuffx() SyntheticSuffix { - if i.vState != nil { - return i.vState.syntheticSuffix - } - return nil -} - // Helper function to check if keys returned from iterator are within virtual bounds. func (i *singleLevelIterator) maybeVerifyKey( iKey *InternalKey, val base.LazyValue, @@ -458,7 +452,7 @@ func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult { i.err = err return loadBlockFailed } - i.err = i.data.initHandle(i.cmp, i.reader.Split, block, i.reader.Properties.GlobalSeqNum, i.hideObsoletePoints, i.getSyntheticSuffx()) + i.err = i.data.initHandle(i.cmp, i.reader.Split, block, i.transforms) if i.err != nil { // The block is partially loaded, and we don't want it to appear valid. i.data.invalidate() diff --git a/sstable/reader_iter_two_lvl.go b/sstable/reader_iter_two_lvl.go index a138bfb474..f59438ae41 100644 --- a/sstable/reader_iter_two_lvl.go +++ b/sstable/reader_iter_two_lvl.go @@ -68,7 +68,7 @@ func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult { i.err = err return loadBlockFailed } - if i.err = i.index.initHandle(i.cmp, i.reader.Split, indexBlock, i.reader.Properties.GlobalSeqNum, false, i.getSyntheticSuffx()); i.err == nil { + if i.err = i.index.initHandle(i.cmp, i.reader.Split, indexBlock, i.transforms); i.err == nil { return loadBlockOK } return loadBlockFailed @@ -143,9 +143,10 @@ func (i *twoLevelIterator) init( ctx context.Context, r *Reader, v *virtualState, + transforms IterTransforms, lower, upper []byte, filterer *BlockPropertiesFilterer, - useFilter, hideObsoletePoints bool, + useFilter bool, stats *base.InternalIteratorStats, categoryAndQoS CategoryAndQoS, statsCollector *CategoryStatsCollector, @@ -175,9 +176,9 @@ func (i *twoLevelIterator) init( i.reader = r i.cmp = r.Compare i.stats = stats - i.hideObsoletePoints = hideObsoletePoints + i.transforms = transforms i.bufferPool = bufferPool - err = i.topLevelIndex.initHandle(i.cmp, i.reader.Split, topLevelIndexH, r.Properties.GlobalSeqNum, false, i.getSyntheticSuffx()) + err = i.topLevelIndex.initHandle(i.cmp, i.reader.Split, topLevelIndexH, transforms) if err != nil { // blockIter.Close releases topLevelIndexH and always returns a nil error _ = i.topLevelIndex.Close() diff --git a/sstable/reader_test.go b/sstable/reader_test.go index 5caa3707d0..42afd4c47a 100644 --- a/sstable/reader_test.go +++ b/sstable/reader_test.go @@ -59,7 +59,7 @@ func (r *Reader) get(key []byte) (value []byte, err error) { } } - i, err := r.NewIter(nil /* lower */, nil /* upper */) + i, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return nil, err } @@ -191,6 +191,7 @@ func TestVirtualReader(t *testing.T) { // Set during the latest virtualize command. var v *VirtualReader + var transforms IterTransforms defer func() { if r != nil { @@ -289,17 +290,18 @@ func TestVirtualReader(t *testing.T) { return "build must be called at least once before virtualize" } v = nil - var params VirtualReaderParams + var params VirtualReaderParams // Parse the virtualization bounds. bounds := strings.Split(td.CmdArgs[0].String(), "-") params.Lower = base.ParseInternalKey(bounds[0]) params.Upper = base.ParseInternalKey(bounds[1]) + transforms = IterTransforms{} if td.HasArg("suffix") { var synthSuffixStr string td.ScanArgs(t, "suffix", &synthSuffixStr) - params.SyntheticSuffix = []byte(synthSuffixStr) + transforms.SyntheticSuffix = []byte(synthSuffixStr) } params.FileNum = nextFileNum() @@ -323,7 +325,7 @@ func TestVirtualReader(t *testing.T) { var rp ReaderProvider var bytesIterated uint64 - iter, err := v.NewCompactionIter(&bytesIterated, CategoryAndQoS{}, nil, rp, &bp) + iter, err := v.NewCompactionIter(transforms, &bytesIterated, CategoryAndQoS{}, nil, rp, &bp) if err != nil { return err.Error() } @@ -362,7 +364,7 @@ func TestVirtualReader(t *testing.T) { if v == nil { return "virtualize must be called before scan-range-del" } - iter, err := v.NewRawRangeDelIter() + iter, err := v.NewRawRangeDelIter(transforms) if err != nil { return err.Error() } @@ -385,7 +387,7 @@ func TestVirtualReader(t *testing.T) { if v == nil { return "virtualize must be called before scan-range-key" } - iter, err := v.NewRawRangeKeyIter() + iter, err := v.NewRawRangeKeyIter(transforms) if err != nil { return err.Error() } @@ -416,7 +418,7 @@ func TestVirtualReader(t *testing.T) { var stats base.InternalIteratorStats iter, err := v.NewIterWithBlockPropertyFiltersAndContextEtc( - context.Background(), lower, upper, nil, false, false, + context.Background(), transforms, lower, upper, nil, false, &stats, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}) if err != nil { return err.Error() @@ -615,7 +617,7 @@ func TestInjectedErrors(t *testing.T) { return err } - iter, err := r.NewIter(nil, nil) + iter, err := r.NewIter(NoTransforms, nil, nil) if err != nil { return err } @@ -680,7 +682,7 @@ func indexLayoutString(t *testing.T, r *Reader) string { var buf strings.Builder twoLevelIndex := r.Properties.IndexType == twoLevelIndex buf.WriteString("index entries:\n") - iter, err := newBlockIter(r.Compare, r.Split, indexH.Get(), nil /* syntheticSuffix */) + iter, err := newBlockIter(r.Compare, r.Split, indexH.Get(), NoTransforms) defer func() { require.NoError(t, iter.Close()) }() @@ -694,7 +696,7 @@ func indexLayoutString(t *testing.T, r *Reader) string { context.Background(), bh.BlockHandle, nil, nil, nil, nil, nil) require.NoError(t, err) defer b.Release() - iter2, err := newBlockIter(r.Compare, r.Split, b.Get(), nil /* syntheticSuffix */) + iter2, err := newBlockIter(r.Compare, r.Split, b.Get(), NoTransforms) defer func() { require.NoError(t, iter2.Close()) }() @@ -748,7 +750,9 @@ func runTestReader(t *testing.T, o WriterOptions, dir string, r *Reader, printVa return err.Error() } var stats base.InternalIteratorStats - r.Properties.GlobalSeqNum = seqNum + transforms := IterTransforms{ + SyntheticSeqNum: SyntheticSeqNum(seqNum), + } var bpfs []BlockPropertyFilter if d.HasArg("block-property-filter") { var filterMin, filterMax uint64 @@ -756,13 +760,13 @@ func runTestReader(t *testing.T, o WriterOptions, dir string, r *Reader, printVa bpf := NewTestKeysBlockPropertyFilter(filterMin, filterMax) bpfs = append(bpfs, bpf) } - hideObsoletePoints := false if d.HasArg("hide-obsolete-points") { - d.ScanArgs(t, "hide-obsolete-points", &hideObsoletePoints) - if hideObsoletePoints { - hideObsoletePoints, bpfs = r.TryAddBlockPropertyFilterForHideObsoletePoints( + d.ScanArgs(t, "hide-obsolete-points", &transforms.HideObsoletePoints) + if transforms.HideObsoletePoints { + var retHideObsoletePoints bool + retHideObsoletePoints, bpfs = r.TryAddBlockPropertyFilterForHideObsoletePoints( InternalKeySeqNumMax, InternalKeySeqNumMax-1, bpfs) - require.True(t, hideObsoletePoints) + require.True(t, retHideObsoletePoints) } } var filterer *BlockPropertiesFilterer @@ -779,10 +783,10 @@ func runTestReader(t *testing.T, o WriterOptions, dir string, r *Reader, printVa } iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc( context.Background(), + transforms, nil, /* lower */ nil, /* upper */ filterer, - hideObsoletePoints, true, /* use filter block */ &stats, CategoryAndQoS{}, @@ -926,7 +930,7 @@ func testBytesIteratedWithCompression( var pool BufferPool pool.Init(5) citer, err := r.NewCompactionIter( - &bytesIterated, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool) + NoTransforms, &bytesIterated, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool) require.NoError(t, err) for key, _ := citer.First(); key != nil; key, _ = citer.Next() { @@ -984,7 +988,7 @@ func TestCompactionIteratorSetupForCompaction(t *testing.T) { var pool BufferPool pool.Init(5) citer, err := r.NewCompactionIter( - &bytesIterated, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool) + NoTransforms, &bytesIterated, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool) require.NoError(t, err) switch i := citer.(type) { case *compactionIterator: @@ -1040,7 +1044,7 @@ func TestReadaheadSetupForV3TablesWithMultipleVersions(t *testing.T) { var pool BufferPool pool.Init(5) citer, err := r.NewCompactionIter( - nil, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool) + NoTransforms, nil, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool) require.NoError(t, err) defer citer.Close() i := citer.(*compactionIterator) @@ -1048,7 +1052,7 @@ func TestReadaheadSetupForV3TablesWithMultipleVersions(t *testing.T) { require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.vbRH)) } { - iter, err := r.NewIter(nil, nil) + iter, err := r.NewIter(NoTransforms, nil, nil) require.NoError(t, err) defer iter.Close() i := iter.(*singleLevelIterator) @@ -1220,12 +1224,12 @@ func TestRandomizedSuffixRewriter(t *testing.T) { require.NoError(t, err) iter, err := eReader.newIterWithBlockPropertyFiltersAndContext( context.Background(), - nil, nil, nil, false, + IterTransforms{SyntheticSuffix: syntheticSuffix}, + nil, nil, nil, true, nil, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: eReader}, &virtualState{ - lower: base.MakeInternalKey([]byte("a"), base.InternalKeySeqNumMax, base.InternalKeyKindSet), - upper: base.MakeRangeDeleteSentinelKey([]byte("zzzzzzzzzzzzzzzzzzz")), - syntheticSuffix: syntheticSuffix, + lower: base.MakeInternalKey([]byte("a"), base.InternalKeySeqNumMax, base.InternalKeyKindSet), + upper: base.MakeRangeDeleteSentinelKey([]byte("zzzzzzzzzzzzzzzzzzz")), }) require.NoError(t, err) return iter, func() { @@ -1374,14 +1378,14 @@ func TestReaderChecksumErrors(t *testing.T) { r, err := newReader(corrupted, ReaderOptions{}) require.NoError(t, err) - iter, err := r.NewIter(nil, nil) + iter, err := r.NewIter(NoTransforms, nil, nil) require.NoError(t, err) for k, _ := iter.First(); k != nil; k, _ = iter.Next() { } require.Regexp(t, `checksum mismatch`, iter.Error()) require.Regexp(t, `checksum mismatch`, iter.Close()) - iter, err = r.NewIter(nil, nil) + iter, err = r.NewIter(NoTransforms, nil, nil) require.NoError(t, err) for k, _ := iter.Last(); k != nil; k, _ = iter.Prev() { } @@ -1748,7 +1752,7 @@ func BenchmarkTableIterSeekGE(b *testing.B) { b.Run(bm.name, func(b *testing.B) { r, keys := buildBenchmarkTable(b, bm.options, false, 0) - it, err := r.NewIter(nil /* lower */, nil /* upper */) + it, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) require.NoError(b, err) rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) @@ -1769,7 +1773,7 @@ func BenchmarkTableIterSeekLT(b *testing.B) { b.Run(bm.name, func(b *testing.B) { r, keys := buildBenchmarkTable(b, bm.options, false, 0) - it, err := r.NewIter(nil /* lower */, nil /* upper */) + it, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) require.NoError(b, err) rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) @@ -1790,7 +1794,7 @@ func BenchmarkTableIterNext(b *testing.B) { b.Run(bm.name, func(b *testing.B) { r, _ := buildBenchmarkTable(b, bm.options, false, 0) - it, err := r.NewIter(nil /* lower */, nil /* upper */) + it, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) require.NoError(b, err) b.ResetTimer() @@ -1819,7 +1823,7 @@ func BenchmarkTableIterPrev(b *testing.B) { b.Run(bm.name, func(b *testing.B) { r, _ := buildBenchmarkTable(b, bm.options, false, 0) - it, err := r.NewIter(nil /* lower */, nil /* upper */) + it, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) require.NoError(b, err) b.ResetTimer() @@ -1899,7 +1903,7 @@ func BenchmarkSeqSeekGEExhausted(b *testing.B) { } else { seekKeys = postKeys } - it, err := reader.NewIter(nil /* lower */, upper) + it, err := reader.NewIter(NoTransforms, nil /* lower */, upper) require.NoError(b, err) b.ResetTimer() pos := 0 @@ -2006,7 +2010,7 @@ func BenchmarkIteratorScanManyVersions(b *testing.B) { }() for _, readValue := range []bool{false, true} { b.Run(fmt.Sprintf("read-value=%t", readValue), func(b *testing.B) { - iter, err := r.NewIter(nil, nil) + iter, err := r.NewIter(NoTransforms, nil, nil) require.NoError(b, err) var k *InternalKey var v base.LazyValue @@ -2150,7 +2154,7 @@ func BenchmarkIteratorScanNextPrefix(b *testing.B) { b.Run(fmt.Sprintf("method=%s", method), func(b *testing.B) { for _, readValue := range []bool{false, true} { b.Run(fmt.Sprintf("read-value=%t", readValue), func(b *testing.B) { - iter, err := r.NewIter(nil, nil) + iter, err := r.NewIter(NoTransforms, nil, nil) require.NoError(b, err) var nextFunc func(index int) (*InternalKey, base.LazyValue) switch method { @@ -2281,8 +2285,9 @@ func BenchmarkIteratorScanObsolete(b *testing.B) { b.Fatalf("sstable does not intersect") } } + transforms := IterTransforms{HideObsoletePoints: hideObsoletePoints} iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc( - context.Background(), nil, nil, filterer, hideObsoletePoints, + context.Background(), transforms, nil, nil, filterer, true, nil, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}) require.NoError(b, err) diff --git a/sstable/reader_virtual.go b/sstable/reader_virtual.go index fdacde0bcf..88badcfbc8 100644 --- a/sstable/reader_virtual.go +++ b/sstable/reader_virtual.go @@ -25,6 +25,8 @@ type VirtualReader struct { Properties CommonProperties } +var _ CommonReader = (*VirtualReader)(nil) + // Lightweight virtual sstable state which can be passed to sstable iterators. type virtualState struct { lower InternalKey @@ -33,23 +35,21 @@ type virtualState struct { Compare Compare isSharedIngested bool prefixChange *PrefixReplacement - syntheticSuffix SyntheticSuffix } // VirtualReaderParams are the parameters necessary to create a VirtualReader. type VirtualReaderParams struct { - Lower InternalKey - Upper InternalKey - FileNum base.FileNum - IsShared bool + Lower InternalKey + Upper InternalKey + FileNum base.FileNum + IsSharedIngested bool // Size is an estimate of the size of the [Lower, Upper) section of the table. Size uint64 // BackingSize is the total size of the backing table. The ratio between Size // and BackingSize is used to estimate statistics. BackingSize uint64 - // TODO(radu): this should be passed just to iterators. + // TODO(radu): these should be moved to sstable.IterTransforms. PrefixReplacement *PrefixReplacement - SyntheticSuffix SyntheticSuffix } // MakeVirtualReader is used to contruct a reader which can read from virtual @@ -60,9 +60,8 @@ func MakeVirtualReader(reader *Reader, p VirtualReaderParams) VirtualReader { upper: p.Upper, fileNum: p.FileNum, Compare: reader.Compare, - isSharedIngested: p.IsShared && reader.Properties.GlobalSeqNum != 0, + isSharedIngested: p.IsSharedIngested, prefixChange: p.PrefixReplacement, - syntheticSuffix: p.SyntheticSuffix, } v := VirtualReader{ vState: vState, @@ -95,6 +94,7 @@ func MakeVirtualReader(reader *Reader, p VirtualReaderParams) VirtualReader { // NewCompactionIter is the compaction iterator function for virtual readers. func (v *VirtualReader) NewCompactionIter( + transforms IterTransforms, bytesIterated *uint64, categoryAndQoS CategoryAndQoS, statsCollector *CategoryStatsCollector, @@ -102,7 +102,7 @@ func (v *VirtualReader) NewCompactionIter( bufferPool *BufferPool, ) (Iterator, error) { i, err := v.reader.newCompactionIter( - bytesIterated, categoryAndQoS, statsCollector, rp, &v.vState, bufferPool) + transforms, bytesIterated, categoryAndQoS, statsCollector, rp, &v.vState, bufferPool) if err == nil && v.vState.prefixChange != nil { i = newPrefixReplacingIterator( i, v.vState.prefixChange.ContentPrefix, v.vState.prefixChange.SyntheticPrefix, @@ -118,17 +118,18 @@ func (v *VirtualReader) NewCompactionIter( // sstable bounds. No overlap is not currently supported in the iterator. func (v *VirtualReader) NewIterWithBlockPropertyFiltersAndContextEtc( ctx context.Context, + transforms IterTransforms, lower, upper []byte, filterer *BlockPropertiesFilterer, - hideObsoletePoints, useFilterBlock bool, + useFilterBlock bool, stats *base.InternalIteratorStats, categoryAndQoS CategoryAndQoS, statsCollector *CategoryStatsCollector, rp ReaderProvider, ) (Iterator, error) { i, err := v.reader.newIterWithBlockPropertyFiltersAndContext( - ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, - categoryAndQoS, statsCollector, rp, &v.vState) + ctx, transforms, lower, upper, filterer, useFilterBlock, + stats, categoryAndQoS, statsCollector, rp, &v.vState) if err == nil && v.vState.prefixChange != nil { i = newPrefixReplacingIterator( i, v.vState.prefixChange.ContentPrefix, v.vState.prefixChange.SyntheticPrefix, @@ -145,8 +146,10 @@ func (v *VirtualReader) ValidateBlockChecksumsOnBacking() error { } // NewRawRangeDelIter wraps Reader.NewRawRangeDelIter. -func (v *VirtualReader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { - iter, err := v.reader.NewRawRangeDelIter() +func (v *VirtualReader) NewRawRangeDelIter( + transforms IterTransforms, +) (keyspan.FragmentIterator, error) { + iter, err := v.reader.NewRawRangeDelIter(transforms) if err != nil { return nil, err } @@ -187,8 +190,17 @@ func (v *VirtualReader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { } // NewRawRangeKeyIter wraps Reader.NewRawRangeKeyIter. -func (v *VirtualReader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { - iter, err := v.reader.newRawRangeKeyIter(&v.vState) +func (v *VirtualReader) NewRawRangeKeyIter( + transforms IterTransforms, +) (keyspan.FragmentIterator, error) { + syntheticSeqNum := transforms.SyntheticSeqNum + if v.vState.isSharedIngested { + // Don't pass a synthetic sequence number for shared ingested sstables. We + // need to know the materialized sequence numbers, and we will set up the + // appropriate sequence number substitution below. + transforms.SyntheticSeqNum = 0 + } + iter, err := v.reader.NewRawRangeKeyIter(transforms) if err != nil { return nil, err } @@ -200,13 +212,13 @@ func (v *VirtualReader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { if v.vState.isSharedIngested { // We need to coalesce range keys within each sstable, and then apply the - // global sequence number. For this, we use ForeignSSTTransformer. + // synthetic sequence number. For this, we use ForeignSSTTransformer. // // TODO(bilal): Avoid these allocations by hoisting the transformer and // transform iter into VirtualReader. transform := &rangekey.ForeignSSTTransformer{ Equal: v.reader.Equal, - SeqNum: v.reader.Properties.GlobalSeqNum, + SeqNum: uint64(syntheticSeqNum), } transformIter := &keyspan.TransformerIter{ FragmentIterator: iter, diff --git a/sstable/suffix_rewriter.go b/sstable/suffix_rewriter.go index 480bad0818..cd94f88c18 100644 --- a/sstable/suffix_rewriter.go +++ b/sstable/suffix_rewriter.go @@ -201,7 +201,7 @@ func rewriteBlocks( if err != nil { return err } - if err := iter.init(r.Compare, r.Split, inputBlock, r.Properties.GlobalSeqNum, false, nil); err != nil { + if err := iter.init(r.Compare, r.Split, inputBlock, NoTransforms); err != nil { return err } @@ -392,7 +392,7 @@ func rewriteDataBlocksToWriter( } func rewriteRangeKeyBlockToWriter(r *Reader, w *Writer, from, to []byte) error { - iter, err := r.NewRawRangeKeyIter() + iter, err := r.NewRawRangeKeyIter(NoTransforms) if err != nil { return err } @@ -465,7 +465,7 @@ func RewriteKeySuffixesViaWriter( w.Close() } }() - i, err := r.NewIter(nil, nil) + i, err := r.NewIter(NoTransforms, nil, nil) if err != nil { return nil, err } diff --git a/sstable/table_test.go b/sstable/table_test.go index fb970457c9..897a394cfa 100644 --- a/sstable/table_test.go +++ b/sstable/table_test.go @@ -60,7 +60,7 @@ func check(fs vfs.FS, filename string, comparer *Comparer, fp FilterPolicy) erro } // Check using SeekGE. - iter, err := r.NewIter(nil /* lower */, nil /* upper */) + iter, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return err } @@ -110,7 +110,7 @@ func check(fs vfs.FS, filename string, comparer *Comparer, fp FilterPolicy) erro } // Check using Find. - iter, err := r.NewIter(nil /* lower */, nil /* upper */) + iter, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return err } @@ -140,7 +140,7 @@ func check(fs vfs.FS, filename string, comparer *Comparer, fp FilterPolicy) erro {0, "~"}, } for _, ct := range countTests { - iter, err := r.NewIter(nil /* lower */, nil /* upper */) + iter, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return err } @@ -193,7 +193,7 @@ func check(fs vfs.FS, filename string, comparer *Comparer, fp FilterPolicy) erro upper = []byte(words[upperIdx]) } - iter, err := r.NewIter(lower, upper) + iter, err := r.NewIter(NoTransforms, lower, upper) if err != nil { return err } @@ -469,7 +469,7 @@ func TestFinalBlockIsWritten(t *testing.T) { if err != nil { t.Errorf("nk=%d, vLen=%d: reader open: %v", nk, vLen, err) } - iter, err := r.NewIter(nil /* lower */, nil /* upper */) + iter, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) require.NoError(t, err) i := newIterAdapter(iter) for valid := i.First(); valid; valid = i.Next() { @@ -494,22 +494,22 @@ func TestFinalBlockIsWritten(t *testing.T) { } } -func TestReaderGlobalSeqNum(t *testing.T) { +func TestReaderSymtheticSeqNum(t *testing.T) { f, err := os.Open(filepath.FromSlash("testdata/h.sst")) require.NoError(t, err) r, err := newReader(f, ReaderOptions{}) require.NoError(t, err) - const globalSeqNum = 42 - r.Properties.GlobalSeqNum = globalSeqNum + const syntheticSeqNum = 42 + transforms := IterTransforms{SyntheticSeqNum: syntheticSeqNum} - iter, err := r.NewIter(nil /* lower */, nil /* upper */) + iter, err := r.NewIter(transforms, nil /* lower */, nil /* upper */) require.NoError(t, err) i := newIterAdapter(iter) for valid := i.First(); valid; valid = i.Next() { - if globalSeqNum != i.Key().SeqNum() { - t.Fatalf("expected %d, but found %d", globalSeqNum, i.Key().SeqNum()) + if syntheticSeqNum != i.Key().SeqNum() { + t.Fatalf("expected %d, but found %d", syntheticSeqNum, i.Key().SeqNum()) } } require.NoError(t, i.Close()) diff --git a/sstable/testdata/h.no-compression.sst b/sstable/testdata/h.no-compression.sst index 34e7e4d33f..1bf742e8ea 100644 Binary files a/sstable/testdata/h.no-compression.sst and b/sstable/testdata/h.no-compression.sst differ diff --git a/sstable/testdata/h.no-compression.two_level_index.sst b/sstable/testdata/h.no-compression.two_level_index.sst index 82289bb1d1..5e9a68b253 100644 Binary files a/sstable/testdata/h.no-compression.two_level_index.sst and b/sstable/testdata/h.no-compression.two_level_index.sst differ diff --git a/sstable/testdata/h.sst b/sstable/testdata/h.sst index 8b792034ff..b7ff583532 100644 Binary files a/sstable/testdata/h.sst and b/sstable/testdata/h.sst differ diff --git a/sstable/testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst b/sstable/testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst index 6b299c2211..000cd2e089 100644 Binary files a/sstable/testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst and b/sstable/testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst differ diff --git a/sstable/testdata/h.table-bloom.no-compression.sst b/sstable/testdata/h.table-bloom.no-compression.sst index f443bff409..b2e2681985 100644 Binary files a/sstable/testdata/h.table-bloom.no-compression.sst and b/sstable/testdata/h.table-bloom.no-compression.sst differ diff --git a/sstable/testdata/h.table-bloom.sst b/sstable/testdata/h.table-bloom.sst index 5aab598325..3b512372a8 100644 Binary files a/sstable/testdata/h.table-bloom.sst and b/sstable/testdata/h.table-bloom.sst differ diff --git a/sstable/testdata/h.zstd-compression.sst b/sstable/testdata/h.zstd-compression.sst index 0f6ccf333c..d0b6e4b927 100644 Binary files a/sstable/testdata/h.zstd-compression.sst and b/sstable/testdata/h.zstd-compression.sst differ diff --git a/sstable/testdata/rewriter b/sstable/testdata/rewriter index 3f2ea580ad..b04882d47d 100644 --- a/sstable/testdata/rewriter +++ b/sstable/testdata/rewriter @@ -48,10 +48,10 @@ layout 191 index (22) 218 index (22) 245 top-index (48) - 298 properties (630) - 933 meta-index (79) - 1017 footer (53) - 1070 EOF + 298 properties (607) + 910 meta-index (79) + 994 footer (53) + 1047 EOF scan ---- @@ -83,10 +83,10 @@ layout 191 index (22) 218 index (22) 245 top-index (48) - 298 properties (630) - 933 meta-index (79) - 1017 footer (53) - 1070 EOF + 298 properties (607) + 910 meta-index (79) + 994 footer (53) + 1047 EOF scan ---- @@ -118,10 +118,10 @@ layout 191 index (22) 218 index (22) 245 top-index (48) - 298 properties (630) - 933 meta-index (79) - 1017 footer (53) - 1070 EOF + 298 properties (607) + 910 meta-index (79) + 994 footer (53) + 1047 EOF scan ---- @@ -153,10 +153,10 @@ layout 191 index (22) 218 index (22) 245 top-index (48) - 298 properties (630) - 933 meta-index (79) - 1017 footer (53) - 1070 EOF + 298 properties (607) + 910 meta-index (79) + 994 footer (53) + 1047 EOF scan ---- @@ -189,10 +189,10 @@ layout 191 index (22) 218 index (22) 245 top-index (48) - 298 properties (630) - 933 meta-index (79) - 1017 footer (53) - 1070 EOF + 298 properties (607) + 910 meta-index (79) + 994 footer (53) + 1047 EOF scan ---- diff --git a/sstable/testdata/rewriter_v3 b/sstable/testdata/rewriter_v3 index 4f37789eb4..6ec3dc558b 100644 --- a/sstable/testdata/rewriter_v3 +++ b/sstable/testdata/rewriter_v3 @@ -48,10 +48,10 @@ layout 194 index (22) 221 index (22) 248 top-index (48) - 301 properties (630) - 936 meta-index (79) - 1020 footer (53) - 1073 EOF + 301 properties (607) + 913 meta-index (79) + 997 footer (53) + 1050 EOF scan ---- @@ -83,10 +83,10 @@ layout 194 index (22) 221 index (22) 248 top-index (48) - 301 properties (630) - 936 meta-index (79) - 1020 footer (53) - 1073 EOF + 301 properties (607) + 913 meta-index (79) + 997 footer (53) + 1050 EOF scan ---- @@ -118,10 +118,10 @@ layout 194 index (22) 221 index (22) 248 top-index (48) - 301 properties (630) - 936 meta-index (79) - 1020 footer (53) - 1073 EOF + 301 properties (607) + 913 meta-index (79) + 997 footer (53) + 1050 EOF scan ---- @@ -153,10 +153,10 @@ layout 194 index (22) 221 index (22) 248 top-index (48) - 301 properties (630) - 936 meta-index (79) - 1020 footer (53) - 1073 EOF + 301 properties (607) + 913 meta-index (79) + 997 footer (53) + 1050 EOF scan ---- @@ -189,10 +189,10 @@ layout 194 index (22) 221 index (22) 248 top-index (48) - 301 properties (630) - 936 meta-index (79) - 1020 footer (53) - 1073 EOF + 301 properties (607) + 913 meta-index (79) + 997 footer (53) + 1050 EOF scan ---- diff --git a/sstable/testdata/virtual_reader b/sstable/testdata/virtual_reader index 69a1bdd756..5977c6ecd0 100644 --- a/sstable/testdata/virtual_reader +++ b/sstable/testdata/virtual_reader @@ -41,7 +41,7 @@ virtualize b.SET.1-c.SET.1 ---- bounds: [b#1,1-c#1,1] filenum: 000002 -props: NumEntries: 1, RawKeySize: 2, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0 +props: NumEntries: 1, RawKeySize: 3, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0 citer ---- @@ -159,7 +159,7 @@ virtualize dd.SET.5-ddd.SET.6 ---- bounds: [dd#5,1-ddd#6,1] filenum: 000004 -props: NumEntries: 1, RawKeySize: 10, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0 +props: NumEntries: 2, RawKeySize: 10, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0 # Check lower bound enforcement during SeekPrefixGE. iter @@ -190,7 +190,7 @@ virtualize c.SET.3-f.SET.6 ---- bounds: [c#3,1-f#6,1] filenum: 000005 -props: NumEntries: 1, RawKeySize: 9, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0 +props: NumEntries: 2, RawKeySize: 10, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0 # Just test a basic iterator once virtual sstable bounds have been set. iter @@ -290,7 +290,7 @@ virtualize c.SET.3-f.SET.1:ff ---- bounds: [c#3,1-f#0,1] filenum: 000006 -props: NumEntries: 2, RawKeySize: 11, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 3 +props: NumEntries: 2, RawKeySize: 12, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 3 iter set-bounds lower=d upper=e @@ -343,7 +343,7 @@ virtualize f.SET.6-h.SET.9 ---- bounds: [f#6,1-h#9,1] filenum: 000007 -props: NumEntries: 2, RawKeySize: 11, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 3 +props: NumEntries: 2, RawKeySize: 12, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 3 iter seek-lt z diff --git a/sstable/testdata/writer b/sstable/testdata/writer index 61d3a24fa3..085c7978f2 100644 --- a/sstable/testdata/writer +++ b/sstable/testdata/writer @@ -316,10 +316,10 @@ layout 105 index (22) 132 index (22) 159 top-index (50) - 214 properties (580) - 799 meta-index (33) - 837 footer (53) - 890 EOF + 214 properties (557) + 776 meta-index (33) + 814 footer (53) + 867 EOF scan ---- @@ -344,10 +344,10 @@ layout 26 data (21) 52 data (21) 78 index (47) - 130 properties (678) - 813 meta-index (33) - 851 leveldb-footer (48) - 899 EOF + 130 properties (655) + 790 meta-index (33) + 828 leveldb-footer (48) + 876 EOF # Range keys, if present, are shown in the layout. @@ -364,7 +364,7 @@ layout 0 data (8) 13 index (21) 39 range-key (82) - 126 properties (628) - 759 meta-index (57) - 821 footer (53) - 874 EOF + 126 properties (605) + 736 meta-index (57) + 798 footer (53) + 851 EOF diff --git a/sstable/testdata/writer_v3 b/sstable/testdata/writer_v3 index a003c5fb96..5ff7a4a046 100644 --- a/sstable/testdata/writer_v3 +++ b/sstable/testdata/writer_v3 @@ -289,10 +289,10 @@ layout 108 index (22) 135 index (22) 162 top-index (51) - 218 properties (580) - 803 meta-index (33) - 841 footer (53) - 894 EOF + 218 properties (557) + 780 meta-index (33) + 818 footer (53) + 871 EOF scan ---- @@ -317,10 +317,10 @@ layout 26 data (21) 52 data (21) 78 index (47) - 130 properties (678) - 813 meta-index (33) - 851 leveldb-footer (48) - 899 EOF + 130 properties (655) + 790 meta-index (33) + 828 leveldb-footer (48) + 876 EOF # Range keys, if present, are shown in the layout. @@ -337,7 +337,7 @@ layout 0 data (8) 13 index (21) 39 range-key (82) - 126 properties (628) - 759 meta-index (57) - 821 footer (53) - 874 EOF + 126 properties (605) + 736 meta-index (57) + 798 footer (53) + 851 EOF diff --git a/sstable/testdata/writer_value_blocks b/sstable/testdata/writer_value_blocks index 9461626665..5e09d6ac34 100644 --- a/sstable/testdata/writer_value_blocks +++ b/sstable/testdata/writer_value_blocks @@ -193,7 +193,7 @@ layout 357 value-block (11) 373 value-block (15) 393 value-index (8) - 406 properties (676) + 406 properties (653) 406 obsolete-key (16) [restart] 422 pebble.num.value-blocks (27) 449 pebble.num.values.in.value-blocks (21) @@ -206,37 +206,36 @@ layout 630 rocksdb.compression_options (106) 736 rocksdb.data.size (14) 750 rocksdb.deleted.keys (15) - 765 rocksdb.external_sst_file.global_seqno (41) - 806 rocksdb.external_sst_file.version (14) - 820 rocksdb.filter.size (15) - 835 rocksdb.index.partitions (20) - 855 rocksdb.index.size (9) - 864 rocksdb.merge.operands (18) - 882 rocksdb.merge.operator (24) - 906 rocksdb.num.data.blocks (19) - 925 rocksdb.num.entries (11) - 936 rocksdb.num.range-deletions (19) - 955 rocksdb.prefix.extractor.name (31) - 986 rocksdb.property.collectors (34) - 1020 rocksdb.raw.key.size (16) - 1036 rocksdb.raw.value.size (14) - 1050 rocksdb.top-level.index.size (24) - 1074 [restart 406] - 1082 [trailer compression=none checksum=0xbf6fe705] - 1087 meta-index (64) - 1087 pebble.value_index block:393/8 value-blocks-index-lengths: 1(num), 2(offset), 1(length) [restart] - 1114 rocksdb.properties block:406/676 [restart] - 1139 [restart 1087] - 1143 [restart 1114] - 1151 [trailer compression=none checksum=0x5a8a2a98] - 1156 footer (53) - 1156 checksum type: crc32c - 1157 meta: offset=1087, length=64 - 1160 index: offset=267, length=85 - 1163 [padding] - 1197 version: 4 - 1201 magic number: 0xf09faab3f09faab3 - 1209 EOF + 765 rocksdb.external_sst_file.version (32) + 797 rocksdb.filter.size (15) + 812 rocksdb.index.partitions (20) + 832 rocksdb.index.size (9) + 841 rocksdb.merge.operands (18) + 859 rocksdb.merge.operator (24) + 883 rocksdb.num.data.blocks (19) + 902 rocksdb.num.entries (11) + 913 rocksdb.num.range-deletions (19) + 932 rocksdb.prefix.extractor.name (31) + 963 rocksdb.property.collectors (34) + 997 rocksdb.raw.key.size (16) + 1013 rocksdb.raw.value.size (14) + 1027 rocksdb.top-level.index.size (24) + 1051 [restart 406] + 1059 [trailer compression=none checksum=0x98bc4d71] + 1064 meta-index (64) + 1064 pebble.value_index block:393/8 value-blocks-index-lengths: 1(num), 2(offset), 1(length) [restart] + 1091 rocksdb.properties block:406/653 [restart] + 1116 [restart 1064] + 1120 [restart 1091] + 1128 [trailer compression=none checksum=0xdf6cf118] + 1133 footer (53) + 1133 checksum type: crc32c + 1134 meta: offset=1064, length=64 + 1137 index: offset=267, length=85 + 1140 [padding] + 1174 version: 4 + 1178 magic number: 0xf09faab3f09faab3 + 1186 EOF # Require that [c,e) must be in-place. build in-place-bound=(c,e) @@ -316,7 +315,7 @@ layout 71 block:0/66 [restart] 85 [restart 71] 93 [trailer compression=none checksum=0xf80f5bcf] - 98 properties (606) + 98 properties (583) 98 obsolete-key (16) [restart] 114 pebble.raw.point-tombstone.key.size (39) 153 rocksdb.block.based.table.index.type (43) @@ -327,30 +326,29 @@ layout 292 rocksdb.compression_options (106) 398 rocksdb.data.size (13) 411 rocksdb.deleted.keys (15) - 426 rocksdb.external_sst_file.global_seqno (41) - 467 rocksdb.external_sst_file.version (14) - 481 rocksdb.filter.size (15) - 496 rocksdb.index.size (14) - 510 rocksdb.merge.operands (18) - 528 rocksdb.merge.operator (24) - 552 rocksdb.num.data.blocks (19) - 571 rocksdb.num.entries (11) - 582 rocksdb.num.range-deletions (19) - 601 rocksdb.prefix.extractor.name (31) - 632 rocksdb.property.collectors (34) - 666 rocksdb.raw.key.size (16) - 682 rocksdb.raw.value.size (14) - 696 [restart 98] - 704 [trailer compression=none checksum=0xb3084f65] - 709 meta-index (32) - 709 rocksdb.properties block:98/606 [restart] - 733 [restart 709] - 741 [trailer compression=none checksum=0x907a9f2c] - 746 footer (53) - 746 checksum type: crc32c - 747 meta: offset=709, length=32 - 750 index: offset=71, length=22 - 752 [padding] - 787 version: 4 - 791 magic number: 0xf09faab3f09faab3 - 799 EOF + 426 rocksdb.external_sst_file.version (32) + 458 rocksdb.filter.size (15) + 473 rocksdb.index.size (14) + 487 rocksdb.merge.operands (18) + 505 rocksdb.merge.operator (24) + 529 rocksdb.num.data.blocks (19) + 548 rocksdb.num.entries (11) + 559 rocksdb.num.range-deletions (19) + 578 rocksdb.prefix.extractor.name (31) + 609 rocksdb.property.collectors (34) + 643 rocksdb.raw.key.size (16) + 659 rocksdb.raw.value.size (14) + 673 [restart 98] + 681 [trailer compression=none checksum=0xeb626b4c] + 686 meta-index (32) + 686 rocksdb.properties block:98/583 [restart] + 710 [restart 686] + 718 [trailer compression=none checksum=0x55c264c2] + 723 footer (53) + 723 checksum type: crc32c + 724 meta: offset=686, length=32 + 727 index: offset=71, length=22 + 729 [padding] + 764 version: 4 + 768 magic number: 0xf09faab3f09faab3 + 776 EOF diff --git a/sstable/writer_rangekey_test.go b/sstable/writer_rangekey_test.go index 1968059651..794a693577 100644 --- a/sstable/writer_rangekey_test.go +++ b/sstable/writer_rangekey_test.go @@ -105,7 +105,7 @@ func TestWriter_RangeKeys(t *testing.T) { return err.Error() } - iter, err := r.NewRawRangeKeyIter() + iter, err := r.NewRawRangeKeyIter(NoTransforms) if err != nil { return err.Error() } diff --git a/sstable/writer_test.go b/sstable/writer_test.go index 2a5e9dad0e..32ff87c74d 100644 --- a/sstable/writer_test.go +++ b/sstable/writer_test.go @@ -141,7 +141,7 @@ func runDataDriven(t *testing.T, file string, tableFormat TableFormat, paralleli return format(td, meta) case "scan": - origIter, err := r.NewIter(nil /* lower */, nil /* upper */) + origIter, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return err.Error() } @@ -167,7 +167,7 @@ func runDataDriven(t *testing.T, file string, tableFormat TableFormat, paralleli return buf.String() case "scan-range-del": - iter, err := r.NewRawRangeDelIter() + iter, err := r.NewRawRangeDelIter(NoTransforms) if err != nil { return err.Error() } @@ -187,7 +187,7 @@ func runDataDriven(t *testing.T, file string, tableFormat TableFormat, paralleli return buf.String() case "scan-range-key": - iter, err := r.NewRawRangeKeyIter() + iter, err := r.NewRawRangeKeyIter(NoTransforms) if err != nil { return err.Error() } @@ -321,7 +321,7 @@ func TestWriterWithValueBlocks(t *testing.T) { case "scan-raw": // Raw scan does not fetch from value blocks. - origIter, err := r.NewIter(nil /* lower */, nil /* upper */) + origIter, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return err.Error() } @@ -360,7 +360,7 @@ func TestWriterWithValueBlocks(t *testing.T) { return buf.String() case "scan": - origIter, err := r.NewIter(nil /* lower */, nil /* upper */) + origIter, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return err.Error() } @@ -373,7 +373,7 @@ func TestWriterWithValueBlocks(t *testing.T) { return buf.String() case "scan-cloned-lazy-values": - iter, err := r.NewIter(nil /* lower */, nil /* upper */) + iter, err := r.NewIter(NoTransforms, nil /* lower */, nil /* upper */) if err != nil { return err.Error() } @@ -905,7 +905,7 @@ func TestWriterRace(t *testing.T) { r, err := NewMemReader(f.Data(), readerOpts) require.NoError(t, err) defer r.Close() - it, err := r.NewIter(nil, nil) + it, err := r.NewIter(NoTransforms, nil, nil) require.NoError(t, err) defer it.Close() ki := 0 diff --git a/table_cache.go b/table_cache.go index 1c9f554cc2..d6bfb6920f 100644 --- a/table_cache.go +++ b/table_cache.go @@ -513,7 +513,7 @@ func (c *tableCacheShard) newIters( var iters iterSet var err error if kinds.RangeKey() && file.HasRangeKeys { - iters.rangeKey, err = c.newRangeKeyIter(v, cr, opts.SpanIterOptions()) + iters.rangeKey, err = c.newRangeKeyIter(v, file, cr, opts.SpanIterOptions()) } if kinds.RangeDeletion() && file.HasPointKeys && err == nil { iters.rangeDeletion, err = c.newRangeDelIter(ctx, file, cr, dbOpts) @@ -621,24 +621,26 @@ func (c *tableCacheShard) newPointIter( rp = &tableCacheShardReaderProvider{c: c, file: file, dbOpts: dbOpts} } - if v.isShared && v.reader.Properties.GlobalSeqNum != 0 { + if v.isShared && file.SyntheticSeqNum() != 0 { if tableFormat < sstable.TableFormatPebblev4 { return nil, errors.New("pebble: shared ingested sstable has a lower table format than expected") } // The table is shared and ingested. hideObsoletePoints = true } + transforms := file.IterTransforms() + transforms.HideObsoletePoints = hideObsoletePoints var categoryAndQoS sstable.CategoryAndQoS if opts != nil { categoryAndQoS = opts.CategoryAndQoS } if internalOpts.bytesIterated != nil { iter, err = cr.NewCompactionIter( - internalOpts.bytesIterated, categoryAndQoS, dbOpts.sstStatsCollector, rp, + transforms, internalOpts.bytesIterated, categoryAndQoS, dbOpts.sstStatsCollector, rp, internalOpts.bufferPool) } else { iter, err = cr.NewIterWithBlockPropertyFiltersAndContextEtc( - ctx, opts.GetLowerBound(), opts.GetUpperBound(), filterer, hideObsoletePoints, useFilter, + ctx, transforms, opts.GetLowerBound(), opts.GetUpperBound(), filterer, useFilter, internalOpts.stats, categoryAndQoS, dbOpts.sstStatsCollector, rp) } if err != nil { @@ -665,7 +667,7 @@ func (c *tableCacheShard) newRangeDelIter( ) (keyspan.FragmentIterator, error) { // NB: range-del iterator does not maintain a reference to the table, nor // does it need to read from it after creation. - rangeDelIter, err := cr.NewRawRangeDelIter() + rangeDelIter, err := cr.NewRawRangeDelIter(file.IterTransforms()) if err != nil { return nil, err } @@ -686,22 +688,24 @@ func (c *tableCacheShard) newRangeDelIter( // sstable's range keys. This function is for table-cache internal use only, and // callers should use newIters instead. func (c *tableCacheShard) newRangeKeyIter( - v *tableCacheValue, cr sstable.CommonReader, opts keyspan.SpanIterOptions, + v *tableCacheValue, file *fileMetadata, cr sstable.CommonReader, opts keyspan.SpanIterOptions, ) (keyspan.FragmentIterator, error) { + transforms := file.IterTransforms() // Don't filter a table's range keys if the file contains RANGEKEYDELs. // The RANGEKEYDELs may delete range keys in other levels. Skipping the // file's range key blocks may surface deleted range keys below. This is // done here, rather than deferring to the block-property collector in order // to maintain parity with point keys and the treatment of RANGEDELs. if v.reader.Properties.NumRangeKeyDels == 0 && len(opts.RangeKeyFilters) > 0 { - ok, _, err := c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil, nil) + ok, _, err := c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil, transforms.SyntheticSuffix) if err != nil { return nil, err } else if !ok { return nil, nil } } - return cr.NewRawRangeKeyIter() + // TODO(radu): wrap in an AssertBounds. + return cr.NewRawRangeKeyIter(transforms) } type tableCacheShardReaderProvider struct { @@ -838,25 +842,8 @@ func (c *tableCacheShard) findNode(meta *fileMetadata, dbOpts *tableCacheOpts) * info := loadInfo{ backingFileNum: meta.FileBacking.DiskFileNum, } - // All virtual tables sharing an ingested backing will have the same - // SmallestSeqNum=LargestSeqNum value. We assert that below. - if meta.SmallestSeqNum == meta.LargestSeqNum { - info.globalSeqNum = meta.SmallestSeqNum - } - - v := c.findNodeInternal(info, dbOpts) - // Loading a file before its global sequence number is known (eg, during - // ingest before entering the commit pipeline) can pollute the cache with - // incorrect state. In invariant builds, verify that the global sequence - // number of the returned reader matches. - if invariants.Enabled { - if v.reader != nil && v.reader.Properties.GlobalSeqNum != info.globalSeqNum { - panic(errors.AssertionFailedf("file %s loaded from table cache with the wrong global sequence number %d", - meta, v.reader.Properties.GlobalSeqNum)) - } - } - return v + return c.findNodeInternal(info, dbOpts) } func (c *tableCacheShard) findNodeInternal( @@ -1173,8 +1160,6 @@ type tableCacheValue struct { // loadInfo contains the information needed to populate a new cache entry. type loadInfo struct { backingFileNum base.DiskFileNum - // See sstable.Properties.GlobalSeqNum. - globalSeqNum uint64 } func (v *tableCacheValue) load(loadInfo loadInfo, c *tableCacheShard, dbOpts *tableCacheOpts) { @@ -1197,9 +1182,6 @@ func (v *tableCacheValue) load(loadInfo loadInfo, c *tableCacheShard, dbOpts *ta v.err = errors.Wrapf( err, "pebble: backing file %s error", loadInfo.backingFileNum) } - if v.err == nil { - v.reader.Properties.GlobalSeqNum = loadInfo.globalSeqNum - } if v.err != nil { c.mu.Lock() defer c.mu.Unlock() diff --git a/table_stats.go b/table_stats.go index cbc9f1de19..63b5e77f28 100644 --- a/table_stats.go +++ b/table_stats.go @@ -906,7 +906,7 @@ func newCombinedDeletionKeyspanIter( }) mIter.Init(comparer.Compare, transform, new(keyspanimpl.MergingBuffers)) - iter, err := cr.NewRawRangeDelIter() + iter, err := cr.NewRawRangeDelIter(m.IterTransforms()) if err != nil { return nil, err } @@ -957,7 +957,7 @@ func newCombinedDeletionKeyspanIter( mIter.AddLevel(iter) } - iter, err = cr.NewRawRangeKeyIter() + iter, err = cr.NewRawRangeKeyIter(m.IterTransforms()) if err != nil { return nil, err } diff --git a/testdata/checkpoint b/testdata/checkpoint index 086c95a48b..4ee3f11d27 100644 --- a/testdata/checkpoint +++ b/testdata/checkpoint @@ -473,8 +473,8 @@ h 11 i i k k open: db/000014.sst -read-at(636, 53): db/000014.sst -read-at(599, 37): db/000014.sst +read-at(613, 53): db/000014.sst +read-at(576, 37): db/000014.sst z z . @@ -545,18 +545,18 @@ e 8 f 9 g 10 open: checkpoints/checkpoint4/000011.sst -read-at(653, 53): checkpoints/checkpoint4/000011.sst -read-at(616, 37): checkpoints/checkpoint4/000011.sst -read-at(70, 546): checkpoints/checkpoint4/000011.sst +read-at(630, 53): checkpoints/checkpoint4/000011.sst +read-at(593, 37): checkpoints/checkpoint4/000011.sst +read-at(70, 523): checkpoints/checkpoint4/000011.sst read-at(43, 27): checkpoints/checkpoint4/000011.sst read-at(0, 43): checkpoints/checkpoint4/000011.sst h 11 i i k k open: checkpoints/checkpoint4/000014.sst -read-at(636, 53): checkpoints/checkpoint4/000014.sst -read-at(599, 37): checkpoints/checkpoint4/000014.sst -read-at(53, 546): checkpoints/checkpoint4/000014.sst +read-at(613, 53): checkpoints/checkpoint4/000014.sst +read-at(576, 37): checkpoints/checkpoint4/000014.sst +read-at(53, 523): checkpoints/checkpoint4/000014.sst read-at(26, 27): checkpoints/checkpoint4/000014.sst read-at(0, 26): checkpoints/checkpoint4/000014.sst z z diff --git a/testdata/compaction_delete_only_hints b/testdata/compaction_delete_only_hints index 5147ba6dea..efb96610b8 100644 --- a/testdata/compaction_delete_only_hints +++ b/testdata/compaction_delete_only_hints @@ -414,4 +414,4 @@ maybe-compact Deletion hints: (none) Compactions: - [JOB 100] compacted(delete-only) L6 [000006 000007 000008 000009 000011] (3.9KB) Score=0.00 -> L6 [] (0B), in 1.0s (2.0s total), output rate 0B/s + [JOB 100] compacted(delete-only) L6 [000006 000007 000008 000009 000011] (3.8KB) Score=0.00 -> L6 [] (0B), in 1.0s (2.0s total), output rate 0B/s diff --git a/testdata/compaction_picker_scores b/testdata/compaction_picker_scores index 314ee6a4f8..1139cf5595 100644 --- a/testdata/compaction_picker_scores +++ b/testdata/compaction_picker_scores @@ -145,7 +145,7 @@ wait-pending-table-stats num-entries: 5 num-deletions: 5 num-range-key-sets: 0 -point-deletions-bytes-estimate: 164616 +point-deletions-bytes-estimate: 164605 range-deletions-bytes-estimate: 0 maybe-compact diff --git a/testdata/event_listener b/testdata/event_listener index 0277df7227..aa135deac7 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -177,9 +177,9 @@ remove: db/MANIFEST-000009 ingest ---- open: ext/0 -read-at(664, 53): ext/0 -read-at(627, 37): ext/0 -read-at(53, 574): ext/0 +read-at(641, 53): ext/0 +read-at(604, 37): ext/0 +read-at(53, 551): ext/0 read-at(26, 27): ext/0 read-at(0, 26): ext/0 close: ext/0 @@ -203,21 +203,21 @@ sync: db remove: db/MANIFEST-000011 [JOB 10] MANIFEST deleted 000011 remove: ext/0 -[JOB 10] ingested L0:000015 (717B) +[JOB 10] ingested L0:000015 (694B) metrics ---- | | | | ingested | moved | written | | amp level | tables size val-bl vtables | score | in | tables size | tables size | tables size | read | r w ------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+--------- - 0 | 2 1.3KB 0B 0 | 0.40 | 81B | 1 717B | 0 0B | 3 1.9KB | 0B | 2 24.5 + 0 | 2 1.3KB 0B 0 | 0.40 | 81B | 1 694B | 0 0B | 3 1.9KB | 0B | 2 24.5 1 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 2 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 3 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 4 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 5 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 6 | 1 662B 0B 0 | - | 1.3KB | 0 0B | 0 0B | 1 662B | 1.3KB | 1 0.5 -total | 3 2.0KB 0B 0 | - | 825B | 1 717B | 0 0B | 4 3.4KB | 1.3KB | 3 4.2 +total | 3 2.0KB 0B 0 | - | 802B | 1 694B | 0 0B | 4 3.4KB | 1.3KB | 3 4.3 ------------------------------------------------------------------------------------------------------------------- WAL: 1 files (27B) in: 48B written: 108B (125% overhead) Flushes: 3 @@ -228,7 +228,7 @@ Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) Block cache: 6 entries (1.1KB) hit rate: 0.0% -Table cache: 1 entries (808B) hit rate: 40.0% +Table cache: 1 entries (800B) hit rate: 40.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -243,16 +243,16 @@ ingest-flushable ---- sync-data: wal/000012.log open: ext/a -read-at(664, 53): ext/a -read-at(627, 37): ext/a -read-at(53, 574): ext/a +read-at(641, 53): ext/a +read-at(604, 37): ext/a +read-at(53, 551): ext/a read-at(26, 27): ext/a read-at(0, 26): ext/a close: ext/a open: ext/b -read-at(664, 53): ext/b -read-at(627, 37): ext/b -read-at(53, 574): ext/b +read-at(641, 53): ext/b +read-at(604, 37): ext/b +read-at(53, 551): ext/b read-at(26, 27): ext/b read-at(0, 26): ext/b close: ext/b @@ -274,7 +274,7 @@ sync: wal [JOB 13] WAL created 000020 remove: ext/a remove: ext/b -[JOB 11] ingested as flushable 000017 (717B), 000018 (717B) +[JOB 11] ingested as flushable 000017 (694B), 000018 (694B) sync-data: wal/000020.log close: wal/000020.log create: wal/000021.log @@ -297,7 +297,7 @@ close: db/marker.manifest.000007.MANIFEST-000023 remove: db/marker.manifest.000006.MANIFEST-000016 sync: db [JOB 16] MANIFEST created 000023 -[JOB 16] flushed 2 ingested flushables L0:000017 (717B) + L6:000018 (717B) in 1.0s (2.0s total), output rate 1.4KB/s +[JOB 16] flushed 2 ingested flushables L0:000017 (694B) + L6:000018 (694B) in 1.0s (2.0s total), output rate 1.4KB/s remove: db/MANIFEST-000014 [JOB 16] MANIFEST deleted 000014 [JOB 17] flushing 1 memtable (100B) to L0 @@ -309,14 +309,14 @@ metrics | | | | ingested | moved | written | | amp level | tables size val-bl vtables | score | in | tables size | tables size | tables size | read | r w ------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+--------- - 0 | 4 2.7KB 0B 0 | 0.80 | 81B | 2 1.4KB | 0 0B | 4 2.6KB | 0B | 4 32.7 + 0 | 4 2.6KB 0B 0 | 0.80 | 81B | 2 1.4KB | 0 0B | 4 2.6KB | 0B | 4 32.7 1 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 2 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 3 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 4 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 5 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 - 6 | 2 1.3KB 0B 0 | - | 1.3KB | 1 717B | 0 0B | 1 662B | 1.3KB | 1 0.5 -total | 6 4.0KB 0B 0 | - | 2.2KB | 3 2.1KB | 0 0B | 5 5.4KB | 1.3KB | 5 2.5 + 6 | 2 1.3KB 0B 0 | - | 1.3KB | 1 694B | 0 0B | 1 662B | 1.3KB | 1 0.5 +total | 6 4.0KB 0B 0 | - | 2.1KB | 3 2.0KB | 0 0B | 5 5.4KB | 1.3KB | 5 2.5 ------------------------------------------------------------------------------------------------------------------- WAL: 1 files (29B) in: 82B written: 110B (34% overhead) Flushes: 6 @@ -327,7 +327,7 @@ Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) Block cache: 12 entries (2.3KB) hit rate: 7.7% -Table cache: 1 entries (808B) hit rate: 50.0% +Table cache: 1 entries (800B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 diff --git a/testdata/ingest b/testdata/ingest index 987cb6bdde..ff6ed9278e 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -40,8 +40,8 @@ level | tables size val-bl vtables | score | in | tables size | tables siz 3 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 4 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 5 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 - 6 | 1 696B 0B 0 | - | 0B | 1 696B | 0 0B | 0 0B | 0B | 1 0.0 -total | 1 696B 0B 0 | - | 696B | 1 696B | 0 0B | 0 696B | 0B | 1 1.0 + 6 | 1 673B 0B 0 | - | 0B | 1 673B | 0 0B | 0 0B | 0B | 1 0.0 +total | 1 673B 0B 0 | - | 673B | 1 673B | 0 0B | 0 673B | 0B | 1 1.0 ------------------------------------------------------------------------------------------------------------------- WAL: 1 files (0B) in: 0B written: 0B (0% overhead) Flushes: 0 @@ -51,8 +51,8 @@ MemTables: 1 (256KB) zombie: 0 (0B) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 6 entries (1.2KB) hit rate: 35.7% -Table cache: 1 entries (808B) hit rate: 50.0% +Block cache: 6 entries (1.1KB) hit rate: 35.7% +Table cache: 1 entries (800B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -356,7 +356,7 @@ num-entries: 2 num-deletions: 2 num-range-key-sets: 0 point-deletions-bytes-estimate: 0 -range-deletions-bytes-estimate: 1420 +range-deletions-bytes-estimate: 1374 # A set operation takes precedence over a range deletion at the same # sequence number as can occur during ingestion. diff --git a/testdata/metrics b/testdata/metrics index 5727ef0ca9..a376b00478 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -68,7 +68,7 @@ Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) Block cache: 3 entries (556B) hit rate: 0.0% -Table cache: 1 entries (808B) hit rate: 0.0% +Table cache: 1 entries (800B) hit rate: 0.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -201,7 +201,7 @@ Zombie tables: 1 (661B) Backing tables: 0 (0B) Virtual tables: 0 (0B) Block cache: 3 entries (556B) hit rate: 33.3% -Table cache: 1 entries (808B) hit rate: 66.7% +Table cache: 1 entries (800B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -449,30 +449,30 @@ metrics | | | | ingested | moved | written | | amp level | tables size val-bl vtables | score | in | tables size | tables size | tables size | read | r w ------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+--------- - 0 | 4 2.8KB 0B 0 | 0.50 | 149B | 3 2.1KB | 0 0B | 6 4.2KB | 0B | 2 28.8 + 0 | 4 2.7KB 0B 0 | 0.50 | 149B | 3 2.0KB | 0 0B | 6 4.2KB | 0B | 2 28.8 1 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 2 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 3 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 4 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 5 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 6 | 3 2.2KB 41B 0 | - | 3.5KB | 0 0B | 0 0B | 3 2.2KB | 3.5KB | 1 0.6 -total | 7 5.0KB 41B 0 | - | 2.3KB | 3 2.1KB | 0 0B | 9 8.7KB | 3.5KB | 3 3.8 +total | 7 4.9KB 41B 0 | - | 2.2KB | 3 2.0KB | 0 0B | 9 8.6KB | 3.5KB | 3 3.9 ------------------------------------------------------------------------------------------------------------------- WAL: 1 files (26B) in: 176B written: 175B (-1% overhead) Flushes: 8 -Compactions: 2 estimated debt: 5.0KB in progress: 0 (0B) +Compactions: 2 estimated debt: 4.9KB in progress: 0 (0B) default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 12 entries (2.4KB) hit rate: 16.7% -Table cache: 1 entries (808B) hit rate: 60.0% +Block cache: 12 entries (2.3KB) hit rate: 16.7% +Table cache: 1 entries (800B) hit rate: 60.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 Filter utility: 0.0% -Ingestions: 0 as flushable: 2 (2.1KB in 3 tables) +Ingestions: 0 as flushable: 2 (2.0KB in 3 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} @@ -510,30 +510,30 @@ metrics | | | | ingested | moved | written | | amp level | tables size val-bl vtables | score | in | tables size | tables size | tables size | read | r w ------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+--------- - 0 | 7 4.7KB 0B 0 | 0.50 | 207B | 3 2.1KB | 0 0B | 9 6.2KB | 0B | 2 30.5 + 0 | 7 4.7KB 0B 0 | 0.50 | 207B | 3 2.0KB | 0 0B | 9 6.2KB | 0B | 2 30.5 1 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 2 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 3 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 4 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 5 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 6 | 3 2.2KB 41B 0 | - | 3.5KB | 0 0B | 0 0B | 3 2.2KB | 3.5KB | 1 0.6 -total | 10 7.0KB 41B 0 | - | 2.4KB | 3 2.1KB | 0 0B | 12 11KB | 3.5KB | 3 4.6 +total | 10 6.9KB 41B 0 | - | 2.3KB | 3 2.0KB | 0 0B | 12 11KB | 3.5KB | 3 4.7 ------------------------------------------------------------------------------------------------------------------- WAL: 1 files (58B) in: 223B written: 265B (19% overhead) Flushes: 9 -Compactions: 2 estimated debt: 7.0KB in progress: 0 (0B) +Compactions: 2 estimated debt: 6.9KB in progress: 0 (0B) default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 12 entries (2.4KB) hit rate: 16.7% -Table cache: 1 entries (808B) hit rate: 60.0% +Block cache: 12 entries (2.3KB) hit rate: 16.7% +Table cache: 1 entries (800B) hit rate: 60.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 Filter utility: 0.0% -Ingestions: 0 as flushable: 2 (2.1KB in 3 tables) +Ingestions: 0 as flushable: 2 (2.0KB in 3 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} @@ -585,30 +585,30 @@ metrics zero-cache-hits-misses | | | | ingested | moved | written | | amp level | tables size val-bl vtables | score | in | tables size | tables size | tables size | read | r w ------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+--------- - 0 | 7 3.5KB 0B 2 | 0.50 | 207B | 3 2.1KB | 0 0B | 9 6.2KB | 0B | 2 30.5 + 0 | 7 3.4KB 0B 2 | 0.50 | 207B | 3 2.0KB | 0 0B | 9 6.2KB | 0B | 2 30.5 1 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 2 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 3 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 4 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 5 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 - 6 | 4 2.9KB 41B 0 | - | 3.5KB | 1 716B | 0 0B | 3 2.2KB | 3.5KB | 1 0.6 -total | 11 6.4KB 41B 2 | - | 3.1KB | 4 2.8KB | 0 0B | 12 12KB | 3.5KB | 3 3.7 + 6 | 4 2.9KB 41B 0 | - | 3.5KB | 1 693B | 0 0B | 3 2.2KB | 3.5KB | 1 0.6 +total | 11 6.3KB 41B 2 | - | 3.0KB | 4 2.7KB | 0 0B | 12 11KB | 3.5KB | 3 3.8 ------------------------------------------------------------------------------------------------------------------- WAL: 1 files (58B) in: 223B written: 265B (19% overhead) Flushes: 9 -Compactions: 2 estimated debt: 6.4KB in progress: 0 (0B) +Compactions: 2 estimated debt: 6.3KB in progress: 0 (0B) default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B) Backing tables: 2 (1.3KB) Virtual tables: 2 (102B) -Block cache: 21 entries (4.1KB) hit rate: 0.0% -Table cache: 3 entries (2.4KB) hit rate: 0.0% +Block cache: 21 entries (4.0KB) hit rate: 0.0% +Table cache: 3 entries (2.3KB) hit rate: 0.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 Filter utility: 0.0% -Ingestions: 1 as flushable: 2 (2.1KB in 3 tables) +Ingestions: 1 as flushable: 2 (2.0KB in 3 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} @@ -685,14 +685,14 @@ metrics zero-cache-hits-misses | | | | ingested | moved | written | | amp level | tables size val-bl vtables | score | in | tables size | tables size | tables size | read | r w ------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+--------- - 0 | 0 0B 0B 0 | 0.00 | 207B | 3 2.1KB | 0 0B | 9 6.2KB | 0B | 0 30.5 + 0 | 0 0B 0B 0 | 0.00 | 207B | 3 2.0KB | 0 0B | 9 6.2KB | 0B | 0 30.5 1 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 2 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 3 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 4 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 5 | 0 0B 0B 0 | 0.00 | 0B | 0 0B | 0 0B | 0 0B | 0B | 0 0.0 6 | 6 4.3KB 41B 0 | - | 7.0KB | 2 1.4KB | 0 0B | 4 2.9KB | 7.0KB | 1 0.4 -total | 6 4.3KB 41B 0 | - | 3.8KB | 5 3.5KB | 0 0B | 13 13KB | 7.0KB | 1 3.4 +total | 6 4.3KB 41B 0 | - | 3.6KB | 5 3.4KB | 0 0B | 13 13KB | 7.0KB | 1 3.5 ------------------------------------------------------------------------------------------------------------------- WAL: 1 files (58B) in: 223B written: 265B (19% overhead) Flushes: 9 @@ -708,7 +708,7 @@ Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 Filter utility: 0.0% -Ingestions: 2 as flushable: 2 (2.1KB in 3 tables) +Ingestions: 2 as flushable: 2 (2.0KB in 3 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} diff --git a/tool/find.go b/tool/find.go index 3dfd70abda..ac042646b3 100644 --- a/tool/find.go +++ b/tool/find.go @@ -457,11 +457,12 @@ func (f *findT) searchTables(stdout io.Writer, searchKey []byte, refs []findRef) } defer r.Close() - if m != nil && m.SmallestSeqNum == m.LargestSeqNum { - r.Properties.GlobalSeqNum = m.LargestSeqNum + var transforms sstable.IterTransforms + if m != nil { + transforms = m.IterTransforms() } - iter, err := r.NewIter(nil, nil) + iter, err := r.NewIter(transforms, nil, nil) if err != nil { return err } @@ -472,7 +473,7 @@ func (f *findT) searchTables(stdout io.Writer, searchKey []byte, refs []findRef) // bit more work here to put them in a form that can be iterated in // parallel with the point records. rangeDelIter, err := func() (keyspan.FragmentIterator, error) { - iter, err := r.NewRawRangeDelIter() + iter, err := r.NewRawRangeDelIter(transforms) if err != nil { return nil, err } diff --git a/tool/sstable.go b/tool/sstable.go index 69263872eb..fba651c9b2 100644 --- a/tool/sstable.go +++ b/tool/sstable.go @@ -178,7 +178,7 @@ func (s *sstableT) runCheck(cmd *cobra.Command, args []string) { s.fmtKey.setForComparer(r.Properties.ComparerName, s.comparers) s.fmtValue.setForComparer(r.Properties.ComparerName, s.comparers) - iter, err := r.NewIter(nil, nil) + iter, err := r.NewIter(sstable.NoTransforms, nil, nil) if err != nil { fmt.Fprintf(stderr, "%s\n", err) return @@ -189,7 +189,7 @@ func (s *sstableT) runCheck(cmd *cobra.Command, args []string) { var prefixIter sstable.Iterator if r.Split != nil { var err error - prefixIter, err = r.NewIter(nil, nil) + prefixIter, err = r.NewIter(sstable.NoTransforms, nil, nil) if err != nil { fmt.Fprintf(stderr, "%s\n", err) return @@ -331,7 +331,6 @@ func (s *sstableT) runProperties(cmd *cobra.Command, args []string) { fmt.Fprintf(tw, " range-key-unset\t%d\n", r.Properties.NumRangeKeyUnsets) fmt.Fprintf(tw, " range-key-delete\t%d\n", r.Properties.NumRangeKeyDels) fmt.Fprintf(tw, " merge\t%d\n", r.Properties.NumMergeOperands) - fmt.Fprintf(tw, " global-seq-num\t%d\n", r.Properties.GlobalSeqNum) fmt.Fprintf(tw, " pinned\t%d\n", r.Properties.SnapshotPinnedKeys) fmt.Fprintf(tw, "index\t\n") fmt.Fprintf(tw, " key\t") @@ -386,7 +385,7 @@ func (s *sstableT) runScan(cmd *cobra.Command, args []string) { s.fmtKey.setForComparer(r.Properties.ComparerName, s.comparers) s.fmtValue.setForComparer(r.Properties.ComparerName, s.comparers) - iter, err := r.NewIter(nil, s.end) + iter, err := r.NewIter(sstable.NoTransforms, nil, s.end) if err != nil { fmt.Fprintf(stderr, "%s%s\n", prefix, err) return @@ -399,7 +398,7 @@ func (s *sstableT) runScan(cmd *cobra.Command, args []string) { // bit more work here to put them in a form that can be iterated in // parallel with the point records. rangeDelIter, err := func() (keyspan.FragmentIterator, error) { - iter, err := r.NewRawRangeDelIter() + iter, err := r.NewRawRangeDelIter(sstable.NoTransforms) if err != nil { return nil, err } @@ -501,7 +500,7 @@ func (s *sstableT) runScan(cmd *cobra.Command, args []string) { } // Handle range keys. - rkIter, err := r.NewRawRangeKeyIter() + rkIter, err := r.NewRawRangeKeyIter(sstable.NoTransforms) if err != nil { fmt.Fprintf(stdout, "%s\n", err) os.Exit(1) diff --git a/tool/testdata/sstable_layout b/tool/testdata/sstable_layout index 58da2056e2..90373aea63 100644 --- a/tool/testdata/sstable_layout +++ b/tool/testdata/sstable_layout @@ -22,10 +22,10 @@ h.sst 13752 data (156) 13913 index (245) 14163 range-del (421) - 14589 properties (536) - 15130 meta-index (61) - 15196 footer (53) - 15249 EOF + 14589 properties (513) + 15107 meta-index (61) + 15173 footer (53) + 15226 EOF sstable layout ../sstable/testdata/h.table-bloom.no-compression.sst @@ -48,10 +48,10 @@ h.table-bloom.no-compression.sst 26799 filter (2245) 29049 index (325) 29379 range-del (421) - 29805 properties (580) - 30390 meta-index (112) - 30507 footer (53) - 30560 EOF + 29805 properties (557) + 30367 meta-index (112) + 30484 footer (53) + 30537 EOF sstable layout ../sstable/testdata/h.no-compression.two_level_index.sst @@ -76,10 +76,10 @@ h.no-compression.two_level_index.sst 27047 index (95) 27147 top-index (70) 27222 range-del (421) - 27648 properties (582) - 28235 meta-index (63) - 28303 footer (53) - 28356 EOF + 27648 properties (559) + 28212 meta-index (63) + 28280 footer (53) + 28333 EOF sstable layout -v @@ -3744,7 +3744,7 @@ h.no-compression.two_level_index.sst 27631 [restart 27523] 27635 [restart 27546] 27643 [trailer compression=none checksum=0xb93b31c5] - 27648 properties (582) + 27648 properties (559) 27648 rocksdb.block.based.table.index.type (43) [restart] 27691 rocksdb.block.based.table.prefix.filtering (20) 27711 rocksdb.block.based.table.whole.key.filtering (23) @@ -3753,37 +3753,36 @@ h.no-compression.two_level_index.sst 27796 rocksdb.compression_options (106) 27902 rocksdb.data.size (15) 27917 rocksdb.deleted.keys (15) - 27932 rocksdb.external_sst_file.global_seqno (41) - 27973 rocksdb.external_sst_file.version (14) - 27987 rocksdb.filter.size (15) - 28002 rocksdb.index.partitions (20) - 28022 rocksdb.index.size (9) - 28031 rocksdb.merge.operands (18) - 28049 rocksdb.merge.operator (13) - 28062 rocksdb.num.data.blocks (19) - 28081 rocksdb.num.entries (12) - 28093 rocksdb.num.range-deletions (19) - 28112 rocksdb.prefix.extractor.name (31) - 28143 rocksdb.property.collectors (22) - 28165 rocksdb.raw.key.size (18) - 28183 rocksdb.raw.value.size (15) - 28198 rocksdb.top-level.index.size (24) - 28222 [restart 27648] - 28230 [trailer compression=none checksum=0x9414996] - 28235 meta-index (63) - 28235 rocksdb.properties block:27648/582 [restart] - 28261 rocksdb.range_del block:27222/421 [restart] - 28286 [restart 28235] - 28290 [restart 28261] - 28298 [trailer compression=none checksum=0x9e135f62] - 28303 footer (53) - 28303 checksum type: crc32c - 28304 meta: offset=28235, length=63 - 28308 index: offset=27147, length=70 - 28312 [padding] - 28344 version: 1 - 28348 magic number: 0xf09faab3f09faab3 - 28356 EOF + 27932 rocksdb.external_sst_file.version (32) + 27964 rocksdb.filter.size (15) + 27979 rocksdb.index.partitions (20) + 27999 rocksdb.index.size (9) + 28008 rocksdb.merge.operands (18) + 28026 rocksdb.merge.operator (13) + 28039 rocksdb.num.data.blocks (19) + 28058 rocksdb.num.entries (12) + 28070 rocksdb.num.range-deletions (19) + 28089 rocksdb.prefix.extractor.name (31) + 28120 rocksdb.property.collectors (22) + 28142 rocksdb.raw.key.size (18) + 28160 rocksdb.raw.value.size (15) + 28175 rocksdb.top-level.index.size (24) + 28199 [restart 27648] + 28207 [trailer compression=none checksum=0x2d96f92a] + 28212 meta-index (63) + 28212 rocksdb.properties block:27648/559 [restart] + 28238 rocksdb.range_del block:27222/421 [restart] + 28263 [restart 28212] + 28267 [restart 28238] + 28275 [trailer compression=none checksum=0xf7259c47] + 28280 footer (53) + 28280 checksum type: crc32c + 28281 meta: offset=28212, length=63 + 28285 index: offset=27147, length=70 + 28289 [padding] + 28321 version: 1 + 28325 magic number: 0xf09faab3f09faab3 + 28333 EOF sstable layout -v diff --git a/tool/testdata/sstable_properties b/tool/testdata/sstable_properties index e0735ae54b..810db1311e 100644 Binary files a/tool/testdata/sstable_properties and b/tool/testdata/sstable_properties differ