diff --git a/compaction.go b/compaction.go index 5cee48d9ee..8e81724bd9 100644 --- a/compaction.go +++ b/compaction.go @@ -1415,6 +1415,10 @@ func (c *compaction) newInputIter( iterOpts := IterOptions{logger: c.logger} // TODO(bananabrick): Get rid of the extra manifest.Level parameter and fold it into // compactionLevel. + // + // TODO(bilal): when we start using strict obsolete sstables for L5 and L6 + // in disaggregated storage, and rely on the obsolete bit, we will also need + // to configure the levelIter at these levels to hide the obsolete points. addItersForLevel := func(level *compactionLevel, l manifest.Level) error { iters = append(iters, newLevelIter(iterOpts, c.cmp, nil /* split */, newIters, level.files.Iter(), l, &c.bytesIterated)) @@ -3235,7 +3239,10 @@ func (d *DB) runCompaction( return nil, pendingOutputs, stats, err } } - if err := tw.Add(*key, val); err != nil { + // iter.snapshotPinned is broader than whether the point was covered by + // a RANGEDEL, but it is harmless to pass true when the callee will also + // independently discover that the point is obsolete. + if err := tw.AddWithForceObsolete(*key, val, iter.snapshotPinned); err != nil { return nil, pendingOutputs, stats, err } if iter.snapshotPinned { diff --git a/compaction_iter.go b/compaction_iter.go index b013036b6b..6623e9343e 100644 --- a/compaction_iter.go +++ b/compaction_iter.go @@ -383,6 +383,13 @@ func (i *compactionIter) Next() (*InternalKey, []byte) { } else if cover == keyspan.CoversInvisibly { // i.iterKey would be deleted by a range deletion if there weren't // any open snapshots. Mark it as pinned. + // + // TODO(sumeer): there are multiple places in this file where we call + // i.rangeDelFrag.Covers and this is the only one where we are fiddling + // with i.snapshotPinned. i.snapshotPinned was previously being used + // only for stats, where a mistake does not lead to corruption. But it + // is also now being used for the forceObsolete bit in + // Writer.AddWithForceObsolete(). Give this more scrutiny. i.snapshotPinned = true } diff --git a/db.go b/db.go index b4173ffe01..cd59a4cf82 100644 --- a/db.go +++ b/db.go @@ -1362,6 +1362,7 @@ func (i *Iterator) constructPointIter( levelsIndex := len(levels) mlevels = mlevels[:numMergingLevels] levels = levels[:numLevelIters] + i.opts.snapshotForHideObsoletePoints = buf.dbi.seqNum addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) { li := &levels[levelsIndex] diff --git a/external_iterator.go b/external_iterator.go index 3c45c2cd48..8fbef76e72 100644 --- a/external_iterator.go +++ b/external_iterator.go @@ -209,15 +209,15 @@ func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterato pointIter internalIterator err error ) - pointIter, err = r.NewIterWithBlockPropertyFiltersAndContext( - ctx, - it.opts.LowerBound, - it.opts.UpperBound, - nil, /* BlockPropertiesFilterer */ - false, /* useFilterBlock */ - &it.stats.InternalStats, - sstable.TrivialReaderProvider{Reader: r}, - ) + // We could set hideObsoletePoints=true, since we are reading at + // InternalKeySeqNumMax, but we don't bother since these sstables should + // not have obsolete points (so the performance optimization is + // unnecessary), and we don't want to bother constructing a + // BlockPropertiesFilterer that includes obsoleteKeyBlockPropertyFilter. + pointIter, err = r.NewIterWithBlockPropertyFiltersAndContextEtc( + ctx, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */ + false /* hideObsoletePoints */, false, /* useFilterBlock */ + &it.stats.InternalStats, sstable.TrivialReaderProvider{Reader: r}) if err != nil { return nil, err } diff --git a/get_iter.go b/get_iter.go index f1fc25caed..0ac0484e50 100644 --- a/get_iter.go +++ b/get_iter.go @@ -158,7 +158,7 @@ func (g *getIter) Next() (*InternalKey, base.LazyValue) { if n := len(g.l0); n > 0 { files := g.l0[n-1].Iter() g.l0 = g.l0[:n-1] - iterOpts := IterOptions{logger: g.logger} + iterOpts := IterOptions{logger: g.logger, snapshotForHideObsoletePoints: g.snapshot} g.levelIter.init(context.Background(), iterOpts, g.cmp, nil /* split */, g.newIters, files, manifest.L0Sublevel(n), internalIterOpts{}) g.levelIter.initRangeDel(&g.rangeDelIter) @@ -177,7 +177,7 @@ func (g *getIter) Next() (*InternalKey, base.LazyValue) { continue } - iterOpts := IterOptions{logger: g.logger} + iterOpts := IterOptions{logger: g.logger, snapshotForHideObsoletePoints: g.snapshot} g.levelIter.init(context.Background(), iterOpts, g.cmp, nil /* split */, g.newIters, g.version.Levels[g.level].Iter(), manifest.Level(g.level), internalIterOpts{}) g.levelIter.initRangeDel(&g.rangeDelIter) diff --git a/internal/base/internal.go b/internal/base/internal.go index 6f954bb851..082307a35b 100644 --- a/internal/base/internal.go +++ b/internal/base/internal.go @@ -24,6 +24,14 @@ const ( //InternalKeyKindColumnFamilyDeletion InternalKeyKind = 4 //InternalKeyKindColumnFamilyValue InternalKeyKind = 5 //InternalKeyKindColumnFamilyMerge InternalKeyKind = 6 + + // InternalKeyKindSingleDelete (SINGLEDEL) is a performance optimization + // solely for compactions (to reduce write amp and space amp). Readers other + // than compactions should treat SINGLEDEL as equivalent to a DEL. + // Historically, it was simpler for readers other than compactions to treat + // SINGLEDEL as equivalent to DEL, but as of the introduction of + // InternalKeyKindSSTableInternalObsoleteBit, this is also necessary for + // correctness. InternalKeyKindSingleDelete InternalKeyKind = 7 //InternalKeyKindColumnFamilySingleDelete InternalKeyKind = 8 //InternalKeyKindBeginPrepareXID InternalKeyKind = 9 @@ -71,7 +79,7 @@ const ( // value indicating the (len(key)+len(value)) of the shadowed entry the // tombstone is expected to delete. This value is used to inform compaction // heuristics, but is not required to be accurate for correctness. - InternalKeyKindDeleteSized = 23 + InternalKeyKindDeleteSized InternalKeyKind = 23 // This maximum value isn't part of the file format. Future extensions may // increase this value. @@ -84,12 +92,17 @@ const ( // seqNum. InternalKeyKindMax InternalKeyKind = 23 + // Internal to the sstable format. Not exposed by any sstable iterator. + // Declared here to prevent definition of valid key kinds that set this bit. + InternalKeyKindSSTableInternalObsoleteBit InternalKeyKind = 64 + InternalKeyKindSSTableInternalObsoleteMask InternalKeyKind = 191 + // InternalKeyZeroSeqnumMaxTrailer is the largest trailer with a // zero sequence number. - InternalKeyZeroSeqnumMaxTrailer = uint64(InternalKeyKindInvalid) + InternalKeyZeroSeqnumMaxTrailer = uint64(255) // A marker for an invalid key. - InternalKeyKindInvalid InternalKeyKind = 255 + InternalKeyKindInvalid InternalKeyKind = InternalKeyKindSSTableInternalObsoleteMask // InternalKeySeqNumBatch is a bit that is set on batch sequence numbers // which prevents those entries from being excluded from iteration. @@ -112,6 +125,9 @@ const ( InternalKeyBoundaryRangeKey = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeKeySet) ) +// Assert InternalKeyKindSSTableInternalObsoleteBit > InternalKeyKindMax +const _ = uint(InternalKeyKindSSTableInternalObsoleteBit - InternalKeyKindMax - 1) + var internalKeyKindNames = []string{ InternalKeyKindDelete: "DEL", InternalKeyKindSet: "SET", diff --git a/iterator.go b/iterator.go index 996817c78b..9c1ff3eea9 100644 --- a/iterator.go +++ b/iterator.go @@ -569,6 +569,9 @@ func (i *Iterator) findNextEntry(limit []byte) { return case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: + // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not + // only simpler, but is also necessary for correctness due to + // InternalKeyKindSSTableInternalObsoleteBit. i.nextUserKey() continue @@ -632,6 +635,9 @@ func (i *Iterator) nextPointCurrentUserKey() bool { return false case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: + // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not + // only simpler, but is also necessary for correctness due to + // InternalKeyKindSSTableInternalObsoleteBit. return false case InternalKeyKindSet, InternalKeyKindSetWithDelete: @@ -1095,6 +1101,10 @@ func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) { case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: // We've hit a deletion tombstone. Return everything up to this // point. + // + // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not + // only simpler, but is also necessary for correctness due to + // InternalKeyKindSSTableInternalObsoleteBit. return case InternalKeyKindSet, InternalKeyKindSetWithDelete: diff --git a/level_iter.go b/level_iter.go index 10ee903338..68e2e338a1 100644 --- a/level_iter.go +++ b/level_iter.go @@ -200,6 +200,11 @@ type levelIter struct { // cache when constructing new table iterators. internalOpts internalIterOpts + // Scratch space for the obsolete keys filter, when there are no other block + // property filters specified. See the performance note where + // IterOptions.PointKeyFilters is declared. + filtersBuf [1]BlockPropertyFilter + // Disable invariant checks even if they are otherwise enabled. Used by tests // which construct "impossible" situations (e.g. seeking to a key before the // lower bound). @@ -267,8 +272,12 @@ func (l *levelIter) init( l.upper = opts.UpperBound l.tableOpts.TableFilter = opts.TableFilter l.tableOpts.PointKeyFilters = opts.PointKeyFilters + if len(opts.PointKeyFilters) == 0 { + l.tableOpts.PointKeyFilters = l.filtersBuf[:0:1] + } l.tableOpts.UseL6Filters = opts.UseL6Filters l.tableOpts.level = l.level + l.tableOpts.snapshotForHideObsoletePoints = opts.snapshotForHideObsoletePoints l.cmp = cmp l.split = split l.iterFile = nil diff --git a/level_iter_test.go b/level_iter_test.go index af65b943b4..d2ec77b80b 100644 --- a/level_iter_test.go +++ b/level_iter_test.go @@ -163,8 +163,8 @@ func (lt *levelIterTest) newIters( ctx context.Context, file *manifest.FileMetadata, opts *IterOptions, iio internalIterOpts, ) (internalIterator, keyspan.FragmentIterator, error) { lt.itersCreated++ - iter, err := lt.readers[file.FileNum].NewIterWithBlockPropertyFiltersAndContext( - ctx, opts.LowerBound, opts.UpperBound, nil, true, iio.stats, + iter, err := lt.readers[file.FileNum].NewIterWithBlockPropertyFiltersAndContextEtc( + ctx, opts.LowerBound, opts.UpperBound, nil, false, true, iio.stats, sstable.TrivialReaderProvider{Reader: lt.readers[file.FileNum]}) if err != nil { return nil, nil, err diff --git a/options.go b/options.go index c415f77fa3..b30c19cb12 100644 --- a/options.go +++ b/options.go @@ -119,10 +119,13 @@ type IterOptions struct { // function can be used by multiple iterators, if the iterator is cloned. TableFilter func(userProps map[string]string) bool // PointKeyFilters can be used to avoid scanning tables and blocks in tables - // when iterating over point keys. It is requires that this slice is sorted in - // increasing order of the BlockPropertyFilter.ShortID. This slice represents - // an intersection across all filters, i.e., all filters must indicate that the - // block is relevant. + // when iterating over point keys. This slice represents an intersection + // across all filters, i.e., all filters must indicate that the block is + // relevant. + // + // Performance note: When len(PointKeyFilters) > 0, the caller should ensure + // that cap(PointKeyFilters) is at least len(PointKeyFilters)+1. This helps + // avoid allocations in Pebble internal code that mutates the slice. PointKeyFilters []BlockPropertyFilter // RangeKeyFilters can be usefd to avoid scanning tables and blocks in tables // when iterating over range keys. The same requirements that apply to @@ -181,6 +184,10 @@ type IterOptions struct { level manifest.Level // disableLazyCombinedIteration is an internal testing option. disableLazyCombinedIteration bool + // snapshotForHideObsoletePoints is specified for/by levelIter when opening + // files and is used to decide whether to hide obsolete points. A value of 0 + // implies obsolete points should not be hidden. + snapshotForHideObsoletePoints uint64 // NB: If adding new Options, you must account for them in iterator // construction and Iterator.SetOptions. @@ -632,18 +639,24 @@ type Options struct { ShortAttributeExtractor ShortAttributeExtractor // RequiredInPlaceValueBound specifies an optional span of user key - // prefixes for which the values must be stored with the key. This is - // useful for statically known exclusions to value separation. In - // CockroachDB, this will be used for the lock table key space that has - // non-empty suffixes, but those locks don't represent actual MVCC - // versions (the suffix ordering is arbitrary). We will also need to add - // support for dynamically configured exclusions (we want the default to - // be to allow Pebble to decide whether to separate the value or not, - // hence this is structured as exclusions), for example, for users of - // CockroachDB to dynamically exclude certain tables. + // prefixes that are not-MVCC, but have a suffix. For these the values + // must be stored with the key, since the concept of "older versions" is + // not defined. It is also useful for statically known exclusions to value + // separation. In CockroachDB, this will be used for the lock table key + // space that has non-empty suffixes, but those locks don't represent + // actual MVCC versions (the suffix ordering is arbitrary). We will also + // need to add support for dynamically configured exclusions (we want the + // default to be to allow Pebble to decide whether to separate the value + // or not, hence this is structured as exclusions), for example, for users + // of CockroachDB to dynamically exclude certain tables. // // Any change in exclusion behavior takes effect only on future written // sstables, and does not start rewriting existing sstables. + // + // Even ignoring changes in this setting, exclusions are interpreted as a + // guidance by Pebble, and not necessarily honored. Specifically, user + // keys with multiple Pebble-versions *may* have the older versions stored + // in value blocks. RequiredInPlaceValueBound UserKeyPrefixBound // DisableIngestAsFlushable disables lazy ingestion of sstables through @@ -1642,6 +1655,9 @@ func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstab if format >= sstable.TableFormatPebblev3 { writerOpts.ShortAttributeExtractor = o.Experimental.ShortAttributeExtractor writerOpts.RequiredInPlaceValueBound = o.Experimental.RequiredInPlaceValueBound + if format >= sstable.TableFormatPebblev4 && level == numLevels-1 { + writerOpts.WritingToLowestLevel = true + } } levelOpts := o.Level(level) writerOpts.BlockRestartInterval = levelOpts.BlockRestartInterval diff --git a/scan_internal.go b/scan_internal.go index beee42004f..7f7a2378c5 100644 --- a/scan_internal.go +++ b/scan_internal.go @@ -977,6 +977,7 @@ func (i *scanInternalIterator) constructPointIter(memtables flushableList, buf * mlevels = mlevels[:numMergingLevels] levels = levels[:numLevelIters] rangeDelLevels = rangeDelLevels[:numLevelIters] + i.opts.IterOptions.snapshotForHideObsoletePoints = i.seqNum addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) { li := &levels[levelsIndex] rli := &rangeDelLevels[levelsIndex] diff --git a/sstable/block.go b/sstable/block.go index 40ad4ec1bc..f7fb628b83 100644 --- a/sstable/block.go +++ b/sstable/block.go @@ -47,7 +47,10 @@ type blockWriter struct { // ensuring that any of the keys with the same prefix can be used to // assemble the full key when the prefix does change. restarts []uint32 - curKey []byte + // Do not read curKey directly from outside blockWriter since it can have + // the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or + // getCurUserKey() instead. + curKey []byte // curValue excludes the optional prefix provided to // storeWithOptionalValuePrefix. curValue []byte @@ -79,6 +82,20 @@ const setHasSameKeyPrefixRestartMask uint32 = 1 << 31 const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111 const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000 +func (w *blockWriter) getCurKey() InternalKey { + k := base.DecodeInternalKey(w.curKey) + k.Trailer = k.Trailer & trailerObsoleteMask + return k +} + +func (w *blockWriter) getCurUserKey() []byte { + n := len(w.curKey) - base.InternalTrailerLen + if n < 0 { + panic("corrupt key in blockWriter buffer") + } + return w.curKey[:n:n] +} + // If !addValuePrefix, the valuePrefix is ignored. func (w *blockWriter) storeWithOptionalValuePrefix( keySize int, @@ -191,12 +208,21 @@ func (w *blockWriter) storeWithOptionalValuePrefix( func (w *blockWriter) add(key InternalKey, value []byte) { w.addWithOptionalValuePrefix( - key, value, len(key.UserKey), false, 0, false) + key, false, value, len(key.UserKey), false, 0, false) } // Callers that always set addValuePrefix to false should use add() instead. +// +// isObsolete indicates whether this key-value pair is obsolete in this +// sstable (only applicable when writing data blocks) -- see the comment in +// table.go and the longer one in format.go. addValuePrefix adds a 1 byte +// prefix to the value, specified in valuePrefix -- this is used for data +// blocks in TableFormatPebblev3 onwards for SETs (see the comment in +// format.go, with more details in value_block.go). setHasSameKeyPrefix is +// also used in TableFormatPebblev3 onwards for SETs. func (w *blockWriter) addWithOptionalValuePrefix( key InternalKey, + isObsolete bool, value []byte, maxSharedKeyLen int, addValuePrefix bool, @@ -210,6 +236,9 @@ func (w *blockWriter) addWithOptionalValuePrefix( w.curKey = make([]byte, 0, size*2) } w.curKey = w.curKey[:size] + if isObsolete { + key.Trailer = key.Trailer | trailerObsoleteBit + } key.Encode(w.curKey) w.storeWithOptionalValuePrefix( @@ -381,6 +410,7 @@ type blockIter struct { vbr *valueBlockReader hasValuePrefix bool } + hideObsoletePoints bool } // blockIter implements the base.InternalIterator interface. @@ -388,14 +418,16 @@ var _ base.InternalIterator = (*blockIter)(nil) func newBlockIter(cmp Compare, block block) (*blockIter, error) { i := &blockIter{} - return i, i.init(cmp, block, 0) + return i, i.init(cmp, block, 0, false) } func (i *blockIter) String() string { return "block" } -func (i *blockIter) init(cmp Compare, block block, globalSeqNum uint64) error { +func (i *blockIter) init( + cmp Compare, block block, globalSeqNum uint64, hideObsoletePoints bool, +) error { numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:])) if numRestarts == 0 { return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)") @@ -408,6 +440,7 @@ func (i *blockIter) init(cmp Compare, block block, globalSeqNum uint64) error { i.data = block i.fullKey = i.fullKey[:0] i.val = nil + i.hideObsoletePoints = hideObsoletePoints i.clearCache() if i.restarts > 0 { if err := i.readFirstKey(); err != nil { @@ -420,10 +453,16 @@ func (i *blockIter) init(cmp Compare, block block, globalSeqNum uint64) error { return nil } -func (i *blockIter) initHandle(cmp Compare, block cache.Handle, globalSeqNum uint64) error { +// NB: two cases of hideObsoletePoints: +// - Local sstable iteration: globalSeqNum will be set iff the sstable was +// ingested. +// - Foreign sstable iteration: globalSeqNum is always set. +func (i *blockIter) initHandle( + cmp Compare, block cache.Handle, globalSeqNum uint64, hideObsoletePoints bool, +) error { i.cacheHandle.Release() i.cacheHandle = block - return i.init(cmp, block.Get(), globalSeqNum) + return i.init(cmp, block.Get(), globalSeqNum, hideObsoletePoints) } func (i *blockIter) invalidate() { @@ -590,7 +629,9 @@ func (i *blockIter) readFirstKey() error { // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on // BlockIter benchmarks. if n := len(firstKey) - 8; n >= 0 { - i.firstKey.Trailer = binary.LittleEndian.Uint64(firstKey[n:]) + // NB: we do not track whether the firstKey is obsolete since the trailer + // of the firstKey is not used. + i.firstKey.Trailer = binary.LittleEndian.Uint64(firstKey[n:]) & trailerObsoleteMask i.firstKey.UserKey = firstKey[:n:n] if i.globalSeqNum != 0 { i.firstKey.SetSeqNum(i.globalSeqNum) @@ -603,11 +644,19 @@ func (i *blockIter) readFirstKey() error { return nil } -func (i *blockIter) decodeInternalKey(key []byte) { +// The sstable internal obsolete bit is set when writing a block and unset by +// blockIter, so no code outside block writing/reading code ever sees it. +const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit) +const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask) + +func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) { // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on // BlockIter benchmarks. if n := len(key) - 8; n >= 0 { - i.ikey.Trailer = binary.LittleEndian.Uint64(key[n:]) + trailer := binary.LittleEndian.Uint64(key[n:]) + hiddenPoint = i.hideObsoletePoints && + (trailer&trailerObsoleteBit != 0) + i.ikey.Trailer = trailer & trailerObsoleteMask i.ikey.UserKey = key[:n:n] if i.globalSeqNum != 0 { i.ikey.SetSeqNum(i.globalSeqNum) @@ -616,6 +665,7 @@ func (i *blockIter) decodeInternalKey(key []byte) { i.ikey.Trailer = uint64(InternalKeyKindInvalid) i.ikey.UserKey = nil } + return hiddenPoint } func (i *blockIter) clearCache() { @@ -640,13 +690,15 @@ func (i *blockIter) cacheEntry() { i.cachedBuf = append(i.cachedBuf, i.key...) } +func (i *blockIter) firstUserKey() []byte { + return i.firstKey.UserKey +} + // SeekGE implements internalIterator.SeekGE, as documented in the pebble // package. func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { i.clearCache() - ikey := base.MakeSearchKey(key) - // Find the index of the smallest restart point whose key is > the key // sought; index will be numRestarts if there is no such restart point. i.offset = 0 @@ -700,23 +752,23 @@ func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, ba ptr = unsafe.Pointer(uintptr(ptr) + 5) } - // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on - // BlockIter benchmarks. + // Manually inlining part of base.DecodeInternalKey provides a 5-10% + // speedup on BlockIter benchmarks. s := getBytes(ptr, int(v1)) - var k InternalKey + var k []byte if n := len(s) - 8; n >= 0 { - k.Trailer = binary.LittleEndian.Uint64(s[n:]) - k.UserKey = s[:n:n] - // NB: We can't have duplicate keys if the globalSeqNum != 0, so we - // leave the seqnum on this key as 0 as it won't affect our search - // since ikey has the maximum seqnum. - } else { - k.Trailer = uint64(InternalKeyKindInvalid) + k = s[:n:n] } + // Else k is invalid, and left as nil - if base.InternalCompare(i.cmp, ikey, k) >= 0 { + if i.cmp(key, k) > 0 { + // The search key is greater than the user key at this restart point. + // Search beyond this restart point, since we are trying to find the + // first restart point with a user key >= the search key. index = h + 1 // preserves f(i-1) == false } else { + // k >= search key, so prune everything after index (since index + // satisfies the property we are looking for). upper = h // preserves f(j) == true } } @@ -724,21 +776,25 @@ func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, ba // => answer is index. } - // Since keys are strictly increasing, if index > 0 then the restart point at - // index-1 will be the largest whose key is <= the key sought. If index == - // 0, then all keys in this block are larger than the key sought, and offset - // remains at zero. + // index is the first restart point with key >= search key. Define the keys + // between a restart point and the next restart point as belonging to that + // restart point. + // + // Since keys are strictly increasing, if index > 0 then the restart point + // at index-1 will be the first one that has some keys belonging to it that + // could be equal to the search key. If index == 0, then all keys in this + // block are larger than the key sought, and offset remains at zero. if index > 0 { i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) } i.readEntry() - i.decodeInternalKey(i.key) + hiddenPoint := i.decodeInternalKey(i.key) // Iterate from that restart point to somewhere >= the key sought. if !i.valid() { return nil, base.LazyValue{} } - if base.InternalCompare(i.cmp, i.ikey, ikey) >= 0 { + if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 { // Initialize i.lazyValue if !i.lazyValueHandling.hasValuePrefix || base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { @@ -751,7 +807,7 @@ func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, ba return &i.ikey, i.lazyValue } for i.Next(); i.valid(); i.Next() { - if base.InternalCompare(i.cmp, i.ikey, ikey) >= 0 { + if i.cmp(i.ikey.UserKey, key) >= 0 { // i.Next() has already initialized i.lazyValue. return &i.ikey, i.lazyValue } @@ -773,8 +829,6 @@ func (i *blockIter) SeekPrefixGE( func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { i.clearCache() - ikey := base.MakeSearchKey(key) - // Find the index of the smallest restart point whose key is >= the key // sought; index will be numRestarts if there is no such restart point. i.offset = 0 @@ -828,23 +882,23 @@ func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, ba ptr = unsafe.Pointer(uintptr(ptr) + 5) } - // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on - // BlockIter benchmarks. + // Manually inlining part of base.DecodeInternalKey provides a 5-10% + // speedup on BlockIter benchmarks. s := getBytes(ptr, int(v1)) - var k InternalKey + var k []byte if n := len(s) - 8; n >= 0 { - k.Trailer = binary.LittleEndian.Uint64(s[n:]) - k.UserKey = s[:n:n] - // NB: We can't have duplicate keys if the globalSeqNum != 0, so we - // leave the seqnum on this key as 0 as it won't affect our search - // since ikey has the maximum seqnum. - } else { - k.Trailer = uint64(InternalKeyKindInvalid) + k = s[:n:n] } + // Else k is invalid, and left as nil - if base.InternalCompare(i.cmp, ikey, k) > 0 { + if i.cmp(key, k) > 0 { + // The search key is greater than the user key at this restart point. + // Search beyond this restart point, since we are trying to find the + // first restart point with a user key >= the search key. index = h + 1 // preserves f(i-1) == false } else { + // k >= search key, so prune everything after index (since index + // satisfies the property we are looking for). upper = h // preserves f(j) == true } } @@ -852,8 +906,15 @@ func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, ba // => answer is index. } - // Since keys are strictly increasing, if index > 0 then the restart point at - // index-1 will be the largest whose key is < the key sought. + // index is the first restart point with key >= search key. Define the keys + // between a restart point and the next restart point as belonging to that + // restart point. Note that index could be equal to i.numRestarts, i.e., we + // are past the last restart. + // + // Since keys are strictly increasing, if index > 0 then the restart point + // at index-1 will be the first one that has some keys belonging to it that + // are less than the search key. If index == 0, then all keys in this block + // are larger than the search key, so there is no match. targetOffset := i.restarts if index > 0 { i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) @@ -876,24 +937,41 @@ func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, ba for { i.offset = i.nextOffset i.readEntry() - i.decodeInternalKey(i.key) - - if i.cmp(i.ikey.UserKey, ikey.UserKey) >= 0 { + // When hidden keys are common, there is additional optimization possible + // by not caching entries that are hidden (note that some calls to + // cacheEntry don't decode the internal key before caching, but checking + // whether a key is hidden does not require full decoding). However, we do + // need to use the blockEntry.offset in the cache for the first entry at + // the reset point to do the binary search when the cache is empty -- so + // we would need to cache that first entry (though not the key) even if + // was hidden. Our current assumption is that if there are large numbers + // of hidden keys we will be able to skip whole blocks (using block + // property filters) so we don't bother optimizing. + hiddenPoint := i.decodeInternalKey(i.key) + + // NB: we don't use the hiddenPoint return value of decodeInternalKey + // since we want to stop as soon as we reach a key >= ikey.UserKey, so + // that we can reverse. + if i.cmp(i.ikey.UserKey, key) >= 0 { // The current key is greater than or equal to our search key. Back up to // the previous key which was less than our search key. Note that this for // loop will execute at least once with this if-block not being true, so // the key we are backing up to is the last one this loop cached. - i.Prev() - // i.Prev() has already initialized i.lazyValue. - return &i.ikey, i.lazyValue + return i.Prev() } if i.nextOffset >= targetOffset { - // We've reached the end of the current restart block. Return the current - // key. When the restart interval is 1, the first iteration of the for - // loop will bring us here. In that case ikey is backed by the block so - // we get the desired key stability guarantee for the lifetime of the - // blockIter. + // We've reached the end of the current restart block. Return the + // current key if not hidden, else call Prev(). + // + // When the restart interval is 1, the first iteration of the for loop + // will bring us here. In that case ikey is backed by the block so we + // get the desired key stability guarantee for the lifetime of the + // blockIter. That is, we never cache anything and therefore never + // return a key backed by cachedBuf. + if hiddenPoint { + return i.Prev() + } break } @@ -923,7 +1001,10 @@ func (i *blockIter) First() (*InternalKey, base.LazyValue) { } i.clearCache() i.readEntry() - i.decodeInternalKey(i.key) + hiddenPoint := i.decodeInternalKey(i.key) + if hiddenPoint { + return i.Next() + } if !i.lazyValueHandling.hasValuePrefix || base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { i.lazyValue = base.MakeInPlaceValue(i.val) @@ -958,7 +1039,10 @@ func (i *blockIter) Last() (*InternalKey, base.LazyValue) { i.readEntry() } - i.decodeInternalKey(i.key) + hiddenPoint := i.decodeInternalKey(i.key) + if hiddenPoint { + return i.Prev() + } if !i.lazyValueHandling.hasValuePrefix || base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { i.lazyValue = base.MakeInPlaceValue(i.val) @@ -988,6 +1072,7 @@ func (i *blockIter) Next() (*InternalKey, base.LazyValue) { i.clearCache() } +start: i.offset = i.nextOffset if !i.valid() { return nil, base.LazyValue{} @@ -995,11 +1080,17 @@ func (i *blockIter) Next() (*InternalKey, base.LazyValue) { i.readEntry() // Manually inlined version of i.decodeInternalKey(i.key). if n := len(i.key) - 8; n >= 0 { - i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:]) + trailer := binary.LittleEndian.Uint64(i.key[n:]) + hiddenPoint := i.hideObsoletePoints && + (trailer&trailerObsoleteBit != 0) + i.ikey.Trailer = trailer & trailerObsoleteMask i.ikey.UserKey = i.key[:n:n] if i.globalSeqNum != 0 { i.ikey.SetSeqNum(i.globalSeqNum) } + if hiddenPoint { + goto start + } } else { i.ikey.Trailer = uint64(InternalKeyKindInvalid) i.ikey.UserKey = nil @@ -1015,8 +1106,8 @@ func (i *blockIter) Next() (*InternalKey, base.LazyValue) { return &i.ikey, i.lazyValue } -// nextPrefix is used for implementing NPrefix. -func (i *blockIter) nextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { +// NextPrefix implements (base.InternalIterator).NextPrefix. +func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { if i.lazyValueHandling.hasValuePrefix { return i.nextPrefixV3(succKey) } @@ -1141,6 +1232,7 @@ func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) // The trailer is written in little endian, so the key kind is the first // byte in the trailer that is encoded in the slice [unshared-8:unshared]. keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8]) + keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask prefixChanged := false if keyKind == InternalKeyKindSet { if invariants.Enabled && value == 0 { @@ -1256,8 +1348,12 @@ func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) i.key = i.fullKey } // Manually inlined version of i.decodeInternalKey(i.key). + hiddenPoint := false if n := len(i.key) - 8; n >= 0 { - i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:]) + trailer := binary.LittleEndian.Uint64(i.key[n:]) + hiddenPoint = i.hideObsoletePoints && + (trailer&trailerObsoleteBit != 0) + i.ikey.Trailer = trailer & trailerObsoleteMask i.ikey.UserKey = i.key[:n:n] if i.globalSeqNum != 0 { i.ikey.SetSeqNum(i.globalSeqNum) @@ -1273,6 +1369,9 @@ func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) } if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 { // Prefix has changed. + if hiddenPoint { + return i.Next() + } if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix { panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable")) } @@ -1294,23 +1393,11 @@ func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) return i.SeekGE(succKey, base.SeekGEFlagsNone) } -// NextPrefix implements (base.InternalIterator).NextPrefix -func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { - const nextsBeforeSeek = 3 - k, v := i.Next() - for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ { - if j >= nextsBeforeSeek { - return i.SeekGE(succKey, base.SeekGEFlagsNone) - } - k, v = i.Next() - } - return k, v -} - // Prev implements internalIterator.Prev, as documented in the pebble // package. func (i *blockIter) Prev() (*InternalKey, base.LazyValue) { - if n := len(i.cached) - 1; n >= 0 { +start: + for n := len(i.cached) - 1; n >= 0; n-- { i.nextOffset = i.offset e := &i.cached[n] i.offset = e.offset @@ -1318,7 +1405,13 @@ func (i *blockIter) Prev() (*InternalKey, base.LazyValue) { // Manually inlined version of i.decodeInternalKey(i.key). i.key = i.cachedBuf[e.keyStart:e.keyEnd] if n := len(i.key) - 8; n >= 0 { - i.ikey.Trailer = binary.LittleEndian.Uint64(i.key[n:]) + trailer := binary.LittleEndian.Uint64(i.key[n:]) + hiddenPoint := i.hideObsoletePoints && + (trailer&trailerObsoleteBit != 0) + if hiddenPoint { + continue + } + i.ikey.Trailer = trailer & trailerObsoleteMask i.ikey.UserKey = i.key[:n:n] if i.globalSeqNum != 0 { i.ikey.SetSeqNum(i.globalSeqNum) @@ -1360,6 +1453,8 @@ func (i *blockIter) Prev() (*InternalKey, base.LazyValue) { // index ≤ h < upper offset := decodeRestart(i.data[i.restarts+4*h:]) if offset < targetOffset { + // Looking for the first restart that has offset >= targetOffset, so + // ignore h and earlier. index = h + 1 // preserves f(i-1) == false } else { upper = h // preserves f(j) == true @@ -1369,20 +1464,35 @@ func (i *blockIter) Prev() (*InternalKey, base.LazyValue) { // => answer is index. } + // index is first restart with offset >= targetOffset. Note that + // targetOffset may not be at a restart point since one can call Prev() + // after Next() (so the cache was not populated) and targetOffset refers to + // the current entry. index-1 must have an offset < targetOffset (it can't + // be equal to targetOffset since the binary search would have selected that + // as the index). i.offset = 0 if index > 0 { i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) } + // TODO(sumeer): why is the else case not an error given targetOffset is a + // valid offset. i.readEntry() + // We stop when i.nextOffset == targetOffset since the targetOffset is the + // entry we are stepping back from, and we don't need to cache the entry + // before it, since it is the candidate to return. for i.nextOffset < targetOffset { i.cacheEntry() i.offset = i.nextOffset i.readEntry() } - i.decodeInternalKey(i.key) + hiddenPoint := i.decodeInternalKey(i.key) + if hiddenPoint { + // Use the cache. + goto start + } if !i.lazyValueHandling.hasValuePrefix || base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { i.lazyValue = base.MakeInPlaceValue(i.val) diff --git a/sstable/block_property.go b/sstable/block_property.go index 1e986fbc3e..9bec5294dc 100644 --- a/sstable/block_property.go +++ b/sstable/block_property.go @@ -117,7 +117,7 @@ type BlockPropertyCollector interface { } // SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector -// interface that allows a block property collector to indicate the it supports +// interface that allows a block property collector to indicate that it supports // being *updated* during suffix replacement, i.e. when an existing SST in which // all keys have the same key suffix is updated to have a new suffix. // diff --git a/sstable/block_property_test.go b/sstable/block_property_test.go index ece6884a52..e7298eccb7 100644 --- a/sstable/block_property_test.go +++ b/sstable/block_property_test.go @@ -1320,7 +1320,8 @@ func runBlockPropsCmd(r *Reader, td *datadriven.TestData) string { if err != nil { return err.Error() } - if err := subiter.init(r.Compare, subIndex.Get(), 0 /* globalSeqNum */); err != nil { + if err := subiter.init( + r.Compare, subIndex.Get(), 0 /* globalSeqNum */, false); err != nil { return err.Error() } for key, value := subiter.First(); key != nil; key, value = subiter.Next() { diff --git a/sstable/block_test.go b/sstable/block_test.go index d191247640..7d1aed6e9f 100644 --- a/sstable/block_test.go +++ b/sstable/block_test.go @@ -56,7 +56,7 @@ func TestBlockWriterWithPrefix(t *testing.T) { valuePrefix valuePrefix, setHasSameKeyPrefix bool) { w.addWithOptionalValuePrefix( - key, value, len(key.UserKey), addValuePrefix, valuePrefix, setHasSameKeyPrefix) + key, false, value, len(key.UserKey), addValuePrefix, valuePrefix, setHasSameKeyPrefix) } addAdapter( ikey("apple"), []byte("red"), false, 0, true) diff --git a/sstable/format.go b/sstable/format.go index e8c620028f..7b55cfeb3f 100644 --- a/sstable/format.go +++ b/sstable/format.go @@ -29,6 +29,161 @@ const ( TableFormatMax = TableFormatPebblev4 ) +// TableFormatPebblev4, in addition to DELSIZED, introduces the use of +// InternalKeyKindSSTableInternalObsoleteBit. +// +// 1. Motivation +// +// We have various related problems caused by Pebble snapshots: +// +// - P1: RANGEDELs that delete points in the same sstable, but the points +// happen to not get deleted during compactions because of an open snapshot. +// This causes very expensive iteration, that has been observed in +// production deployments +// +// - P2: When iterating over a foreign sstable (in disaggregated storage), we +// need to do (a) point collapsing to expose at most one point per user key, +// (b) apply RANGEDELs in the sstable to hide deleted points in the same +// sstable. This per-sstable point collapsing iteration needs to be very +// efficient (ideally as efficient from a CPU perspective as iteration over +// regular sstables) since foreign sstables can be very long-lived -- one of +// the goals of disaggregated storage is to scale compute and disk bandwidth +// resources as a function of the hot (from a write perspective) data and +// not the whole data, so we don't want to have to rewrite foreign sstables +// solely to improve read performance. +// +// The ideal solution for P2 would allow user-facing reads to utilize the +// existing SST iterators (with slight modifications) and with no loss of +// efficiency. And for P1 and P2 we would like to skip whole blocks of +// overwritten/deleted points. Even when we can't skip whole blocks, avoiding +// key comparisons at iteration time to discover what points are deleted is +// very desirable, since keys can be long. +// +// We observe that: +// +// - Reads: +// - All user-facing reads in CockroachDB use iterators over the DB, hence +// have a higher read seqnum than all sstables (there are some rare cases +// that can violate this, but those are not important from a performance +// optimization perspective). +// +// - Certain internal-facing reads in CockroachDB use snapshots, but the +// snapshots are shortlived enough that most L5 and L6 sstables will have +// all seqnums lower than the snapshot seqnum. +// +// - Writes: +// - We already do key comparisons between points when writing the sstable +// to ensure that the sstable invariant (monotonically increasing internal +// keys) is not violated. So we know which points share the same userkey, +// and thereby which points are obsolete because there is a more recent +// point in the same sstable. +// +// - The compactionIter knows which point id deleted by a RANGEDEL even if +// the point does need to be written because of a snapshot. +// +// So this known information can be encoded in the sstable at write time and +// utilized for optimized reading. +// +// 2. Solution +// +// We scope the solution to the following point kinds: SET, SETWITHDEL, DEL, +// DELSIZED, SINGLEDEL. These are the only ones marked locally obsolete, i.e., +// obsolete within the sstable. We never mark MERGE keys as obsolete since +// there is additional complexity as the values need to be merged, and there +// isn't a real requirement to support marking them as obsolete. Specifically, +// +// - For regular sst iteration, the obsolete marking is a performance +// optimization, and MERGE keys can be handled by higher layers in the +// iterator tree (specifically pebble.Iterator). +// +// - For foreign sst iteration, we disallow MERGEs to be written to such +// shared ssts (details below). +// +// The supported key kinds are marked with an obsolete bit +// (InternalKeyKindSSTableInternalObsoleteBit) when the key-value pair is +// obsolete. This marking is done within blockWriter, based on information +// passed to it by Writer. In turn, Writer uses a combination of key +// comparisons, and information provided by compactionIter to decide whether a +// key-value pair is obsolete. Additionally, a Pebble-internal BlockPropertyCollector +// (obsoleteKeyBlockPropertyCollector) is used to mark blocks where all key-value +// pairs are obsolete. Since the common case is non-obsolete blocks, this block +// property collector uses the empty byte slice to represent a non-obsolete block, +// which consumes no space in BlockHandleWithProperties.Props. +// +// At read time, the obsolete bit is only visible to the blockIter, which can +// be optionally configured to hide obsolete points. This hiding is only +// configured for data block iterators for sstables being read by user-facing +// iterators at a seqnum greater than the max seqnum in the sstable. +// Additionally, when this hiding is configured, a Pebble-internal block +// property filter (obsoleteKeyBlockPropertyFilter), is used to skip whole +// blocks that are obsolete. +// +// 2.1 Correctness +// +// Due to the level invariant, the sequence of seqnums for a user key in a +// sstable represents a contiguous subsequence of the seqnums for the userkey +// across the whole LSM, and is more recent than the seqnums in a sstable in a +// lower level. So exposing exactly one point from a sstable for a userkey +// will also mask the points for the userkey in lower levels. If we expose no +// point, because of RANGEDELs, that RANGEDEL will also mask the points in +// lower levels. +// +// Note that we do not need to do anything special at write time for +// SETWITHDEL and SINGLEDEL. This is because these key kinds are treated +// specially only by compactions, which do not hide obsolete points. For +// regular reads, SETWITHDEL behaves the same as SET and SINGLEDEL behaves the +// same as DEL. +// +// 2.2 Strictness and MERGE +// +// Setting the obsolete bit on point keys is advanced usage, so we support two +// modes, both of which must be truthful when setting the obsolete bit, but +// vary in when they don't set the obsolete bit. +// +// - Non-strict: In this mode, the bit does not need to be set for keys that +// are obsolete. Additionally, any sstable containing MERGE keys can only +// use this mode. An iterator over such an sstable, when configured to +// hideObsoletePoints, can expose multiple internal keys per user key, and +// can expose keys that are deleted by rangedels in the same sstable. This +// is the mode that non-advanced users should use. Pebble without +// disaggregated storage will also use this mode and will best-effort set +// the obsolete bit, to optimize iteration when snapshots have retained many +// obsolete keys. +// +// - Strict: In this mode, every obsolete key must have the obsolete bit set, +// and no MERGE keys are permitted. An iterator over such an sstable, when +// configured to hideObsoletePoints satisfies two properties: +// - S1: will expose at most one internal key per user key, which is the +// most recent one. +// - S2: will never expose keys that are deleted by rangedels in the same +// sstable. +// +// This is the mode for two use cases in disaggregated storage (which will +// exclude parts of the key space that has MERGEs), for levels that contain +// sstables that can become foreign sstables: +// - Pebble compaction output to these levels that can become foreign +// sstables. +// +// - CockroachDB ingest operations that can ingest into the levels that can +// become foreign sstables. Note, these are not sstables corresponding to +// copied data for CockroachDB range snapshots. This case occurs for +// operations like index backfills: these trivially satisfy the strictness +// criteria since they only write one key per userkey. +// +// TODO(sumeer): this latter case is not currently supported, since only +// Writer.AddWithForceObsolete calls are permitted for writing strict +// obsolete sstables. This is done to reduce the likelihood of bugs. One +// simple way to lift this limitation would be to disallow adding any +// RANGEDELs when a Pebble-external writer is trying to construct a strict +// obsolete sstable. +// +// Since non-strict mode permits MERGE keys, we need rules to handle the +// interaction of MERGE and other key kinds for the same user key. We have +// already mentioned that MERGE keys never have the obsolete bit set. +// Additionally, a SET or SETWITHDEL is not marked obsolete if the immediately +// more recent internal key is a MERGE with the same userkey (see +// Writer.makeAddPointDecisionV3 for details). + // ParseTableFormat parses the given magic bytes and version into its // corresponding internal TableFormat. func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) { diff --git a/sstable/internal.go b/sstable/internal.go index 039da91074..0fe7c9937d 100644 --- a/sstable/internal.go +++ b/sstable/internal.go @@ -23,6 +23,7 @@ const ( InternalKeyKindLogData = base.InternalKeyKindLogData InternalKeyKindSingleDelete = base.InternalKeyKindSingleDelete InternalKeyKindRangeDelete = base.InternalKeyKindRangeDelete + InternalKeyKindSetWithDelete = base.InternalKeyKindSetWithDelete InternalKeyKindDeleteSized = base.InternalKeyKindDeleteSized InternalKeyKindMax = base.InternalKeyKindMax InternalKeyKindInvalid = base.InternalKeyKindInvalid diff --git a/sstable/options.go b/sstable/options.go index 869e69b17c..a2d5e3e858 100644 --- a/sstable/options.go +++ b/sstable/options.go @@ -216,6 +216,17 @@ type WriterOptions struct { // by a wider range of tools and libraries. TableFormat TableFormat + // IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment + // in format.go. Must be false if format < TableFormatPebblev4. + // + // TODO(bilal): set this when writing shared ssts. + IsStrictObsolete bool + + // WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is + // used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the + // youngest for a userkey. + WritingToLowestLevel bool + // TablePropertyCollectors is a list of TablePropertyCollector creation // functions. A new TablePropertyCollector is created for each sstable built // and lives for the lifetime of the table. diff --git a/sstable/properties.go b/sstable/properties.go index c85aa1db60..00881cb1ae 100644 --- a/sstable/properties.go +++ b/sstable/properties.go @@ -101,6 +101,9 @@ type Properties struct { IndexType uint32 `prop:"rocksdb.block.based.table.index.type"` // Whether delta encoding is used to encode the index values. IndexValueIsDeltaEncoded uint64 `prop:"rocksdb.index.value.is.delta.encoded"` + // For formats >= TableFormatPebblev4, this is set to true if the obsolete + // bit is strict for all the point keys. + IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"` // The name of the merger used in this table. Empty if no merger is used. MergerName string `prop:"rocksdb.merge.operator"` // The number of blocks in this table. @@ -349,6 +352,9 @@ func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) { p.saveUvarint(m, unsafe.Offsetof(p.IndexSize), p.IndexSize) p.saveUint32(m, unsafe.Offsetof(p.IndexType), p.IndexType) p.saveUvarint(m, unsafe.Offsetof(p.IndexValueIsDeltaEncoded), p.IndexValueIsDeltaEncoded) + if p.IsStrictObsolete { + p.saveBool(m, unsafe.Offsetof(p.IsStrictObsolete), p.IsStrictObsolete) + } if p.MergerName != "" { p.saveString(m, unsafe.Offsetof(p.MergerName), p.MergerName) } diff --git a/sstable/properties_test.go b/sstable/properties_test.go index 25d1e1ba1d..49deacf7c5 100644 --- a/sstable/properties_test.go +++ b/sstable/properties_test.go @@ -79,6 +79,7 @@ func TestPropertiesSave(t *testing.T) { IndexSize: 11, IndexType: 12, IndexValueIsDeltaEncoded: 13, + IsStrictObsolete: true, MergerName: "merge operator name", NumDataBlocks: 14, NumDeletions: 15, diff --git a/sstable/reader.go b/sstable/reader.go index b37e6f0186..8511ee1de7 100644 --- a/sstable/reader.go +++ b/sstable/reader.go @@ -323,6 +323,8 @@ type singleLevelIterator struct { // is high). useFilter bool lastBloomFilterMatched bool + + hideObsoletePoints bool } // singleLevelIterator implements the base.InternalIterator interface. @@ -404,7 +406,7 @@ func (i *singleLevelIterator) init( v *virtualState, lower, upper []byte, filterer *BlockPropertiesFilterer, - useFilter bool, + useFilter, hideObsoletePoints bool, stats *base.InternalIteratorStats, rp ReaderProvider, ) error { @@ -428,7 +430,8 @@ func (i *singleLevelIterator) init( i.reader = r i.cmp = r.Compare i.stats = stats - err = i.index.initHandle(i.cmp, indexH, r.Properties.GlobalSeqNum) + i.hideObsoletePoints = hideObsoletePoints + err = i.index.initHandle(i.cmp, indexH, r.Properties.GlobalSeqNum, false) if err != nil { // blockIter.Close releases indexH and always returns a nil error _ = i.index.Close() @@ -566,7 +569,7 @@ func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult { i.err = err return loadBlockFailed } - i.err = i.data.initHandle(i.cmp, block, i.reader.Properties.GlobalSeqNum) + i.err = i.data.initHandle(i.cmp, block, i.reader.Properties.GlobalSeqNum, i.hideObsoletePoints) if i.err != nil { // The block is partially loaded, and we don't want it to appear valid. i.data.invalidate() @@ -653,12 +656,13 @@ func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { } func (i *singleLevelIterator) initBoundsForAlreadyLoadedBlock() { - if i.data.firstKey.UserKey == nil { + if i.data.firstUserKey() == nil { panic("initBoundsForAlreadyLoadedBlock must not be called on empty or corrupted block") } i.blockLower = i.lower if i.blockLower != nil { - if i.data.firstKey.UserKey != nil && i.cmp(i.blockLower, i.data.firstKey.UserKey) < 0 { + firstUserKey := i.data.firstUserKey() + if firstUserKey != nil && i.cmp(i.blockLower, firstUserKey) < 0 { // The lower-bound is less than the first key in the block. No need // to check the lower-bound again for this block. i.blockLower = nil @@ -1066,7 +1070,7 @@ func (i *singleLevelIterator) SeekLT( var dontSeekWithinBlock bool if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() && - boundsCmp < 0 && i.cmp(i.data.firstKey.UserKey, key) < 0 { + boundsCmp < 0 && i.cmp(i.data.firstUserKey(), key) < 0 { // Fast-path: The bounds have moved backward, and this SeekLT is // respecting the upper bound (guaranteed by Iterator). We know that // the iterator must already be positioned within or just outside the @@ -1314,7 +1318,7 @@ func (i *singleLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.Laz if i.err != nil { return nil, base.LazyValue{} } - if key, val := i.data.nextPrefix(succKey); key != nil { + if key, val := i.data.NextPrefix(succKey); key != nil { if i.blockUpper != nil { cmp := i.cmp(key.UserKey, i.blockUpper) if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { @@ -1763,8 +1767,7 @@ func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult { i.err = err return loadBlockFailed } - if i.err = i.index.initHandle( - i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum); i.err == nil { + if i.err = i.index.initHandle(i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum, false); i.err == nil { return loadBlockOK } return loadBlockFailed @@ -1842,7 +1845,7 @@ func (i *twoLevelIterator) init( v *virtualState, lower, upper []byte, filterer *BlockPropertiesFilterer, - useFilter bool, + useFilter, hideObsoletePoints bool, stats *base.InternalIteratorStats, rp ReaderProvider, ) error { @@ -1867,7 +1870,8 @@ func (i *twoLevelIterator) init( i.reader = r i.cmp = r.Compare i.stats = stats - err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum) + i.hideObsoletePoints = hideObsoletePoints + err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum, false) if err != nil { // blockIter.Close releases topLevelIndexH and always returns a nil error _ = i.topLevelIndex.Close() @@ -2907,21 +2911,20 @@ func (v *VirtualReader) NewCompactionIter( return v.reader.newCompactionIter(bytesIterated, rp, &v.vState) } -// NewIterWithBlockPropertyFiltersAndContext wraps +// NewIterWithBlockPropertyFiltersAndContextEtc wraps // Reader.NewIterWithBlockPropertyFiltersAndContext. We assume that the passed // in [lower, upper) bounds will have at least some overlap with the virtual // sstable bounds. No overlap is not currently supported in the iterator. -func (v *VirtualReader) NewIterWithBlockPropertyFiltersAndContext( +func (v *VirtualReader) NewIterWithBlockPropertyFiltersAndContextEtc( ctx context.Context, lower, upper []byte, filterer *BlockPropertiesFilterer, - useFilterBlock bool, + hideObsoletePoints, useFilterBlock bool, stats *base.InternalIteratorStats, rp ReaderProvider, ) (Iterator, error) { return v.reader.newIterWithBlockPropertyFiltersAndContext( - ctx, - lower, upper, filterer, useFilterBlock, stats, rp, &v.vState, + ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, &v.vState, ) } @@ -3080,31 +3083,51 @@ func (r *Reader) NewIterWithBlockPropertyFilters( ) (Iterator, error) { return r.newIterWithBlockPropertyFiltersAndContext( context.Background(), - lower, upper, filterer, useFilterBlock, stats, rp, nil, + lower, upper, filterer, false, useFilterBlock, stats, rp, nil, ) } -// NewIterWithBlockPropertyFiltersAndContext is similar to +// NewIterWithBlockPropertyFiltersAndContextEtc is similar to // NewIterWithBlockPropertyFilters and additionally accepts a context for // tracing. -func (r *Reader) NewIterWithBlockPropertyFiltersAndContext( +// +// If hideObsoletePoints, the callee assumes that filterer already includes +// obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by +// first calling TryAddBlockPropertyFilterForHideObsoletePoints. +func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc( ctx context.Context, lower, upper []byte, filterer *BlockPropertiesFilterer, - useFilterBlock bool, + hideObsoletePoints, useFilterBlock bool, stats *base.InternalIteratorStats, rp ReaderProvider, ) (Iterator, error) { return r.newIterWithBlockPropertyFiltersAndContext( - ctx, - lower, upper, filterer, useFilterBlock, stats, rp, nil, + ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, nil, ) } +// TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called +// before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the +// value of hideObsoletePoints and potentially add a block property filter. +func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints( + snapshotForHideObsoletePoints uint64, + fileLargestSeqNum uint64, + pointKeyFilters []BlockPropertyFilter, +) (hideObsoletePoints bool, filters []BlockPropertyFilter) { + hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 && + snapshotForHideObsoletePoints > fileLargestSeqNum + if hideObsoletePoints { + pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{}) + } + return hideObsoletePoints, pointKeyFilters +} + func (r *Reader) newIterWithBlockPropertyFiltersAndContext( ctx context.Context, lower, upper []byte, filterer *BlockPropertiesFilterer, + hideObsoletePoints bool, useFilterBlock bool, stats *base.InternalIteratorStats, rp ReaderProvider, @@ -3115,7 +3138,7 @@ func (r *Reader) newIterWithBlockPropertyFiltersAndContext( // until the final iterator closes. if r.Properties.IndexType == twoLevelIndex { i := twoLevelIterPool.Get().(*twoLevelIterator) - err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, stats, rp) + err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp) if err != nil { return nil, err } @@ -3123,7 +3146,7 @@ func (r *Reader) newIterWithBlockPropertyFiltersAndContext( } i := singleLevelIterPool.Get().(*singleLevelIterator) - err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, stats, rp) + err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp) if err != nil { return nil, err } @@ -3155,7 +3178,8 @@ func (r *Reader) newCompactionIter( err := i.init( context.Background(), r, v, nil /* lower */, nil /* upper */, nil, - false /* useFilter */, nil /* stats */, rp, + false /* useFilter */, false, /* hideObsoletePoints */ + nil /* stats */, rp, ) if err != nil { return nil, err @@ -3169,7 +3193,8 @@ func (r *Reader) newCompactionIter( i := singleLevelIterPool.Get().(*singleLevelIterator) err := i.init( context.Background(), r, v, nil /* lower */, nil, /* upper */ - nil, false /* useFilter */, nil /* stats */, rp, + nil, false /* useFilter */, false, /* hideObsoletePoints */ + nil /* stats */, rp, ) if err != nil { return nil, err @@ -3207,7 +3232,7 @@ func (r *Reader) NewFixedSeqnumRangeDelIter(seqNum uint64) (keyspan.FragmentIter return nil, err } i := &fragmentBlockIter{elideSameSeqnum: true} - if err := i.blockIter.initHandle(r.Compare, h, seqNum); err != nil { + if err := i.blockIter.initHandle(r.Compare, h, seqNum, false); err != nil { return nil, err } return i, nil @@ -3228,7 +3253,7 @@ func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { return nil, err } i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter) - if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil { + if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil { return nil, err } return i, nil @@ -3388,7 +3413,7 @@ func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) { // tombstones. We need properly fragmented and sorted range tombstones in // order to serve from them directly. iter := &blockIter{} - if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum); err != nil { + if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil { return nil, err } var tombstones []keyspan.Span @@ -3580,7 +3605,8 @@ func (r *Reader) Layout() (*Layout, error) { if err != nil { return nil, err } - if err := iter.init(r.Compare, subIndex.Get(), 0 /* globalSeqNum */); err != nil { + if err := iter.init(r.Compare, subIndex.Get(), 0, /* globalSeqNum */ + false /* hideObsoletePoints */); err != nil { return nil, err } for key, value := iter.First(); key != nil; key, value = iter.Next() { diff --git a/sstable/reader_test.go b/sstable/reader_test.go index 4d432a9533..fdf377ab0d 100644 --- a/sstable/reader_test.go +++ b/sstable/reader_test.go @@ -414,15 +414,9 @@ func TestVirtualReader(t *testing.T) { } var stats base.InternalIteratorStats - iter, err := v.NewIterWithBlockPropertyFiltersAndContext( - context.Background(), - lower, - upper, - nil, - false, - &stats, - TrivialReaderProvider{Reader: r}, - ) + iter, err := v.NewIterWithBlockPropertyFiltersAndContextEtc( + context.Background(), lower, upper, nil, false, false, + &stats, TrivialReaderProvider{Reader: r}) if err != nil { return err.Error() } @@ -473,7 +467,7 @@ func TestReader(t *testing.T) { "prefixFilter": "testdata/prefixreader", } - for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} { + for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4} { for dName, blockSize := range blockSizes { for iName, indexBlockSize := range blockSizes { for lName, tableOpt := range writerOpts { @@ -497,6 +491,29 @@ func TestReader(t *testing.T) { } } +func TestReaderHideObsolete(t *testing.T) { + blockSizes := map[string]int{ + "1bytes": 1, + "5bytes": 5, + "10bytes": 10, + "25bytes": 25, + "Maxbytes": math.MaxInt32, + } + for dName, blockSize := range blockSizes { + opts := WriterOptions{ + TableFormat: TableFormatPebblev4, + BlockSize: blockSize, + IndexBlockSize: blockSize, + Comparer: testkeys.Comparer, + } + t.Run(fmt.Sprintf("blockSize=%s", dName), func(t *testing.T) { + runTestReader( + t, opts, "testdata/reader_hide_obsolete", + nil /* Reader */, 0, false, true) + }) + } +} + func TestHamletReader(t *testing.T) { prebuiltSSTs := []string{ "testdata/h.ldb", @@ -722,12 +739,23 @@ func runTestReader( } var stats base.InternalIteratorStats r.Properties.GlobalSeqNum = seqNum - var filterer *BlockPropertiesFilterer + hideObsoletePoints := false + var bpfs []BlockPropertyFilter + if d.HasArg("hide-obsolete-points") { + d.ScanArgs(t, "hide-obsolete-points", &hideObsoletePoints) + if hideObsoletePoints { + bpfs = append(bpfs, obsoleteKeyBlockPropertyFilter{}) + } + } if d.HasArg("block-property-filter") { var filterMin, filterMax uint64 d.ScanArgs(t, "block-property-filter", &filterMin, &filterMax) bpf := NewTestKeysBlockPropertyFilter(filterMin, filterMax) - filterer = newBlockPropertiesFilterer([]BlockPropertyFilter{bpf}, nil) + bpfs = append(bpfs, bpf) + } + var filterer *BlockPropertiesFilterer + if len(bpfs) > 0 { + filterer = newBlockPropertiesFilterer(bpfs, nil) intersects, err := filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties) if err != nil { @@ -737,10 +765,12 @@ func runTestReader( return "table does not intersect BlockPropertyFilter" } } - iter, err := r.NewIterWithBlockPropertyFilters( + iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc( + context.Background(), nil, /* lower */ nil, /* upper */ filterer, + hideObsoletePoints, true, /* use filter block */ &stats, TrivialReaderProvider{Reader: r}, @@ -1897,6 +1927,114 @@ func BenchmarkIteratorScanNextPrefix(b *testing.B) { } } +func BenchmarkIteratorScanObsolete(b *testing.B) { + options := WriterOptions{ + BlockSize: 32 << 10, + BlockRestartInterval: 16, + FilterPolicy: nil, + Compression: SnappyCompression, + Comparer: testkeys.Comparer, + } + const keyCount = 1 << 20 + const keyLen = 10 + + // Take the very large keyspace consisting of alphabetic characters of + // lengths up to unsharedPrefixLen and reduce it down to keyCount keys by + // picking every 1 key every keyCount keys. + keys := testkeys.Alpha(keyLen) + keys = keys.EveryN(keys.Count() / keyCount) + if keys.Count() < keyCount { + b.Fatalf("expected %d keys, found %d", keyCount, keys.Count()) + } + expectedKeyCount := keys.Count() + keyBuf := make([]byte, keyLen) + setupBench := func(b *testing.B, tableFormat TableFormat, cacheSize int64) *Reader { + mem := vfs.NewMem() + f0, err := mem.Create("bench") + require.NoError(b, err) + options.TableFormat = tableFormat + w := NewWriter(objstorageprovider.NewFileWritable(f0), options) + val := make([]byte, 100) + rng := rand.New(rand.NewSource(100)) + for i := 0; i < keys.Count(); i++ { + n := testkeys.WriteKey(keyBuf, keys, i) + key := keyBuf[:n] + rng.Read(val) + forceObsolete := true + if i == 0 { + forceObsolete = false + } + require.NoError(b, w.AddWithForceObsolete( + base.MakeInternalKey(key, 0, InternalKeyKindSet), val, forceObsolete)) + } + require.NoError(b, w.Close()) + c := cache.New(cacheSize) + defer c.Unref() + // Re-open the filename for reading. + f0, err = mem.Open("bench") + require.NoError(b, err) + r, err := newReader(f0, ReaderOptions{ + Cache: c, + Comparer: testkeys.Comparer, + }) + require.NoError(b, err) + return r + } + for _, format := range []TableFormat{TableFormatPebblev3, TableFormatPebblev4} { + b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) { + // 150MiB results in a high cache hit rate for both formats. + for _, cacheSize := range []int64{1, 150 << 20} { + b.Run(fmt.Sprintf("cache-size=%s", humanize.IEC.Int64(cacheSize)), + func(b *testing.B) { + r := setupBench(b, format, cacheSize) + defer func() { + require.NoError(b, r.Close()) + }() + for _, hideObsoletePoints := range []bool{false, true} { + b.Run(fmt.Sprintf("hide-obsolete=%t", hideObsoletePoints), func(b *testing.B) { + var filterer *BlockPropertiesFilterer + if format == TableFormatPebblev4 && hideObsoletePoints { + filterer = newBlockPropertiesFilterer( + []BlockPropertyFilter{obsoleteKeyBlockPropertyFilter{}}, nil) + intersects, err := + filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties) + if err != nil { + b.Fatalf("%s", err.Error()) + } + if !intersects { + b.Fatalf("sstable does not intersect") + } + } + iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc( + context.Background(), nil, nil, filterer, hideObsoletePoints, + true, nil, TrivialReaderProvider{Reader: r}) + require.NoError(b, err) + b.ResetTimer() + for i := 0; i < b.N; i++ { + count := 0 + k, _ := iter.First() + for k != nil { + count++ + k, _ = iter.Next() + } + if format == TableFormatPebblev4 && hideObsoletePoints { + if count != 1 { + b.Fatalf("found %d points", count) + } + } else { + if count != expectedKeyCount { + b.Fatalf("found %d points", count) + } + } + } + }) + } + }) + } + }) + } +} + func newReader(r ReadableFile, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) { readable, err := NewSimpleReadable(r) if err != nil { diff --git a/sstable/suffix_rewriter.go b/sstable/suffix_rewriter.go index b1423fddb8..fe44cba479 100644 --- a/sstable/suffix_rewriter.go +++ b/sstable/suffix_rewriter.go @@ -50,6 +50,24 @@ func RewriteKeySuffixes( // same TableFormat as the input, which is returned in case the caller wants // to do some error checking. Suffix rewriting is meant to be efficient, and // allowing changes in the TableFormat detracts from that efficiency. +// +// Any obsolete bits that key-value pairs may be annotated with are ignored +// and lost during the rewrite. Additionally, the output sstable has the +// pebble.obsolete.is_strict property set to false. These limitations could be +// removed if needed. The current use case for +// RewriteKeySuffixesAndReturnFormat in CockroachDB is for MVCC-compliant file +// ingestion, where these files do not contain RANGEDELs and have one +// key-value pair per userkey -- so they trivially satisfy the strict +// criteria, and we don't need the obsolete bit as a performance optimization. +// For disaggregated storage, strict obsolete sstables are needed for L5 and +// L6, but at the time of writing, we expect such MVCC-compliant file +// ingestion to only ingest into levels L4 and higher. If this changes, we can +// do one of two things to get rid of this limitation: +// - Validate that there are no duplicate userkeys and no RANGEDELs in the +// sstable to be rewritten. Validating the former condition is non-trivial +// when rewriting blocks in parallel, so we could encode the pre-existing +// condition in a NumObsoletePoints property. +// - Preserve the obsolete bit (with changes to the blockIter). func RewriteKeySuffixesAndReturnFormat( sst []byte, rOpts ReaderOptions, @@ -184,7 +202,7 @@ func rewriteBlocks( if err != nil { return err } - if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum); err != nil { + if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum, false); err != nil { return err } @@ -434,7 +452,12 @@ func (c copyFilterWriter) policyName() string { return c.origPolicyName } // single loop over the Reader that writes each key to the Writer with the new // suffix. The is significantly slower than the parallelized rewriter, and does // more work to rederive filters, props, etc, however re-doing that work makes -// it less restrictive -- props no longer need to +// it less restrictive -- props no longer need to ?? +// +// Any obsolete bits that key-value pairs may be annotated with are ignored +// and lost during the rewrite. Additionally, the output sstable has the +// pebble.obsolete.is_strict property set to false. See the longer comment at +// RewriteKeySuffixesAndReturnFormat. func RewriteKeySuffixesViaWriter( r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte, ) (*WriterMetadata, error) { @@ -472,7 +495,7 @@ func RewriteKeySuffixesViaWriter( if err != nil { return nil, err } - if w.addPoint(scratch, val); err != nil { + if w.addPoint(scratch, val, false); err != nil { return nil, err } k, v = i.Next() diff --git a/sstable/suffix_rewriter_test.go b/sstable/suffix_rewriter_test.go index 0d9865c775..c3132d9b70 100644 --- a/sstable/suffix_rewriter_test.go +++ b/sstable/suffix_rewriter_test.go @@ -76,6 +76,9 @@ func TestRewriteSuffixProps(t *testing.T) { t.Run(fmt.Sprintf("byBlocks=%v", byBlocks), func(t *testing.T) { rewrittenSST := &memFile{} if byBlocks { + if format == TableFormatPebblev4 { + expectedProps["obsolete-key"] = string([]byte{3}) + } _, rewriteFormat, err := rewriteKeySuffixesInBlocks( r, rewrittenSST, rwOpts, from, to, 8) // rewriteFormat is equal to the original format, since @@ -83,6 +86,9 @@ func TestRewriteSuffixProps(t *testing.T) { require.Equal(t, wOpts.TableFormat, rewriteFormat) require.NoError(t, err) } else { + if rwOpts.TableFormat != TableFormatPebblev4 { + delete(expectedProps, "obsolete-key") + } _, err := RewriteKeySuffixesViaWriter(r, rewrittenSST, rwOpts, from, to) require.NoError(t, err) } @@ -119,7 +125,9 @@ func TestRewriteSuffixProps(t *testing.T) { for !newDecoder.done() { id, val, err := newDecoder.next() require.NoError(t, err) - newProps[id] = val + if int(id) < len(newProps) { + newProps[id] = val + } } require.Equal(t, oldProps[0], newProps[1]) ival.decode(newProps[0]) diff --git a/sstable/table.go b/sstable/table.go index 5267123029..040efafd62 100644 --- a/sstable/table.go +++ b/sstable/table.go @@ -160,8 +160,27 @@ The metaindex block also contains block handles as values, with keys being the names of the meta blocks. For a description of value blocks and the meta value index block, see -value_block.go - +value_block.go. + +Data blocks have some additional features: +- For TableFormatPebblev3 onwards: + - For SETs, the value has a 1 byte value prefix, which indicates whether the + value is inline, or in a separate value block, and indicates whether the + prefix of the userkey (as defined by split) has changed or not. See + value_block.go for details. + - The most significant bit of the restart points is used to indicate whether + userkey prefix has changed since the last restart point. See the detailed + comment in blockWriter. + - The maximum length of the "shared prefix" when encoding the key, is the + length of the prefix of the userkey (as defined by split) of the previous + key. + +- For TableFormatPebblev4 onwards: + - The key kinds may be altered to set the + InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete + in the context of that sstable (for a reader that reads at a higher seqnum + than the highest seqnum in the sstable). For details, see the comment in + format.go. */ const ( diff --git a/sstable/testdata/reader_hide_obsolete/iter b/sstable/testdata/reader_hide_obsolete/iter new file mode 100644 index 0000000000..173a0f6ad8 --- /dev/null +++ b/sstable/testdata/reader_hide_obsolete/iter @@ -0,0 +1,154 @@ +build +a.SET.1:A +b.SINGLEDEL.4: +b.SET.2:B +c.DEL.5: +c.SET.3:C +d.SET.4:D4 +d.SET.2:D2 +---- + +iter +first +next +next +next +next +next +next +next +prev +prev +prev +prev +prev +prev +prev +prev +---- +:A +: +:B +: +:C +:D4 +:D2 +. +:D2 +:D4 +:C +: +:B +: +:A +. + +iter hide-obsolete-points=true +first +next +next +next +next +prev +prev +prev +prev +prev +---- +:A +: +: +:D4 +. +:D4 +: +: +:A +. + +iter hide-obsolete-points=true +seek-ge c +prev +prev +next +next +next +seek-lt c +next +prev +prev +---- +: +: +:A +: +: +:D4 +: +: +: +:A + +build +a.SET.3:A +a.MERGE.2:A2 +b.MERGE.20:B20 +b.MERGE.18:B18 +b.SET.16:B16 +b.SET.14:B14 +c.MERGE.30:C30 +c.MERGE.28:C28 +c.DEL.26: +---- + +iter +first +next +next +next +next +next +next +next +---- +:A +:A2 +:B20 +:B18 +:B16 +:B14 +:C30 +:C28 + +iter hide-obsolete-points=true +first +next +next +next +next +next +next +last +prev +prev +prev +prev +prev +prev +---- +:A +:B20 +:B18 +:B16 +:C30 +:C28 +. +:C28 +:C30 +:B16 +:B18 +:B20 +:A +. + +# TODO(sumeer): writing to lowest level. diff --git a/sstable/testdata/virtual_reader b/sstable/testdata/virtual_reader index b5b507d245..c0765e31a9 100644 --- a/sstable/testdata/virtual_reader +++ b/sstable/testdata/virtual_reader @@ -159,7 +159,7 @@ virtualize dd.SET.5-ddd.SET.6 ---- bounds: [dd#5,1-ddd#6,1] filenum: 000008 -props: 8,1 +props: 7,1 # Check lower bound enforcement during SeekPrefixGE. iter @@ -566,7 +566,7 @@ virtualize f.SET.6-h.SET.9 ---- bounds: [f#6,1-h#9,1] filenum: 000020 -props: 6,0 +props: 5,0 iter seek-lt z diff --git a/sstable/testdata/writer_value_blocks b/sstable/testdata/writer_value_blocks index 869b849afb..8590e42925 100644 --- a/sstable/testdata/writer_value_blocks +++ b/sstable/testdata/writer_value_blocks @@ -132,7 +132,7 @@ layout 72 record (21 = 3 [0] + 14 + 4) [restart] blue@8#16,1:value handle {valueLen:6 blockNum:0 offsetInBlock:5} 93 [restart 72] - 101 [trailer compression=none checksum=0xdc74261] + 101 [trailer compression=none checksum=0x4e65b9b6] 106 data (29) 106 record (21 = 3 [0] + 14 + 4) [restart] blue@6#16,1:value handle {valueLen:15 blockNum:1 offsetInBlock:0} @@ -146,77 +146,78 @@ layout 173 block:38/29 [restart] 192 [restart 173] 200 [trailer compression=none checksum=0x21d27815] - 205 index (27) + 205 index (30) 205 block:72/29 [restart] - 224 [restart 205] - 232 [trailer compression=none checksum=0xbae26eb3] - 237 index (22) - 237 block:106/29 [restart] - 251 [restart 237] - 259 [trailer compression=none checksum=0x802be702] - 264 top-index (77) - 264 block:140/28 [restart] - 285 block:173/27 [restart] - 305 block:205/27 [restart] - 325 block:237/22 [restart] - 340 [restart 264] - 344 [restart 285] - 348 [restart 305] - 352 [restart 325] - 341 [trailer compression=snappy checksum=0x6b2d79b] - 346 value-block (11) - 362 value-block (15) - 382 value-index (8) - 395 properties (785) - 395 pebble.num.value-blocks (27) [restart] - 422 pebble.num.values.in.value-blocks (21) - 443 pebble.value-blocks.size (21) - 464 rocksdb.block.based.table.index.type (43) - 507 rocksdb.block.based.table.prefix.filtering (20) - 527 rocksdb.block.based.table.whole.key.filtering (23) - 550 rocksdb.column.family.id (24) - 574 rocksdb.comparator (35) - 609 rocksdb.compression (16) - 625 rocksdb.compression_options (106) - 731 rocksdb.creation.time (16) - 747 rocksdb.data.size (14) - 761 rocksdb.deleted.keys (15) - 776 rocksdb.external_sst_file.global_seqno (41) - 817 rocksdb.external_sst_file.version (14) - 831 rocksdb.filter.size (15) - 846 rocksdb.fixed.key.length (18) - 864 rocksdb.format.version (17) - 881 rocksdb.index.key.is.user.key (25) - 906 rocksdb.index.partitions (14) - 920 rocksdb.index.size (9) - 929 rocksdb.index.value.is.delta.encoded (26) - 955 rocksdb.merge.operands (18) - 973 rocksdb.merge.operator (24) - 997 rocksdb.num.data.blocks (19) - 1016 rocksdb.num.entries (11) - 1027 rocksdb.num.range-deletions (19) - 1046 rocksdb.oldest.key.time (19) - 1065 rocksdb.prefix.extractor.name (31) - 1096 rocksdb.property.collectors (22) - 1118 rocksdb.raw.key.size (16) - 1134 rocksdb.raw.value.size (14) - 1148 rocksdb.top-level.index.size (24) - 1172 [restart 395] - 1180 [trailer compression=none checksum=0x4352b7fd] - 1185 meta-index (64) - 1185 pebble.value_index block:382/8 value-blocks-index-lengths: 1(num), 2(offset), 1(length) [restart] - 1212 rocksdb.properties block:395/785 [restart] - 1237 [restart 1185] - 1241 [restart 1212] - 1249 [trailer compression=none checksum=0xe7aed935] - 1254 footer (53) - 1254 checksum type: crc32c - 1255 meta: offset=1185, length=64 - 1258 index: offset=264, length=77 - 1261 [padding] - 1295 version: 4 - 1299 magic number: 0xf09faab3f09faab3 - 1307 EOF + 227 [restart 205] + 235 [trailer compression=none checksum=0xba0b26fe] + 240 index (22) + 240 block:106/29 [restart] + 254 [restart 240] + 262 [trailer compression=none checksum=0x802be702] + 267 top-index (85) + 267 block:140/28 [restart] + 288 block:173/27 [restart] + 308 block:205/30 [restart] + 331 block:240/22 [restart] + 346 [restart 267] + 350 [restart 288] + 354 [restart 308] + 358 [restart 331] + 352 [trailer compression=snappy checksum=0x8bd0d63a] + 357 value-block (11) + 373 value-block (15) + 393 value-index (8) + 406 properties (813) + 406 obsolete-key (16) [restart] + 422 pebble.num.value-blocks (27) + 449 pebble.num.values.in.value-blocks (21) + 470 pebble.value-blocks.size (21) + 491 rocksdb.block.based.table.index.type (43) + 534 rocksdb.block.based.table.prefix.filtering (20) + 554 rocksdb.block.based.table.whole.key.filtering (23) + 577 rocksdb.column.family.id (24) + 601 rocksdb.comparator (35) + 636 rocksdb.compression (16) + 652 rocksdb.compression_options (106) + 758 rocksdb.creation.time (16) + 774 rocksdb.data.size (14) + 788 rocksdb.deleted.keys (15) + 803 rocksdb.external_sst_file.global_seqno (41) + 844 rocksdb.external_sst_file.version (14) + 858 rocksdb.filter.size (15) + 873 rocksdb.fixed.key.length (18) + 891 rocksdb.format.version (17) + 908 rocksdb.index.key.is.user.key (25) + 933 rocksdb.index.partitions (14) + 947 rocksdb.index.size (9) + 956 rocksdb.index.value.is.delta.encoded (26) + 982 rocksdb.merge.operands (18) + 1000 rocksdb.merge.operator (24) + 1024 rocksdb.num.data.blocks (19) + 1043 rocksdb.num.entries (11) + 1054 rocksdb.num.range-deletions (19) + 1073 rocksdb.oldest.key.time (19) + 1092 rocksdb.prefix.extractor.name (31) + 1123 rocksdb.property.collectors (34) + 1157 rocksdb.raw.key.size (16) + 1173 rocksdb.raw.value.size (14) + 1187 rocksdb.top-level.index.size (24) + 1211 [restart 406] + 1219 [trailer compression=none checksum=0xa07c3fa5] + 1224 meta-index (64) + 1224 pebble.value_index block:393/8 value-blocks-index-lengths: 1(num), 2(offset), 1(length) [restart] + 1251 rocksdb.properties block:406/813 [restart] + 1276 [restart 1224] + 1280 [restart 1251] + 1288 [trailer compression=none checksum=0xfe295720] + 1293 footer (53) + 1293 checksum type: crc32c + 1294 meta: offset=1224, length=64 + 1297 index: offset=267, length=85 + 1300 [padding] + 1334 version: 4 + 1338 magic number: 0xf09faab3f09faab3 + 1346 EOF # Require that [c,e) must be in-place. build in-place-bound=(c,e) @@ -296,47 +297,48 @@ layout 71 block:0/66 [restart] 85 [restart 71] 93 [trailer compression=none checksum=0xf80f5bcf] - 98 properties (715) - 98 pebble.raw.point-tombstone.key.size (39) [restart] - 137 rocksdb.block.based.table.index.type (43) - 180 rocksdb.block.based.table.prefix.filtering (20) - 200 rocksdb.block.based.table.whole.key.filtering (23) - 223 rocksdb.column.family.id (24) - 247 rocksdb.comparator (35) - 282 rocksdb.compression (16) - 298 rocksdb.compression_options (106) - 404 rocksdb.creation.time (16) - 420 rocksdb.data.size (13) - 433 rocksdb.deleted.keys (15) - 448 rocksdb.external_sst_file.global_seqno (41) - 489 rocksdb.external_sst_file.version (14) - 503 rocksdb.filter.size (15) - 518 rocksdb.fixed.key.length (18) - 536 rocksdb.format.version (17) - 553 rocksdb.index.key.is.user.key (25) - 578 rocksdb.index.size (8) - 586 rocksdb.index.value.is.delta.encoded (26) - 612 rocksdb.merge.operands (18) - 630 rocksdb.merge.operator (24) - 654 rocksdb.num.data.blocks (19) - 673 rocksdb.num.entries (11) - 684 rocksdb.num.range-deletions (19) - 703 rocksdb.oldest.key.time (19) - 722 rocksdb.prefix.extractor.name (31) - 753 rocksdb.property.collectors (22) - 775 rocksdb.raw.key.size (16) - 791 rocksdb.raw.value.size (14) - 805 [restart 98] - 813 [trailer compression=none checksum=0xfb9d6722] - 818 meta-index (32) - 818 rocksdb.properties block:98/715 [restart] - 842 [restart 818] - 850 [trailer compression=none checksum=0x8ca405dc] - 855 footer (53) - 855 checksum type: crc32c - 856 meta: offset=818, length=32 - 859 index: offset=71, length=22 - 861 [padding] - 896 version: 4 - 900 magic number: 0xf09faab3f09faab3 - 908 EOF + 98 properties (743) + 98 obsolete-key (16) [restart] + 114 pebble.raw.point-tombstone.key.size (39) + 153 rocksdb.block.based.table.index.type (43) + 196 rocksdb.block.based.table.prefix.filtering (20) + 216 rocksdb.block.based.table.whole.key.filtering (23) + 239 rocksdb.column.family.id (24) + 263 rocksdb.comparator (35) + 298 rocksdb.compression (16) + 314 rocksdb.compression_options (106) + 420 rocksdb.creation.time (16) + 436 rocksdb.data.size (13) + 449 rocksdb.deleted.keys (15) + 464 rocksdb.external_sst_file.global_seqno (41) + 505 rocksdb.external_sst_file.version (14) + 519 rocksdb.filter.size (15) + 534 rocksdb.fixed.key.length (18) + 552 rocksdb.format.version (17) + 569 rocksdb.index.key.is.user.key (25) + 594 rocksdb.index.size (8) + 602 rocksdb.index.value.is.delta.encoded (26) + 628 rocksdb.merge.operands (18) + 646 rocksdb.merge.operator (24) + 670 rocksdb.num.data.blocks (19) + 689 rocksdb.num.entries (11) + 700 rocksdb.num.range-deletions (19) + 719 rocksdb.oldest.key.time (19) + 738 rocksdb.prefix.extractor.name (31) + 769 rocksdb.property.collectors (34) + 803 rocksdb.raw.key.size (16) + 819 rocksdb.raw.value.size (14) + 833 [restart 98] + 841 [trailer compression=none checksum=0x71c150c9] + 846 meta-index (32) + 846 rocksdb.properties block:98/743 [restart] + 870 [restart 846] + 878 [trailer compression=none checksum=0x545483dc] + 883 footer (53) + 883 checksum type: crc32c + 884 meta: offset=846, length=32 + 887 index: offset=71, length=22 + 889 [padding] + 924 version: 4 + 928 magic number: 0xf09faab3f09faab3 + 936 EOF diff --git a/sstable/writer.go b/sstable/writer.go index 83ebad4ca0..00e276b0f9 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -132,6 +132,8 @@ type Writer struct { separator Separator successor Successor tableFormat TableFormat + isStrictObsolete bool + writingToLowestLevel bool cache *cache.Cache restartInterval int checksumType ChecksumType @@ -166,6 +168,7 @@ type Writer struct { props Properties propCollectors []TablePropertyCollector blockPropCollectors []BlockPropertyCollector + obsoleteCollector obsoleteKeyBlockPropertyCollector blockPropsEncoder blockPropertiesEncoder // filter accumulates the filter block. If populated, the filter ingests // either the output of w.split (i.e. a prefix extractor) if w.split is not @@ -673,7 +676,12 @@ func (w *Writer) Set(key, value []byte) error { if w.err != nil { return w.err } - return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value) + if w.isStrictObsolete { + return errors.Errorf("use AddWithForceObsolete") + } + // forceObsolete is false based on the assumption that no RANGEDELs in the + // sstable delete the added points. + return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value, false) } // Delete deletes the value for the given key. The sequence number is set to @@ -685,7 +693,12 @@ func (w *Writer) Delete(key []byte) error { if w.err != nil { return w.err } - return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil) + if w.isStrictObsolete { + return errors.Errorf("use AddWithForceObsolete") + } + // forceObsolete is false based on the assumption that no RANGEDELs in the + // sstable delete the added points. + return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil, false) } // DeleteRange deletes all of the keys (and values) in the range [start,end) @@ -698,6 +711,9 @@ func (w *Writer) DeleteRange(start, end []byte) error { if w.err != nil { return w.err } + if w.isStrictObsolete { + return errors.Errorf("use AddWithForceObsolete") + } return w.addTombstone(base.MakeInternalKey(start, 0, InternalKeyKindRangeDelete), end) } @@ -711,7 +727,13 @@ func (w *Writer) Merge(key, value []byte) error { if w.err != nil { return w.err } - return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value) + if w.isStrictObsolete { + return errors.Errorf("use AddWithForceObsolete") + } + // forceObsolete is false based on the assumption that no RANGEDELs in the + // sstable that delete the added points. If the user configured this writer + // to be strict-obsolete, addPoint will reject the addition of this MERGE. + return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value, false) } // Add adds a key/value pair to the table being written. For a given Writer, @@ -721,6 +743,24 @@ func (w *Writer) Merge(key, value []byte) error { // point entries. Additionally, range deletion tombstones must be fragmented // (i.e. by keyspan.Fragmenter). func (w *Writer) Add(key InternalKey, value []byte) error { + if w.isStrictObsolete { + return errors.Errorf("use AddWithForceObsolete") + } + return w.AddWithForceObsolete(key, value, false) +} + +// AddWithForceObsolete must be used when writing a strict-obsolete sstable. +// +// forceObsolete indicates whether the caller has determined that this key is +// obsolete even though it may be the latest point key for this userkey. This +// should be set to true for keys obsoleted by RANGEDELs, and is required for +// strict-obsolete sstables. +// +// Note that there are two properties, S1 and S2 (see comment in format.go) +// that strict-obsolete ssts must satisfy. S2, due to RANGEDELs, is solely the +// responsibility of the caller. S1 is solely the responsibility of the +// callee. +func (w *Writer) AddWithForceObsolete(key InternalKey, value []byte, forceObsolete bool) error { if w.err != nil { return w.err } @@ -735,7 +775,7 @@ func (w *Writer) Add(key InternalKey, value []byte) error { "pebble: range keys must be added via one of the RangeKey* functions") return w.err } - return w.addPoint(key, value) + return w.addPoint(key, value, forceObsolete) } func (w *Writer) makeAddPointDecisionV2(key InternalKey) error { @@ -745,12 +785,7 @@ func (w *Writer) makeAddPointDecisionV2(key InternalKey) error { return nil } if !w.disableKeyOrderChecks { - prevPointUserKey := w.dataBlockBuf.dataBlock.curKey - n := len(prevPointUserKey) - base.InternalTrailerLen - if n < 0 { - panic("corrupt key in blockWriter buffer") - } - prevPointUserKey = prevPointUserKey[:n:n] + prevPointUserKey := w.dataBlockBuf.dataBlock.getCurUserKey() cmpUser := w.compare(prevPointUserKey, key.UserKey) if cmpUser > 0 || (cmpUser == 0 && prevTrailer <= key.Trailer) { return errors.Errorf( @@ -762,9 +797,17 @@ func (w *Writer) makeAddPointDecisionV2(key InternalKey) error { return nil } +// REQUIRES: at least one point has been written to the Writer. +func (w *Writer) getLastPointUserKey() []byte { + if w.dataBlockBuf.dataBlock.nEntries == 0 { + panic(errors.AssertionFailedf("no point keys added to writer")) + } + return w.dataBlockBuf.dataBlock.getCurUserKey() +} + func (w *Writer) makeAddPointDecisionV3( key InternalKey, valueLen int, -) (setHasSamePrefix bool, writeToValueBlock bool, err error) { +) (setHasSamePrefix bool, writeToValueBlock bool, isObsolete bool, err error) { prevPointKeyInfo := w.lastPointKeyInfo w.lastPointKeyInfo.userKeyLen = len(key.UserKey) w.lastPointKeyInfo.prefixLen = w.lastPointKeyInfo.userKeyLen @@ -772,18 +815,15 @@ func (w *Writer) makeAddPointDecisionV3( w.lastPointKeyInfo.prefixLen = w.split(key.UserKey) } w.lastPointKeyInfo.trailer = key.Trailer - if w.dataBlockBuf.dataBlock.nEntries == 0 { - return false, false, nil - } - prevPointUserKey := w.dataBlockBuf.dataBlock.curKey - n := len(prevPointUserKey) - base.InternalTrailerLen - if n < 0 { - panic("corrupt key in blockWriter buffer") + if !w.meta.HasPointKeys { + return false, false, false, nil } - prevPointUserKey = prevPointUserKey[:n:n] + keyKind := base.TrailerKind(key.Trailer) + prevPointUserKey := w.getLastPointUserKey() prevPointKey := InternalKey{UserKey: prevPointUserKey, Trailer: prevPointKeyInfo.trailer} - considerWriteToValueBlock := base.TrailerKind(prevPointKeyInfo.trailer) == InternalKeyKindSet && - base.TrailerKind(key.Trailer) == InternalKeyKindSet + prevKeyKind := base.TrailerKind(prevPointKeyInfo.trailer) + considerWriteToValueBlock := prevKeyKind == InternalKeyKindSet && + keyKind == InternalKeyKindSet if considerWriteToValueBlock && !w.requiredInPlaceValueBound.IsEmpty() { keyPrefix := key.UserKey[:w.lastPointKeyInfo.prefixLen] cmpUpper := w.compare( @@ -812,14 +852,39 @@ func (w *Writer) makeAddPointDecisionV3( } else { cmpUser = w.compare(prevPointUserKey, key.UserKey) } + keyKindIsSetOrMerge := keyKind == InternalKeyKindSet || keyKind == InternalKeyKindSetWithDelete || + keyKind == InternalKeyKindMerge + keyKindIsPointDelete := keyKind == InternalKeyKindDelete || + keyKind == InternalKeyKindSingleDelete || keyKind == InternalKeyKindDeleteSized + if !keyKindIsSetOrMerge && !keyKindIsPointDelete { + panic(errors.AssertionFailedf("unexpected key kind %s", keyKind.String())) + } + // If same user key, then the current key is obsolete if: + // - The prev key was not a MERGE. + // - If the prev key was a MERGE, then as long as the current key is not a + // SET or SETWITHDELETE or MERGE, then it is obsolete. SET, SETWITHDELETE, + // MERGE are not obsolete since they will be merged. + // + // If not the same user key, but it is some kind of point delete, and we are + // writing to the lowest level, then it is also obsolete. + isObsolete = (cmpUser == 0 && (prevKeyKind != InternalKeyKindMerge || !keyKindIsSetOrMerge)) || + (w.writingToLowestLevel && + (keyKind == InternalKeyKindDelete || keyKind == InternalKeyKindSingleDelete || + keyKind == InternalKeyKindDeleteSized)) + // TODO(sumeer): storing isObsolete SET and SETWITHDEL in value blocks is + // possible, but requires some care in documenting and checking invariants. + // There is code that assumes nothing in value blocks because of single MVCC + // version (those should be ok). We have to ensure setHasSamePrefix is + // correctly initialized here etc. + if !w.disableKeyOrderChecks && (cmpUser > 0 || (cmpUser == 0 && prevPointKeyInfo.trailer <= key.Trailer)) { - return false, false, errors.Errorf( + return false, false, false, errors.Errorf( "pebble: keys must be added in strictly increasing order: %s, %s", prevPointKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) } if !considerWriteToValueBlock { - return false, false, nil + return false, false, isObsolete, nil } // NB: it is possible that cmpUser == 0, i.e., these two SETs have identical // user keys (because of an open snapshot). This should be the rare case. @@ -832,19 +897,24 @@ func (w *Writer) makeAddPointDecisionV3( if considerWriteToValueBlock && valueLen <= tinyValueThreshold { considerWriteToValueBlock = false } - return setHasSamePrefix, considerWriteToValueBlock, nil + return setHasSamePrefix, considerWriteToValueBlock, isObsolete, nil } -func (w *Writer) addPoint(key InternalKey, value []byte) error { +func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) error { + if w.isStrictObsolete && key.Kind() == InternalKeyKindMerge { + return errors.Errorf("MERGE not supported in a strict-obsolete sstable") + } var err error var setHasSameKeyPrefix, writeToValueBlock, addPrefixToValueStoredWithKey bool + var isObsolete bool maxSharedKeyLen := len(key.UserKey) if w.valueBlockWriter != nil { // maxSharedKeyLen is limited to the prefix of the preceding key. If the // preceding key was in a different block, then the blockWriter will // ignore this maxSharedKeyLen. maxSharedKeyLen = w.lastPointKeyInfo.prefixLen - setHasSameKeyPrefix, writeToValueBlock, err = w.makeAddPointDecisionV3(key, len(value)) + setHasSameKeyPrefix, writeToValueBlock, isObsolete, err = + w.makeAddPointDecisionV3(key, len(value)) addPrefixToValueStoredWithKey = base.TrailerKind(key.Trailer) == InternalKeyKindSet } else { err = w.makeAddPointDecisionV2(key) @@ -852,6 +922,7 @@ func (w *Writer) addPoint(key InternalKey, value []byte) error { if err != nil { return err } + isObsolete = w.tableFormat >= TableFormatPebblev4 && (isObsolete || forceObsolete) var valueStoredWithKey []byte var prefix valuePrefix var valueStoredWithKeyLen int @@ -907,16 +978,19 @@ func (w *Writer) addPoint(key InternalKey, value []byte) error { return err } } + if w.tableFormat >= TableFormatPebblev4 { + w.obsoleteCollector.AddPoint(isObsolete) + } w.maybeAddToFilter(key.UserKey) w.dataBlockBuf.dataBlock.addWithOptionalValuePrefix( - key, valueStoredWithKey, maxSharedKeyLen, addPrefixToValueStoredWithKey, prefix, + key, isObsolete, valueStoredWithKey, maxSharedKeyLen, addPrefixToValueStoredWithKey, prefix, setHasSameKeyPrefix) w.meta.updateSeqNum(key.SeqNum()) if !w.meta.HasPointKeys { - k := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey) + k := w.dataBlockBuf.dataBlock.getCurKey() // NB: We need to ensure that SmallestPoint.UserKey is set, so we create // an InternalKey which is semantically identical to the key, but won't // have a nil UserKey. We do this, because key.UserKey could be nil, and @@ -968,7 +1042,7 @@ func (w *Writer) addTombstone(key InternalKey, value []byte) error { if !w.disableKeyOrderChecks && !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 { // Check that tombstones are being added in fragmented order. If the two // tombstones overlap, their start and end keys must be identical. - prevKey := base.DecodeInternalKey(w.rangeDelBlock.curKey) + prevKey := w.rangeDelBlock.getCurKey() switch c := w.compare(prevKey.UserKey, key.UserKey); { case c > 0: w.err = errors.Errorf("pebble: keys must be added in order: %s, %s", @@ -1164,7 +1238,7 @@ func (w *Writer) encodeRangeKeySpan(span keyspan.Span) { func (w *Writer) addRangeKey(key InternalKey, value []byte) error { if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 { - prevStartKey := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + prevStartKey := w.rangeKeyBlock.getCurKey() prevEndKey, _, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue) if !ok { // We panic here as we should have previously decoded and validated this @@ -1310,7 +1384,7 @@ func (w *Writer) flush(key InternalKey) error { // dataBlockBuf is added back to a sync.Pool. In this particular case, the // byte slice which supports "sep" will eventually be copied when "sep" is // added to the index block. - prevKey := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey) + prevKey := w.dataBlockBuf.dataBlock.getCurKey() sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf) // We determine that we should flush an index block from the Writer client // goroutine, but we actually finish the index block from the writeQueue. @@ -1600,7 +1674,7 @@ func (w *Writer) finishIndexBlock(indexBuf *indexBlockBuf, props []byte) error { nEntries: indexBuf.block.nEntries, properties: props, } w.indexSepAlloc, part.sep = cloneKeyWithBuf( - base.DecodeInternalKey(indexBuf.block.curKey), w.indexSepAlloc, + indexBuf.block.getCurKey(), w.indexSepAlloc, ) bk := indexBuf.finish() if len(w.indexBlockAlloc) < len(bk) { @@ -1804,7 +1878,7 @@ func (w *Writer) Close() (err error) { // however, if a dataBlock is flushed, then we add a key to the new w.dataBlockBuf in the // addPoint function after the flush occurs. if w.dataBlockBuf.dataBlock.nEntries >= 1 { - w.meta.SetLargestPointKey(base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey).Clone()) + w.meta.SetLargestPointKey(w.dataBlockBuf.dataBlock.getCurKey().Clone()) } // Finish the last data block, or force an empty data block if there @@ -1818,7 +1892,7 @@ func (w *Writer) Close() (err error) { if err != nil { return err } - prevKey := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey) + prevKey := w.dataBlockBuf.dataBlock.getCurKey() if err := w.addIndexEntrySync(prevKey, InternalKey{}, bhp, w.dataBlockBuf.tmp[:]); err != nil { return err } @@ -1897,7 +1971,7 @@ func (w *Writer) Close() (err error) { var rangeKeyBH BlockHandle if w.props.NumRangeKeys() > 0 { - key := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + key := w.rangeKeyBlock.getCurKey() kind := key.Kind() endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue) if !ok { @@ -2086,7 +2160,7 @@ func (o PreviousPointKeyOpt) UnsafeKey() base.InternalKey { if o.w.dataBlockBuf.dataBlock.nEntries >= 1 { // o.w.dataBlockBuf.dataBlock.curKey is guaranteed to point to the last point key // which was added to the Writer. - return base.DecodeInternalKey(o.w.dataBlockBuf.dataBlock.curKey) + return o.w.dataBlockBuf.dataBlock.getCurKey() } return base.InternalKey{} } @@ -2115,6 +2189,8 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write separator: o.Comparer.Separator, successor: o.Comparer.Successor, tableFormat: o.TableFormat, + isStrictObsolete: o.IsStrictObsolete, + writingToLowestLevel: o.WritingToLowestLevel, cache: o.Cache, restartInterval: o.BlockRestartInterval, checksumType: o.Checksum, @@ -2188,7 +2264,8 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write w.props.PropertyCollectorNames = "[]" w.props.ExternalFormatVersion = rocksDBExternalFormatVersion - if len(o.TablePropertyCollectors) > 0 || len(o.BlockPropertyCollectors) > 0 { + if len(o.TablePropertyCollectors) > 0 || len(o.BlockPropertyCollectors) > 0 || + w.tableFormat >= TableFormatPebblev4 { var buf bytes.Buffer buf.WriteString("[") if len(o.TablePropertyCollectors) > 0 { @@ -2201,16 +2278,22 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write buf.WriteString(w.propCollectors[i].Name()) } } + numBlockPropertyCollectors := len(o.BlockPropertyCollectors) + if w.tableFormat >= TableFormatPebblev4 { + numBlockPropertyCollectors++ + } + // shortID is a uint8, so we cannot exceed that number of block + // property collectors. + if numBlockPropertyCollectors > math.MaxUint8 { + w.err = errors.New("pebble: too many block property collectors") + return w + } + if numBlockPropertyCollectors > 0 { + w.blockPropCollectors = make([]BlockPropertyCollector, numBlockPropertyCollectors) + } if len(o.BlockPropertyCollectors) > 0 { - // shortID is a uint8, so we cannot exceed that number of block - // property collectors. - if len(o.BlockPropertyCollectors) > math.MaxUint8 { - w.err = errors.New("pebble: too many block property collectors") - return w - } // The shortID assigned to a collector is the same as its index in // this slice. - w.blockPropCollectors = make([]BlockPropertyCollector, len(o.BlockPropertyCollectors)) for i := range o.BlockPropertyCollectors { w.blockPropCollectors[i] = o.BlockPropertyCollectors[i]() if i > 0 || len(o.TablePropertyCollectors) > 0 { @@ -2219,6 +2302,13 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write buf.WriteString(w.blockPropCollectors[i].Name()) } } + if w.tableFormat >= TableFormatPebblev4 { + if numBlockPropertyCollectors > 1 || len(o.TablePropertyCollectors) > 0 { + buf.WriteString(",") + } + w.blockPropCollectors[numBlockPropertyCollectors-1] = &w.obsoleteCollector + buf.WriteString(w.obsoleteCollector.Name()) + } buf.WriteString("]") w.props.PropertyCollectorNames = buf.String() } @@ -2252,3 +2342,84 @@ func init() { } private.SSTableInternalProperties = internalGetProperties } + +type obsoleteKeyBlockPropertyCollector struct { + blockIsNonObsolete bool + indexIsNonObsolete bool +} + +func (o *obsoleteKeyBlockPropertyCollector) Name() string { + return "obsolete-key" +} + +func (o *obsoleteKeyBlockPropertyCollector) Add(key InternalKey, value []byte) error { + // Ignore. + return nil +} + +func (o *obsoleteKeyBlockPropertyCollector) AddPoint(isObsolete bool) { + o.blockIsNonObsolete = o.blockIsNonObsolete || !isObsolete +} + +func (o *obsoleteKeyBlockPropertyCollector) FinishDataBlock(buf []byte) ([]byte, error) { + if o.blockIsNonObsolete { + return buf, nil + } + buf = append(buf, 't') + return buf, nil +} + +func (o *obsoleteKeyBlockPropertyCollector) AddPrevDataBlockToIndexBlock() { + o.indexIsNonObsolete = o.indexIsNonObsolete || o.blockIsNonObsolete + o.blockIsNonObsolete = false +} + +func (o *obsoleteKeyBlockPropertyCollector) FinishIndexBlock(buf []byte) ([]byte, error) { + indexIsNonObsolete := o.indexIsNonObsolete + o.indexIsNonObsolete = false + if indexIsNonObsolete { + return buf, nil + } + buf = append(buf, 't') + return buf, nil +} + +func (o *obsoleteKeyBlockPropertyCollector) FinishTable(buf []byte) ([]byte, error) { + // Table as a whole is never obsolete. + return buf, nil +} + +func (o *obsoleteKeyBlockPropertyCollector) UpdateKeySuffixes( + oldProp []byte, oldSuffix, newSuffix []byte, +) error { + isObsolete, err := propToIsObsolete(oldProp) + if err != nil { + return err + } + o.blockIsNonObsolete = !isObsolete + return nil +} + +// NB: obsoleteKeyBlockPropertyFilter is stateless. This aspect of the filter +// is used in table_cache.go for in-place modification of a filters slice. +type obsoleteKeyBlockPropertyFilter struct{} + +func (o obsoleteKeyBlockPropertyFilter) Name() string { + return "obsolete-key" +} + +// Intersects returns true if the set represented by prop intersects with +// the set in the filter. +func (o obsoleteKeyBlockPropertyFilter) Intersects(prop []byte) (bool, error) { + return propToIsObsolete(prop) +} + +func propToIsObsolete(prop []byte) (bool, error) { + if len(prop) == 0 { + return true, nil + } + if len(prop) > 1 || prop[0] != 't' { + return false, errors.Errorf("unexpected property %x", prop) + } + return false, nil +} diff --git a/sstable/writer_test.go b/sstable/writer_test.go index 4d504a8778..bb5a47ddfe 100644 --- a/sstable/writer_test.go +++ b/sstable/writer_test.go @@ -883,7 +883,7 @@ func TestWriterRace(t *testing.T) { w.Add(base.MakeInternalKey(keys[ki], uint64(ki), InternalKeyKindSet), val), ) require.Equal( - t, base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey).UserKey, keys[ki], + t, w.dataBlockBuf.dataBlock.getCurKey().UserKey, keys[ki], ) } require.NoError(t, w.Close()) diff --git a/table_cache.go b/table_cache.go index 7c416a737d..9c3b9d7fc9 100644 --- a/table_cache.go +++ b/table_cache.go @@ -375,6 +375,26 @@ func (c *tableCacheShard) newIters( return nil, nil, v.err } + hideObsoletePoints := false + if opts != nil { + // This code is appending (at most one filter) in-place to + // opts.PointKeyFilters even though the slice is shared for iterators in + // the same iterator tree. This is acceptable since all the following + // properties are true: + // - The iterator tree is single threaded, so the shared backing for the + // slice is being mutated in a single threaded manner. + // - Each shallow copy of the slice has its own notion of length. + // - The appended element is always the obsoleteKeyBlockPropertyFilter + // struct, which is stateless, so overwriting that struct when creating + // one sstable iterator is harmless to other sstable iterators that are + // relying on that struct. + // + // An alternative would be to have different slices for different sstable + // iterators, but that requires more work to avoid allocations. + hideObsoletePoints, opts.PointKeyFilters = + v.reader.TryAddBlockPropertyFilterForHideObsoletePoints( + opts.snapshotForHideObsoletePoints, file.LargestSeqNum, opts.PointKeyFilters) + } ok := true var filterer *sstable.BlockPropertiesFilterer var err error @@ -390,14 +410,7 @@ func (c *tableCacheShard) newIters( type iterCreator interface { NewFixedSeqnumRangeDelIter(seqNum uint64) (keyspan.FragmentIterator, error) NewRawRangeDelIter() (keyspan.FragmentIterator, error) - NewIterWithBlockPropertyFiltersAndContext( - ctx context.Context, - lower, upper []byte, - filterer *sstable.BlockPropertiesFilterer, - useFilterBlock bool, - stats *base.InternalIteratorStats, - rp sstable.ReaderProvider, - ) (sstable.Iterator, error) + NewIterWithBlockPropertyFiltersAndContextEtc(ctx context.Context, lower, upper []byte, filterer *sstable.BlockPropertiesFilterer, hideObsoletePoints, useFilterBlock bool, stats *base.InternalIteratorStats, rp sstable.ReaderProvider) (sstable.Iterator, error) NewCompactionIter( bytesIterated *uint64, rp sstable.ReaderProvider, @@ -484,11 +497,9 @@ func (c *tableCacheShard) newIters( if internalOpts.bytesIterated != nil { iter, err = ic.NewCompactionIter(internalOpts.bytesIterated, rp) } else { - iter, err = ic.NewIterWithBlockPropertyFiltersAndContext( - ctx, - opts.GetLowerBound(), opts.GetUpperBound(), - filterer, useFilter, internalOpts.stats, rp, - ) + iter, err = ic.NewIterWithBlockPropertyFiltersAndContextEtc( + ctx, opts.GetLowerBound(), opts.GetUpperBound(), filterer, hideObsoletePoints, useFilter, + internalOpts.stats, rp) } if err != nil { if rangeDelIter != nil { diff --git a/testdata/checkpoint b/testdata/checkpoint index 986d23b111..9c38a7c932 100644 --- a/testdata/checkpoint +++ b/testdata/checkpoint @@ -256,23 +256,23 @@ close: db/000009.sst sync: db sync: db/MANIFEST-000001 open: db/000005.sst -read-at(739, 53): db/000005.sst -read-at(702, 37): db/000005.sst -read-at(74, 628): db/000005.sst +read-at(767, 53): db/000005.sst +read-at(730, 37): db/000005.sst +read-at(74, 656): db/000005.sst read-at(47, 27): db/000005.sst open: db/000005.sst close: db/000005.sst open: db/000009.sst -read-at(734, 53): db/000009.sst -read-at(697, 37): db/000009.sst -read-at(69, 628): db/000009.sst +read-at(762, 53): db/000009.sst +read-at(725, 37): db/000009.sst +read-at(69, 656): db/000009.sst read-at(42, 27): db/000009.sst open: db/000009.sst close: db/000009.sst open: db/000007.sst -read-at(739, 53): db/000007.sst -read-at(702, 37): db/000007.sst -read-at(74, 628): db/000007.sst +read-at(767, 53): db/000007.sst +read-at(730, 37): db/000007.sst +read-at(74, 656): db/000007.sst read-at(47, 27): db/000007.sst open: db/000007.sst close: db/000007.sst @@ -341,15 +341,15 @@ close: checkpoints/checkpoint1/000006.log scan checkpoints/checkpoint1 ---- open: checkpoints/checkpoint1/000007.sst -read-at(739, 53): checkpoints/checkpoint1/000007.sst -read-at(702, 37): checkpoints/checkpoint1/000007.sst -read-at(74, 628): checkpoints/checkpoint1/000007.sst +read-at(767, 53): checkpoints/checkpoint1/000007.sst +read-at(730, 37): checkpoints/checkpoint1/000007.sst +read-at(74, 656): checkpoints/checkpoint1/000007.sst read-at(47, 27): checkpoints/checkpoint1/000007.sst read-at(0, 47): checkpoints/checkpoint1/000007.sst open: checkpoints/checkpoint1/000005.sst -read-at(739, 53): checkpoints/checkpoint1/000005.sst -read-at(702, 37): checkpoints/checkpoint1/000005.sst -read-at(74, 628): checkpoints/checkpoint1/000005.sst +read-at(767, 53): checkpoints/checkpoint1/000005.sst +read-at(730, 37): checkpoints/checkpoint1/000005.sst +read-at(74, 656): checkpoints/checkpoint1/000005.sst read-at(47, 27): checkpoints/checkpoint1/000005.sst read-at(0, 47): checkpoints/checkpoint1/000005.sst a 1 @@ -364,9 +364,9 @@ g 10 scan db ---- open: db/000010.sst -read-at(766, 53): db/000010.sst -read-at(729, 37): db/000010.sst -read-at(101, 628): db/000010.sst +read-at(794, 53): db/000010.sst +read-at(757, 37): db/000010.sst +read-at(101, 656): db/000010.sst read-at(74, 27): db/000010.sst read-at(0, 74): db/000010.sst a 1 @@ -406,9 +406,9 @@ close: checkpoints/checkpoint2/000006.log scan checkpoints/checkpoint2 ---- open: checkpoints/checkpoint2/000007.sst -read-at(739, 53): checkpoints/checkpoint2/000007.sst -read-at(702, 37): checkpoints/checkpoint2/000007.sst -read-at(74, 628): checkpoints/checkpoint2/000007.sst +read-at(767, 53): checkpoints/checkpoint2/000007.sst +read-at(730, 37): checkpoints/checkpoint2/000007.sst +read-at(74, 656): checkpoints/checkpoint2/000007.sst read-at(47, 27): checkpoints/checkpoint2/000007.sst read-at(0, 47): checkpoints/checkpoint2/000007.sst b 5 @@ -446,15 +446,15 @@ close: checkpoints/checkpoint3/000006.log scan checkpoints/checkpoint3 ---- open: checkpoints/checkpoint3/000007.sst -read-at(739, 53): checkpoints/checkpoint3/000007.sst -read-at(702, 37): checkpoints/checkpoint3/000007.sst -read-at(74, 628): checkpoints/checkpoint3/000007.sst +read-at(767, 53): checkpoints/checkpoint3/000007.sst +read-at(730, 37): checkpoints/checkpoint3/000007.sst +read-at(74, 656): checkpoints/checkpoint3/000007.sst read-at(47, 27): checkpoints/checkpoint3/000007.sst read-at(0, 47): checkpoints/checkpoint3/000007.sst open: checkpoints/checkpoint3/000005.sst -read-at(739, 53): checkpoints/checkpoint3/000005.sst -read-at(702, 37): checkpoints/checkpoint3/000005.sst -read-at(74, 628): checkpoints/checkpoint3/000005.sst +read-at(767, 53): checkpoints/checkpoint3/000005.sst +read-at(730, 37): checkpoints/checkpoint3/000005.sst +read-at(74, 656): checkpoints/checkpoint3/000005.sst read-at(47, 27): checkpoints/checkpoint3/000005.sst read-at(0, 47): checkpoints/checkpoint3/000005.sst a 1 diff --git a/testdata/compaction_delete_only_hints b/testdata/compaction_delete_only_hints index 6bee9d15e6..83606100dd 100644 --- a/testdata/compaction_delete_only_hints +++ b/testdata/compaction_delete_only_hints @@ -88,7 +88,7 @@ maybe-compact Deletion hints: (none) Compactions: - [JOB 100] compacted(delete-only) L2 [000005] (786 B) + L3 [000006] (786 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s + [JOB 100] compacted(delete-only) L2 [000005] (814 B) + L3 [000006] (814 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s # Verify that compaction correctly handles the presence of multiple # overlapping hints which might delete a file multiple times. All of the @@ -127,7 +127,7 @@ maybe-compact Deletion hints: (none) Compactions: - [JOB 100] compacted(delete-only) L2 [000006] (786 B) + L3 [000007] (786 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s + [JOB 100] compacted(delete-only) L2 [000006] (814 B) + L3 [000007] (814 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s # Test a range tombstone that is already compacted into L6. @@ -206,7 +206,7 @@ maybe-compact Deletion hints: (none) Compactions: - [JOB 100] compacted(delete-only) L2 [000005] (786 B) + L3 [000006] (786 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s + [JOB 100] compacted(delete-only) L2 [000005] (814 B) + L3 [000006] (814 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s # A deletion hint present on an sstable in a higher level should NOT result in a # deletion-only compaction incorrectly removing an sstable in L6 following an @@ -255,7 +255,7 @@ L0.000001 a-z seqnums(tombstone=5-27, file-smallest=0, type=point-key-only) close-snapshot 10 ---- -[JOB 100] compacted(elision-only) L6 [000004] (850 B) + L6 [] (0 B) -> L6 [000005] (771 B), in 1.0s (2.0s total), output rate 771 B/s +[JOB 100] compacted(elision-only) L6 [000004] (878 B) + L6 [] (0 B) -> L6 [000005] (799 B), in 1.0s (2.0s total), output rate 799 B/s # The deletion hint was removed by the elision-only compaction. get-hints @@ -414,4 +414,4 @@ maybe-compact Deletion hints: (none) Compactions: - [JOB 100] compacted(delete-only) L6 [000006 000007 000008 000009 000011] (4.5 K) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s + [JOB 100] compacted(delete-only) L6 [000006 000007 000008 000009 000011] (4.6 K) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s diff --git a/testdata/compaction_iter b/testdata/compaction_iter index 016cdbd5bf..a4e73827d5 100644 --- a/testdata/compaction_iter +++ b/testdata/compaction_iter @@ -238,7 +238,7 @@ a.SET.1:c iter first ---- -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.SET.2:b @@ -250,7 +250,7 @@ first next ---- a#2,1:b -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.MERGE.2:b @@ -262,7 +262,7 @@ first next ---- a#2,2:b -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.INVALID.2:c @@ -273,7 +273,7 @@ iter first tombstones ---- -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 . define diff --git a/testdata/compaction_iter_delete_sized b/testdata/compaction_iter_delete_sized index c1663fcbfe..0483fd530a 100644 --- a/testdata/compaction_iter_delete_sized +++ b/testdata/compaction_iter_delete_sized @@ -238,7 +238,7 @@ a.SET.1:c iter first ---- -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.SET.2:b @@ -250,7 +250,7 @@ first next ---- a#2,18:b -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.MERGE.2:b @@ -262,7 +262,7 @@ first next ---- a#2,2:b -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.INVALID.2:c @@ -273,7 +273,7 @@ iter first tombstones ---- -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 . define @@ -1338,7 +1338,7 @@ first next ---- a#2,18:b -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.SET.3:c @@ -1351,7 +1351,7 @@ first next ---- a#3,18:c -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 # SINGLEDEL that meets a SETWITHDEL is transformed into a DEL. diff --git a/testdata/compaction_iter_set_with_del b/testdata/compaction_iter_set_with_del index 796984e0c7..0cb35f6b29 100644 --- a/testdata/compaction_iter_set_with_del +++ b/testdata/compaction_iter_set_with_del @@ -238,7 +238,7 @@ a.SET.1:c iter first ---- -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.SET.2:b @@ -250,7 +250,7 @@ first next ---- a#2,18:b -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.MERGE.2:b @@ -262,7 +262,7 @@ first next ---- a#2,2:b -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.INVALID.2:c @@ -273,7 +273,7 @@ iter first tombstones ---- -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 . define @@ -1338,7 +1338,7 @@ first next ---- a#2,18:b -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 define a.SET.3:c @@ -1351,7 +1351,7 @@ first next ---- a#3,18:c -err=invalid internal key kind: 255 +err=invalid internal key kind: 191 # SINGLEDEL that meets a SETWITHDEL is transformed into a DEL. diff --git a/testdata/compaction_tombstones b/testdata/compaction_tombstones index cfc249cef6..48dd8b2693 100644 --- a/testdata/compaction_tombstones +++ b/testdata/compaction_tombstones @@ -41,7 +41,7 @@ range-deletions-bytes-estimate: 0 maybe-compact ---- -[JOB 100] compacted(elision-only) L6 [000004] (853 B) + L6 [] (0 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s +[JOB 100] compacted(elision-only) L6 [000004] (884 B) + L6 [] (0 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s # Test a table that straddles a snapshot. It should not be compacted. define snapshots=(50) auto-compactions=off @@ -80,12 +80,12 @@ wait-pending-table-stats num-entries: 2 num-deletions: 1 num-range-key-sets: 0 -point-deletions-bytes-estimate: 108 +point-deletions-bytes-estimate: 111 range-deletions-bytes-estimate: 0 maybe-compact ---- -[JOB 100] compacted(elision-only) L6 [000004] (823 B) + L6 [] (0 B) -> L6 [000005] (772 B), in 1.0s (2.0s total), output rate 772 B/s +[JOB 100] compacted(elision-only) L6 [000004] (851 B) + L6 [] (0 B) -> L6 [000005] (800 B), in 1.0s (2.0s total), output rate 800 B/s version ---- @@ -119,7 +119,7 @@ wait-pending-table-stats num-entries: 6 num-deletions: 2 num-range-key-sets: 0 -point-deletions-bytes-estimate: 48 +point-deletions-bytes-estimate: 49 range-deletions-bytes-estimate: 66 maybe-compact @@ -134,7 +134,7 @@ close-snapshot close-snapshot 103 ---- -[JOB 100] compacted(elision-only) L6 [000004] (1001 B) + L6 [] (0 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s +[JOB 100] compacted(elision-only) L6 [000004] (1.0 K) + L6 [] (0 B) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s # Test a table that contains both deletions and non-deletions, but whose # non-deletions well outnumber its deletions. The table should not be @@ -152,7 +152,7 @@ wait-pending-table-stats num-entries: 11 num-deletions: 1 num-range-key-sets: 0 -point-deletions-bytes-estimate: 26 +point-deletions-bytes-estimate: 27 range-deletions-bytes-estimate: 0 close-snapshot @@ -233,7 +233,7 @@ wait-pending-table-stats num-entries: 3 num-deletions: 3 num-range-key-sets: 0 -point-deletions-bytes-estimate: 6966 +point-deletions-bytes-estimate: 6994 range-deletions-bytes-estimate: 0 # By plain file size, 000005 should be picked because it is larger and @@ -243,7 +243,7 @@ range-deletions-bytes-estimate: 0 maybe-compact ---- -[JOB 100] compacted(default) L5 [000004] (833 B) + L6 [000006] (13 K) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s +[JOB 100] compacted(default) L5 [000004] (861 B) + L6 [000006] (13 K) -> L6 [] (0 B), in 1.0s (2.0s total), output rate 0 B/s # A table containing only range keys is not eligible for elision. # RANGEKEYDEL or RANGEKEYUNSET. @@ -323,7 +323,7 @@ range-deletions-bytes-estimate: 41 maybe-compact ---- -[JOB 100] compacted(elision-only) L6 [000004] (1003 B) + L6 [] (0 B) -> L6 [000005] (778 B), in 1.0s (2.0s total), output rate 778 B/s +[JOB 100] compacted(elision-only) L6 [000004] (1.0 K) + L6 [] (0 B) -> L6 [000005] (806 B), in 1.0s (2.0s total), output rate 806 B/s # Close the DB, asserting that the reference counts balance. close @@ -359,7 +359,7 @@ wait-pending-table-stats num-entries: 2 num-deletions: 1 num-range-key-sets: 0 -point-deletions-bytes-estimate: 2786 +point-deletions-bytes-estimate: 2797 range-deletions-bytes-estimate: 0 wait-pending-table-stats @@ -373,7 +373,7 @@ range-deletions-bytes-estimate: 8246 maybe-compact ---- -[JOB 100] compacted(default) L5 [000005] (850 B) + L6 [000007] (13 K) -> L6 [000008] (4.8 K), in 1.0s (2.0s total), output rate 4.8 K/s +[JOB 100] compacted(default) L5 [000005] (878 B) + L6 [000007] (13 K) -> L6 [000008] (4.8 K), in 1.0s (2.0s total), output rate 4.8 K/s # The same LSM as above. However, this time, with point tombstone weighting at # 2x, the table with the point tombstone (000004) will be selected as the @@ -402,7 +402,7 @@ wait-pending-table-stats num-entries: 2 num-deletions: 1 num-range-key-sets: 0 -point-deletions-bytes-estimate: 2786 +point-deletions-bytes-estimate: 2797 range-deletions-bytes-estimate: 0 wait-pending-table-stats @@ -416,4 +416,4 @@ range-deletions-bytes-estimate: 8246 maybe-compact ---- -[JOB 100] compacted(default) L5 [000005] (850 B) + L6 [000007] (13 K) -> L6 [000008] (4.8 K), in 1.0s (2.0s total), output rate 4.8 K/s +[JOB 100] compacted(default) L5 [000005] (878 B) + L6 [000007] (13 K) -> L6 [000008] (4.8 K), in 1.0s (2.0s total), output rate 4.8 K/s diff --git a/testdata/event_listener b/testdata/event_listener index 9d5fb953f6..172ee775d8 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -127,7 +127,7 @@ close: db/marker.manifest.000002.MANIFEST-000006 remove: db/marker.manifest.000001.MANIFEST-000001 sync: db [JOB 5] MANIFEST created 000006 -[JOB 5] flushed 1 memtable to L0 [000005] (771 B), in 1.0s (2.0s total), output rate 771 B/s +[JOB 5] flushed 1 memtable to L0 [000005] (799 B), in 1.0s (2.0s total), output rate 799 B/s compact ---- @@ -151,21 +151,21 @@ close: db/marker.manifest.000003.MANIFEST-000009 remove: db/marker.manifest.000002.MANIFEST-000006 sync: db [JOB 7] MANIFEST created 000009 -[JOB 7] flushed 1 memtable to L0 [000008] (771 B), in 1.0s (2.0s total), output rate 771 B/s +[JOB 7] flushed 1 memtable to L0 [000008] (799 B), in 1.0s (2.0s total), output rate 799 B/s remove: db/MANIFEST-000001 [JOB 7] MANIFEST deleted 000001 -[JOB 8] compacting(default) L0 [000005 000008] (1.5 K) + L6 [] (0 B) +[JOB 8] compacting(default) L0 [000005 000008] (1.6 K) + L6 [] (0 B) open: db/000005.sst -read-at(718, 53): db/000005.sst -read-at(681, 37): db/000005.sst -read-at(53, 628): db/000005.sst +read-at(746, 53): db/000005.sst +read-at(709, 37): db/000005.sst +read-at(53, 656): db/000005.sst read-at(26, 27): db/000005.sst open: db/000005.sst close: db/000005.sst open: db/000008.sst -read-at(718, 53): db/000008.sst -read-at(681, 37): db/000008.sst -read-at(53, 628): db/000008.sst +read-at(746, 53): db/000008.sst +read-at(709, 37): db/000008.sst +read-at(53, 656): db/000008.sst read-at(26, 27): db/000008.sst open: db/000008.sst close: db/000008.sst @@ -188,7 +188,7 @@ close: db/marker.manifest.000004.MANIFEST-000011 remove: db/marker.manifest.000003.MANIFEST-000009 sync: db [JOB 8] MANIFEST created 000011 -[JOB 8] compacted(default) L0 [000005 000008] (1.5 K) + L6 [] (0 B) -> L6 [000010] (771 B), in 1.0s (3.0s total), output rate 771 B/s +[JOB 8] compacted(default) L0 [000005 000008] (1.6 K) + L6 [] (0 B) -> L6 [000010] (799 B), in 1.0s (3.0s total), output rate 799 B/s close: db/000005.sst close: db/000008.sst remove: db/000005.sst @@ -223,7 +223,7 @@ close: db/marker.manifest.000005.MANIFEST-000014 remove: db/marker.manifest.000004.MANIFEST-000011 sync: db [JOB 10] MANIFEST created 000014 -[JOB 10] flushed 1 memtable to L0 [000013] (771 B), in 1.0s (2.0s total), output rate 771 B/s +[JOB 10] flushed 1 memtable to L0 [000013] (799 B), in 1.0s (2.0s total), output rate 799 B/s enable-file-deletions ---- @@ -233,9 +233,9 @@ remove: db/MANIFEST-000009 ingest ---- open: ext/0 -read-at(773, 53): ext/0 -read-at(736, 37): ext/0 -read-at(53, 683): ext/0 +read-at(801, 53): ext/0 +read-at(764, 37): ext/0 +read-at(53, 711): ext/0 read-at(26, 27): ext/0 read-at(0, 26): ext/0 close: ext/0 @@ -243,9 +243,9 @@ link: ext/0 -> db/000015.sst [JOB 12] ingesting: sstable created 000015 sync: db open: db/000013.sst -read-at(718, 53): db/000013.sst -read-at(681, 37): db/000013.sst -read-at(53, 628): db/000013.sst +read-at(746, 53): db/000013.sst +read-at(709, 37): db/000013.sst +read-at(53, 656): db/000013.sst read-at(26, 27): db/000013.sst read-at(0, 26): db/000013.sst create: db/MANIFEST-000016 @@ -257,28 +257,28 @@ remove: db/marker.manifest.000005.MANIFEST-000014 sync: db [JOB 12] MANIFEST created 000016 remove: ext/0 -[JOB 12] ingested L0:000015 (826 B) +[JOB 12] ingested L0:000015 (854 B) metrics ---- __level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp WAL 1 27 B - 48 B - - - - 108 B - - - 2.2 - 0 2 1.6 K 0.40 81 B 826 B 1 0 B 0 2.3 K 3 0 B 2 28.6 + 0 2 1.6 K 0.40 81 B 854 B 1 0 B 0 2.3 K 3 0 B 2 29.6 1 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 2 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 3 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 4 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 5 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 - 6 1 771 B - 1.5 K 0 B 0 0 B 0 771 B 1 1.5 K 1 0.5 - total 3 2.3 K - 934 B 826 B 1 0 B 0 3.9 K 4 1.5 K 3 4.3 + 6 1 799 B - 1.6 K 0 B 0 0 B 0 799 B 1 1.6 K 1 0.5 + total 3 2.4 K - 962 B 854 B 1 0 B 0 4.1 K 4 1.6 K 3 4.3 flush 3 0 B 0 0 (ingest = tables-ingested, move = ingested-as-flushable) -compact 1 2.3 K 0 B 0 (size == estimated-debt, score = in-progress-bytes, in = num-in-progress) +compact 1 2.4 K 0 B 0 (size == estimated-debt, score = in-progress-bytes, in = num-in-progress) ctype 1 0 0 0 0 0 0 (default, delete, elision, move, read, rewrite, multi-level) memtbl 1 256 K zmemtbl 0 0 B ztbl 0 0 B - bcache 8 1.4 K 11.1% (score == hit-rate) - tcache 1 752 B 40.0% (score == hit-rate) + bcache 8 1.5 K 11.1% (score == hit-rate) + tcache 1 760 B 40.0% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 0 filter - - 0.0% (score == utility) @@ -292,16 +292,16 @@ ingest-flushable ---- sync-data: wal/000012.log open: ext/a -read-at(773, 53): ext/a -read-at(736, 37): ext/a -read-at(53, 683): ext/a +read-at(801, 53): ext/a +read-at(764, 37): ext/a +read-at(53, 711): ext/a read-at(26, 27): ext/a read-at(0, 26): ext/a close: ext/a open: ext/b -read-at(773, 53): ext/b -read-at(736, 37): ext/b -read-at(53, 683): ext/b +read-at(801, 53): ext/b +read-at(764, 37): ext/b +read-at(53, 711): ext/b read-at(26, 27): ext/b read-at(0, 26): ext/b close: ext/b @@ -323,7 +323,7 @@ sync: wal [JOB 15] WAL created 000020 remove: ext/a remove: ext/b -[JOB 13] ingested as flushable 000017 (826 B), 000018 (826 B) +[JOB 13] ingested as flushable 000017 (854 B), 000018 (854 B) sync-data: wal/000020.log close: wal/000020.log create: wal/000021.log @@ -336,7 +336,7 @@ sync-data: db/000022.sst close: db/000022.sst sync: db sync: db/MANIFEST-000016 -[JOB 17] flushed 1 memtable to L0 [000022] (771 B), in 1.0s (2.0s total), output rate 771 B/s +[JOB 17] flushed 1 memtable to L0 [000022] (799 B), in 1.0s (2.0s total), output rate 799 B/s remove: db/MANIFEST-000011 [JOB 17] MANIFEST deleted 000011 [JOB 18] flushing 2 ingested tables @@ -348,7 +348,7 @@ close: db/marker.manifest.000007.MANIFEST-000023 remove: db/marker.manifest.000006.MANIFEST-000016 sync: db [JOB 18] MANIFEST created 000023 -[JOB 18] flushed 2 ingested flushables L0:000017 (826 B) + L6:000018 (826 B) in 1.0s (2.0s total), output rate 1.6 K/s +[JOB 18] flushed 2 ingested flushables L0:000017 (854 B) + L6:000018 (854 B) in 1.0s (2.0s total), output rate 1.7 K/s remove: db/MANIFEST-000014 [JOB 18] MANIFEST deleted 000014 [JOB 19] flushing 1 memtable to L0 @@ -359,22 +359,22 @@ metrics ---- __level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp WAL 1 29 B - 82 B - - - - 110 B - - - 1.3 - 0 4 3.1 K 0.80 81 B 1.6 K 2 0 B 0 3.0 K 4 0 B 4 38.1 + 0 4 3.2 K 0.80 81 B 1.7 K 2 0 B 0 3.1 K 4 0 B 4 39.5 1 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 2 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 3 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 4 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 5 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0 0.0 - 6 2 1.6 K - 1.5 K 826 B 1 0 B 0 771 B 1 1.5 K 1 0.5 - total 6 4.7 K - 2.5 K 2.4 K 3 0 B 0 6.3 K 5 1.5 K 5 2.5 - flush 6 1.6 K 2 1 (ingest = tables-ingested, move = ingested-as-flushable) -compact 1 4.7 K 0 B 0 (size == estimated-debt, score = in-progress-bytes, in = num-in-progress) + 6 2 1.6 K - 1.6 K 854 B 1 0 B 0 799 B 1 1.6 K 1 0.5 + total 6 4.8 K - 2.6 K 2.5 K 3 0 B 0 6.5 K 5 1.6 K 5 2.5 + flush 6 1.7 K 2 1 (ingest = tables-ingested, move = ingested-as-flushable) +compact 1 4.8 K 0 B 0 (size == estimated-debt, score = in-progress-bytes, in = num-in-progress) ctype 1 0 0 0 0 0 0 (default, delete, elision, move, read, rewrite, multi-level) memtbl 1 512 K zmemtbl 0 0 B ztbl 0 0 B - bcache 16 2.9 K 14.3% (score == hit-rate) - tcache 1 752 B 50.0% (score == hit-rate) + bcache 16 3.0 K 14.3% (score == hit-rate) + tcache 1 760 B 50.0% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/ingest b/testdata/ingest index 3334e58254..346ee52946 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -48,12 +48,12 @@ compact 0 0 B 0 B 0 (size == esti zmemtbl 0 0 B ztbl 0 0 B bcache 8 1.5 K 42.9% (score == hit-rate) - tcache 1 752 B 50.0% (score == hit-rate) + tcache 1 760 B 50.0% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 0 filter - - 0.0% (score == utility) ingest 1 - + iter seek-ge a @@ -351,7 +351,7 @@ num-entries: 2 num-deletions: 2 num-range-key-sets: 0 point-deletions-bytes-estimate: 0 -range-deletions-bytes-estimate: 1666 +range-deletions-bytes-estimate: 1694 # A set operation takes precedence over a range deletion at the same # sequence number as can occur during ingestion. diff --git a/testdata/manual_compaction_file_boundaries_delsized b/testdata/manual_compaction_file_boundaries_delsized index b83e57e265..9ec939d662 100644 --- a/testdata/manual_compaction_file_boundaries_delsized +++ b/testdata/manual_compaction_file_boundaries_delsized @@ -290,60 +290,60 @@ compact a-zz L1 file-sizes ---- L2: - 000095:[a#101,1-az@1#129,1]: 7637 bytes (7.5 K) - 000096:[b@1#130,1-bz@1#156,1]: 6629 bytes (6.5 K) - 000097:[c@1#157,1-cz@1#183,1]: 6629 bytes (6.5 K) - 000098:[d@1#184,1-dz@1#210,1]: 6629 bytes (6.5 K) - 000099:[e@1#211,1-ez@1#237,1]: 6629 bytes (6.5 K) - 000100:[f@1#238,1-fz@1#264,1]: 6629 bytes (6.5 K) - 000101:[g@1#265,1-gz@1#291,1]: 6629 bytes (6.5 K) - 000102:[h@1#292,1-hz@1#318,1]: 6629 bytes (6.5 K) - 000103:[i@1#319,1-iz@1#345,1]: 6629 bytes (6.5 K) - 000104:[j@1#346,1-jz@1#372,1]: 6629 bytes (6.5 K) - 000105:[k@1#373,1-kz@1#399,1]: 6629 bytes (6.5 K) - 000106:[l@1#400,1-lz@1#426,1]: 6629 bytes (6.5 K) - 000107:[m@1#427,1-mz@1#453,1]: 6629 bytes (6.5 K) - 000108:[n@1#454,1-nz@1#480,1]: 6629 bytes (6.5 K) - 000109:[o@1#481,1-oz@1#507,1]: 6629 bytes (6.5 K) - 000110:[p@1#508,1-pz@1#534,1]: 6629 bytes (6.5 K) - 000111:[q@1#535,1-qz@1#561,1]: 6628 bytes (6.5 K) - 000112:[r@1#562,1-rz@1#588,1]: 6629 bytes (6.5 K) - 000113:[s@1#589,1-sz@1#615,1]: 6629 bytes (6.5 K) - 000114:[t@1#616,1-tz@1#642,1]: 6629 bytes (6.5 K) - 000115:[u@1#643,1-uz@1#669,1]: 6629 bytes (6.5 K) - 000116:[v@1#670,1-vz@1#696,1]: 6629 bytes (6.5 K) - 000117:[w@1#697,1-wz@1#723,1]: 6629 bytes (6.5 K) - 000118:[x@1#724,1-xz@1#750,1]: 6629 bytes (6.5 K) - 000119:[y@1#751,1-yz@1#777,1]: 6629 bytes (6.5 K) - 000120:[z#102,1-zr@1#796,1]: 5909 bytes (5.8 K) - 000121:[zs@1#797,1-zz@1#804,1]: 2491 bytes (2.4 K) + 000095:[a#101,1-az@1#129,1]: 7665 bytes (7.5 K) + 000096:[b@1#130,1-bz@1#156,1]: 6657 bytes (6.5 K) + 000097:[c@1#157,1-cz@1#183,1]: 6657 bytes (6.5 K) + 000098:[d@1#184,1-dz@1#210,1]: 6657 bytes (6.5 K) + 000099:[e@1#211,1-ez@1#237,1]: 6657 bytes (6.5 K) + 000100:[f@1#238,1-fz@1#264,1]: 6657 bytes (6.5 K) + 000101:[g@1#265,1-gz@1#291,1]: 6657 bytes (6.5 K) + 000102:[h@1#292,1-hz@1#318,1]: 6657 bytes (6.5 K) + 000103:[i@1#319,1-iz@1#345,1]: 6657 bytes (6.5 K) + 000104:[j@1#346,1-jz@1#372,1]: 6657 bytes (6.5 K) + 000105:[k@1#373,1-kz@1#399,1]: 6657 bytes (6.5 K) + 000106:[l@1#400,1-lz@1#426,1]: 6657 bytes (6.5 K) + 000107:[m@1#427,1-mz@1#453,1]: 6657 bytes (6.5 K) + 000108:[n@1#454,1-nz@1#480,1]: 6657 bytes (6.5 K) + 000109:[o@1#481,1-oz@1#507,1]: 6657 bytes (6.5 K) + 000110:[p@1#508,1-pz@1#534,1]: 6657 bytes (6.5 K) + 000111:[q@1#535,1-qz@1#561,1]: 6656 bytes (6.5 K) + 000112:[r@1#562,1-rz@1#588,1]: 6657 bytes (6.5 K) + 000113:[s@1#589,1-sz@1#615,1]: 6657 bytes (6.5 K) + 000114:[t@1#616,1-tz@1#642,1]: 6657 bytes (6.5 K) + 000115:[u@1#643,1-uz@1#669,1]: 6657 bytes (6.5 K) + 000116:[v@1#670,1-vz@1#696,1]: 6657 bytes (6.5 K) + 000117:[w@1#697,1-wz@1#723,1]: 6657 bytes (6.5 K) + 000118:[x@1#724,1-xz@1#750,1]: 6657 bytes (6.5 K) + 000119:[y@1#751,1-yz@1#777,1]: 6657 bytes (6.5 K) + 000120:[z#102,1-zr@1#796,1]: 5937 bytes (5.8 K) + 000121:[zs@1#797,1-zz@1#804,1]: 2519 bytes (2.5 K) L3: - 000005:[a#1,1-a#1,1]: 10776 bytes (10 K) - 000006:[b#2,1-b#2,1]: 10776 bytes (10 K) - 000007:[c#3,1-c#3,1]: 10776 bytes (10 K) - 000008:[d#4,1-d#4,1]: 10776 bytes (10 K) - 000009:[e#5,1-e#5,1]: 10776 bytes (10 K) - 000010:[f#6,1-f#6,1]: 10776 bytes (10 K) - 000011:[g#7,1-g#7,1]: 10776 bytes (10 K) - 000012:[h#8,1-h#8,1]: 10776 bytes (10 K) - 000013:[i#9,1-i#9,1]: 10776 bytes (10 K) - 000014:[j#10,1-j#10,1]: 10776 bytes (10 K) - 000015:[k#11,1-k#11,1]: 10776 bytes (10 K) - 000016:[l#12,1-l#12,1]: 10776 bytes (10 K) - 000017:[m#13,1-m#13,1]: 10776 bytes (10 K) - 000018:[n#14,1-n#14,1]: 10776 bytes (10 K) - 000019:[o#15,1-o#15,1]: 10776 bytes (10 K) - 000020:[p#16,1-p#16,1]: 10776 bytes (10 K) - 000021:[q#17,1-q#17,1]: 10776 bytes (10 K) - 000022:[r#18,1-r#18,1]: 10776 bytes (10 K) - 000023:[s#19,1-s#19,1]: 10776 bytes (10 K) - 000024:[t#20,1-t#20,1]: 10776 bytes (10 K) - 000025:[u#21,1-u#21,1]: 10776 bytes (10 K) - 000026:[v#22,1-v#22,1]: 10776 bytes (10 K) - 000027:[w#23,1-w#23,1]: 10776 bytes (10 K) - 000028:[x#24,1-x#24,1]: 10776 bytes (10 K) - 000029:[y#25,1-y#25,1]: 10776 bytes (10 K) - 000030:[z#26,1-z#26,1]: 10776 bytes (10 K) + 000005:[a#1,1-a#1,1]: 10804 bytes (11 K) + 000006:[b#2,1-b#2,1]: 10804 bytes (11 K) + 000007:[c#3,1-c#3,1]: 10804 bytes (11 K) + 000008:[d#4,1-d#4,1]: 10804 bytes (11 K) + 000009:[e#5,1-e#5,1]: 10804 bytes (11 K) + 000010:[f#6,1-f#6,1]: 10804 bytes (11 K) + 000011:[g#7,1-g#7,1]: 10804 bytes (11 K) + 000012:[h#8,1-h#8,1]: 10804 bytes (11 K) + 000013:[i#9,1-i#9,1]: 10804 bytes (11 K) + 000014:[j#10,1-j#10,1]: 10804 bytes (11 K) + 000015:[k#11,1-k#11,1]: 10804 bytes (11 K) + 000016:[l#12,1-l#12,1]: 10804 bytes (11 K) + 000017:[m#13,1-m#13,1]: 10804 bytes (11 K) + 000018:[n#14,1-n#14,1]: 10804 bytes (11 K) + 000019:[o#15,1-o#15,1]: 10804 bytes (11 K) + 000020:[p#16,1-p#16,1]: 10804 bytes (11 K) + 000021:[q#17,1-q#17,1]: 10804 bytes (11 K) + 000022:[r#18,1-r#18,1]: 10804 bytes (11 K) + 000023:[s#19,1-s#19,1]: 10804 bytes (11 K) + 000024:[t#20,1-t#20,1]: 10804 bytes (11 K) + 000025:[u#21,1-u#21,1]: 10804 bytes (11 K) + 000026:[v#22,1-v#22,1]: 10804 bytes (11 K) + 000027:[w#23,1-w#23,1]: 10804 bytes (11 K) + 000028:[x#24,1-x#24,1]: 10804 bytes (11 K) + 000029:[y#25,1-y#25,1]: 10804 bytes (11 K) + 000030:[z#26,1-z#26,1]: 10804 bytes (11 K) # Test a scenario where there exists a grandparent file (in L3), but the L1->L2 # compaction doesn't reach it until late in the compaction. The output file @@ -399,11 +399,11 @@ compact a-zz L1 file-sizes ---- L2: - 000007:[a#201,1-j#210,1]: 10958 bytes (11 K) - 000008:[k#211,1-o#215,1]: 5865 bytes (5.7 K) - 000009:[z#102,1-z#102,1]: 781 bytes (781 B) + 000007:[a#201,1-j#210,1]: 10986 bytes (11 K) + 000008:[k#211,1-o#215,1]: 5893 bytes (5.8 K) + 000009:[z#102,1-z#102,1]: 809 bytes (809 B) L3: - 000006:[m#1,1-m#1,1]: 10776 bytes (10 K) + 000006:[m#1,1-m#1,1]: 10804 bytes (11 K) # Test the file-size splitter's adaptive tolerance for early-splitting at a # grandparent boundary. The L1->L2 compaction has many opportunities to split at @@ -501,20 +501,20 @@ compact a-zz L1 file-sizes ---- L2: - 000019:[a#201,1-e#205,1]: 5865 bytes (5.7 K) - 000020:[f#206,1-l#212,1]: 7893 bytes (7.7 K) - 000021:[m#213,1-z#102,1]: 3827 bytes (3.7 K) + 000019:[a#201,1-e#205,1]: 5893 bytes (5.8 K) + 000020:[f#206,1-l#212,1]: 7921 bytes (7.7 K) + 000021:[m#213,1-z#102,1]: 3855 bytes (3.8 K) L3: - 000006:[a#1,1-a#1,1]: 1776 bytes (1.7 K) - 000007:[ab#2,1-ab#2,1]: 1777 bytes (1.7 K) - 000008:[ac#3,1-ac#3,1]: 1777 bytes (1.7 K) - 000013:[ad#8,1-ad#8,1]: 1777 bytes (1.7 K) - 000012:[ad#7,1-ad#7,1]: 1777 bytes (1.7 K) - 000011:[ad#6,1-ad#6,1]: 1777 bytes (1.7 K) - 000010:[ad#5,1-ad#5,1]: 1777 bytes (1.7 K) - 000009:[ad#4,1-ad#4,1]: 1777 bytes (1.7 K) - 000014:[c#9,1-c#9,1]: 1776 bytes (1.7 K) - 000015:[d#10,1-d#10,1]: 1776 bytes (1.7 K) - 000016:[e#11,1-e#11,1]: 1776 bytes (1.7 K) - 000017:[f#12,1-f#12,1]: 1776 bytes (1.7 K) - 000018:[m#13,1-m#13,1]: 1776 bytes (1.7 K) + 000006:[a#1,1-a#1,1]: 1804 bytes (1.8 K) + 000007:[ab#2,1-ab#2,1]: 1805 bytes (1.8 K) + 000008:[ac#3,1-ac#3,1]: 1805 bytes (1.8 K) + 000013:[ad#8,1-ad#8,1]: 1805 bytes (1.8 K) + 000012:[ad#7,1-ad#7,1]: 1805 bytes (1.8 K) + 000011:[ad#6,1-ad#6,1]: 1805 bytes (1.8 K) + 000010:[ad#5,1-ad#5,1]: 1805 bytes (1.8 K) + 000009:[ad#4,1-ad#4,1]: 1805 bytes (1.8 K) + 000014:[c#9,1-c#9,1]: 1804 bytes (1.8 K) + 000015:[d#10,1-d#10,1]: 1804 bytes (1.8 K) + 000016:[e#11,1-e#11,1]: 1804 bytes (1.8 K) + 000017:[f#12,1-f#12,1]: 1804 bytes (1.8 K) + 000018:[m#13,1-m#13,1]: 1804 bytes (1.8 K) diff --git a/testdata/marked_for_compaction b/testdata/marked_for_compaction index 6d17017e96..fa99de8b58 100644 --- a/testdata/marked_for_compaction +++ b/testdata/marked_for_compaction @@ -20,8 +20,8 @@ marked L0.000004 maybe-compact ---- -[JOB 100] compacted(rewrite) L1 [000005] (779 B) + L1 [] (0 B) -> L1 [000006] (779 B), in 1.0s (2.0s total), output rate 779 B/s -[JOB 100] compacted(rewrite) L0 [000004] (774 B) + L0 [] (0 B) -> L0 [000007] (774 B), in 1.0s (2.0s total), output rate 774 B/s +[JOB 100] compacted(rewrite) L1 [000005] (807 B) + L1 [] (0 B) -> L1 [000006] (807 B), in 1.0s (2.0s total), output rate 807 B/s +[JOB 100] compacted(rewrite) L0 [000004] (802 B) + L0 [] (0 B) -> L0 [000007] (802 B), in 1.0s (2.0s total), output rate 802 B/s 0.0: 000007:[c#11,SET-c#11,SET] points:[c#11,SET-c#11,SET] 1: diff --git a/testdata/metrics b/testdata/metrics index 22c015068a..b30f480dfd 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -34,7 +34,7 @@ compact 0 0 B 0 B 0 (size == esti zmemtbl 1 256 K ztbl 0 0 B bcache 4 697 B 0.0% (score == hit-rate) - tcache 1 752 B 0.0% (score == hit-rate) + tcache 1 760 B 0.0% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 1 filter - - 0.0% (score == utility) @@ -148,7 +148,7 @@ compact 1 0 B 0 B 0 (size == esti zmemtbl 1 256 K ztbl 1 770 B bcache 4 697 B 42.9% (score == hit-rate) - tcache 1 752 B 66.7% (score == hit-rate) + tcache 1 760 B 66.7% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 1 filter - - 0.0% (score == utility) diff --git a/testdata/table_stats b/testdata/table_stats index d370ee88f2..0768426a95 100644 --- a/testdata/table_stats +++ b/testdata/table_stats @@ -542,5 +542,5 @@ wait-pending-table-stats num-entries: 5 num-deletions: 2 num-range-key-sets: 0 -point-deletions-bytes-estimate: 112974 +point-deletions-bytes-estimate: 113036 range-deletions-bytes-estimate: 0