diff --git a/compaction.go b/compaction.go index 1cb04c2a38..53d237b2d2 100644 --- a/compaction.go +++ b/compaction.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/private" "github.com/cockroachdb/pebble/internal/rangedel" + "github.com/cockroachdb/pebble/internal/rangekey" "github.com/cockroachdb/pebble/sstable" "github.com/cockroachdb/pebble/vfs" ) @@ -309,6 +310,69 @@ func (k compactionKind) String() string { return "?" } +// rangeKeyCompactionTransform is used to transform range key spans as part of the +// keyspan.MergingIter. As part of this transformation step, we can elide range +// keys in the last snapshot stripe, as well as coalesce range keys within +// snapshot stripes. +func rangeKeyCompactionTransform( + snapshots []uint64, elideRangeKey func(start, end []byte) bool, +) keyspan.Transformer { + return keyspan.TransformerFunc(func(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error { + elideInLastStripe := func(keys []keyspan.Key) []keyspan.Key { + // Unsets and deletes in the last snapshot stripe can be elided. + k := 0 + for j := range keys { + if elideRangeKey(s.Start, s.End) && + (keys[j].Kind() == InternalKeyKindRangeKeyUnset || keys[j].Kind() == InternalKeyKindRangeKeyDelete) { + continue + } + keys[k] = keys[j] + k++ + } + keys = keys[:k] + return keys + } + // snapshots are in ascending order, while s.keys are in descending seqnum + // order. Partition s.keys by snapshot stripes, and call rangekey.Coalesce + // on each partition. + dst.Start = s.Start + dst.End = s.End + dst.Keys = dst.Keys[:0] + i, j := len(snapshots)-1, 0 + usedLen := 0 + for i >= 0 { + start := j + for j < len(s.Keys) && !base.Visible(s.Keys[j].SeqNum(), snapshots[i]) { + // Include j in current partition. + j++ + } + if j > start { + keysDst := dst.Keys[usedLen:cap(dst.Keys)] + if err := rangekey.Coalesce(cmp, s.Keys[start:j], &keysDst); err != nil { + return err + } + if j == len(s.Keys) { + // This is the last snapshot stripe. Unsets and deletes can be elided. + keysDst = elideInLastStripe(keysDst) + } + usedLen += len(keysDst) + dst.Keys = append(dst.Keys, keysDst...) + } + i-- + } + if j < len(s.Keys) { + keysDst := dst.Keys[usedLen:cap(dst.Keys)] + if err := rangekey.Coalesce(cmp, s.Keys[j:], &keysDst); err != nil { + return err + } + keysDst = elideInLastStripe(keysDst) + usedLen += len(keysDst) + dst.Keys = append(dst.Keys, keysDst...) + } + return nil + }) +} + // compaction is a table compaction from one level to the next, starting from a // given version. type compaction struct { @@ -344,10 +408,10 @@ type compaction struct { // maxOverlapBytes is the maximum number of bytes of overlap allowed for a // single output table with the tables in the grandparent level. maxOverlapBytes uint64 - // disableRangeTombstoneElision disables elision of range tombstones. Used by - // tests to allow range tombstones to be added to tables where they would - // otherwise be elided. - disableRangeTombstoneElision bool + // disableSpanElision disables elision of range tombstones and range keys. Used + // by tests to allow range tombstones or range keys to be added to tables where + // they would otherwise be elided. + disableSpanElision bool // flushing contains the flushables (aka memtables) that are being flushed. flushing flushableList @@ -370,12 +434,17 @@ type compaction struct { // returned from `compactionIter` and fragments them for output to files. // Referenced by `compactionIter` which uses it to check whether keys are deleted. rangeDelFrag keyspan.Fragmenter + // The range key fragmenter. Similar to rangeDelFrag in that it gets range + // keys from the compaction iter and fragments them for output to files. + rangeKeyFrag keyspan.Fragmenter // The range deletion tombstone iterator, that merges and fragments // tombstones across levels. This iterator is included within the compaction // input iterator as a single level. // TODO(jackson): Remove this when the refactor of FragmentIterator, // InterleavingIterator, etc is complete. rangeDelIter keyspan.InternalIteratorShim + // rangeKeyInterleaving is the interleaving iter for range keys. + rangeKeyInterleaving keyspan.InterleavingIter // A list of objects to close when the compaction finishes. Used by input // iteration to keep rangeDelIters open for the lifetime of the compaction, @@ -615,6 +684,9 @@ func newFlush( } updateRangeBounds := func(iter keyspan.FragmentIterator) { + // File bounds require s != nil && !s.Empty(). We only need to check for + // s != nil here, as the memtable's FragmentIterator would never surface + // empty spans. if s := iter.First(); s != nil { if key := s.SmallestKey(); !smallestSet || base.InternalCompare(c.cmp, c.smallest, key) > 0 { @@ -638,6 +710,9 @@ func newFlush( if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil { updateRangeBounds(rangeDelIter) } + if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { + updateRangeBounds(rangeKeyIter) + } flushingBytes += f.inuseBytes() } @@ -908,7 +983,7 @@ func (c *compaction) elideRangeTombstone(start, end []byte) bool { // code doesn't know that L0 contains files and zeroing of seqnums should // be disabled. That is fixable, but it seems safer to just match the // RocksDB behavior for now. - if c.disableRangeTombstoneElision || len(c.flushing) != 0 { + if c.disableSpanElision || len(c.flushing) != 0 { return false } @@ -921,9 +996,23 @@ func (c *compaction) elideRangeTombstone(start, end []byte) bool { return lower >= upper } +// elideRangeKey returns true if it is ok to elide the specified range key. A +// return value of true guarantees that there are no key/value pairs at +// c.outputLevel.level+1 or higher that possibly overlap the specified range key. +func (c *compaction) elideRangeKey(start, end []byte) bool { + // TODO(bilal): Track inuseKeyRanges separately for the range keyspace as + // opposed to the point keyspace. Once that is done, elideRangeTombstone + // can just check in the point keyspace, and this function can check for + // inuseKeyRanges in the range keyspace. + return c.elideRangeTombstone(start, end) +} + // newInputIter returns an iterator over all the input tables in a compaction. -func (c *compaction) newInputIter(newIters tableNewIters) (_ internalIterator, retErr error) { +func (c *compaction) newInputIter( + newIters tableNewIters, newSpanIter keyspan.TableNewSpanIter, snapshots []uint64, +) (_ internalIterator, retErr error) { var rangeDelIters []keyspan.FragmentIterator + var rangeKeyIters []keyspan.FragmentIterator if len(c.flushing) != 0 { if len(c.flushing) == 1 { @@ -931,12 +1020,19 @@ func (c *compaction) newInputIter(newIters tableNewIters) (_ internalIterator, r iter := f.newFlushIter(nil, &c.bytesIterated) if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil { c.rangeDelIter.Init(c.cmp, rangeDelIter) - return newMergingIter(c.logger, c.cmp, nil, iter, &c.rangeDelIter), nil + iter = newMergingIter(c.logger, c.cmp, nil, iter, &c.rangeDelIter) + } + if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { + mi := &keyspan.MergingIter{} + mi.Init(c.cmp, rangeKeyCompactionTransform(snapshots, c.elideRangeKey), rangeKeyIter) + c.rangeKeyInterleaving.Init(c.cmp, base.WrapIterWithStats(iter), mi, nil /* hooks */, nil /* lowerBound */, nil /* upperBound */) + iter = &c.rangeKeyInterleaving } return iter, nil } iters := make([]internalIterator, 0, len(c.flushing)+1) rangeDelIters = make([]keyspan.FragmentIterator, 0, len(c.flushing)) + rangeKeyIters = make([]keyspan.FragmentIterator, 0, len(c.flushing)) for i := range c.flushing { f := c.flushing[i] iters = append(iters, f.newFlushIter(nil, &c.bytesIterated)) @@ -944,12 +1040,22 @@ func (c *compaction) newInputIter(newIters tableNewIters) (_ internalIterator, r if rangeDelIter != nil { rangeDelIters = append(rangeDelIters, rangeDelIter) } + if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { + rangeKeyIters = append(rangeKeyIters, rangeKeyIter) + } } if len(rangeDelIters) > 0 { c.rangeDelIter.Init(c.cmp, rangeDelIters...) iters = append(iters, &c.rangeDelIter) } - return newMergingIter(c.logger, c.cmp, nil, iters...), nil + var iter base.InternalIteratorWithStats = newMergingIter(c.logger, c.cmp, nil, iters...) + if len(rangeKeyIters) > 0 { + mi := &keyspan.MergingIter{} + mi.Init(c.cmp, rangeKeyCompactionTransform(snapshots, c.elideRangeKey), rangeKeyIters...) + c.rangeKeyInterleaving.Init(c.cmp, base.WrapIterWithStats(iter), mi, nil /* hooks */, nil /* lowerBound */, nil /* upperBound */) + iter = &c.rangeKeyInterleaving + } + return iter, nil } if c.startLevel.level >= 0 { @@ -1081,6 +1187,20 @@ func (c *compaction) newInputIter(newIters tableNewIters) (_ internalIterator, r li := &keyspan.LevelIter{} li.Init(keyspan.SpanIterOptions{}, c.cmp, wrapper, level.files.Iter(), l, c.logger, manifest.KeyTypePoint) rangeDelIters = append(rangeDelIters, li) + // Check if this level has any range keys. + hasRangeKeys := false + iter := level.files.Iter() + for f := iter.First(); f != nil; f = iter.Next() { + if f.HasRangeKeys { + hasRangeKeys = true + break + } + } + if hasRangeKeys { + li := &keyspan.LevelIter{} + li.Init(keyspan.SpanIterOptions{}, c.cmp, newSpanIter, level.files.Iter(), l, c.logger, manifest.KeyTypeRange) + rangeKeyIters = append(rangeKeyIters, li) + } return nil } @@ -1114,7 +1234,17 @@ func (c *compaction) newInputIter(newIters tableNewIters) (_ internalIterator, r c.rangeDelIter.Init(c.cmp, rangeDelIters...) iters = append(iters, &c.rangeDelIter) } - return newMergingIter(c.logger, c.cmp, nil, iters...), nil + pointKeyIter := newMergingIter(c.logger, c.cmp, nil, iters...) + if len(rangeKeyIters) > 0 { + mi := &keyspan.MergingIter{} + mi.Init(c.cmp, rangeKeyCompactionTransform(snapshots, c.elideRangeKey), rangeKeyIters...) + di := &keyspan.DefragmentingIter{} + di.Init(c.cmp, mi, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer) + c.rangeKeyInterleaving.Init(c.cmp, pointKeyIter, di, nil /* hooks */, nil /* lowerBound */, nil /* upperBound */) + return &c.rangeKeyInterleaving, nil + } + + return pointKeyIter, nil } func (c *compaction) String() string { @@ -2048,13 +2178,13 @@ func (d *DB) runCompaction( d.mu.Unlock() defer d.mu.Lock() - iiter, err := c.newInputIter(d.newIters) + iiter, err := c.newInputIter(d.newIters, d.tableNewRangeKeyIter, snapshots) if err != nil { return nil, pendingOutputs, err } c.allowedZeroSeqNum = c.allowZeroSeqNum() iter := newCompactionIter(c.cmp, c.equal, c.formatKey, d.merge, iiter, snapshots, - &c.rangeDelFrag, c.allowedZeroSeqNum, c.elideTombstone, + &c.rangeDelFrag, &c.rangeKeyFrag, c.allowedZeroSeqNum, c.elideTombstone, c.elideRangeTombstone, d.FormatMajorVersion()) var ( @@ -2182,18 +2312,18 @@ func (d *DB) runCompaction( // should be flushed. Typically, this is the first key of the next // sstable or an empty key if this output is the final sstable. finishOutput := func(splitKey []byte) error { - // If we haven't output any point records to the sstable (tw == nil) - // then the sstable will only contain range tombstones. The smallest - // key in the sstable will be the start key of the first range - // tombstone added. We need to ensure that this start key is distinct - // from the splitKey passed to finishOutput (if set), otherwise we - // would generate an sstable where the largest key is smaller than the - // smallest key due to how the largest key boundary is set below. - // NB: It is permissible for the range tombstone start key to be the - // empty string. - // TODO: It is unfortunate that we have to do this check here rather - // than when we decide to finish the sstable in the runCompaction - // loop. A better structure currently eludes us. + // If we haven't output any point records to the sstable (tw == nil) then the + // sstable will only contain range tombstones and/or range keys. The smallest + // key in the sstable will be the start key of the first range tombstone or + // range key added. We need to ensure that this start key is distinct from + // the splitKey passed to finishOutput (if set), otherwise we would generate + // an sstable where the largest key is smaller than the smallest key due to + // how the largest key boundary is set below. NB: It is permissible for the + // range tombstone / range key start key to be the empty string. + // + // TODO: It is unfortunate that we have to do this check here rather than + // when we decide to finish the sstable in the runCompaction loop. A better + // structure currently eludes us. if tw == nil { startKey := c.rangeDelFrag.Start() if len(iter.tombstones) > 0 { @@ -2238,7 +2368,18 @@ func (d *DB) runCompaction( // added to the writer, eliding out-of-file range tombstones based // on sequence number at this stage is difficult, and necessitates // read-time logic to ignore range tombstones outside file bounds. - if rangedel.Encode(&v, tw.Add); err != nil { + if err := rangedel.Encode(&v, tw.Add); err != nil { + return err + } + } + for _, v := range iter.RangeKeys(splitKey) { + // Same logic as for range tombstones, except added using tw.AddRangeKey. + if tw == nil { + if err := newOutput(); err != nil { + return err + } + } + if err := rangekey.Encode(&v, tw.AddRangeKey); err != nil { return err } } @@ -2410,7 +2551,7 @@ func (d *DB) runCompaction( // to a grandparent file largest key, or nil. Taken together, these // progress guarantees ensure that eventually the input iterator will be // exhausted and the range tombstone fragments will all be flushed. - for key, val := iter.First(); key != nil || !c.rangeDelFrag.Empty(); { + for key, val := iter.First(); key != nil || !c.rangeDelFrag.Empty() || !c.rangeKeyFrag.Empty(); { splitterSuggestion := splitter.onNewOutput(key) // Each inner loop iteration processes one key from the input iterator. @@ -2425,7 +2566,8 @@ func (d *DB) runCompaction( return nil, pendingOutputs, err } } - if key.Kind() == InternalKeyKindRangeDelete { + switch key.Kind() { + case InternalKeyKindRangeDelete: // Range tombstones are handled specially. They are fragmented, // and they're not written until later during `finishOutput()`. // We add them to the `Fragmenter` now to make them visible to @@ -2462,6 +2604,22 @@ func (d *DB) runCompaction( c.rangeDelFrag.Add(clone) } continue + case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: + // Range keys are handled in the same way as range tombstones, except + // with a dedicated fragmenter. + if s := c.rangeKeyInterleaving.Span(); !s.Empty() { + clone := keyspan.Span{ + Start: iter.cloneKey(s.Start), + End: iter.cloneKey(s.End), + Keys: make([]keyspan.Key, len(s.Keys)), + } + // Since the keys' Suffix and Value fields are not deep cloned, the + // underlying blockIter must be kept open for the lifetime of the + // compaction. + copy(clone.Keys, s.Keys) + c.rangeKeyFrag.Add(clone) + } + continue } if tw == nil { if err := newOutput(); err != nil { diff --git a/compaction_iter.go b/compaction_iter.go index 8f7e2e1977..3a92b4bb04 100644 --- a/compaction_iter.go +++ b/compaction_iter.go @@ -5,7 +5,6 @@ package pebble import ( - "fmt" "io" "sort" "strconv" @@ -14,6 +13,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/bytealloc" "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/internal/rangekey" ) // compactionIter provides a forward-only iterator that encapsulates the logic @@ -198,8 +198,11 @@ type compactionIter struct { // Reference to the range deletion tombstone fragmenter (e.g., // `compaction.rangeDelFrag`). rangeDelFrag *keyspan.Fragmenter + rangeKeyFrag *keyspan.Fragmenter // The fragmented tombstones. tombstones []keyspan.Span + // The fragmented range keys. + rangeKeys []keyspan.Span // Byte allocator for the tombstone keys. alloc bytealloc.A allowZeroSeqNum bool @@ -218,6 +221,7 @@ func newCompactionIter( iter internalIterator, snapshots []uint64, rangeDelFrag *keyspan.Fragmenter, + rangeKeyFrag *keyspan.Fragmenter, allowZeroSeqNum bool, elideTombstone func(key []byte) bool, elideRangeTombstone func(start, end []byte) bool, @@ -229,6 +233,7 @@ func newCompactionIter( iter: iter, snapshots: snapshots, rangeDelFrag: rangeDelFrag, + rangeKeyFrag: rangeKeyFrag, allowZeroSeqNum: allowZeroSeqNum, elideTombstone: elideTombstone, elideRangeTombstone: elideRangeTombstone, @@ -237,6 +242,9 @@ func newCompactionIter( i.rangeDelFrag.Cmp = cmp i.rangeDelFrag.Format = formatKey i.rangeDelFrag.Emit = i.emitRangeDelChunk + i.rangeKeyFrag.Cmp = cmp + i.rangeKeyFrag.Format = formatKey + i.rangeKeyFrag.Emit = i.emitRangeKeyChunk return i } @@ -282,13 +290,13 @@ func (i *compactionIter) Next() (*InternalKey, []byte) { i.pos = iterPosCurForward i.valid = false for i.iterKey != nil { - if i.iterKey.Kind() == InternalKeyKindRangeDelete { - // Return the range tombstone so the compaction can use it for - // file truncation and add it to the fragmenter. We do not set `skip` - // to true before returning as there may be a forthcoming point key - // with the same user key and sequence number. Such a point key must be - // visible (i.e., not skipped over) since we promise point keys are - // not deleted by range tombstones at the same sequence number. + if i.iterKey.Kind() == InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKey.Kind()) { + // Return the span so the compaction can use it for file truncation and add + // it to the relevant fragmenter. We do not set `skip` to true before + // returning as there may be a forthcoming point key with the same user key + // and sequence number. Such a point key must be visible (i.e., not skipped + // over) since we promise point keys are not deleted by range tombstones at + // the same sequence number. // // Although, note that `skip` may already be true before reaching here // due to an earlier key in the stripe. Then it is fine to leave it set @@ -457,10 +465,6 @@ func (i *compactionIter) skipInStripe() { func (i *compactionIter) iterNext() bool { i.iterKey, i.iterValue = i.iter.Next() - // We should never see an exclusive sentinel in the compaction input. - if i.iterKey != nil && i.iterKey.IsExclusiveSentinel() { - panic(fmt.Sprintf("pebble: unexpected exclusive sentinel in compaction input, trailer = %x", i.iterKey.Trailer)) - } return i.iterKey != nil } @@ -766,9 +770,6 @@ func (i *compactionIter) Close() error { // Tombstones returns a list of pending range tombstones in the fragmenter // up to the specified key, or all pending range tombstones if key = nil. -// exclude specifies if the specified key is exclusive or inclusive. -// When exclude = true, all returned range tombstones are truncated to the -// specified key. func (i *compactionIter) Tombstones(key []byte) []keyspan.Span { if key == nil { i.rangeDelFrag.Finish() @@ -783,6 +784,22 @@ func (i *compactionIter) Tombstones(key []byte) []keyspan.Span { return tombstones } +// RangeKeys returns a list of pending fragmented range keys up to the specified +// key, or all pending range keys if key = nil. +func (i *compactionIter) RangeKeys(key []byte) []keyspan.Span { + if key == nil { + i.rangeKeyFrag.Finish() + } else { + // The specified end key is exclusive; no versions of the specified + // user key (including range tombstones covering that key) should + // be flushed yet. + i.rangeKeyFrag.TruncateAndFlushTo(key) + } + rangeKeys := i.rangeKeys + i.rangeKeys = nil + return rangeKeys +} + func (i *compactionIter) emitRangeDelChunk(fragmented keyspan.Span) { // Apply the snapshot stripe rules, keeping only the latest tombstone for // each snapshot stripe. @@ -815,6 +832,14 @@ func (i *compactionIter) emitRangeDelChunk(fragmented keyspan.Span) { } } +func (i *compactionIter) emitRangeKeyChunk(fragmented keyspan.Span) { + // Elision of snapshot stripes happens in rangeKeyCompactionTransform, so no need to + // do that here. + if len(fragmented.Keys) > 0 { + i.rangeKeys = append(i.rangeKeys, fragmented) + } +} + // maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing // so improves compression and enables an optimization during forward iteration // to skip some key comparisons. The seqnum for an entry can be zeroed if the diff --git a/compaction_iter_test.go b/compaction_iter_test.go index 5dcd2bd986..4fd1586afb 100644 --- a/compaction_iter_test.go +++ b/compaction_iter_test.go @@ -16,6 +16,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/datadriven" "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/internal/rangekey" ) func TestSnapshotIndex(t *testing.T) { @@ -75,10 +76,12 @@ func (m *debugMerger) Finish(includesBase bool) ([]byte, io.Closer, error) { func TestCompactionIter(t *testing.T) { var merge Merge var keys []InternalKey + var rangeKeys []keyspan.Span var vals [][]byte var snapshots []uint64 var elideTombstones bool var allowZeroSeqnum bool + var interleavingIter *keyspan.InterleavingIter // The input to the data-driven test is dependent on the format major // version we are testing against. @@ -94,7 +97,14 @@ func TestCompactionIter(t *testing.T) { // SSTables are not released while iterating, and therefore not // susceptible to use-after-free bugs, we skip the zeroing of // RangeDelete keys. - iter := newInvalidatingIter(&fakeIter{keys: keys, vals: vals}) + fi := &fakeIter{keys: keys, vals: vals} + interleavingIter = &keyspan.InterleavingIter{} + interleavingIter.Init( + base.DefaultComparer.Compare, + base.WrapIterWithStats(fi), + keyspan.NewIter(base.DefaultComparer.Compare, rangeKeys), + nil, nil, nil) + iter := newInvalidatingIter(interleavingIter) iter.ignoreKind(InternalKeyKindRangeDelete) if merge == nil { merge = func(key, value []byte) (base.ValueMerger, error) { @@ -112,6 +122,7 @@ func TestCompactionIter(t *testing.T) { iter, snapshots, &keyspan.Fragmenter{}, + &keyspan.Fragmenter{}, allowZeroSeqnum, func([]byte) bool { return elideTombstones @@ -134,6 +145,7 @@ func TestCompactionIter(t *testing.T) { } keys = keys[:0] vals = vals[:0] + rangeKeys = rangeKeys[:0] for _, key := range strings.Split(d.Input, "\n") { j := strings.Index(key, ":") keys = append(keys, base.ParseInternalKey(key[:j])) @@ -141,6 +153,13 @@ func TestCompactionIter(t *testing.T) { } return "" + case "define-range-keys": + for _, key := range strings.Split(d.Input, "\n") { + s := keyspan.ParseSpan(strings.TrimSpace(key)) + rangeKeys = append(rangeKeys, s) + } + return "" + case "iter": snapshots = snapshots[:0] elideTombstones = false @@ -199,6 +218,16 @@ func TestCompactionIter(t *testing.T) { } fmt.Fprintf(&b, ".\n") continue + case "range-keys": + var key []byte + if len(parts) == 2 { + key = []byte(parts[1]) + } + for _, v := range iter.RangeKeys(key) { + fmt.Fprintf(&b, "%s\n", v) + } + fmt.Fprintf(&b, ".\n") + continue default: return fmt.Sprintf("unknown op: %s", parts[0]) } @@ -213,6 +242,9 @@ func TestCompactionIter(t *testing.T) { }, }) } + if rangekey.IsRangeKey(iter.Key().Kind()) { + iter.rangeKeyFrag.Add(*interleavingIter.Span()) + } } else if err := iter.Error(); err != nil { fmt.Fprintf(&b, "err=%v\n", err) } else { diff --git a/compaction_test.go b/compaction_test.go index 1c96415b8b..6c73463303 100644 --- a/compaction_test.go +++ b/compaction_test.go @@ -829,6 +829,62 @@ func TestElideRangeTombstone(t *testing.T) { } } +func TestCompactionTransform(t *testing.T) { + datadriven.RunTest(t, "testdata/compaction_transform", func(td *datadriven.TestData) string { + switch td.Cmd { + case "transform": + var snapshots []uint64 + var keyRanges []manifest.UserKeyRange + disableElision := false + for i := range td.CmdArgs { + switch td.CmdArgs[i].Key { + case "snapshots": + for _, snapshot := range td.CmdArgs[i].Vals { + s, err := strconv.ParseUint(snapshot, 10, 64) + if err != nil { + return err.Error() + } + snapshots = append(snapshots, s) + } + case "in-use-key-ranges": + for _, keyRange := range td.CmdArgs[i].Vals { + parts := strings.SplitN(keyRange, "-", 2) + start := []byte(strings.TrimSpace(parts[0])) + end := []byte(strings.TrimSpace(parts[1])) + keyRanges = append(keyRanges, manifest.UserKeyRange{ + Start: start, + End: end, + }) + } + case "disable-elision": + disableElision = true + } + } + span := keyspan.ParseSpan(td.Input) + for i := range span.Keys { + if i > 0 { + if span.Keys[i-1].Trailer < span.Keys[i].Trailer { + return "span keys not sorted" + } + } + } + var outSpan keyspan.Span + c := compaction{ + cmp: base.DefaultComparer.Compare, + disableSpanElision: disableElision, + inuseKeyRanges: keyRanges, + } + transformer := rangeKeyCompactionTransform(snapshots, c.elideRangeTombstone) + if err := transformer.Transform(base.DefaultComparer.Compare, span, &outSpan); err != nil { + return fmt.Sprintf("error: %s", err) + } + return outSpan.String() + default: + return fmt.Sprintf("unknown command: %s", td.Cmd) + } + }) +} + type cpuPermissionGranter struct { granted int used bool @@ -1243,7 +1299,7 @@ func TestManualCompaction(t *testing.T) { d.mu.compact.compactingCount-- } - runTest := func(t *testing.T, testData string, minVersion, maxVersion FormatMajorVersion) { + runTest := func(t *testing.T, testData string, minVersion, maxVersion FormatMajorVersion, verbose bool) { reset(minVersion, maxVersion) var ongoingCompaction *compaction datadriven.RunTest(t, testData, func(td *datadriven.TestData) string { @@ -1270,7 +1326,12 @@ func TestManualCompaction(t *testing.T) { if err := runCompactCmd(td, d); err != nil { return err.Error() } - s := runLSMCmd(td, d) + d.mu.Lock() + s := d.mu.versions.currentVersion().String() + if verbose { + s = d.mu.versions.currentVersion().DebugString(base.DefaultFormatter) + } + d.mu.Unlock() if td.HasArg("hide-file-num") { re := regexp.MustCompile(`([0-9]*):\[`) s = re.ReplaceAllString(s, "[") @@ -1291,22 +1352,32 @@ func TestManualCompaction(t *testing.T) { FormatMajorVersion: randVersion(minVersion, maxVersion), } opts.DisableAutomaticCompactions = true + if opts.FormatMajorVersion >= FormatRangeKeys { + opts.Experimental.RangeKeys = new(RangeKeysArena) + } var err error if d, err = runDBDefineCmd(td, opts); err != nil { return err.Error() } - d.mu.Lock() s := d.mu.versions.currentVersion().String() - d.mu.Unlock() + if verbose { + s = d.mu.versions.currentVersion().DebugString(base.DefaultFormatter) + } return s case "ingest": if err := runIngestCmd(td, d, mem); err != nil { return err.Error() } - return runLSMCmd(td, d) + d.mu.Lock() + s := d.mu.versions.currentVersion().String() + if verbose { + s = d.mu.versions.currentVersion().DebugString(base.DefaultFormatter) + } + d.mu.Unlock() + return s case "iter": // TODO(peter): runDBDefineCmd doesn't properly update the visible @@ -1429,6 +1500,7 @@ func TestManualCompaction(t *testing.T) { testData string minVersion FormatMajorVersion maxVersion FormatMajorVersion // inclusive + verbose bool }{ { testData: "testdata/manual_compaction", @@ -1450,11 +1522,17 @@ func TestManualCompaction(t *testing.T) { minVersion: FormatSetWithDelete, maxVersion: FormatNewest, }, + { + testData: "testdata/manual_compaction_range_keys", + minVersion: FormatRangeKeys, + maxVersion: FormatNewest, + verbose: true, + }, } for _, tc := range testCases { t.Run(tc.testData, func(t *testing.T) { - runTest(t, tc.testData, tc.minVersion, tc.maxVersion) + runTest(t, tc.testData, tc.minVersion, tc.maxVersion, tc.verbose) }) } } @@ -2834,7 +2912,7 @@ func TestCompactionCheckOrdering(t *testing.T) { return &errorIter{}, nil, nil } result := "OK" - _, err := c.newInputIter(newIters) + _, err := c.newInputIter(newIters, nil, nil) if err != nil { result = fmt.Sprint(err) } diff --git a/data_test.go b/data_test.go index 702be2b572..cab4f9808d 100644 --- a/data_test.go +++ b/data_test.go @@ -17,7 +17,9 @@ import ( "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/datadriven" + "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/rangedel" + "github.com/cockroachdb/pebble/internal/rangekey" "github.com/cockroachdb/pebble/internal/testkeys" "github.com/cockroachdb/pebble/sstable" "github.com/cockroachdb/pebble/vfs" @@ -680,7 +682,7 @@ func runDBDefineCmd(td *datadriven.TestData, opts *Options) (*DB, error) { }} c := newFlush(d.opts, d.mu.versions.currentVersion(), d.mu.versions.picker.getBaseLevel(), toFlush, &d.atomic.bytesFlushed) - c.disableRangeTombstoneElision = true + c.disableSpanElision = true // NB: define allows the test to exactly specify which keys go // into which sstables. If the test has a small target file // size to test grandparent limits, etc, the maxOutputFileSize @@ -804,6 +806,16 @@ func runDBDefineCmd(td *datadriven.TestData, opts *Options) (*DB, error) { d.mu.compact.inProgress[c] = struct{}{} continue } + if data[:i] == "rangekey" { + span := keyspan.ParseSpan(data[i:]) + err := rangekey.Encode(&span, func(k base.InternalKey, v []byte) error { + return mem.set(k, v) + }) + if err != nil { + return nil, err + } + continue + } key := base.ParseInternalKey(data[:i]) valueStr := data[i+1:] value := []byte(valueStr) diff --git a/internal/rangekey/coalesce.go b/internal/rangekey/coalesce.go index b9e88a14f5..e350589866 100644 --- a/internal/rangekey/coalesce.go +++ b/internal/rangekey/coalesce.go @@ -60,7 +60,9 @@ func (ui *UserIteratorConfig) AddLevel(iter keyspan.FragmentIterator) { // sequence number. func (ui *UserIteratorConfig) Transform(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error { // Apply shadowing of keys. - if err := Coalesce(cmp, s.Visible(ui.snapshot), dst); err != nil { + dst.Start = s.Start + dst.End = s.End + if err := Coalesce(cmp, s.Visible(ui.snapshot).Keys, &dst.Keys); err != nil { return err } @@ -169,18 +171,18 @@ func (ui *UserIteratorConfig) ShouldDefragment(cmp base.Compare, a, b *keyspan.S // keys do not affect one another. Ingested sstables are expected to be // consistent with respect to the set/unset suffixes: A given suffix should be // set or unset but not both. -func Coalesce(cmp base.Compare, span keyspan.Span, dst *keyspan.Span) error { +func Coalesce(cmp base.Compare, keys []keyspan.Key, dst *[]keyspan.Key) error { // TODO(jackson): Currently, Coalesce doesn't actually perform the sequence // number promotion described in the comment above. keysBySuffix := keysBySuffix{ cmp: cmp, - keys: dst.Keys[:0], + keys: (*dst)[:0], } var deleted bool - for i := 0; i < len(span.Keys) && !deleted; i++ { - k := span.Keys[i] - if invariants.Enabled && i > 0 && k.Trailer > span.Keys[i-1].Trailer { + for i := 0; i < len(keys) && !deleted; i++ { + k := keys[i] + if invariants.Enabled && i > 0 && k.Trailer > keys[i-1].Trailer { panic("pebble: invariant violation: span keys unordered") } @@ -224,12 +226,8 @@ func Coalesce(cmp base.Compare, span keyspan.Span, dst *keyspan.Span) error { // Update the span with the (potentially reduced) keys slice, and re-sort it // by Trailer. - *dst = keyspan.Span{ - Start: span.Start, - End: span.End, - Keys: keysBySuffix.keys, - } - keyspan.SortKeys(dst.Keys) + *dst = keysBySuffix.keys + keyspan.SortKeys(*dst) return nil } diff --git a/internal/rangekey/coalesce_test.go b/internal/rangekey/coalesce_test.go index e6958d15a8..2833a615b0 100644 --- a/internal/rangekey/coalesce_test.go +++ b/internal/rangekey/coalesce_test.go @@ -31,8 +31,11 @@ func TestCoalesce(t *testing.T) { case "coalesce": buf.Reset() span := keyspan.ParseSpan(td.Input) - var coalesced keyspan.Span - if err := Coalesce(cmp, span, &coalesced); err != nil { + coalesced := keyspan.Span{ + Start: span.Start, + End: span.End, + } + if err := Coalesce(cmp, span.Keys, &coalesced.Keys); err != nil { return err.Error() } fmt.Fprintln(&buf, coalesced) @@ -68,7 +71,9 @@ func TestIter(t *testing.T) { } transform := keyspan.TransformerFunc(func(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error { s = s.Visible(visibleSeqNum) - return Coalesce(cmp, s, dst) + dst.Start = s.Start + dst.End = s.End + return Coalesce(cmp, s.Keys, &dst.Keys) }) iter.Init(cmp, transform, keyspan.NewIter(cmp, spans)) return "OK" diff --git a/iterator.go b/iterator.go index 6f18d6aa94..a98ed58742 100644 --- a/iterator.go +++ b/iterator.go @@ -1728,7 +1728,7 @@ func (i *Iterator) Close() error { if i.pointIter != nil { i.err = firstError(i.err, i.pointIter.Close()) } - if i.rangeKey != nil { + if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil { i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) } } diff --git a/mem_table_test.go b/mem_table_test.go index 38ff2106eb..942b592dbc 100644 --- a/mem_table_test.go +++ b/mem_table_test.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/pebble/internal/arenaskl" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/datadriven" + "github.com/cockroachdb/pebble/internal/rangekey" "github.com/stretchr/testify/require" "golang.org/x/exp/rand" "golang.org/x/sync/errgroup" @@ -53,6 +54,13 @@ func (m *memTable) set(key InternalKey, value []byte) error { m.tombstones.invalidate(1) return nil } + if rangekey.IsRangeKey(key.Kind()) { + if err := m.rangeKeySkl.Add(key, value); err != nil { + return err + } + m.rangeKeys.invalidate(1) + return nil + } return m.skl.Add(key, value) } diff --git a/sstable/writer.go b/sstable/writer.go index 1ca4468236..5de49d1b98 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -938,7 +938,9 @@ func (w *Writer) addRangeKeySpan(span keyspan.Span) error { func (w *Writer) coalesceSpans(span keyspan.Span) { // This method is the emit function of the Fragmenter, so span.Keys is only // owned by this span and it's safe to mutate. - err := rangekey.Coalesce(w.compare, span, &w.rangeKeyCoalesced) + w.rangeKeyCoalesced.Start = span.Start + w.rangeKeyCoalesced.End = span.End + err := rangekey.Coalesce(w.compare, span.Keys, &w.rangeKeyCoalesced.Keys) if err != nil { w.err = errors.Newf("sstable: could not coalesce span: %s", err) return diff --git a/testdata/compaction_iter b/testdata/compaction_iter index 2f17e2bcff..9ea96ab009 100644 --- a/testdata/compaction_iter +++ b/testdata/compaction_iter @@ -1181,3 +1181,35 @@ next . . . + +# Test that range keys are interleaved, and exposed to the fragmenter. + +define +a.SINGLEDEL.4: +a.SET.3:val +a.DEL.2: +a.SET.1:val +c.SET.3:val +---- + +define-range-keys +a-b:{(#3,RANGEKEYSET,@2,foo)} +d-e:{(#3,RANGEKEYSET,@2,foo)} +---- + +iter +first +next +next +next +next +range-keys +---- +a#72057594037927935,21: +a#2,0: +c#3,1:val +d#72057594037927935,21: +. +a-b:{(#3,RANGEKEYSET,@2,foo)} +d-e:{(#3,RANGEKEYSET,@2,foo)} +. diff --git a/testdata/compaction_transform b/testdata/compaction_transform new file mode 100644 index 0000000000..4fd9fd039e --- /dev/null +++ b/testdata/compaction_transform @@ -0,0 +1,116 @@ + +# Test snapshot striping and coalescing. + +transform snapshots=(5,10,15) disable-elision +a-c:{(#9,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#9,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3)} + +transform snapshots=(5,10,15) disable-elision +a-c:{(#9,RANGEKEYUNSET,@3) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#9,RANGEKEYUNSET,@3) (#4,RANGEKEYSET,@3,foo3)} + +transform snapshots=(5,10,15) disable-elision +a-c:{(#9,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#9,RANGEKEYDEL) (#4,RANGEKEYSET,@3,foo3)} + +transform snapshots=(5,10,15) disable-elision +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3)} + +transform disable-elision +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#11,RANGEKEYDEL)} + +# Test that elision works on the last snapshot stripe. + +transform snapshots=(5,10,15) +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3)} + +transform snapshots=(3,10,15) +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYUNSET,@4) (#2,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#3,RANGEKEYUNSET,@4) (#2,RANGEKEYSET,@3,foo2)} + +transform snapshots=(2,10,15) +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYUNSET,@4) (#2,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5)} + +# The RANGEKEYDEL deletes all underlying keys and there are no snapshots or +# in-use key ranges at play, so all keys should empty out. + +transform +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{} + +# Test RANGEKEYDELs are preserved over in-use key ranges in the last snapshot stripe. +# in-use key ranges cover keys that exist in lower levels of the LSM, so dropping +# range keys in that space could cause correctness issues. + +transform in-use-key-ranges=(b-d) +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#11,RANGEKEYDEL)} + +# Test RANGEKEYSETs are preserved in the non-last snapshot stripe. + +transform in-use-key-ranges=(b-d) snapshots=(8) +a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#11,RANGEKEYDEL) (#4,RANGEKEYSET,@3,foo3)} + +transform +a-c:{(#13,RANGEKEYSET,@3,bar1) (#12,RANGEKEYSET,@2,bar2) (#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#13,RANGEKEYSET,@3,bar1) (#12,RANGEKEYSET,@2,bar2)} + +# Test RANGEKEYUNSETs are preserved over in-use key ranges. + +transform +a-c:{(#11,RANGEKEYUNSET,@3) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{} + +transform in-use-key-ranges=(b-d) +a-c:{(#11,RANGEKEYUNSET,@3) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)} +---- +a-c:{(#11,RANGEKEYUNSET,@3)} + +# Test cases where multiple keys have the same sequence number. + +transform +a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@4) (#11,RANGEKEYDEL)} +---- +a-c:{(#11,RANGEKEYSET,@3,foo5)} + +transform +a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@3) (#11,RANGEKEYDEL)} +---- +a-c:{(#11,RANGEKEYSET,@3,foo5)} + +# Test that UNSETs and DELs are retained over in-use key ranges. + +transform in-use-key-ranges=(b-d) +a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@4) (#11,RANGEKEYDEL)} +---- +a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@4) (#11,RANGEKEYDEL)} + +# Test that sets shadow unset at the same prefix, even if elision is disabled. + +transform in-use-key-ranges=(b-d) +a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@3) (#11,RANGEKEYDEL)} +---- +a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYDEL)} + +transform disable-elision +a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@3) (#11,RANGEKEYDEL) +---- +a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYDEL)} diff --git a/testdata/manual_compaction_range_keys b/testdata/manual_compaction_range_keys new file mode 100644 index 0000000000..b88be723f8 --- /dev/null +++ b/testdata/manual_compaction_range_keys @@ -0,0 +1,48 @@ + +# Test compaction of range keys. + +define target-file-sizes=(1, 1, 1, 1) +L0 + rangekey:a-c:{(#4,RANGEKEYSET,@2,foo)} + a.SET.3:b +L2 + a.SET.2:v +L3 + a.SET.0:v + b.SET.0:v + rangekey:b-c:{(#1,RANGEKEYSET,@2,bar)} +L3 + c.SET.0:v +---- +0.0: + 000004:[a#4,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] points:[a#3,SET-a#3,SET] ranges:[a#4,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] +2: + 000005:[a#2,SET-a#2,SET] points:[a#2,SET-a#2,SET] +3: + 000006:[a#0,SET-c#72057594037927935,RANGEKEYSET] points:[a#0,SET-b#0,SET] ranges:[b#1,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] + 000007:[c#0,SET-c#0,SET] points:[c#0,SET-c#0,SET] + +compact a-d L0 +---- +1: + 000008:[a#4,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] points:[a#3,SET-a#3,SET] ranges:[a#4,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] +2: + 000005:[a#2,SET-a#2,SET] points:[a#2,SET-a#2,SET] +3: + 000006:[a#0,SET-c#72057594037927935,RANGEKEYSET] points:[a#0,SET-b#0,SET] ranges:[b#1,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] + 000007:[c#0,SET-c#0,SET] points:[c#0,SET-c#0,SET] + +compact a-d L1 +---- +2: + 000009:[a#4,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] points:[a#3,SET-a#3,SET] ranges:[a#4,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] +3: + 000006:[a#0,SET-c#72057594037927935,RANGEKEYSET] points:[a#0,SET-b#0,SET] ranges:[b#1,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] + 000007:[c#0,SET-c#0,SET] points:[c#0,SET-c#0,SET] + +compact a-d L2 +---- +3: + 000010:[a#4,RANGEKEYSET-b#72057594037927935,RANGEKEYSET] points:[a#0,SET-a#0,SET] ranges:[a#4,RANGEKEYSET-b#72057594037927935,RANGEKEYSET] + 000011:[b#4,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] points:[b#0,SET-b#0,SET] ranges:[b#4,RANGEKEYSET-c#72057594037927935,RANGEKEYSET] + 000007:[c#0,SET-c#0,SET] points:[c#0,SET-c#0,SET]