diff --git a/internal/rangekey/rangekey.go b/internal/rangekey/rangekey.go index c692ff6495..5007b9406b 100644 --- a/internal/rangekey/rangekey.go +++ b/internal/rangekey/rangekey.go @@ -323,6 +323,30 @@ func Parse(s string) (key base.InternalKey, value []byte) { } } +// RecombinedValueLen returns the length of the byte slice that results from +// re-encoding the end key and the user-value as a physical range key value. +func RecombinedValueLen(kind base.InternalKeyKind, endKey, userValue []byte) int { + n := len(endKey) + if kind == base.InternalKeyKindRangeKeyDelete { + // RANGEKEYDELs are not varint encoded. + return n + } + return lenVarint(len(endKey)) + len(endKey) + len(userValue) +} + +// RecombineValue re-encodes the end key and user-value as a physical range key +// value into the destination byte slice. +func RecombineValue(kind base.InternalKeyKind, dst, endKey, userValue []byte) int { + if kind == base.InternalKeyKindRangeKeyDelete { + // RANGEKEYDELs are not varint encoded. + return copy(dst, endKey) + } + n := binary.PutUvarint(dst, uint64(len(endKey))) + n += copy(dst[n:], endKey) + n += copy(dst[n:], userValue) + return n +} + func lenVarint(v int) (n int) { x := uint32(v) n++ diff --git a/internal/rangekey/rangekey_test.go b/internal/rangekey/rangekey_test.go index 5c1b6b9c3a..68d073a928 100644 --- a/internal/rangekey/rangekey_test.go +++ b/internal/rangekey/rangekey_test.go @@ -2,6 +2,7 @@ package rangekey import ( "fmt" + "strconv" "testing" "github.com/cockroachdb/pebble/internal/base" @@ -146,3 +147,27 @@ func TestParseFormatRoundtrip(t *testing.T) { } } } + +func TestRecombinedValueLen_RoundTrip(t *testing.T) { + testCases := []string{ + "a.RANGEKEYSET.1: [(@t22=foo),(@t1=bar)]", + "a.RANGEKEYSET.1: [(@t1=bar)]", + "a.RANGEKEYUNSET.1: [@t9,@t8,@t7,@t6,@t5]", + "a.RANGEKEYDEL.5: foo", + } + for i, in := range testCases { + t.Run(strconv.Itoa(i), func(t *testing.T) { + k, v := Parse(in) + + // Split the value into an end key and a user-value. + endKey, restValue, ok := DecodeEndKey(k.Kind(), v) + require.True(t, ok) + + // Re-encode the end key and user-value. + dst := make([]byte, RecombinedValueLen(k.Kind(), endKey, restValue)) + RecombineValue(k.Kind(), dst, endKey, restValue) + + require.Equal(t, v, dst) + }) + } +} diff --git a/sstable/data_test.go b/sstable/data_test.go index c13aa3f4ce..e277c7d40a 100644 --- a/sstable/data_test.go +++ b/sstable/data_test.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/datadriven" "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/internal/rangekey" "github.com/cockroachdb/pebble/vfs" ) @@ -68,12 +69,20 @@ func runBuildCmd( } w := NewWriter(f0, *writerOpts) - var tombstones []keyspan.Span - f := keyspan.Fragmenter{ + var rangeDels []keyspan.Span + rangeDelFrag := keyspan.Fragmenter{ Cmp: DefaultComparer.Compare, Format: DefaultComparer.FormatKey, Emit: func(fragmented []keyspan.Span) { - tombstones = append(tombstones, fragmented...) + rangeDels = append(rangeDels, fragmented...) + }, + } + var rangeKeys []keyspan.Span + rangeKeyFrag := keyspan.Fragmenter{ + Cmp: DefaultComparer.Compare, + Format: DefaultComparer.FormatKey, + Emit: func(fragmented []keyspan.Span) { + rangeKeys = append(rangeKeys, fragmented...) }, } for _, data := range strings.Split(td.Input, "\n") { @@ -89,7 +98,28 @@ func runBuildCmd( err = errors.Errorf("%v", r) } }() - f.Add(keyspan.Span{Start: key, End: value}) + rangeDelFrag.Add(keyspan.Span{Start: key, End: value}) + }() + if err != nil { + return nil, nil, err + } + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + var err error + func() { + defer func() { + if r := recover(); r != nil { + err = errors.Errorf("%v", r) + } + }() + key, value := rangekey.Parse(data) + endKey, value, ok := rangekey.DecodeEndKey(key.Kind(), value) + if !ok { + err = errors.New("could not decode end key") + return + } + rangeKeyFrag.Add(keyspan.Span{Start: key, End: endKey, Value: value}) }() if err != nil { return nil, nil, err @@ -98,15 +128,24 @@ func runBuildCmd( if err := w.Add(key, value); err != nil { return nil, nil, err } - } } - f.Finish() - for _, v := range tombstones { + rangeDelFrag.Finish() + for _, v := range rangeDels { if err := w.Add(v.Start, v.End); err != nil { return nil, nil, err } } + rangeKeyFrag.Finish() + for _, v := range rangeKeys { + // Reconstitute the value from the end key and the user value. + n := rangekey.RecombinedValueLen(v.Start.Kind(), v.End, v.Value) + b := make([]byte, n) + _ = rangekey.RecombineValue(v.Start.Kind(), b, v.End, v.Value) + if err := w.Add(v.Start, b); err != nil { + return nil, nil, err + } + } if err := w.Close(); err != nil { return nil, nil, err } @@ -151,7 +190,19 @@ func runBuildRawCmd(td *datadriven.TestData) (*WriterMetadata, *Reader, error) { for _, data := range strings.Split(td.Input, "\n") { j := strings.Index(data, ":") key := base.ParseInternalKey(data[:j]) - value := []byte(data[j+1:]) + + var value []byte + switch key.Kind() { + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + // Values for range keys must be converted into their "packed" form before + // being added to the Writer. + _, value = rangekey.Parse(data) + default: + value = []byte(data[j+1:]) + } + if err := w.Add(key, value); err != nil { return nil, nil, err } diff --git a/sstable/properties.go b/sstable/properties.go index c8ff10be8e..7426222fa9 100644 --- a/sstable/properties.go +++ b/sstable/properties.go @@ -114,6 +114,8 @@ type Properties struct { NumMergeOperands uint64 `prop:"rocksdb.merge.operands"` // The number of range deletions in this table. NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"` + // The number of range keys in this table. + NumRangeKeys uint64 `prop:"pebble.num.range-keys"` // Timestamp of the earliest key. 0 if unknown. OldestKeyTime uint64 `prop:"rocksdb.oldest.key.time"` // The name of the prefix extractor used in this table. Empty if no prefix @@ -318,6 +320,9 @@ func (p *Properties) save(w *rawBlockWriter) { p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions) p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands) p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions) + if p.NumRangeKeys > 0 { + p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeys), p.NumRangeKeys) + } p.saveUvarint(m, unsafe.Offsetof(p.OldestKeyTime), p.OldestKeyTime) if p.PrefixExtractorName != "" { p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName) diff --git a/sstable/properties_test.go b/sstable/properties_test.go index 9173a99d86..2ac9d3b094 100644 --- a/sstable/properties_test.go +++ b/sstable/properties_test.go @@ -85,13 +85,14 @@ func TestPropertiesSave(t *testing.T) { NumEntries: 16, NumMergeOperands: 17, NumRangeDeletions: 18, - OldestKeyTime: 19, + NumRangeKeys: 19, + OldestKeyTime: 20, PrefixExtractorName: "prefix extractor name", PrefixFiltering: true, PropertyCollectorNames: "prefix collector names", - RawKeySize: 20, - RawValueSize: 21, - TopLevelIndexSize: 22, + RawKeySize: 21, + RawValueSize: 22, + TopLevelIndexSize: 23, WholeKeyFiltering: true, UserProperties: map[string]string{ "user-prop-a": "1", diff --git a/sstable/reader.go b/sstable/reader.go index 50362ca4d8..4fefb768b2 100644 --- a/sstable/reader.go +++ b/sstable/reader.go @@ -2018,6 +2018,7 @@ type Reader struct { indexBH BlockHandle filterBH BlockHandle rangeDelBH BlockHandle + rangeKeyBH BlockHandle rangeDelTransform blockTransform propertiesBH BlockHandle metaIndexBH BlockHandle @@ -2133,6 +2134,24 @@ func (r *Reader) NewRawRangeDelIter() (base.InternalIterator, error) { return i, nil } +// NewRawRangeKeyIter returns an internal iterator for the contents of the +// range-key block for the table. Returns nil if the table does not contain any +// range keys. +func (r *Reader) NewRawRangeKeyIter() (base.InternalIterator, error) { + if r.rangeKeyBH.Length == 0 { + return nil, nil + } + h, err := r.readRangeKey() + if err != nil { + return nil, err + } + i := &blockIter{} + if err := i.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil { + return nil, err + } + return i, nil +} + func (r *Reader) readIndex() (cache.Handle, error) { return r.readBlock(r.indexBH, nil /* transform */, nil /* readaheadState */) } @@ -2145,6 +2164,10 @@ func (r *Reader) readRangeDel() (cache.Handle, error) { return r.readBlock(r.rangeDelBH, r.rangeDelTransform, nil /* readaheadState */) } +func (r *Reader) readRangeKey() (cache.Handle, error) { + return r.readBlock(r.rangeKeyBH, nil /* transform */, nil /* readaheadState */) +} + // readBlock reads and decompresses a block from disk into memory. func (r *Reader) readBlock( bh BlockHandle, transform blockTransform, raState *readaheadState, @@ -2347,6 +2370,10 @@ func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { } } + if bh, ok := meta[metaRangeKeyName]; ok { + r.rangeKeyBH = bh + } + for name, fp := range r.opts.Filters { types := []struct { ftype FilterType diff --git a/sstable/table.go b/sstable/table.go index 94da13d59f..a20d423832 100644 --- a/sstable/table.go +++ b/sstable/table.go @@ -159,6 +159,7 @@ const ( levelDBFormatVersion = 0 rocksDBFormatVersion2 = 2 + metaRangeKeyName = "pebble.range_key" metaPropertiesName = "rocksdb.properties" metaRangeDelName = "rocksdb.range_del" metaRangeDelV2Name = "rocksdb.range_del2" diff --git a/sstable/testdata/rewriter b/sstable/testdata/rewriter index a25dc2f095..42dd23826c 100644 --- a/sstable/testdata/rewriter +++ b/sstable/testdata/rewriter @@ -5,6 +5,7 @@ c_xyz.SET.1:c ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] # rewrite from=xyz to=123 block-size=1 index-block-size=1 filter @@ -22,6 +23,7 @@ ca_xyz.SET.1:c ---- point: [aa_xyz#1,1,ca_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] rewrite from=yz to=23 block-size=1 index-block-size=1 filter comparer-split-4b-suffix @@ -39,6 +41,7 @@ c_xyz.SET.1:c ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -75,6 +78,7 @@ rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_123#1,1,c_123#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -111,6 +115,7 @@ rewrite from=_123 to=_456 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_456#1,1,c_456#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -147,6 +152,7 @@ rewrite from=_456 to=_xyz block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -184,6 +190,7 @@ rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_123#1,1,c_123#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout diff --git a/sstable/testdata/writer b/sstable/testdata/writer index 904e35109e..9eece4f283 100644 --- a/sstable/testdata/writer +++ b/sstable/testdata/writer @@ -3,6 +3,7 @@ a.SET.1:a ---- point: [a#1,1,a#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] scan @@ -12,6 +13,9 @@ a#1,1:a scan-range-del ---- +scan-range-key +---- + build a.SET.1:a b.DEL.2: @@ -21,10 +25,14 @@ f.SET.5:f g.DEL.6: h.MERGE.7:h i.RANGEDEL.8:j +j.RANGEKEYDEL.9:k +k.RANGEKEYUNSET.10:l [@t5] +l.RANGEKEYSET.11:m [(@t10=foo)] ---- point: [a#1,1,h#7,2] rangedel: [d#4,15,j#72057594037927935,15] -seqnums: [1,8] +rangekey: [j#9,19,m#72057594037927935,21] +seqnums: [1,11] build a.SET.1:a @@ -38,6 +46,7 @@ i.RANGEDEL.8:j ---- point: [a#1,1,h#7,2] rangedel: [d#4,15,j#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,8] scan @@ -65,6 +74,7 @@ j.RANGEDEL.1:z ---- point: [#0,0,#0,0] rangedel: [a#3,15,z#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,3] scan @@ -82,6 +92,9 @@ m#2,15:s m#1,15:s s#1,15:z +scan-range-key +---- + # The range tombstone upper bound is exclusive, so a point operation # on that same key will be the actual boundary. @@ -91,6 +104,7 @@ b.SET.4:c ---- point: [b#4,1,b#4,1] rangedel: [a#3,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [3,4] build @@ -99,6 +113,7 @@ b.SET.2:c ---- point: [b#2,1,b#2,1] rangedel: [a#3,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [2,3] build @@ -107,6 +122,7 @@ b.SET.2:c ---- point: [b#2,1,b#2,1] rangedel: [a#3,15,c#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [2,3] # Keys must be added in order. @@ -134,6 +150,7 @@ build-raw ---- point: [#0,0,#0,0] rangedel: [#1,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,1] build-raw @@ -160,8 +177,64 @@ c.RANGEDEL.2:d ---- point: [#0,0,#0,0] rangedel: [a#1,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] +seqnums: [1,2] + +build-raw +a.RANGEKEYSET.1:b [(@t10=foo)] +a.RANGEKEYSET.2:b [(@t10=foo)] +---- +pebble: range keys must be added in order: a#1,RANGEKEYSET, a#2,RANGEKEYSET + +build-raw +b.RANGEKEYSET.2:c [(@t10=foo)] +a.RANGEKEYSET.1:b [(@t10=foo)] +---- +pebble: range keys must be added in order: b#2,RANGEKEYSET, a#1,RANGEKEYSET + +build-raw +a.RANGEKEYSET.1:c [(@t10=foo)] +b.RANGEKEYSET.2:d [(@t10=foo)] +---- +pebble: overlapping range keys must be fragmented: a-c#1, b-d#2 + +build-raw +a.RANGEKEYSET.2:c [(@t10=foo)] +a.RANGEKEYSET.1:d [(@t10=foo)] +---- +pebble: overlapping range keys must be fragmented: a-c#2, a-d#1 + +build-raw +a.RANGEKEYSET.1:c [(@t10=foo)] +c.RANGEKEYSET.2:d [(@t10=foo)] +---- +point: [#0,0,#0,0] +rangedel: [#0,0,#0,0] +rangekey: [a#1,21,d#72057594037927935,21] seqnums: [1,2] +# Range keys may have perfectly aligned spans (including sequence numbers), +# though the key kinds must be ordered (descending). + +build-raw +a.RANGEKEYUNSET.1:b [@t10] +a.RANGEKEYSET.1:b [(@t10=foo)] +---- +pebble: range keys must be added in order: a#1,RANGEKEYUNSET, a#1,RANGEKEYSET + +build-raw +a.RANGEKEYSET.1:b [(@t10=foo)] +a.RANGEKEYUNSET.1:b [@t10] +a.RANGEKEYDEL.1:b +---- +point: [#0,0,#0,0] +rangedel: [#0,0,#0,0] +rangekey: [a#1,21,b#72057594037927935,19] +seqnums: [1,1] + +# FIXME(travers): Additional test cases that cover the ordering of suffix-value +# tuples? + # The range-del-v1 format supports unfragmented and unsorted range # tombstones. @@ -171,6 +244,7 @@ a.RANGEDEL.2:c ---- point: [#0,0,#0,0] rangedel: [a#2,15,c#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -184,6 +258,7 @@ b.RANGEDEL.2:d ---- point: [#0,0,#0,0] rangedel: [a#1,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -199,6 +274,7 @@ a.RANGEDEL.1:d ---- point: [#0,0,#0,0] rangedel: [a#2,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -218,6 +294,7 @@ a.RANGEDEL.3:m ---- point: [#0,0,#0,0] rangedel: [a#3,15,z#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,3] scan-range-del @@ -241,6 +318,7 @@ c.SET.1:c ---- point: [a#1,1,c#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -273,6 +351,7 @@ c.SET.1:c ---- point: [a#1,1,c#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout diff --git a/sstable/writer.go b/sstable/writer.go index 8648a96546..8f97941554 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/pebble/internal/crc" "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/private" + "github.com/cockroachdb/pebble/internal/rangekey" ) var errWriterClosed = errors.New("pebble: writer is closed") @@ -28,8 +29,10 @@ type WriterMetadata struct { Size uint64 SmallestPoint InternalKey SmallestRangeDel InternalKey + SmallestRangeKey InternalKey LargestPoint InternalKey LargestRangeDel InternalKey + LargestRangeKey InternalKey SmallestSeqNum uint64 LargestSeqNum uint64 Properties Properties @@ -134,6 +137,7 @@ type Writer struct { block blockWriter indexBlock blockWriter rangeDelBlock blockWriter + rangeKeyBlock blockWriter props Properties propCollectors []TablePropertyCollector blockPropCollectors []BlockPropertyCollector @@ -212,19 +216,30 @@ func (w *Writer) Merge(key, value []byte) error { return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value) } -// Add adds a key/value pair to the table being written. For a given Writer, -// the keys passed to Add must be in increasing order. The exception to this -// rule is range deletion tombstones. Range deletion tombstones need to be -// added ordered by their start key, but they can be added out of order from -// point entries. Additionally, range deletion tombstones must be fragmented -// (i.e. by keyspan.Fragmenter). +// Add adds a key/value pair to the table being written. For a given Writer, the +// keys passed to Add must be in increasing order. The exceptions to this rule +// are range deletion tombstones, and range key sets, unsets and deletes. +// +// Range deletion tombstones need to be added ordered by their start key, but +// they can be added out of order from point entries. Additionally, range +// deletion tombstones must be fragmented (i.e. by keyspan.Fragmenter). +// +// Range keys must be added ordered by their start key, and in their fragmented +// span order. Other than spans that are perfectly aligned (same start and end +// keys), spans may not overlap. Range keys can be added out of order with point +// keys and range deletions. func (w *Writer) Add(key InternalKey, value []byte) error { if w.err != nil { return w.err } - if key.Kind() == InternalKeyKindRangeDelete { + switch key.Kind() { + case InternalKeyKindRangeDelete: return w.addTombstone(key, value) + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + return w.addRangeKey(key, value) } return w.addPoint(key, value) } @@ -370,6 +385,90 @@ func (w *Writer) addTombstone(key InternalKey, value []byte) error { return nil } +func (w *Writer) addRangeKey(key InternalKey, value []byte) error { + if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 { + prevStartKey := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + prevEndKey, prevVal, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue) + if !ok { + // We panic here as we should have previously decoded and validated this + // key and value when it was first added to the range key block. + panic(errors.Errorf("pebble: invalid end key for span: %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey))) + } + + curStartKey := key + curEndKey, curVal, ok := rangekey.DecodeEndKey(curStartKey.Kind(), value) + if !ok { + w.err = errors.Errorf("pebble: invalid end key for span: %s", + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + + switch c := base.InternalCompare(w.compare, prevStartKey, curStartKey); { + case c > 0: + // Previous start key is greater than the current start key. + w.err = errors.Errorf("pebble: range keys must be added in order: %s, %s", + prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) + return w.err + case c == 0: + // Start keys are equal. End keys must also be equal. + if w.compare(prevVal, curVal) != 0 { + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + default: + // Previous start key sorts before the current start key. There are two + // sub-cases: + // 1. If the start user keys are equal, the end keys must be equal (i.e. + // aligned spans). + if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 { + if w.compare(prevEndKey, curEndKey) != 0 { + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + } else if w.compare(prevEndKey, curStartKey.UserKey) > 0 { + // 2. If the start user keys are NOT equal, the spans must be disjoint + // (i.e. no overlap). + // NOTE: the inequality excludes zero, as we allow the end key of the + // lower span be the same as the start key of the upper span, because + // the range end key is considered an exclusive bound. + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + } + } + + // FIXME(travers): Does there need to be sorting of the suffix-values for + // RANGEKEYSETs and RANGEKEYUNSETs? + + // FIXME(travers): Should this seqnum exist in a "parallel keyspace", or can + // it coexist with the point key seqnums? + w.meta.updateSeqNum(key.SeqNum()) + + // Range tombstones are fragmented, so the start key of the first range key + // added will be the smallest. The largest range key is determined in + // Writer.Close() as the end key of the last range key added to the block. + if w.props.NumRangeKeys == 0 { + w.meta.SmallestRangeKey = key.Clone() + } + + // FIXME(travers): More granular properties for RANGEKEYSET, *UNSET, *DEL + // counts? + // FIXME(travers): Update existing point-key properties to count range keys? + // Probably not? + // - raw key / value sizes + // - num entries + w.props.NumRangeKeys++ + w.rangeKeyBlock.add(key, value) + return nil +} + func (w *Writer) maybeAddToFilter(key []byte) { if w.filter != nil { if w.split != nil { @@ -710,6 +809,34 @@ func (w *Writer) Close() (err error) { } } + // Write the range-key block. + var rangeKeyBH BlockHandle + if w.props.NumRangeKeys > 0 { + key := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + kind := key.Kind() + endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue) + if !ok { + w.err = errors.Newf("invalid end key: %s", w.rangeKeyBlock.curValue) + return w.err + } + w.meta.LargestRangeKey = base.MakeInternalKey(endKey, base.InternalKeySeqNumMax, key.Kind()).Clone() + rangeKeyBH, err = w.writeBlock(w.rangeKeyBlock.finish(), NoCompression) + if err != nil { + w.err = err + return w.err + } + } + + // Add the range key block handle to the metaindex block. Note that we add the + // block handle to the metaindex block before the other meta blocks as the + // metaindex block entries must be sorted, and the range key block name sorts + // before the other block names. + // FIXME(travers): update if we use a name other than "pebble.range_key". + if w.props.NumRangeKeys > 0 { + n := encodeBlockHandle(w.tmp[:], rangeKeyBH) + metaindex.add(InternalKey{UserKey: []byte(metaRangeKeyName)}, w.tmp[:n]) + } + { userProps := make(map[string]string) for i := range w.propCollectors { diff --git a/sstable/writer_test.go b/sstable/writer_test.go index 3b3a0ab84c..cdcf408018 100644 --- a/sstable/writer_test.go +++ b/sstable/writer_test.go @@ -49,9 +49,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) case "build-raw": @@ -65,9 +66,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) case "scan": @@ -112,6 +114,22 @@ func runDataDriven(t *testing.T, file string) { } return buf.String() + case "scan-range-key": + iter, err := r.NewRawRangeKeyIter() + if err != nil { + return err.Error() + } + if iter == nil { + return "" + } + defer iter.Close() + + var buf bytes.Buffer + for key, val := iter.First(); key != nil; key, val = iter.Next() { + fmt.Fprintf(&buf, "%s:%s\n", key, val) + } + return buf.String() + case "layout": l, err := r.Layout() if err != nil { @@ -139,9 +157,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) default: diff --git a/testdata/event_listener b/testdata/event_listener index 862b8223ac..d19dadb8a6 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -183,7 +183,7 @@ compact 1 2.3 K 0 B 0 (size == estimated-debt, scor zmemtbl 0 0 B ztbl 0 0 B bcache 8 1.4 K 5.9% (score == hit-rate) - tcache 1 616 B 0.0% (score == hit-rate) + tcache 1 640 B 0.0% (score == hit-rate) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/ingest b/testdata/ingest index 6277b35876..d18206047e 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -48,7 +48,7 @@ compact 0 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 0 0 B ztbl 0 0 B bcache 8 1.5 K 46.7% (score == hit-rate) - tcache 1 616 B 50.0% (score == hit-rate) + tcache 1 640 B 50.0% (score == hit-rate) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/metrics b/testdata/metrics index b5a06a445c..3b874a1f4e 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -34,7 +34,7 @@ compact 0 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 0 0 B bcache 4 698 B 0.0% (score == hit-rate) - tcache 1 616 B 0.0% (score == hit-rate) + tcache 1 640 B 0.0% (score == hit-rate) titers 1 filter - - 0.0% (score == utility) @@ -81,7 +81,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 2 512 K ztbl 2 1.5 K bcache 8 1.4 K 33.3% (score == hit-rate) - tcache 2 1.2 K 50.0% (score == hit-rate) + tcache 2 1.3 K 50.0% (score == hit-rate) titers 2 filter - - 0.0% (score == utility) @@ -113,7 +113,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 2 1.5 K bcache 8 1.4 K 33.3% (score == hit-rate) - tcache 2 1.2 K 50.0% (score == hit-rate) + tcache 2 1.3 K 50.0% (score == hit-rate) titers 2 filter - - 0.0% (score == utility) @@ -142,7 +142,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 1 771 B bcache 4 698 B 33.3% (score == hit-rate) - tcache 1 616 B 50.0% (score == hit-rate) + tcache 1 640 B 50.0% (score == hit-rate) titers 1 filter - - 0.0% (score == utility)