From 156a588bcc221ace58075d4dad342a6da54de97d Mon Sep 17 00:00:00 2001 From: Nick Travers Date: Tue, 7 Dec 2021 11:51:19 -0800 Subject: [PATCH] sstable: add meta block for range keys; support writing range keys Range keys (see #1341) will be stored in their own, single block of an sstable. Add a new, optional meta block, indexed as "pebble.range_key" in the metablock index, to the sstable structure. This block is only present when at least one range key has been written to the sstable. Add the ability to add range keys to an sstable via `(*sstable.Writer).Write`. Update existing data-driven tests to support printing of the range key summary. Add additional test coverage demonstrating writing of range keys with an `sstable.Writer`. Add minimal functionality to `sstable.Reader` to support writing the data-driven test cases for the writer. Additional read-oriented functionality will be added in a subsequent patch. Related to #1339. --- internal/rangekey/rangekey.go | 24 +++++ internal/rangekey/rangekey_test.go | 25 +++++ sstable/data_test.go | 67 ++++++++++++-- sstable/properties.go | 5 + sstable/properties_test.go | 9 +- sstable/reader.go | 27 ++++++ sstable/table.go | 2 + sstable/testdata/rewriter | 7 ++ sstable/testdata/writer | 81 +++++++++++++++- sstable/writer.go | 143 +++++++++++++++++++++++++++-- sstable/writer_test.go | 25 ++++- testdata/event_listener | 2 +- testdata/ingest | 2 +- testdata/metrics | 8 +- 14 files changed, 398 insertions(+), 29 deletions(-) diff --git a/internal/rangekey/rangekey.go b/internal/rangekey/rangekey.go index c692ff6495..5007b9406b 100644 --- a/internal/rangekey/rangekey.go +++ b/internal/rangekey/rangekey.go @@ -323,6 +323,30 @@ func Parse(s string) (key base.InternalKey, value []byte) { } } +// RecombinedValueLen returns the length of the byte slice that results from +// re-encoding the end key and the user-value as a physical range key value. +func RecombinedValueLen(kind base.InternalKeyKind, endKey, userValue []byte) int { + n := len(endKey) + if kind == base.InternalKeyKindRangeKeyDelete { + // RANGEKEYDELs are not varint encoded. + return n + } + return lenVarint(len(endKey)) + len(endKey) + len(userValue) +} + +// RecombineValue re-encodes the end key and user-value as a physical range key +// value into the destination byte slice. +func RecombineValue(kind base.InternalKeyKind, dst, endKey, userValue []byte) int { + if kind == base.InternalKeyKindRangeKeyDelete { + // RANGEKEYDELs are not varint encoded. + return copy(dst, endKey) + } + n := binary.PutUvarint(dst, uint64(len(endKey))) + n += copy(dst[n:], endKey) + n += copy(dst[n:], userValue) + return n +} + func lenVarint(v int) (n int) { x := uint32(v) n++ diff --git a/internal/rangekey/rangekey_test.go b/internal/rangekey/rangekey_test.go index 5c1b6b9c3a..68d073a928 100644 --- a/internal/rangekey/rangekey_test.go +++ b/internal/rangekey/rangekey_test.go @@ -2,6 +2,7 @@ package rangekey import ( "fmt" + "strconv" "testing" "github.com/cockroachdb/pebble/internal/base" @@ -146,3 +147,27 @@ func TestParseFormatRoundtrip(t *testing.T) { } } } + +func TestRecombinedValueLen_RoundTrip(t *testing.T) { + testCases := []string{ + "a.RANGEKEYSET.1: [(@t22=foo),(@t1=bar)]", + "a.RANGEKEYSET.1: [(@t1=bar)]", + "a.RANGEKEYUNSET.1: [@t9,@t8,@t7,@t6,@t5]", + "a.RANGEKEYDEL.5: foo", + } + for i, in := range testCases { + t.Run(strconv.Itoa(i), func(t *testing.T) { + k, v := Parse(in) + + // Split the value into an end key and a user-value. + endKey, restValue, ok := DecodeEndKey(k.Kind(), v) + require.True(t, ok) + + // Re-encode the end key and user-value. + dst := make([]byte, RecombinedValueLen(k.Kind(), endKey, restValue)) + RecombineValue(k.Kind(), dst, endKey, restValue) + + require.Equal(t, v, dst) + }) + } +} diff --git a/sstable/data_test.go b/sstable/data_test.go index c13aa3f4ce..e277c7d40a 100644 --- a/sstable/data_test.go +++ b/sstable/data_test.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/datadriven" "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/internal/rangekey" "github.com/cockroachdb/pebble/vfs" ) @@ -68,12 +69,20 @@ func runBuildCmd( } w := NewWriter(f0, *writerOpts) - var tombstones []keyspan.Span - f := keyspan.Fragmenter{ + var rangeDels []keyspan.Span + rangeDelFrag := keyspan.Fragmenter{ Cmp: DefaultComparer.Compare, Format: DefaultComparer.FormatKey, Emit: func(fragmented []keyspan.Span) { - tombstones = append(tombstones, fragmented...) + rangeDels = append(rangeDels, fragmented...) + }, + } + var rangeKeys []keyspan.Span + rangeKeyFrag := keyspan.Fragmenter{ + Cmp: DefaultComparer.Compare, + Format: DefaultComparer.FormatKey, + Emit: func(fragmented []keyspan.Span) { + rangeKeys = append(rangeKeys, fragmented...) }, } for _, data := range strings.Split(td.Input, "\n") { @@ -89,7 +98,28 @@ func runBuildCmd( err = errors.Errorf("%v", r) } }() - f.Add(keyspan.Span{Start: key, End: value}) + rangeDelFrag.Add(keyspan.Span{Start: key, End: value}) + }() + if err != nil { + return nil, nil, err + } + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + var err error + func() { + defer func() { + if r := recover(); r != nil { + err = errors.Errorf("%v", r) + } + }() + key, value := rangekey.Parse(data) + endKey, value, ok := rangekey.DecodeEndKey(key.Kind(), value) + if !ok { + err = errors.New("could not decode end key") + return + } + rangeKeyFrag.Add(keyspan.Span{Start: key, End: endKey, Value: value}) }() if err != nil { return nil, nil, err @@ -98,15 +128,24 @@ func runBuildCmd( if err := w.Add(key, value); err != nil { return nil, nil, err } - } } - f.Finish() - for _, v := range tombstones { + rangeDelFrag.Finish() + for _, v := range rangeDels { if err := w.Add(v.Start, v.End); err != nil { return nil, nil, err } } + rangeKeyFrag.Finish() + for _, v := range rangeKeys { + // Reconstitute the value from the end key and the user value. + n := rangekey.RecombinedValueLen(v.Start.Kind(), v.End, v.Value) + b := make([]byte, n) + _ = rangekey.RecombineValue(v.Start.Kind(), b, v.End, v.Value) + if err := w.Add(v.Start, b); err != nil { + return nil, nil, err + } + } if err := w.Close(); err != nil { return nil, nil, err } @@ -151,7 +190,19 @@ func runBuildRawCmd(td *datadriven.TestData) (*WriterMetadata, *Reader, error) { for _, data := range strings.Split(td.Input, "\n") { j := strings.Index(data, ":") key := base.ParseInternalKey(data[:j]) - value := []byte(data[j+1:]) + + var value []byte + switch key.Kind() { + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + // Values for range keys must be converted into their "packed" form before + // being added to the Writer. + _, value = rangekey.Parse(data) + default: + value = []byte(data[j+1:]) + } + if err := w.Add(key, value); err != nil { return nil, nil, err } diff --git a/sstable/properties.go b/sstable/properties.go index c8ff10be8e..7426222fa9 100644 --- a/sstable/properties.go +++ b/sstable/properties.go @@ -114,6 +114,8 @@ type Properties struct { NumMergeOperands uint64 `prop:"rocksdb.merge.operands"` // The number of range deletions in this table. NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"` + // The number of range keys in this table. + NumRangeKeys uint64 `prop:"pebble.num.range-keys"` // Timestamp of the earliest key. 0 if unknown. OldestKeyTime uint64 `prop:"rocksdb.oldest.key.time"` // The name of the prefix extractor used in this table. Empty if no prefix @@ -318,6 +320,9 @@ func (p *Properties) save(w *rawBlockWriter) { p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions) p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands) p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions) + if p.NumRangeKeys > 0 { + p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeys), p.NumRangeKeys) + } p.saveUvarint(m, unsafe.Offsetof(p.OldestKeyTime), p.OldestKeyTime) if p.PrefixExtractorName != "" { p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName) diff --git a/sstable/properties_test.go b/sstable/properties_test.go index 9173a99d86..2ac9d3b094 100644 --- a/sstable/properties_test.go +++ b/sstable/properties_test.go @@ -85,13 +85,14 @@ func TestPropertiesSave(t *testing.T) { NumEntries: 16, NumMergeOperands: 17, NumRangeDeletions: 18, - OldestKeyTime: 19, + NumRangeKeys: 19, + OldestKeyTime: 20, PrefixExtractorName: "prefix extractor name", PrefixFiltering: true, PropertyCollectorNames: "prefix collector names", - RawKeySize: 20, - RawValueSize: 21, - TopLevelIndexSize: 22, + RawKeySize: 21, + RawValueSize: 22, + TopLevelIndexSize: 23, WholeKeyFiltering: true, UserProperties: map[string]string{ "user-prop-a": "1", diff --git a/sstable/reader.go b/sstable/reader.go index 50362ca4d8..4fefb768b2 100644 --- a/sstable/reader.go +++ b/sstable/reader.go @@ -2018,6 +2018,7 @@ type Reader struct { indexBH BlockHandle filterBH BlockHandle rangeDelBH BlockHandle + rangeKeyBH BlockHandle rangeDelTransform blockTransform propertiesBH BlockHandle metaIndexBH BlockHandle @@ -2133,6 +2134,24 @@ func (r *Reader) NewRawRangeDelIter() (base.InternalIterator, error) { return i, nil } +// NewRawRangeKeyIter returns an internal iterator for the contents of the +// range-key block for the table. Returns nil if the table does not contain any +// range keys. +func (r *Reader) NewRawRangeKeyIter() (base.InternalIterator, error) { + if r.rangeKeyBH.Length == 0 { + return nil, nil + } + h, err := r.readRangeKey() + if err != nil { + return nil, err + } + i := &blockIter{} + if err := i.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil { + return nil, err + } + return i, nil +} + func (r *Reader) readIndex() (cache.Handle, error) { return r.readBlock(r.indexBH, nil /* transform */, nil /* readaheadState */) } @@ -2145,6 +2164,10 @@ func (r *Reader) readRangeDel() (cache.Handle, error) { return r.readBlock(r.rangeDelBH, r.rangeDelTransform, nil /* readaheadState */) } +func (r *Reader) readRangeKey() (cache.Handle, error) { + return r.readBlock(r.rangeKeyBH, nil /* transform */, nil /* readaheadState */) +} + // readBlock reads and decompresses a block from disk into memory. func (r *Reader) readBlock( bh BlockHandle, transform blockTransform, raState *readaheadState, @@ -2347,6 +2370,10 @@ func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { } } + if bh, ok := meta[metaRangeKeyName]; ok { + r.rangeKeyBH = bh + } + for name, fp := range r.opts.Filters { types := []struct { ftype FilterType diff --git a/sstable/table.go b/sstable/table.go index 94da13d59f..24e571672f 100644 --- a/sstable/table.go +++ b/sstable/table.go @@ -159,6 +159,8 @@ const ( levelDBFormatVersion = 0 rocksDBFormatVersion2 = 2 + // FIXME(travers): Are we ok with this naming? + metaRangeKeyName = "pebble.range_key" metaPropertiesName = "rocksdb.properties" metaRangeDelName = "rocksdb.range_del" metaRangeDelV2Name = "rocksdb.range_del2" diff --git a/sstable/testdata/rewriter b/sstable/testdata/rewriter index a25dc2f095..42dd23826c 100644 --- a/sstable/testdata/rewriter +++ b/sstable/testdata/rewriter @@ -5,6 +5,7 @@ c_xyz.SET.1:c ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] # rewrite from=xyz to=123 block-size=1 index-block-size=1 filter @@ -22,6 +23,7 @@ ca_xyz.SET.1:c ---- point: [aa_xyz#1,1,ca_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] rewrite from=yz to=23 block-size=1 index-block-size=1 filter comparer-split-4b-suffix @@ -39,6 +41,7 @@ c_xyz.SET.1:c ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -75,6 +78,7 @@ rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_123#1,1,c_123#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -111,6 +115,7 @@ rewrite from=_123 to=_456 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_456#1,1,c_456#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -147,6 +152,7 @@ rewrite from=_456 to=_xyz block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -184,6 +190,7 @@ rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_123#1,1,c_123#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout diff --git a/sstable/testdata/writer b/sstable/testdata/writer index 904e35109e..bdc6da7126 100644 --- a/sstable/testdata/writer +++ b/sstable/testdata/writer @@ -3,6 +3,7 @@ a.SET.1:a ---- point: [a#1,1,a#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] scan @@ -12,6 +13,9 @@ a#1,1:a scan-range-del ---- +scan-range-key +---- + build a.SET.1:a b.DEL.2: @@ -21,10 +25,14 @@ f.SET.5:f g.DEL.6: h.MERGE.7:h i.RANGEDEL.8:j +j.RANGEKEYDEL.9:k +k.RANGEKEYUNSET.10:l [@t5] +l.RANGEKEYSET.11:m [(@t10=foo)] ---- point: [a#1,1,h#7,2] rangedel: [d#4,15,j#72057594037927935,15] -seqnums: [1,8] +rangekey: [j#9,19,m#72057594037927935,21] +seqnums: [1,11] build a.SET.1:a @@ -38,6 +46,7 @@ i.RANGEDEL.8:j ---- point: [a#1,1,h#7,2] rangedel: [d#4,15,j#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,8] scan @@ -65,6 +74,7 @@ j.RANGEDEL.1:z ---- point: [#0,0,#0,0] rangedel: [a#3,15,z#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,3] scan @@ -82,6 +92,9 @@ m#2,15:s m#1,15:s s#1,15:z +scan-range-key +---- + # The range tombstone upper bound is exclusive, so a point operation # on that same key will be the actual boundary. @@ -91,6 +104,7 @@ b.SET.4:c ---- point: [b#4,1,b#4,1] rangedel: [a#3,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [3,4] build @@ -99,6 +113,7 @@ b.SET.2:c ---- point: [b#2,1,b#2,1] rangedel: [a#3,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [2,3] build @@ -107,6 +122,7 @@ b.SET.2:c ---- point: [b#2,1,b#2,1] rangedel: [a#3,15,c#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [2,3] # Keys must be added in order. @@ -134,6 +150,7 @@ build-raw ---- point: [#0,0,#0,0] rangedel: [#1,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,1] build-raw @@ -160,8 +177,64 @@ c.RANGEDEL.2:d ---- point: [#0,0,#0,0] rangedel: [a#1,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] +seqnums: [1,2] + +build-raw +a.RANGEKEYSET.1:b [(@t10=foo)] +a.RANGEKEYSET.2:b [(@t10=foo)] +---- +pebble: range keys must be added in order: a#1,RANGEKEYSET, a#2,RANGEKEYSET + +build-raw +b.RANGEKEYSET.2:c [(@t10=foo)] +a.RANGEKEYSET.1:b [(@t10=foo)] +---- +pebble: range keys must be added in order: b#2,RANGEKEYSET, a#1,RANGEKEYSET + +build-raw +a.RANGEKEYSET.1:c [(@t10=foo)] +b.RANGEKEYSET.2:d [(@t10=foo)] +---- +pebble: overlapping range keys must be fragmented: a-c#1, b-d#2 + +build-raw +a.RANGEKEYSET.2:c [(@t10=foo)] +a.RANGEKEYSET.1:d [(@t10=foo)] +---- +pebble: overlapping range keys must be fragmented: a-c#2, a-d#1 + +build-raw +a.RANGEKEYSET.1:c [(@t10=foo)] +c.RANGEKEYSET.2:d [(@t10=foo)] +---- +point: [#0,0,#0,0] +rangedel: [#0,0,#0,0] +rangekey: [a#1,21,d#72057594037927935,21] seqnums: [1,2] +# Range keys may have perfectly aligned spans (including sequence numbers), +# though the key kinds must be ordered (descending). + +build-raw +a.RANGEKEYUNSET.1:b [@t10] +a.RANGEKEYSET.1:b [(@t10=foo)] +---- +pebble: range keys must be added in order: a#1,RANGEKEYUNSET, a#1,RANGEKEYSET + +build-raw +a.RANGEKEYSET.1:b [(@t10=foo)] +a.RANGEKEYUNSET.1:b [@t10] +a.RANGEKEYDEL.1:b +---- +point: [#0,0,#0,0] +rangedel: [#0,0,#0,0] +rangekey: [a#1,21,b#72057594037927935,19] +seqnums: [1,1] + +# FIXME(travers): Additional test cases that cover the ordering of suffix-value +# tuples? + # The range-del-v1 format supports unfragmented and unsorted range # tombstones. @@ -171,6 +244,7 @@ a.RANGEDEL.2:c ---- point: [#0,0,#0,0] rangedel: [a#2,15,c#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -184,6 +258,7 @@ b.RANGEDEL.2:d ---- point: [#0,0,#0,0] rangedel: [a#1,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -199,6 +274,7 @@ a.RANGEDEL.1:d ---- point: [#0,0,#0,0] rangedel: [a#2,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -218,6 +294,7 @@ a.RANGEDEL.3:m ---- point: [#0,0,#0,0] rangedel: [a#3,15,z#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,3] scan-range-del @@ -241,6 +318,7 @@ c.SET.1:c ---- point: [a#1,1,c#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -273,6 +351,7 @@ c.SET.1:c ---- point: [a#1,1,c#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout diff --git a/sstable/writer.go b/sstable/writer.go index 8648a96546..7a19bbf05d 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/pebble/internal/crc" "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/private" + "github.com/cockroachdb/pebble/internal/rangekey" ) var errWriterClosed = errors.New("pebble: writer is closed") @@ -28,8 +29,10 @@ type WriterMetadata struct { Size uint64 SmallestPoint InternalKey SmallestRangeDel InternalKey + SmallestRangeKey InternalKey LargestPoint InternalKey LargestRangeDel InternalKey + LargestRangeKey InternalKey SmallestSeqNum uint64 LargestSeqNum uint64 Properties Properties @@ -134,6 +137,7 @@ type Writer struct { block blockWriter indexBlock blockWriter rangeDelBlock blockWriter + rangeKeyBlock blockWriter props Properties propCollectors []TablePropertyCollector blockPropCollectors []BlockPropertyCollector @@ -212,19 +216,30 @@ func (w *Writer) Merge(key, value []byte) error { return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value) } -// Add adds a key/value pair to the table being written. For a given Writer, -// the keys passed to Add must be in increasing order. The exception to this -// rule is range deletion tombstones. Range deletion tombstones need to be -// added ordered by their start key, but they can be added out of order from -// point entries. Additionally, range deletion tombstones must be fragmented -// (i.e. by keyspan.Fragmenter). +// Add adds a key/value pair to the table being written. For a given Writer, the +// keys passed to Add must be in increasing order. The exceptions to this rule +// are range deletion tombstones, and range key sets, unsets and deletes. +// +// Range deletion tombstones need to be added ordered by their start key, but +// they can be added out of order from point entries. Additionally, range +// deletion tombstones must be fragmented (i.e. by keyspan.Fragmenter). +// +// Range keys must be added ordered by their start key, and in their fragmented +// span order. Other than spans that are perfectly aligned (same start and end +// keys), spans may not overlap. Range keys can be added out of order with point +// keys and range deletions. func (w *Writer) Add(key InternalKey, value []byte) error { if w.err != nil { return w.err } - if key.Kind() == InternalKeyKindRangeDelete { + switch key.Kind() { + case InternalKeyKindRangeDelete: return w.addTombstone(key, value) + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + return w.addRangeKey(key, value) } return w.addPoint(key, value) } @@ -370,6 +385,92 @@ func (w *Writer) addTombstone(key InternalKey, value []byte) error { return nil } +func (w *Writer) addRangeKey(key InternalKey, value []byte) error { + if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 { + // Convert both the previous and current keys into spans so that we can + // compare them. + prevStartKey := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + prevEndKey, prevVal, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue) + if !ok { + // We panic here as we should have previously decoded and validated this + // key and value when it was first added to teh range key block. + panic(errors.Errorf("pebble: invalid end key for span: %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey))) + } + + curStartKey := key + curEndKey, curVal, ok := rangekey.DecodeEndKey(curStartKey.Kind(), value) + if !ok { + w.err = errors.Errorf("pebble: invalid end key for span: %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey)) + return w.err + } + + switch c := base.InternalCompare(w.compare, prevStartKey, curStartKey); { + case c > 0: + // Previous start key is greater than the current start key. + w.err = errors.Errorf("pebble: range keys must be added in order: %s, %s", + prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) + return w.err + case c == 0: + // Start keys are equal. End keys must also be equal. + if w.compare(prevVal, curVal) != 0 { + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + default: + // Previous start key sorts before the current start key. There are two + // sub-cases: + // 1. If the start user keys are equal, the end keys must be equal (i.e. + // aligned spans). + if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 { + if w.compare(prevEndKey, curEndKey) != 0 { + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + } else if w.compare(prevEndKey, curStartKey.UserKey) > 0 { + // 2. If the start user keys are NOT equal, the spans must be disjoint + // (i.e. no overlap). + // NOTE: the inequality excludes zero, as we allow the end key of the + // lower span be the same as the start key of the upper span, even + // though the range end key is considered an exclusive bound. + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + } + } + + // FIXME(travers): Does there need to be sorting of the suffix-values for + // RANGEKEYSETs and RANGEKEYUNSETs? + + // FIXME(travers): Should this seqnum exist in a "parallel keyspace", or can + // it coexist with the point key seqnums? + w.meta.updateSeqNum(key.SeqNum()) + + // Range tombstones are fragmented, so the start key of the first range key + // added will be the smallest. The largest range key is determined in + // Writer.Close() as the end key of the last range key added to the block. + if w.props.NumRangeKeys == 0 { + w.meta.SmallestRangeKey = key.Clone() + } + + // FIXME(travers): More granular properties for RANGEKEYSET, *UNSET, *DEL + // counts? + // FIXME(travers): Update existing point-key properties to count range keys? + // Probably not? + // - raw key / value sizes + // - num entries + w.props.NumRangeKeys++ + w.rangeKeyBlock.add(key, value) + return nil +} + func (w *Writer) maybeAddToFilter(key []byte) { if w.filter != nil { if w.split != nil { @@ -710,6 +811,34 @@ func (w *Writer) Close() (err error) { } } + // Write the range-key block. + var rangeKeyBH BlockHandle + if w.props.NumRangeKeys > 0 { + key := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + kind := key.Kind() + endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue) + if !ok { + w.err = errors.Newf("invalid end key: %s", w.rangeKeyBlock.curValue) + return w.err + } + w.meta.LargestRangeKey = base.MakeInternalKey(endKey, base.InternalKeySeqNumMax, key.Kind()).Clone() + rangeKeyBH, err = w.writeBlock(w.rangeKeyBlock.finish(), NoCompression) + if err != nil { + w.err = err + return w.err + } + } + + // Add the range key block handle to the metaindex block. Note that we add the + // block handle to the metaindex block before the other meta blocks as the + // metaindex block entries must be sorted, and the range key block name sorts + // before the other block names. + // FIXME(travers): update if we use a name other than "pebble.range_key". + if w.props.NumRangeKeys > 0 { + n := encodeBlockHandle(w.tmp[:], rangeKeyBH) + metaindex.add(InternalKey{UserKey: []byte(metaRangeKeyName)}, w.tmp[:n]) + } + { userProps := make(map[string]string) for i := range w.propCollectors { diff --git a/sstable/writer_test.go b/sstable/writer_test.go index 3b3a0ab84c..cdcf408018 100644 --- a/sstable/writer_test.go +++ b/sstable/writer_test.go @@ -49,9 +49,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) case "build-raw": @@ -65,9 +66,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) case "scan": @@ -112,6 +114,22 @@ func runDataDriven(t *testing.T, file string) { } return buf.String() + case "scan-range-key": + iter, err := r.NewRawRangeKeyIter() + if err != nil { + return err.Error() + } + if iter == nil { + return "" + } + defer iter.Close() + + var buf bytes.Buffer + for key, val := iter.First(); key != nil; key, val = iter.Next() { + fmt.Fprintf(&buf, "%s:%s\n", key, val) + } + return buf.String() + case "layout": l, err := r.Layout() if err != nil { @@ -139,9 +157,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) default: diff --git a/testdata/event_listener b/testdata/event_listener index 862b8223ac..d19dadb8a6 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -183,7 +183,7 @@ compact 1 2.3 K 0 B 0 (size == estimated-debt, scor zmemtbl 0 0 B ztbl 0 0 B bcache 8 1.4 K 5.9% (score == hit-rate) - tcache 1 616 B 0.0% (score == hit-rate) + tcache 1 640 B 0.0% (score == hit-rate) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/ingest b/testdata/ingest index 6277b35876..d18206047e 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -48,7 +48,7 @@ compact 0 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 0 0 B ztbl 0 0 B bcache 8 1.5 K 46.7% (score == hit-rate) - tcache 1 616 B 50.0% (score == hit-rate) + tcache 1 640 B 50.0% (score == hit-rate) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/metrics b/testdata/metrics index b5a06a445c..3b874a1f4e 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -34,7 +34,7 @@ compact 0 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 0 0 B bcache 4 698 B 0.0% (score == hit-rate) - tcache 1 616 B 0.0% (score == hit-rate) + tcache 1 640 B 0.0% (score == hit-rate) titers 1 filter - - 0.0% (score == utility) @@ -81,7 +81,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 2 512 K ztbl 2 1.5 K bcache 8 1.4 K 33.3% (score == hit-rate) - tcache 2 1.2 K 50.0% (score == hit-rate) + tcache 2 1.3 K 50.0% (score == hit-rate) titers 2 filter - - 0.0% (score == utility) @@ -113,7 +113,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 2 1.5 K bcache 8 1.4 K 33.3% (score == hit-rate) - tcache 2 1.2 K 50.0% (score == hit-rate) + tcache 2 1.3 K 50.0% (score == hit-rate) titers 2 filter - - 0.0% (score == utility) @@ -142,7 +142,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 1 771 B bcache 4 698 B 33.3% (score == hit-rate) - tcache 1 616 B 50.0% (score == hit-rate) + tcache 1 640 B 50.0% (score == hit-rate) titers 1 filter - - 0.0% (score == utility)