From 989fa6f0fce07a92d222d977422c78db17cfa622 Mon Sep 17 00:00:00 2001 From: Nick Travers Date: Tue, 7 Dec 2021 11:51:19 -0800 Subject: [PATCH] sstable: add meta block for range keys; support writing range keys Range keys (see #1341) will be stored in their own, single block of an sstable. Add a new, optional meta block, indexed as "pebble.range_key" in the metablock index, to the sstable structure. This block is only present when at least one range key has been written to the sstable. Add the ability to add range keys to an sstable via `(*sstable.Writer).Write`. Update existing data-driven tests to support printing of the range key summary. Add additional test coverage demonstrating writing of range keys with an `sstable.Writer`. Add minimal functionality to `sstable.Reader` to support writing the data-driven test cases for the writer. Additional read-oriented functionality will be added in a subsequent patch. Related to #1339. --- internal/rangekey/rangekey.go | 24 +++++ internal/rangekey/rangekey_test.go | 25 +++++ sstable/block.go | 15 +-- sstable/data_test.go | 72 ++++++++++++-- sstable/properties.go | 22 +++++ sstable/properties_test.go | 11 ++- sstable/reader.go | 27 ++++++ sstable/table.go | 4 +- sstable/testdata/rewriter | 7 ++ sstable/testdata/writer | 78 ++++++++++++++- sstable/writer.go | 146 ++++++++++++++++++++++++++++- sstable/writer_test.go | 25 ++++- testdata/event_listener | 2 +- testdata/ingest | 2 +- testdata/metrics | 8 +- 15 files changed, 435 insertions(+), 33 deletions(-) diff --git a/internal/rangekey/rangekey.go b/internal/rangekey/rangekey.go index c692ff6495..5007b9406b 100644 --- a/internal/rangekey/rangekey.go +++ b/internal/rangekey/rangekey.go @@ -323,6 +323,30 @@ func Parse(s string) (key base.InternalKey, value []byte) { } } +// RecombinedValueLen returns the length of the byte slice that results from +// re-encoding the end key and the user-value as a physical range key value. +func RecombinedValueLen(kind base.InternalKeyKind, endKey, userValue []byte) int { + n := len(endKey) + if kind == base.InternalKeyKindRangeKeyDelete { + // RANGEKEYDELs are not varint encoded. + return n + } + return lenVarint(len(endKey)) + len(endKey) + len(userValue) +} + +// RecombineValue re-encodes the end key and user-value as a physical range key +// value into the destination byte slice. +func RecombineValue(kind base.InternalKeyKind, dst, endKey, userValue []byte) int { + if kind == base.InternalKeyKindRangeKeyDelete { + // RANGEKEYDELs are not varint encoded. + return copy(dst, endKey) + } + n := binary.PutUvarint(dst, uint64(len(endKey))) + n += copy(dst[n:], endKey) + n += copy(dst[n:], userValue) + return n +} + func lenVarint(v int) (n int) { x := uint32(v) n++ diff --git a/internal/rangekey/rangekey_test.go b/internal/rangekey/rangekey_test.go index 5c1b6b9c3a..68d073a928 100644 --- a/internal/rangekey/rangekey_test.go +++ b/internal/rangekey/rangekey_test.go @@ -2,6 +2,7 @@ package rangekey import ( "fmt" + "strconv" "testing" "github.com/cockroachdb/pebble/internal/base" @@ -146,3 +147,27 @@ func TestParseFormatRoundtrip(t *testing.T) { } } } + +func TestRecombinedValueLen_RoundTrip(t *testing.T) { + testCases := []string{ + "a.RANGEKEYSET.1: [(@t22=foo),(@t1=bar)]", + "a.RANGEKEYSET.1: [(@t1=bar)]", + "a.RANGEKEYUNSET.1: [@t9,@t8,@t7,@t6,@t5]", + "a.RANGEKEYDEL.5: foo", + } + for i, in := range testCases { + t.Run(strconv.Itoa(i), func(t *testing.T) { + k, v := Parse(in) + + // Split the value into an end key and a user-value. + endKey, restValue, ok := DecodeEndKey(k.Kind(), v) + require.True(t, ok) + + // Re-encode the end key and user-value. + dst := make([]byte, RecombinedValueLen(k.Kind(), endKey, restValue)) + RecombineValue(k.Kind(), dst, endKey, restValue) + + require.Equal(t, v, dst) + }) + } +} diff --git a/sstable/block.go b/sstable/block.go index 8ef57da2d3..3e2437892b 100644 --- a/sstable/block.go +++ b/sstable/block.go @@ -183,14 +183,15 @@ type blockEntry struct { // interval of 1 (no prefix compression), blockIter guarantees that // InternalKey.UserKey will point to the key as stored in the block itself // which will remain valid until the blockIter is closed. The key stability -// guarantee is used by the range tombstone code which knows that range -// tombstones are always encoded with a restart interval of 1. This per-block -// key stability guarantee is sufficient for range tombstones as they are -// always encoded in a single block. +// guarantee is used by the range tombstone and range key code, which knows that +// the respective blocks are always encoded with a restart interval of 1. This +// per-block key stability guarantee is sufficient for range tombstones and +// range deletes as they are always encoded in a single block. // -// A blockIter also provides a value stability guarantee for range deletions -// since there is only a single range deletion block per sstable and the -// blockIter will not release the bytes for the block until it is closed. +// A blockIter also provides a value stability guarantee for range deletions and +// range keys since there is only a single range deletion and range key block +// per sstable and the blockIter will not release the bytes for the block until +// it is closed. type blockIter struct { cmp Compare // offset is the byte index that marks where the current key/value is diff --git a/sstable/data_test.go b/sstable/data_test.go index c13aa3f4ce..2e748cc3e3 100644 --- a/sstable/data_test.go +++ b/sstable/data_test.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/datadriven" "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/internal/rangekey" "github.com/cockroachdb/pebble/vfs" ) @@ -68,12 +69,20 @@ func runBuildCmd( } w := NewWriter(f0, *writerOpts) - var tombstones []keyspan.Span - f := keyspan.Fragmenter{ + var rangeDels []keyspan.Span + rangeDelFrag := keyspan.Fragmenter{ Cmp: DefaultComparer.Compare, Format: DefaultComparer.FormatKey, Emit: func(fragmented []keyspan.Span) { - tombstones = append(tombstones, fragmented...) + rangeDels = append(rangeDels, fragmented...) + }, + } + var rangeKeys []keyspan.Span + rangeKeyFrag := keyspan.Fragmenter{ + Cmp: DefaultComparer.Compare, + Format: DefaultComparer.FormatKey, + Emit: func(fragmented []keyspan.Span) { + rangeKeys = append(rangeKeys, fragmented...) }, } for _, data := range strings.Split(td.Input, "\n") { @@ -89,7 +98,28 @@ func runBuildCmd( err = errors.Errorf("%v", r) } }() - f.Add(keyspan.Span{Start: key, End: value}) + rangeDelFrag.Add(keyspan.Span{Start: key, End: value}) + }() + if err != nil { + return nil, nil, err + } + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + var err error + func() { + defer func() { + if r := recover(); r != nil { + err = errors.Errorf("%v", r) + } + }() + key, value := rangekey.Parse(data) + endKey, value, ok := rangekey.DecodeEndKey(key.Kind(), value) + if !ok { + err = errors.New("could not decode end key") + return + } + rangeKeyFrag.Add(keyspan.Span{Start: key, End: endKey, Value: value}) }() if err != nil { return nil, nil, err @@ -98,15 +128,24 @@ func runBuildCmd( if err := w.Add(key, value); err != nil { return nil, nil, err } - } } - f.Finish() - for _, v := range tombstones { + rangeDelFrag.Finish() + for _, v := range rangeDels { if err := w.Add(v.Start, v.End); err != nil { return nil, nil, err } } + rangeKeyFrag.Finish() + for _, v := range rangeKeys { + // Reconstitute the value from the end key and the user value. + n := rangekey.RecombinedValueLen(v.Start.Kind(), v.End, v.Value) + b := make([]byte, n) + _ = rangekey.RecombineValue(v.Start.Kind(), b, v.End, v.Value) + if err := w.AddInternalRangeKey(v.Start, b); err != nil { + return nil, nil, err + } + } if err := w.Close(); err != nil { return nil, nil, err } @@ -151,9 +190,22 @@ func runBuildRawCmd(td *datadriven.TestData) (*WriterMetadata, *Reader, error) { for _, data := range strings.Split(td.Input, "\n") { j := strings.Index(data, ":") key := base.ParseInternalKey(data[:j]) - value := []byte(data[j+1:]) - if err := w.Add(key, value); err != nil { - return nil, nil, err + + switch key.Kind() { + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + // Values for range keys must be converted into their "packed" form before + // being added to the Writer. + _, value := rangekey.Parse(data) + if err := w.AddInternalRangeKey(key, value); err != nil { + return nil, nil, err + } + default: + value := []byte(data[j+1:]) + if err := w.Add(key, value); err != nil { + return nil, nil, err + } } } if err := w.Close(); err != nil { diff --git a/sstable/properties.go b/sstable/properties.go index c8ff10be8e..7ebb9835b9 100644 --- a/sstable/properties.go +++ b/sstable/properties.go @@ -114,6 +114,12 @@ type Properties struct { NumMergeOperands uint64 `prop:"rocksdb.merge.operands"` // The number of range deletions in this table. NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"` + // The number of RANGEKEYDELs in this table. + NumRangeKeyDels uint64 `prop:"pebble.num.range-key-dels"` + // The number of RANGEKEYSETs in this table. + NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"` + // The number of RANGEKEYUNSETs in this table. + NumRangeKeyUnsets uint64 `prop:"pebble.num.range-key-unsets"` // Timestamp of the earliest key. 0 if unknown. OldestKeyTime uint64 `prop:"rocksdb.oldest.key.time"` // The name of the prefix extractor used in this table. Empty if no prefix @@ -126,6 +132,10 @@ type Properties struct { PropertyCollectorNames string `prop:"rocksdb.property.collectors"` // Total raw key size. RawKeySize uint64 `prop:"rocksdb.raw.key.size"` + // Total raw rangekey key size. + RawRangeKeyKeySize uint64 `prop:"pebble.raw.rangekey.key.size"` + // Total raw rangekey value size. + RawRangeKeyValueSize uint64 `prop:"pebble.raw.rangekey.value.size"` // Total raw value size. RawValueSize uint64 `prop:"rocksdb.raw.value.size"` // Size of the top-level index if kTwoLevelIndexSearch is used. @@ -147,6 +157,11 @@ func (p *Properties) NumPointDeletions() uint64 { return p.NumDeletions - p.NumRangeDeletions } +// NumRangeKeys returns a count of the number of range keys in this table. +func (p *Properties) NumRangeKeys() uint64 { + return p.NumRangeKeyDels + p.NumRangeKeySets + p.NumRangeKeyUnsets +} + func (p *Properties) String() string { var buf bytes.Buffer v := reflect.ValueOf(*p) @@ -318,6 +333,13 @@ func (p *Properties) save(w *rawBlockWriter) { p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions) p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands) p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions) + if p.NumRangeKeys() > 0 { + p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyDels), p.NumRangeKeyDels) + p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeySets), p.NumRangeKeySets) + p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyUnsets), p.NumRangeKeyUnsets) + p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyKeySize), p.RawRangeKeyKeySize) + p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyValueSize), p.RawRangeKeyValueSize) + } p.saveUvarint(m, unsafe.Offsetof(p.OldestKeyTime), p.OldestKeyTime) if p.PrefixExtractorName != "" { p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName) diff --git a/sstable/properties_test.go b/sstable/properties_test.go index 9173a99d86..a044e0e3c3 100644 --- a/sstable/properties_test.go +++ b/sstable/properties_test.go @@ -85,13 +85,16 @@ func TestPropertiesSave(t *testing.T) { NumEntries: 16, NumMergeOperands: 17, NumRangeDeletions: 18, - OldestKeyTime: 19, + NumRangeKeyDels: 19, + NumRangeKeySets: 20, + NumRangeKeyUnsets: 21, + OldestKeyTime: 22, PrefixExtractorName: "prefix extractor name", PrefixFiltering: true, PropertyCollectorNames: "prefix collector names", - RawKeySize: 20, - RawValueSize: 21, - TopLevelIndexSize: 22, + RawKeySize: 23, + RawValueSize: 24, + TopLevelIndexSize: 25, WholeKeyFiltering: true, UserProperties: map[string]string{ "user-prop-a": "1", diff --git a/sstable/reader.go b/sstable/reader.go index 50362ca4d8..4fefb768b2 100644 --- a/sstable/reader.go +++ b/sstable/reader.go @@ -2018,6 +2018,7 @@ type Reader struct { indexBH BlockHandle filterBH BlockHandle rangeDelBH BlockHandle + rangeKeyBH BlockHandle rangeDelTransform blockTransform propertiesBH BlockHandle metaIndexBH BlockHandle @@ -2133,6 +2134,24 @@ func (r *Reader) NewRawRangeDelIter() (base.InternalIterator, error) { return i, nil } +// NewRawRangeKeyIter returns an internal iterator for the contents of the +// range-key block for the table. Returns nil if the table does not contain any +// range keys. +func (r *Reader) NewRawRangeKeyIter() (base.InternalIterator, error) { + if r.rangeKeyBH.Length == 0 { + return nil, nil + } + h, err := r.readRangeKey() + if err != nil { + return nil, err + } + i := &blockIter{} + if err := i.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil { + return nil, err + } + return i, nil +} + func (r *Reader) readIndex() (cache.Handle, error) { return r.readBlock(r.indexBH, nil /* transform */, nil /* readaheadState */) } @@ -2145,6 +2164,10 @@ func (r *Reader) readRangeDel() (cache.Handle, error) { return r.readBlock(r.rangeDelBH, r.rangeDelTransform, nil /* readaheadState */) } +func (r *Reader) readRangeKey() (cache.Handle, error) { + return r.readBlock(r.rangeKeyBH, nil /* transform */, nil /* readaheadState */) +} + // readBlock reads and decompresses a block from disk into memory. func (r *Reader) readBlock( bh BlockHandle, transform blockTransform, raState *readaheadState, @@ -2347,6 +2370,10 @@ func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { } } + if bh, ok := meta[metaRangeKeyName]; ok { + r.rangeKeyBH = bh + } + for name, fp := range r.opts.Filters { types := []struct { ftype FilterType diff --git a/sstable/table.go b/sstable/table.go index 0227283060..da36c50d33 100644 --- a/sstable/table.go +++ b/sstable/table.go @@ -85,8 +85,9 @@ The table file format looks like: [data block N-1] [meta filter block] (optional) [index block] (for single level index) -[meta rangedel block] (optional) +[meta range key block] (optional) [meta properties block] +[meta rangedel block] (optional) [metaindex block] [footer] @@ -179,6 +180,7 @@ const ( levelDBFormatVersion = 0 rocksDBFormatVersion2 = 2 + metaRangeKeyName = "pebble.range_key" metaPropertiesName = "rocksdb.properties" metaRangeDelName = "rocksdb.range_del" metaRangeDelV2Name = "rocksdb.range_del2" diff --git a/sstable/testdata/rewriter b/sstable/testdata/rewriter index a25dc2f095..42dd23826c 100644 --- a/sstable/testdata/rewriter +++ b/sstable/testdata/rewriter @@ -5,6 +5,7 @@ c_xyz.SET.1:c ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] # rewrite from=xyz to=123 block-size=1 index-block-size=1 filter @@ -22,6 +23,7 @@ ca_xyz.SET.1:c ---- point: [aa_xyz#1,1,ca_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] rewrite from=yz to=23 block-size=1 index-block-size=1 filter comparer-split-4b-suffix @@ -39,6 +41,7 @@ c_xyz.SET.1:c ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -75,6 +78,7 @@ rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_123#1,1,c_123#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -111,6 +115,7 @@ rewrite from=_123 to=_456 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_456#1,1,c_456#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -147,6 +152,7 @@ rewrite from=_456 to=_xyz block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -184,6 +190,7 @@ rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_123#1,1,c_123#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout diff --git a/sstable/testdata/writer b/sstable/testdata/writer index 904e35109e..3742e8a5cb 100644 --- a/sstable/testdata/writer +++ b/sstable/testdata/writer @@ -3,6 +3,7 @@ a.SET.1:a ---- point: [a#1,1,a#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] scan @@ -12,6 +13,9 @@ a#1,1:a scan-range-del ---- +scan-range-key +---- + build a.SET.1:a b.DEL.2: @@ -21,10 +25,14 @@ f.SET.5:f g.DEL.6: h.MERGE.7:h i.RANGEDEL.8:j +j.RANGEKEYDEL.9:k +k.RANGEKEYUNSET.10:l [@t5] +l.RANGEKEYSET.11:m [(@t10=foo)] ---- point: [a#1,1,h#7,2] rangedel: [d#4,15,j#72057594037927935,15] -seqnums: [1,8] +rangekey: [j#9,19,m#72057594037927935,21] +seqnums: [1,11] build a.SET.1:a @@ -38,6 +46,7 @@ i.RANGEDEL.8:j ---- point: [a#1,1,h#7,2] rangedel: [d#4,15,j#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,8] scan @@ -65,6 +74,7 @@ j.RANGEDEL.1:z ---- point: [#0,0,#0,0] rangedel: [a#3,15,z#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,3] scan @@ -82,6 +92,9 @@ m#2,15:s m#1,15:s s#1,15:z +scan-range-key +---- + # The range tombstone upper bound is exclusive, so a point operation # on that same key will be the actual boundary. @@ -91,6 +104,7 @@ b.SET.4:c ---- point: [b#4,1,b#4,1] rangedel: [a#3,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [3,4] build @@ -99,6 +113,7 @@ b.SET.2:c ---- point: [b#2,1,b#2,1] rangedel: [a#3,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [2,3] build @@ -107,6 +122,7 @@ b.SET.2:c ---- point: [b#2,1,b#2,1] rangedel: [a#3,15,c#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [2,3] # Keys must be added in order. @@ -134,6 +150,7 @@ build-raw ---- point: [#0,0,#0,0] rangedel: [#1,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,1] build-raw @@ -160,8 +177,61 @@ c.RANGEDEL.2:d ---- point: [#0,0,#0,0] rangedel: [a#1,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] +build-raw +a.RANGEKEYSET.1:b [(@t10=foo)] +a.RANGEKEYSET.2:b [(@t10=foo)] +---- +pebble: range keys starts must be added in strictly increasing order: a#1,RANGEKEYSET, a#2,RANGEKEYSET + +build-raw +b.RANGEKEYSET.2:c [(@t10=foo)] +a.RANGEKEYSET.1:b [(@t10=foo)] +---- +pebble: range keys starts must be added in strictly increasing order: b#2,RANGEKEYSET, a#1,RANGEKEYSET + +build-raw +a.RANGEKEYSET.1:c [(@t10=foo)] +b.RANGEKEYSET.2:d [(@t10=foo)] +---- +pebble: overlapping range keys must be fragmented: a-c#1, b-d#2 + +build-raw +a.RANGEKEYSET.2:c [(@t10=foo)] +a.RANGEKEYSET.1:d [(@t10=foo)] +---- +pebble: overlapping range keys must be fragmented: a-c#2, a-d#1 + +build-raw +a.RANGEKEYSET.1:c [(@t10=foo)] +c.RANGEKEYSET.2:d [(@t10=foo)] +---- +point: [#0,0,#0,0] +rangedel: [#0,0,#0,0] +rangekey: [a#1,21,d#72057594037927935,21] +seqnums: [1,2] + +# Range keys may have perfectly aligned spans (including sequence numbers), +# though the key kinds must be ordered (descending). + +build-raw +a.RANGEKEYUNSET.1:b [@t10] +a.RANGEKEYSET.1:b [(@t10=foo)] +---- +pebble: range keys starts must be added in strictly increasing order: a#1,RANGEKEYUNSET, a#1,RANGEKEYSET + +build-raw +a.RANGEKEYSET.1:b [(@t10=foo)] +a.RANGEKEYUNSET.1:b [@t10] +a.RANGEKEYDEL.1:b +---- +point: [#0,0,#0,0] +rangedel: [#0,0,#0,0] +rangekey: [a#1,21,b#72057594037927935,19] +seqnums: [1,1] + # The range-del-v1 format supports unfragmented and unsorted range # tombstones. @@ -171,6 +241,7 @@ a.RANGEDEL.2:c ---- point: [#0,0,#0,0] rangedel: [a#2,15,c#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -184,6 +255,7 @@ b.RANGEDEL.2:d ---- point: [#0,0,#0,0] rangedel: [a#1,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -199,6 +271,7 @@ a.RANGEDEL.1:d ---- point: [#0,0,#0,0] rangedel: [a#2,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -218,6 +291,7 @@ a.RANGEDEL.3:m ---- point: [#0,0,#0,0] rangedel: [a#3,15,z#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,3] scan-range-del @@ -241,6 +315,7 @@ c.SET.1:c ---- point: [a#1,1,c#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -273,6 +348,7 @@ c.SET.1:c ---- point: [a#1,1,c#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout diff --git a/sstable/writer.go b/sstable/writer.go index 8648a96546..304eba9826 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/pebble/internal/crc" "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/private" + "github.com/cockroachdb/pebble/internal/rangekey" ) var errWriterClosed = errors.New("pebble: writer is closed") @@ -28,8 +29,10 @@ type WriterMetadata struct { Size uint64 SmallestPoint InternalKey SmallestRangeDel InternalKey + SmallestRangeKey InternalKey LargestPoint InternalKey LargestRangeDel InternalKey + LargestRangeKey InternalKey SmallestSeqNum uint64 LargestSeqNum uint64 Properties Properties @@ -134,6 +137,7 @@ type Writer struct { block blockWriter indexBlock blockWriter rangeDelBlock blockWriter + rangeKeyBlock blockWriter props Properties propCollectors []TablePropertyCollector blockPropCollectors []BlockPropertyCollector @@ -223,8 +227,15 @@ func (w *Writer) Add(key InternalKey, value []byte) error { return w.err } - if key.Kind() == InternalKeyKindRangeDelete { + switch key.Kind() { + case InternalKeyKindRangeDelete: return w.addTombstone(key, value) + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeySet, + base.InternalKeyKindRangeKeyUnset: + w.err = errors.Errorf( + "pebble: range keys must be added via one of the AddRangeKey functions") + return w.err } return w.addPoint(key, value) } @@ -370,6 +381,106 @@ func (w *Writer) addTombstone(key InternalKey, value []byte) error { return nil } +// AddInternalRangeKey adds a range key set, unset, or delete key/value pair to +// the table being written. +// +// Range keys must be supplied in strictly ascending order of start key (i.e. +// user key ascending, sequence number descending, and key type descending). +// Ranges added must also be supplied in fragmented span order - i.e. other than +// spans that are perfectly aligned (same start and end keys), spans may not +// overlap. Range keys may be added out of order relative to point keys and +// range deletions. +func (w *Writer) AddInternalRangeKey(key InternalKey, value []byte) error { + if w.err != nil { + return w.err + } + return w.addRangeKey(key, value) +} + +func (w *Writer) addRangeKey(key InternalKey, value []byte) error { + if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 { + prevStartKey := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + prevEndKey, _, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue) + if !ok { + // We panic here as we should have previously decoded and validated this + // key and value when it was first added to the range key block. + panic(errors.Errorf("pebble: invalid end key for span: %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey))) + } + + curStartKey := key + curEndKey, _, ok := rangekey.DecodeEndKey(curStartKey.Kind(), value) + if !ok { + w.err = errors.Errorf("pebble: invalid end key for span: %s", + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + + // Start keys must be strictly increasing. + if base.InternalCompare(w.compare, prevStartKey, curStartKey) >= 0 { + w.err = errors.Errorf( + "pebble: range keys starts must be added in strictly increasing order: %s, %s", + prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) + return w.err + } + + // Start keys are strictly increasing. If the start user keys are equal, the + // end keys must be equal (i.e. aligned spans). + if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 { + if w.compare(prevEndKey, curEndKey) != 0 { + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + } else if w.compare(prevEndKey, curStartKey.UserKey) > 0 { + // If the start user keys are NOT equal, the spans must be disjoint (i.e. + // no overlap). + // NOTE: the inequality excludes zero, as we allow the end key of the + // lower span be the same as the start key of the upper span, because + // the range end key is considered an exclusive bound. + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + } + + // TODO(travers): Add an invariant-gated check to ensure that suffix-values + // are sorted within coalesced spans. + + // Range-keys and point-keys are intended to live in "parallel" keyspaces. + // However, we track a single seqnum in the table metadata that spans both of + // these keyspaces. + // TODO(travers): Consider tracking range key seqnums separately. + w.meta.updateSeqNum(key.SeqNum()) + + // Range tombstones are fragmented, so the start key of the first range key + // added will be the smallest. The largest range key is determined in + // Writer.Close() as the end key of the last range key added to the block. + if w.props.NumRangeKeys() == 0 { + w.meta.SmallestRangeKey = key.Clone() + } + + // Update block properties. + w.props.RawRangeKeyKeySize += uint64(key.Size()) + w.props.RawRangeKeyValueSize += uint64(len(value)) + switch key.Kind() { + case base.InternalKeyKindRangeKeyDelete: + w.props.NumRangeKeyDels++ + case base.InternalKeyKindRangeKeySet: + w.props.NumRangeKeySets++ + case base.InternalKeyKindRangeKeyUnset: + w.props.NumRangeKeyUnsets++ + default: + panic(errors.Errorf("pebble: invalid range key type: %s", key.Kind())) + } + + // Add the key to the block. + w.rangeKeyBlock.add(key, value) + return nil +} + func (w *Writer) maybeAddToFilter(key []byte) { if w.filter != nil { if w.split != nil { @@ -710,6 +821,36 @@ func (w *Writer) Close() (err error) { } } + // Write the range-key block. + var rangeKeyBH BlockHandle + if w.props.NumRangeKeys() > 0 { + key := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + kind := key.Kind() + endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue) + if !ok { + w.err = errors.Newf("invalid end key: %s", w.rangeKeyBlock.curValue) + return w.err + } + w.meta.LargestRangeKey = base.MakeInternalKey(endKey, base.InternalKeySeqNumMax, key.Kind()).Clone() + // TODO(travers): The lack of compression on the range key block matches the + // lack of compression on the range-del block. Revisit whether we want to + // enable compression on this block. + rangeKeyBH, err = w.writeBlock(w.rangeKeyBlock.finish(), NoCompression) + if err != nil { + w.err = err + return w.err + } + } + + // Add the range key block handle to the metaindex block. Note that we add the + // block handle to the metaindex block before the other meta blocks as the + // metaindex block entries must be sorted, and the range key block name sorts + // before the other block names. + if w.props.NumRangeKeys() > 0 { + n := encodeBlockHandle(w.tmp[:], rangeKeyBH) + metaindex.add(InternalKey{UserKey: []byte(metaRangeKeyName)}, w.tmp[:n]) + } + { userProps := make(map[string]string) for i := range w.propCollectors { @@ -899,6 +1040,9 @@ func NewWriter(f writeCloseSyncer, o WriterOptions, extraOpts ...WriterOption) * rangeDelBlock: blockWriter{ restartInterval: 1, }, + rangeKeyBlock: blockWriter{ + restartInterval: 1, + }, topLevelIndexBlock: blockWriter{ restartInterval: 1, }, diff --git a/sstable/writer_test.go b/sstable/writer_test.go index 3b3a0ab84c..cdcf408018 100644 --- a/sstable/writer_test.go +++ b/sstable/writer_test.go @@ -49,9 +49,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) case "build-raw": @@ -65,9 +66,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) case "scan": @@ -112,6 +114,22 @@ func runDataDriven(t *testing.T, file string) { } return buf.String() + case "scan-range-key": + iter, err := r.NewRawRangeKeyIter() + if err != nil { + return err.Error() + } + if iter == nil { + return "" + } + defer iter.Close() + + var buf bytes.Buffer + for key, val := iter.First(); key != nil; key, val = iter.Next() { + fmt.Fprintf(&buf, "%s:%s\n", key, val) + } + return buf.String() + case "layout": l, err := r.Layout() if err != nil { @@ -139,9 +157,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) default: diff --git a/testdata/event_listener b/testdata/event_listener index 040e329c14..0821a74870 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -183,7 +183,7 @@ compact 1 2.3 K 0 B 0 (size == estimated-debt, scor zmemtbl 0 0 B ztbl 0 0 B bcache 8 1.4 K 5.9% (score == hit-rate) - tcache 1 616 B 0.0% (score == hit-rate) + tcache 1 672 B 0.0% (score == hit-rate) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/ingest b/testdata/ingest index 6277b35876..5cf09c7a3a 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -48,7 +48,7 @@ compact 0 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 0 0 B ztbl 0 0 B bcache 8 1.5 K 46.7% (score == hit-rate) - tcache 1 616 B 50.0% (score == hit-rate) + tcache 1 672 B 50.0% (score == hit-rate) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/metrics b/testdata/metrics index b5a06a445c..0f74a6de77 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -34,7 +34,7 @@ compact 0 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 0 0 B bcache 4 698 B 0.0% (score == hit-rate) - tcache 1 616 B 0.0% (score == hit-rate) + tcache 1 672 B 0.0% (score == hit-rate) titers 1 filter - - 0.0% (score == utility) @@ -81,7 +81,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 2 512 K ztbl 2 1.5 K bcache 8 1.4 K 33.3% (score == hit-rate) - tcache 2 1.2 K 50.0% (score == hit-rate) + tcache 2 1.3 K 50.0% (score == hit-rate) titers 2 filter - - 0.0% (score == utility) @@ -113,7 +113,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 2 1.5 K bcache 8 1.4 K 33.3% (score == hit-rate) - tcache 2 1.2 K 50.0% (score == hit-rate) + tcache 2 1.3 K 50.0% (score == hit-rate) titers 2 filter - - 0.0% (score == utility) @@ -142,7 +142,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 1 771 B bcache 4 698 B 33.3% (score == hit-rate) - tcache 1 616 B 50.0% (score == hit-rate) + tcache 1 672 B 50.0% (score == hit-rate) titers 1 filter - - 0.0% (score == utility)