diff --git a/internal/rangekey/rangekey.go b/internal/rangekey/rangekey.go index c692ff6495..5007b9406b 100644 --- a/internal/rangekey/rangekey.go +++ b/internal/rangekey/rangekey.go @@ -323,6 +323,30 @@ func Parse(s string) (key base.InternalKey, value []byte) { } } +// RecombinedValueLen returns the length of the byte slice that results from +// re-encoding the end key and the user-value as a physical range key value. +func RecombinedValueLen(kind base.InternalKeyKind, endKey, userValue []byte) int { + n := len(endKey) + if kind == base.InternalKeyKindRangeKeyDelete { + // RANGEKEYDELs are not varint encoded. + return n + } + return lenVarint(len(endKey)) + len(endKey) + len(userValue) +} + +// RecombineValue re-encodes the end key and user-value as a physical range key +// value into the destination byte slice. +func RecombineValue(kind base.InternalKeyKind, dst, endKey, userValue []byte) int { + if kind == base.InternalKeyKindRangeKeyDelete { + // RANGEKEYDELs are not varint encoded. + return copy(dst, endKey) + } + n := binary.PutUvarint(dst, uint64(len(endKey))) + n += copy(dst[n:], endKey) + n += copy(dst[n:], userValue) + return n +} + func lenVarint(v int) (n int) { x := uint32(v) n++ diff --git a/internal/rangekey/rangekey_test.go b/internal/rangekey/rangekey_test.go index 5c1b6b9c3a..68d073a928 100644 --- a/internal/rangekey/rangekey_test.go +++ b/internal/rangekey/rangekey_test.go @@ -2,6 +2,7 @@ package rangekey import ( "fmt" + "strconv" "testing" "github.com/cockroachdb/pebble/internal/base" @@ -146,3 +147,27 @@ func TestParseFormatRoundtrip(t *testing.T) { } } } + +func TestRecombinedValueLen_RoundTrip(t *testing.T) { + testCases := []string{ + "a.RANGEKEYSET.1: [(@t22=foo),(@t1=bar)]", + "a.RANGEKEYSET.1: [(@t1=bar)]", + "a.RANGEKEYUNSET.1: [@t9,@t8,@t7,@t6,@t5]", + "a.RANGEKEYDEL.5: foo", + } + for i, in := range testCases { + t.Run(strconv.Itoa(i), func(t *testing.T) { + k, v := Parse(in) + + // Split the value into an end key and a user-value. + endKey, restValue, ok := DecodeEndKey(k.Kind(), v) + require.True(t, ok) + + // Re-encode the end key and user-value. + dst := make([]byte, RecombinedValueLen(k.Kind(), endKey, restValue)) + RecombineValue(k.Kind(), dst, endKey, restValue) + + require.Equal(t, v, dst) + }) + } +} diff --git a/sstable/block.go b/sstable/block.go index 8ef57da2d3..3e2437892b 100644 --- a/sstable/block.go +++ b/sstable/block.go @@ -183,14 +183,15 @@ type blockEntry struct { // interval of 1 (no prefix compression), blockIter guarantees that // InternalKey.UserKey will point to the key as stored in the block itself // which will remain valid until the blockIter is closed. The key stability -// guarantee is used by the range tombstone code which knows that range -// tombstones are always encoded with a restart interval of 1. This per-block -// key stability guarantee is sufficient for range tombstones as they are -// always encoded in a single block. +// guarantee is used by the range tombstone and range key code, which knows that +// the respective blocks are always encoded with a restart interval of 1. This +// per-block key stability guarantee is sufficient for range tombstones and +// range deletes as they are always encoded in a single block. // -// A blockIter also provides a value stability guarantee for range deletions -// since there is only a single range deletion block per sstable and the -// blockIter will not release the bytes for the block until it is closed. +// A blockIter also provides a value stability guarantee for range deletions and +// range keys since there is only a single range deletion and range key block +// per sstable and the blockIter will not release the bytes for the block until +// it is closed. type blockIter struct { cmp Compare // offset is the byte index that marks where the current key/value is diff --git a/sstable/data_test.go b/sstable/data_test.go index c13aa3f4ce..2e748cc3e3 100644 --- a/sstable/data_test.go +++ b/sstable/data_test.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/datadriven" "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/internal/rangekey" "github.com/cockroachdb/pebble/vfs" ) @@ -68,12 +69,20 @@ func runBuildCmd( } w := NewWriter(f0, *writerOpts) - var tombstones []keyspan.Span - f := keyspan.Fragmenter{ + var rangeDels []keyspan.Span + rangeDelFrag := keyspan.Fragmenter{ Cmp: DefaultComparer.Compare, Format: DefaultComparer.FormatKey, Emit: func(fragmented []keyspan.Span) { - tombstones = append(tombstones, fragmented...) + rangeDels = append(rangeDels, fragmented...) + }, + } + var rangeKeys []keyspan.Span + rangeKeyFrag := keyspan.Fragmenter{ + Cmp: DefaultComparer.Compare, + Format: DefaultComparer.FormatKey, + Emit: func(fragmented []keyspan.Span) { + rangeKeys = append(rangeKeys, fragmented...) }, } for _, data := range strings.Split(td.Input, "\n") { @@ -89,7 +98,28 @@ func runBuildCmd( err = errors.Errorf("%v", r) } }() - f.Add(keyspan.Span{Start: key, End: value}) + rangeDelFrag.Add(keyspan.Span{Start: key, End: value}) + }() + if err != nil { + return nil, nil, err + } + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + var err error + func() { + defer func() { + if r := recover(); r != nil { + err = errors.Errorf("%v", r) + } + }() + key, value := rangekey.Parse(data) + endKey, value, ok := rangekey.DecodeEndKey(key.Kind(), value) + if !ok { + err = errors.New("could not decode end key") + return + } + rangeKeyFrag.Add(keyspan.Span{Start: key, End: endKey, Value: value}) }() if err != nil { return nil, nil, err @@ -98,15 +128,24 @@ func runBuildCmd( if err := w.Add(key, value); err != nil { return nil, nil, err } - } } - f.Finish() - for _, v := range tombstones { + rangeDelFrag.Finish() + for _, v := range rangeDels { if err := w.Add(v.Start, v.End); err != nil { return nil, nil, err } } + rangeKeyFrag.Finish() + for _, v := range rangeKeys { + // Reconstitute the value from the end key and the user value. + n := rangekey.RecombinedValueLen(v.Start.Kind(), v.End, v.Value) + b := make([]byte, n) + _ = rangekey.RecombineValue(v.Start.Kind(), b, v.End, v.Value) + if err := w.AddInternalRangeKey(v.Start, b); err != nil { + return nil, nil, err + } + } if err := w.Close(); err != nil { return nil, nil, err } @@ -151,9 +190,22 @@ func runBuildRawCmd(td *datadriven.TestData) (*WriterMetadata, *Reader, error) { for _, data := range strings.Split(td.Input, "\n") { j := strings.Index(data, ":") key := base.ParseInternalKey(data[:j]) - value := []byte(data[j+1:]) - if err := w.Add(key, value); err != nil { - return nil, nil, err + + switch key.Kind() { + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeyUnset, + base.InternalKeyKindRangeKeySet: + // Values for range keys must be converted into their "packed" form before + // being added to the Writer. + _, value := rangekey.Parse(data) + if err := w.AddInternalRangeKey(key, value); err != nil { + return nil, nil, err + } + default: + value := []byte(data[j+1:]) + if err := w.Add(key, value); err != nil { + return nil, nil, err + } } } if err := w.Close(); err != nil { diff --git a/sstable/properties.go b/sstable/properties.go index c8ff10be8e..7ebb9835b9 100644 --- a/sstable/properties.go +++ b/sstable/properties.go @@ -114,6 +114,12 @@ type Properties struct { NumMergeOperands uint64 `prop:"rocksdb.merge.operands"` // The number of range deletions in this table. NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"` + // The number of RANGEKEYDELs in this table. + NumRangeKeyDels uint64 `prop:"pebble.num.range-key-dels"` + // The number of RANGEKEYSETs in this table. + NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"` + // The number of RANGEKEYUNSETs in this table. + NumRangeKeyUnsets uint64 `prop:"pebble.num.range-key-unsets"` // Timestamp of the earliest key. 0 if unknown. OldestKeyTime uint64 `prop:"rocksdb.oldest.key.time"` // The name of the prefix extractor used in this table. Empty if no prefix @@ -126,6 +132,10 @@ type Properties struct { PropertyCollectorNames string `prop:"rocksdb.property.collectors"` // Total raw key size. RawKeySize uint64 `prop:"rocksdb.raw.key.size"` + // Total raw rangekey key size. + RawRangeKeyKeySize uint64 `prop:"pebble.raw.rangekey.key.size"` + // Total raw rangekey value size. + RawRangeKeyValueSize uint64 `prop:"pebble.raw.rangekey.value.size"` // Total raw value size. RawValueSize uint64 `prop:"rocksdb.raw.value.size"` // Size of the top-level index if kTwoLevelIndexSearch is used. @@ -147,6 +157,11 @@ func (p *Properties) NumPointDeletions() uint64 { return p.NumDeletions - p.NumRangeDeletions } +// NumRangeKeys returns a count of the number of range keys in this table. +func (p *Properties) NumRangeKeys() uint64 { + return p.NumRangeKeyDels + p.NumRangeKeySets + p.NumRangeKeyUnsets +} + func (p *Properties) String() string { var buf bytes.Buffer v := reflect.ValueOf(*p) @@ -318,6 +333,13 @@ func (p *Properties) save(w *rawBlockWriter) { p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions) p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands) p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions) + if p.NumRangeKeys() > 0 { + p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyDels), p.NumRangeKeyDels) + p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeySets), p.NumRangeKeySets) + p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyUnsets), p.NumRangeKeyUnsets) + p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyKeySize), p.RawRangeKeyKeySize) + p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyValueSize), p.RawRangeKeyValueSize) + } p.saveUvarint(m, unsafe.Offsetof(p.OldestKeyTime), p.OldestKeyTime) if p.PrefixExtractorName != "" { p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName) diff --git a/sstable/properties_test.go b/sstable/properties_test.go index 9173a99d86..a044e0e3c3 100644 --- a/sstable/properties_test.go +++ b/sstable/properties_test.go @@ -85,13 +85,16 @@ func TestPropertiesSave(t *testing.T) { NumEntries: 16, NumMergeOperands: 17, NumRangeDeletions: 18, - OldestKeyTime: 19, + NumRangeKeyDels: 19, + NumRangeKeySets: 20, + NumRangeKeyUnsets: 21, + OldestKeyTime: 22, PrefixExtractorName: "prefix extractor name", PrefixFiltering: true, PropertyCollectorNames: "prefix collector names", - RawKeySize: 20, - RawValueSize: 21, - TopLevelIndexSize: 22, + RawKeySize: 23, + RawValueSize: 24, + TopLevelIndexSize: 25, WholeKeyFiltering: true, UserProperties: map[string]string{ "user-prop-a": "1", diff --git a/sstable/reader.go b/sstable/reader.go index 50362ca4d8..4fefb768b2 100644 --- a/sstable/reader.go +++ b/sstable/reader.go @@ -2018,6 +2018,7 @@ type Reader struct { indexBH BlockHandle filterBH BlockHandle rangeDelBH BlockHandle + rangeKeyBH BlockHandle rangeDelTransform blockTransform propertiesBH BlockHandle metaIndexBH BlockHandle @@ -2133,6 +2134,24 @@ func (r *Reader) NewRawRangeDelIter() (base.InternalIterator, error) { return i, nil } +// NewRawRangeKeyIter returns an internal iterator for the contents of the +// range-key block for the table. Returns nil if the table does not contain any +// range keys. +func (r *Reader) NewRawRangeKeyIter() (base.InternalIterator, error) { + if r.rangeKeyBH.Length == 0 { + return nil, nil + } + h, err := r.readRangeKey() + if err != nil { + return nil, err + } + i := &blockIter{} + if err := i.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil { + return nil, err + } + return i, nil +} + func (r *Reader) readIndex() (cache.Handle, error) { return r.readBlock(r.indexBH, nil /* transform */, nil /* readaheadState */) } @@ -2145,6 +2164,10 @@ func (r *Reader) readRangeDel() (cache.Handle, error) { return r.readBlock(r.rangeDelBH, r.rangeDelTransform, nil /* readaheadState */) } +func (r *Reader) readRangeKey() (cache.Handle, error) { + return r.readBlock(r.rangeKeyBH, nil /* transform */, nil /* readaheadState */) +} + // readBlock reads and decompresses a block from disk into memory. func (r *Reader) readBlock( bh BlockHandle, transform blockTransform, raState *readaheadState, @@ -2347,6 +2370,10 @@ func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { } } + if bh, ok := meta[metaRangeKeyName]; ok { + r.rangeKeyBH = bh + } + for name, fp := range r.opts.Filters { types := []struct { ftype FilterType diff --git a/sstable/table.go b/sstable/table.go index 0227283060..da36c50d33 100644 --- a/sstable/table.go +++ b/sstable/table.go @@ -85,8 +85,9 @@ The table file format looks like: [data block N-1] [meta filter block] (optional) [index block] (for single level index) -[meta rangedel block] (optional) +[meta range key block] (optional) [meta properties block] +[meta rangedel block] (optional) [metaindex block] [footer] @@ -179,6 +180,7 @@ const ( levelDBFormatVersion = 0 rocksDBFormatVersion2 = 2 + metaRangeKeyName = "pebble.range_key" metaPropertiesName = "rocksdb.properties" metaRangeDelName = "rocksdb.range_del" metaRangeDelV2Name = "rocksdb.range_del2" diff --git a/sstable/testdata/rewriter b/sstable/testdata/rewriter index a25dc2f095..42dd23826c 100644 --- a/sstable/testdata/rewriter +++ b/sstable/testdata/rewriter @@ -5,6 +5,7 @@ c_xyz.SET.1:c ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] # rewrite from=xyz to=123 block-size=1 index-block-size=1 filter @@ -22,6 +23,7 @@ ca_xyz.SET.1:c ---- point: [aa_xyz#1,1,ca_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] rewrite from=yz to=23 block-size=1 index-block-size=1 filter comparer-split-4b-suffix @@ -39,6 +41,7 @@ c_xyz.SET.1:c ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -75,6 +78,7 @@ rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_123#1,1,c_123#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -111,6 +115,7 @@ rewrite from=_123 to=_456 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_456#1,1,c_456#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -147,6 +152,7 @@ rewrite from=_456 to=_xyz block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_xyz#1,1,c_xyz#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -184,6 +190,7 @@ rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split- ---- point: [a_123#1,1,c_123#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout diff --git a/sstable/testdata/writer b/sstable/testdata/writer index 904e35109e..3742e8a5cb 100644 --- a/sstable/testdata/writer +++ b/sstable/testdata/writer @@ -3,6 +3,7 @@ a.SET.1:a ---- point: [a#1,1,a#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] scan @@ -12,6 +13,9 @@ a#1,1:a scan-range-del ---- +scan-range-key +---- + build a.SET.1:a b.DEL.2: @@ -21,10 +25,14 @@ f.SET.5:f g.DEL.6: h.MERGE.7:h i.RANGEDEL.8:j +j.RANGEKEYDEL.9:k +k.RANGEKEYUNSET.10:l [@t5] +l.RANGEKEYSET.11:m [(@t10=foo)] ---- point: [a#1,1,h#7,2] rangedel: [d#4,15,j#72057594037927935,15] -seqnums: [1,8] +rangekey: [j#9,19,m#72057594037927935,21] +seqnums: [1,11] build a.SET.1:a @@ -38,6 +46,7 @@ i.RANGEDEL.8:j ---- point: [a#1,1,h#7,2] rangedel: [d#4,15,j#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,8] scan @@ -65,6 +74,7 @@ j.RANGEDEL.1:z ---- point: [#0,0,#0,0] rangedel: [a#3,15,z#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,3] scan @@ -82,6 +92,9 @@ m#2,15:s m#1,15:s s#1,15:z +scan-range-key +---- + # The range tombstone upper bound is exclusive, so a point operation # on that same key will be the actual boundary. @@ -91,6 +104,7 @@ b.SET.4:c ---- point: [b#4,1,b#4,1] rangedel: [a#3,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [3,4] build @@ -99,6 +113,7 @@ b.SET.2:c ---- point: [b#2,1,b#2,1] rangedel: [a#3,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [2,3] build @@ -107,6 +122,7 @@ b.SET.2:c ---- point: [b#2,1,b#2,1] rangedel: [a#3,15,c#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [2,3] # Keys must be added in order. @@ -134,6 +150,7 @@ build-raw ---- point: [#0,0,#0,0] rangedel: [#1,15,b#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,1] build-raw @@ -160,8 +177,61 @@ c.RANGEDEL.2:d ---- point: [#0,0,#0,0] rangedel: [a#1,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] +build-raw +a.RANGEKEYSET.1:b [(@t10=foo)] +a.RANGEKEYSET.2:b [(@t10=foo)] +---- +pebble: range keys starts must be added in strictly increasing order: a#1,RANGEKEYSET, a#2,RANGEKEYSET + +build-raw +b.RANGEKEYSET.2:c [(@t10=foo)] +a.RANGEKEYSET.1:b [(@t10=foo)] +---- +pebble: range keys starts must be added in strictly increasing order: b#2,RANGEKEYSET, a#1,RANGEKEYSET + +build-raw +a.RANGEKEYSET.1:c [(@t10=foo)] +b.RANGEKEYSET.2:d [(@t10=foo)] +---- +pebble: overlapping range keys must be fragmented: a-c#1, b-d#2 + +build-raw +a.RANGEKEYSET.2:c [(@t10=foo)] +a.RANGEKEYSET.1:d [(@t10=foo)] +---- +pebble: overlapping range keys must be fragmented: a-c#2, a-d#1 + +build-raw +a.RANGEKEYSET.1:c [(@t10=foo)] +c.RANGEKEYSET.2:d [(@t10=foo)] +---- +point: [#0,0,#0,0] +rangedel: [#0,0,#0,0] +rangekey: [a#1,21,d#72057594037927935,21] +seqnums: [1,2] + +# Range keys may have perfectly aligned spans (including sequence numbers), +# though the key kinds must be ordered (descending). + +build-raw +a.RANGEKEYUNSET.1:b [@t10] +a.RANGEKEYSET.1:b [(@t10=foo)] +---- +pebble: range keys starts must be added in strictly increasing order: a#1,RANGEKEYUNSET, a#1,RANGEKEYSET + +build-raw +a.RANGEKEYSET.1:b [(@t10=foo)] +a.RANGEKEYUNSET.1:b [@t10] +a.RANGEKEYDEL.1:b +---- +point: [#0,0,#0,0] +rangedel: [#0,0,#0,0] +rangekey: [a#1,21,b#72057594037927935,19] +seqnums: [1,1] + # The range-del-v1 format supports unfragmented and unsorted range # tombstones. @@ -171,6 +241,7 @@ a.RANGEDEL.2:c ---- point: [#0,0,#0,0] rangedel: [a#2,15,c#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -184,6 +255,7 @@ b.RANGEDEL.2:d ---- point: [#0,0,#0,0] rangedel: [a#1,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -199,6 +271,7 @@ a.RANGEDEL.1:d ---- point: [#0,0,#0,0] rangedel: [a#2,15,d#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,2] scan-range-del @@ -218,6 +291,7 @@ a.RANGEDEL.3:m ---- point: [#0,0,#0,0] rangedel: [a#3,15,z#72057594037927935,15] +rangekey: [#0,0,#0,0] seqnums: [1,3] scan-range-del @@ -241,6 +315,7 @@ c.SET.1:c ---- point: [a#1,1,c#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout @@ -273,6 +348,7 @@ c.SET.1:c ---- point: [a#1,1,c#1,1] rangedel: [#0,0,#0,0] +rangekey: [#0,0,#0,0] seqnums: [1,1] layout diff --git a/sstable/writer.go b/sstable/writer.go index 8648a96546..304eba9826 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/pebble/internal/crc" "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/private" + "github.com/cockroachdb/pebble/internal/rangekey" ) var errWriterClosed = errors.New("pebble: writer is closed") @@ -28,8 +29,10 @@ type WriterMetadata struct { Size uint64 SmallestPoint InternalKey SmallestRangeDel InternalKey + SmallestRangeKey InternalKey LargestPoint InternalKey LargestRangeDel InternalKey + LargestRangeKey InternalKey SmallestSeqNum uint64 LargestSeqNum uint64 Properties Properties @@ -134,6 +137,7 @@ type Writer struct { block blockWriter indexBlock blockWriter rangeDelBlock blockWriter + rangeKeyBlock blockWriter props Properties propCollectors []TablePropertyCollector blockPropCollectors []BlockPropertyCollector @@ -223,8 +227,15 @@ func (w *Writer) Add(key InternalKey, value []byte) error { return w.err } - if key.Kind() == InternalKeyKindRangeDelete { + switch key.Kind() { + case InternalKeyKindRangeDelete: return w.addTombstone(key, value) + case base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeKeySet, + base.InternalKeyKindRangeKeyUnset: + w.err = errors.Errorf( + "pebble: range keys must be added via one of the AddRangeKey functions") + return w.err } return w.addPoint(key, value) } @@ -370,6 +381,106 @@ func (w *Writer) addTombstone(key InternalKey, value []byte) error { return nil } +// AddInternalRangeKey adds a range key set, unset, or delete key/value pair to +// the table being written. +// +// Range keys must be supplied in strictly ascending order of start key (i.e. +// user key ascending, sequence number descending, and key type descending). +// Ranges added must also be supplied in fragmented span order - i.e. other than +// spans that are perfectly aligned (same start and end keys), spans may not +// overlap. Range keys may be added out of order relative to point keys and +// range deletions. +func (w *Writer) AddInternalRangeKey(key InternalKey, value []byte) error { + if w.err != nil { + return w.err + } + return w.addRangeKey(key, value) +} + +func (w *Writer) addRangeKey(key InternalKey, value []byte) error { + if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 { + prevStartKey := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + prevEndKey, _, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue) + if !ok { + // We panic here as we should have previously decoded and validated this + // key and value when it was first added to the range key block. + panic(errors.Errorf("pebble: invalid end key for span: %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey))) + } + + curStartKey := key + curEndKey, _, ok := rangekey.DecodeEndKey(curStartKey.Kind(), value) + if !ok { + w.err = errors.Errorf("pebble: invalid end key for span: %s", + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + + // Start keys must be strictly increasing. + if base.InternalCompare(w.compare, prevStartKey, curStartKey) >= 0 { + w.err = errors.Errorf( + "pebble: range keys starts must be added in strictly increasing order: %s, %s", + prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) + return w.err + } + + // Start keys are strictly increasing. If the start user keys are equal, the + // end keys must be equal (i.e. aligned spans). + if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 { + if w.compare(prevEndKey, curEndKey) != 0 { + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + } else if w.compare(prevEndKey, curStartKey.UserKey) > 0 { + // If the start user keys are NOT equal, the spans must be disjoint (i.e. + // no overlap). + // NOTE: the inequality excludes zero, as we allow the end key of the + // lower span be the same as the start key of the upper span, because + // the range end key is considered an exclusive bound. + w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", + (keyspan.Span{Start: prevStartKey, End: prevEndKey}).Pretty(w.formatKey), + (keyspan.Span{Start: curStartKey, End: curEndKey}).Pretty(w.formatKey)) + return w.err + } + } + + // TODO(travers): Add an invariant-gated check to ensure that suffix-values + // are sorted within coalesced spans. + + // Range-keys and point-keys are intended to live in "parallel" keyspaces. + // However, we track a single seqnum in the table metadata that spans both of + // these keyspaces. + // TODO(travers): Consider tracking range key seqnums separately. + w.meta.updateSeqNum(key.SeqNum()) + + // Range tombstones are fragmented, so the start key of the first range key + // added will be the smallest. The largest range key is determined in + // Writer.Close() as the end key of the last range key added to the block. + if w.props.NumRangeKeys() == 0 { + w.meta.SmallestRangeKey = key.Clone() + } + + // Update block properties. + w.props.RawRangeKeyKeySize += uint64(key.Size()) + w.props.RawRangeKeyValueSize += uint64(len(value)) + switch key.Kind() { + case base.InternalKeyKindRangeKeyDelete: + w.props.NumRangeKeyDels++ + case base.InternalKeyKindRangeKeySet: + w.props.NumRangeKeySets++ + case base.InternalKeyKindRangeKeyUnset: + w.props.NumRangeKeyUnsets++ + default: + panic(errors.Errorf("pebble: invalid range key type: %s", key.Kind())) + } + + // Add the key to the block. + w.rangeKeyBlock.add(key, value) + return nil +} + func (w *Writer) maybeAddToFilter(key []byte) { if w.filter != nil { if w.split != nil { @@ -710,6 +821,36 @@ func (w *Writer) Close() (err error) { } } + // Write the range-key block. + var rangeKeyBH BlockHandle + if w.props.NumRangeKeys() > 0 { + key := base.DecodeInternalKey(w.rangeKeyBlock.curKey) + kind := key.Kind() + endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue) + if !ok { + w.err = errors.Newf("invalid end key: %s", w.rangeKeyBlock.curValue) + return w.err + } + w.meta.LargestRangeKey = base.MakeInternalKey(endKey, base.InternalKeySeqNumMax, key.Kind()).Clone() + // TODO(travers): The lack of compression on the range key block matches the + // lack of compression on the range-del block. Revisit whether we want to + // enable compression on this block. + rangeKeyBH, err = w.writeBlock(w.rangeKeyBlock.finish(), NoCompression) + if err != nil { + w.err = err + return w.err + } + } + + // Add the range key block handle to the metaindex block. Note that we add the + // block handle to the metaindex block before the other meta blocks as the + // metaindex block entries must be sorted, and the range key block name sorts + // before the other block names. + if w.props.NumRangeKeys() > 0 { + n := encodeBlockHandle(w.tmp[:], rangeKeyBH) + metaindex.add(InternalKey{UserKey: []byte(metaRangeKeyName)}, w.tmp[:n]) + } + { userProps := make(map[string]string) for i := range w.propCollectors { @@ -899,6 +1040,9 @@ func NewWriter(f writeCloseSyncer, o WriterOptions, extraOpts ...WriterOption) * rangeDelBlock: blockWriter{ restartInterval: 1, }, + rangeKeyBlock: blockWriter{ + restartInterval: 1, + }, topLevelIndexBlock: blockWriter{ restartInterval: 1, }, diff --git a/sstable/writer_test.go b/sstable/writer_test.go index 3b3a0ab84c..cdcf408018 100644 --- a/sstable/writer_test.go +++ b/sstable/writer_test.go @@ -49,9 +49,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) case "build-raw": @@ -65,9 +66,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) case "scan": @@ -112,6 +114,22 @@ func runDataDriven(t *testing.T, file string) { } return buf.String() + case "scan-range-key": + iter, err := r.NewRawRangeKeyIter() + if err != nil { + return err.Error() + } + if iter == nil { + return "" + } + defer iter.Close() + + var buf bytes.Buffer + for key, val := iter.First(); key != nil; key, val = iter.Next() { + fmt.Fprintf(&buf, "%s:%s\n", key, val) + } + return buf.String() + case "layout": l, err := r.Layout() if err != nil { @@ -139,9 +157,10 @@ func runDataDriven(t *testing.T, file string) { if err != nil { return err.Error() } - return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nseqnums: [%d,%d]\n", + return fmt.Sprintf("point: [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums: [%d,%d]\n", meta.SmallestPoint, meta.LargestPoint, meta.SmallestRangeDel, meta.LargestRangeDel, + meta.SmallestRangeKey, meta.LargestRangeKey, meta.SmallestSeqNum, meta.LargestSeqNum) default: diff --git a/testdata/event_listener b/testdata/event_listener index 862b8223ac..16a5a10ea1 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -183,7 +183,7 @@ compact 1 2.3 K 0 B 0 (size == estimated-debt, scor zmemtbl 0 0 B ztbl 0 0 B bcache 8 1.4 K 5.9% (score == hit-rate) - tcache 1 616 B 0.0% (score == hit-rate) + tcache 1 672 B 0.0% (score == hit-rate) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/ingest b/testdata/ingest index 6277b35876..5cf09c7a3a 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -48,7 +48,7 @@ compact 0 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 0 0 B ztbl 0 0 B bcache 8 1.5 K 46.7% (score == hit-rate) - tcache 1 616 B 50.0% (score == hit-rate) + tcache 1 672 B 50.0% (score == hit-rate) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/metrics b/testdata/metrics index b5a06a445c..0f74a6de77 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -34,7 +34,7 @@ compact 0 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 0 0 B bcache 4 698 B 0.0% (score == hit-rate) - tcache 1 616 B 0.0% (score == hit-rate) + tcache 1 672 B 0.0% (score == hit-rate) titers 1 filter - - 0.0% (score == utility) @@ -81,7 +81,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 2 512 K ztbl 2 1.5 K bcache 8 1.4 K 33.3% (score == hit-rate) - tcache 2 1.2 K 50.0% (score == hit-rate) + tcache 2 1.3 K 50.0% (score == hit-rate) titers 2 filter - - 0.0% (score == utility) @@ -113,7 +113,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 2 1.5 K bcache 8 1.4 K 33.3% (score == hit-rate) - tcache 2 1.2 K 50.0% (score == hit-rate) + tcache 2 1.3 K 50.0% (score == hit-rate) titers 2 filter - - 0.0% (score == utility) @@ -142,7 +142,7 @@ compact 1 0 B 0 B 0 (size == estimated-debt, scor zmemtbl 1 256 K ztbl 1 771 B bcache 4 698 B 33.3% (score == hit-rate) - tcache 1 616 B 50.0% (score == hit-rate) + tcache 1 672 B 50.0% (score == hit-rate) titers 1 filter - - 0.0% (score == utility)