Skip to content

Commit

Permalink
sstable: order range keys consistently in colblk and rowblk encodings
Browse files Browse the repository at this point in the history
Previously the ordering of range keys within a range key block was a bit
muddled and undefined. Before encoding a Span of range keys to the underlying
RawWriter, the sstable.Writer type previously sorted the span's keys by suffix.
However, the row-based sstable writer always serializes RANGEKEYSETs of a Span
first, followed by RANGEKEYUNSETs and then RANGEKEYDELs.

Confusingly, the rowblk fragment iterator also sorted keys by trailer when
iterating backwards, but not forwards. The columnar RawWriter preserved the
order of keys in the span passed to EncodeSpan, and this order was preserved
during iteration.

This commit adapts the sstable.Writer type to sort a range key Span's keys by
trailer and then suffix before encoding. This provides determinism and matches
the ordering of keys produced by compactions which sort the keys by trailer.
Additionally, the rowblk fragment iterator is updated to always sort the
returned keys by trailer.
  • Loading branch information
jbowens committed Nov 12, 2024
1 parent 048f275 commit 8e01ef3
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 45 deletions.
9 changes: 7 additions & 2 deletions internal/keyspan/span.go
Original file line number Diff line number Diff line change
Expand Up @@ -470,9 +470,14 @@ func SortKeysByTrailer(keys []Key) {
})
}

// SortKeysBySuffix sorts a Keys slice by suffix.
func SortKeysBySuffix(suffixCmp base.CompareRangeSuffixes, keys []Key) {
// SortKeysByTrailerAndSuffix sorts a Keys slice by trailer, and among keys with
// equal trailers, by suffix.
func SortKeysByTrailerAndSuffix(suffixCmp base.CompareRangeSuffixes, keys []Key) {
slices.SortFunc(keys, func(a, b Key) int {
// Trailer are ordered in decreasing number order.
if v := cmp.Compare(b.Trailer, a.Trailer); v != 0 {
return v
}
return suffixCmp(a.Suffix, b.Suffix)
})
}
Expand Down
6 changes: 5 additions & 1 deletion sstable/rowblk/rowblk_fragment_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,10 @@ func (i *fragmentIter) gatherForward(kv *base.InternalKV) (*keyspan.Span, error)
if err := i.applySpanTransforms(); err != nil {
return nil, err
}

// Apply a consistent ordering.
keyspan.SortKeysByTrailer(i.span.Keys)

// i.blockIter is positioned over the first internal key for the next span.
return &i.span, nil
}
Expand Down Expand Up @@ -264,7 +268,7 @@ func (i *fragmentIter) gatherBackward(kv *base.InternalKV) (*keyspan.Span, error
// i.blockIter is positioned over the last internal key for the previous
// span.

// Backwards iteration encounters internal keys in the wrong order.
// Apply a consistent ordering.
keyspan.SortKeysByTrailer(i.span.Keys)

i.applySpanTransforms()
Expand Down
16 changes: 12 additions & 4 deletions sstable/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,18 @@ func (w *Writer) encodeFragmentedRangeKeySpan(s keyspan.Span) {
// NB: The span should only contain range keys and be internally consistent
// (eg, no duplicate suffixes, no additional keys after a RANGEKEYDEL).
//
// Sort the keys by suffix. Iteration doesn't *currently* depend on it, but
// we may want to in the future.
keyspan.SortKeysBySuffix(w.comparer.CompareRangeSuffixes, s.Keys)

// Sort the keys by trailer (descending).
//
// Note that sstables written in TableFormatPebblev4 and earlier (rowblk
// encoding) will always re-order the Keys in order to encode RANGEKEYSETs
// first, then RANGEKEYUNSETs and then RANGEKEYDELs. See rangekey.Encoder.
// SSTables written in TableFormatPebblev5 or later (colblk encoding) will
// encode the keys in this order.
//
// Iteration doesn't depend on this ordering, in particular because it's
// inconsistent between the two encodings, but we may want eventually begin
// to depend on this ordering for colblk-encoded sstables.
keyspan.SortKeysByTrailerAndSuffix(w.comparer.CompareRangeSuffixes, s.Keys)
if w.Error() == nil {
w.rw.EncodeSpan(s)
}
Expand Down
85 changes: 47 additions & 38 deletions sstable/writer_rangekey_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/testkeys"
"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
"github.com/cockroachdb/pebble/sstable/colblk"
"github.com/cockroachdb/pebble/vfs"
"github.com/stretchr/testify/require"
)
Expand All @@ -26,7 +27,9 @@ func TestWriter_RangeKeys(t *testing.T) {
}
}()

buildFn := func(td *datadriven.TestData) (*Reader, error) {
cmp := testkeys.Comparer
keySchema := colblk.DefaultKeySchema(cmp, 16)
buildFn := func(td *datadriven.TestData, format TableFormat) (*Reader, error) {
mem := vfs.NewMem()
f, err := mem.Create("test", vfs.WriteCategoryUnspecified)
if err != nil {
Expand All @@ -35,10 +38,10 @@ func TestWriter_RangeKeys(t *testing.T) {

// Use a "suffix-aware" Comparer, that will sort suffix-values in
// descending order of timestamp, rather than in lexical order.
cmp := testkeys.Comparer
w := NewWriter(objstorageprovider.NewFileWritable(f), WriterOptions{
Comparer: cmp,
TableFormat: TableFormatPebblev2,
TableFormat: format,
KeySchema: &keySchema,
})
defer func() {
if w != nil {
Expand Down Expand Up @@ -92,46 +95,52 @@ func TestWriter_RangeKeys(t *testing.T) {
return nil, err
}

r, err = newReader(f, ReaderOptions{Comparer: cmp})
r, err = newReader(f, ReaderOptions{
Comparer: cmp,
KeySchemas: MakeKeySchemas(&keySchema),
})
if err != nil {
return nil, err
}

return r, nil
}

datadriven.RunTest(t, "testdata/writer_range_keys", func(t *testing.T, td *datadriven.TestData) string {
switch td.Cmd {
case "build":
if r != nil {
_ = r.Close()
r = nil
}

var err error
r, err = buildFn(td)
if err != nil {
return err.Error()
}

iter, err := r.NewRawRangeKeyIter(context.Background(), NoFragmentTransforms)
if err != nil {
return err.Error()
}
defer iter.Close()

var buf bytes.Buffer
s, err := iter.First()
for ; s != nil; s, err = iter.Next() {
_, _ = fmt.Fprintf(&buf, "%s\n", s)
}
if err != nil {
return err.Error()
}
return buf.String()

default:
return fmt.Sprintf("unknown command: %s", td.Cmd)
}
})
for format := TableFormatPebblev2; format <= TableFormatMax; format++ {
t.Run(format.String(), func(t *testing.T) {
datadriven.RunTest(t, "testdata/writer_range_keys", func(t *testing.T, td *datadriven.TestData) string {
switch td.Cmd {
case "build":
if r != nil {
_ = r.Close()
r = nil
}
var err error
r, err = buildFn(td, format)
if err != nil {
return err.Error()
}

iter, err := r.NewRawRangeKeyIter(context.Background(), NoFragmentTransforms)
if err != nil {
return err.Error()
}
defer iter.Close()

var buf bytes.Buffer
s, err := iter.First()
for ; s != nil; s, err = iter.Next() {
_, _ = fmt.Fprintf(&buf, "%s\n", s)
}
if err != nil {
return err.Error()
}
return buf.String()

default:
return fmt.Sprintf("unknown command: %s", td.Cmd)
}
})
})
}
}

0 comments on commit 8e01ef3

Please sign in to comment.