Skip to content

Commit

Permalink
*: set track smallest / largest keys separately in manifest
Browse files Browse the repository at this point in the history
Currently, only point keys are tracked in the manifest. With the
addition of range keys, the bounds of an SSTable should be computed by
considering the bounds of both the point keys and the range keys, and
taking the smallest or largest across both types of key, respectively.

Add four additional fields, `{Smallest,Largest}{Point,Range}Key`, to
`manifest.FileMetadata` to separately track the point and range key
bounds. The existing `Smallest` and `Largest` fields are used to track
the bounds across both point and range keys.

Update the existing calls sites that set the smallest and largest keys
to set all three types of bounds: point keys, range keys and combined.
  • Loading branch information
nicktrav committed Feb 17, 2022
1 parent 51cf6fb commit e352d8e
Show file tree
Hide file tree
Showing 10 changed files with 279 additions and 62 deletions.
4 changes: 4 additions & 0 deletions compaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -2259,6 +2259,10 @@ func (d *DB) runCompaction(
)
}

meta.SmallestPointKey = writerMeta.SmallestPointKey(d.cmp)
meta.LargestPointKey = writerMeta.LargestPointKey(d.cmp)
meta.SmallestRangeKey = writerMeta.SmallestRangeKey
meta.LargestRangeKey = writerMeta.LargestRangeKey
meta.Smallest = writerMeta.Smallest(d.cmp)
meta.Largest = writerMeta.Largest(d.cmp)

Expand Down
18 changes: 11 additions & 7 deletions flush_external.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,17 @@ func flushExternalTable(untypedDB interface{}, path string, originalMeta *fileMe
d.mu.Unlock()

m := &fileMetadata{
FileNum: fileNum,
Size: originalMeta.Size,
CreationTime: time.Now().Unix(),
Smallest: originalMeta.Smallest,
Largest: originalMeta.Largest,
SmallestSeqNum: originalMeta.SmallestSeqNum,
LargestSeqNum: originalMeta.LargestSeqNum,
FileNum: fileNum,
Size: originalMeta.Size,
CreationTime: time.Now().Unix(),
SmallestPointKey: originalMeta.SmallestPointKey,
LargestPointKey: originalMeta.LargestPointKey,
SmallestRangeKey: originalMeta.SmallestRangeKey,
LargestRangeKey: originalMeta.LargestRangeKey,
Smallest: originalMeta.Smallest,
Largest: originalMeta.Largest,
SmallestSeqNum: originalMeta.SmallestSeqNum,
LargestSeqNum: originalMeta.LargestSeqNum,
}

// Hard link the sstable into the DB directory.
Expand Down
122 changes: 96 additions & 26 deletions ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/cockroachdb/pebble/internal/keyspan"
"github.com/cockroachdb/pebble/internal/manifest"
"github.com/cockroachdb/pebble/internal/private"
"github.com/cockroachdb/pebble/internal/rangekey"
"github.com/cockroachdb/pebble/sstable"
"github.com/cockroachdb/pebble/vfs"
)
Expand Down Expand Up @@ -84,8 +85,6 @@ func ingestLoad1(
meta.FileNum = fileNum
meta.Size = uint64(stat.Size())
meta.CreationTime = time.Now().Unix()
meta.Smallest = InternalKey{}
meta.Largest = InternalKey{}

// Avoid loading into into the table cache for collecting stats if we
// don't need to. If there are no range deletions, we have all the
Expand All @@ -98,9 +97,7 @@ func ingestLoad1(
// calculating stats before we can remove the original link.
maybeSetStatsFromProperties(meta, &r.Properties)

smallestSet, largestSet := false, false
empty := true

hasPoints := false
{
iter, err := r.NewIter(nil /* lower */, nil /* upper */)
if err != nil {
Expand All @@ -111,9 +108,7 @@ func ingestLoad1(
if err := ingestValidateKey(opts, key); err != nil {
return nil, err
}
empty = false
meta.Smallest = key.Clone()
smallestSet = true
meta.SmallestPointKey = key.Clone()
}
if err := iter.Error(); err != nil {
return nil, err
Expand All @@ -122,9 +117,8 @@ func ingestLoad1(
if err := ingestValidateKey(opts, key); err != nil {
return nil, err
}
empty = false
meta.Largest = key.Clone()
largestSet = true
meta.LargestPointKey = key.Clone()
hasPoints = true // Implies smallest point key was also set.
}
if err := iter.Error(); err != nil {
return nil, err
Expand All @@ -141,10 +135,9 @@ func ingestLoad1(
if err := ingestValidateKey(opts, key); err != nil {
return nil, err
}
empty = false
if !smallestSet ||
base.InternalCompare(opts.Comparer.Compare, meta.Smallest, *key) > 0 {
meta.Smallest = key.Clone()
if !hasPoints ||
base.InternalCompare(opts.Comparer.Compare, meta.SmallestPointKey, *key) > 0 {
meta.SmallestPointKey = key.Clone()
}
}
if err := iter.Error(); err != nil {
Expand All @@ -154,18 +147,82 @@ func ingestLoad1(
if err := ingestValidateKey(opts, key); err != nil {
return nil, err
}
empty = false
end := base.MakeRangeDeleteSentinelKey(val)
if !largestSet ||
base.InternalCompare(opts.Comparer.Compare, meta.Largest, end) < 0 {
meta.Largest = end.Clone()
if !hasPoints ||
base.InternalCompare(opts.Comparer.Compare, meta.LargestPointKey, end) < 0 {
meta.LargestPointKey = end.Clone()
hasPoints = true // Implies smallest point key was also set.
}
}
}

if empty {
// Update the range-key bounds for the table.
var hasRanges bool
{
iter, err := r.NewRawRangeKeyIter()
if err != nil {
return nil, err
}
if iter != nil {
defer iter.Close()
if key, _ := iter.First(); key != nil {
if err := ingestValidateKey(opts, key); err != nil {
return nil, err
}
meta.SmallestRangeKey = key.Clone()
}
if err := iter.Error(); err != nil {
return nil, err
}
if key, value := iter.Last(); key != nil {
if err := ingestValidateKey(opts, key); err != nil {
return nil, err
}
// As range keys are fragmented, the end key of the last range key in
// the table provides the upper bound for the table.
end, _, ok := rangekey.DecodeEndKey(key.Kind(), value)
if !ok {
return nil, errors.Newf("pebble: could not decode range end key")
}
meta.LargestRangeKey = base.MakeRangeKeySentinelKey(end).Clone()
hasRanges = true // Implies smallest range key was also set.
}
if err := iter.Error(); err != nil {
return nil, err
}
}
}

if !hasPoints && !hasRanges {
return nil, nil
}

// Compute the overall smallest / largest fields from the point and key
// ranges.
switch {
case !hasRanges:
// Table has only point keys. Use the point key bounds.
meta.Smallest = meta.SmallestPointKey.Clone()
meta.Largest = meta.LargestPointKey.Clone()
case !hasPoints:
// Table has only range key. Use the range key bounds.
meta.Smallest = meta.SmallestRangeKey.Clone()
meta.Largest = meta.LargestRangeKey.Clone()
default:
// Table has both points and ranges. Compute the bounds by considering both
// the point and range key bounds.
if base.InternalCompare(opts.Comparer.Compare, meta.SmallestPointKey, meta.SmallestRangeKey) < 0 {
meta.Smallest = meta.SmallestPointKey.Clone()
} else {
meta.Smallest = meta.SmallestRangeKey.Clone()
}
if base.InternalCompare(opts.Comparer.Compare, meta.LargestPointKey, meta.LargestRangeKey) > 0 {
meta.Largest = meta.LargestPointKey.Clone()
} else {
meta.Largest = meta.LargestRangeKey.Clone()
}
}

return meta, nil
}

Expand Down Expand Up @@ -313,14 +370,27 @@ func ingestMemtableOverlaps(cmp Compare, mem flushable, meta []*fileMetadata) bo
return false
}

// TODO(travers): additional test coverage for altering the sequence number.
func ingestUpdateSeqNum(opts *Options, dirname string, seqNum uint64, meta []*fileMetadata) error {
setSeqFn := func(k base.InternalKey) base.InternalKey {
return base.MakeInternalKey(k.UserKey, seqNum, k.Kind())
}
for _, m := range meta {
m.Smallest = base.MakeInternalKey(m.Smallest.UserKey, seqNum, m.Smallest.Kind())
// Don't update the seqnum for the largest key if that key is a range
// deletion sentinel key as doing so unintentionally extends the bounds of
// the table.
if m.Largest.Trailer != InternalKeyRangeDeleteSentinel {
m.Largest = base.MakeInternalKey(m.Largest.UserKey, seqNum, m.Largest.Kind())
m.SmallestPointKey = setSeqFn(m.SmallestPointKey)
m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey)
m.Smallest = setSeqFn(m.Smallest)
// Only update the seqnum for the largest key if that key is not an
// "exclusive sentinel" (i.e. a range deletion sentinel or a range key
// boundary), as doing so effectively drops the exclusive sentinel (by
// lowering the seqnum from the max value), and extends the bounds of the
// table.
// NB: as the largest range key is always an exclusive sentinel, it is never
// updated.
if !m.LargestPointKey.IsExclusiveSentinel() {
m.LargestPointKey = setSeqFn(m.LargestPointKey)
}
if !m.Largest.IsExclusiveSentinel() {
m.Largest = setSeqFn(m.Largest)
}
// Setting smallestSeqNum == largestSeqNum triggers the setting of
// Properties.GlobalSeqNum when an sstable is loaded.
Expand Down
19 changes: 15 additions & 4 deletions ingest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/pebble/internal/datadriven"
"github.com/cockroachdb/pebble/internal/errorfs"
"github.com/cockroachdb/pebble/internal/manifest"
"github.com/cockroachdb/pebble/internal/rangekey"
"github.com/cockroachdb/pebble/sstable"
"github.com/cockroachdb/pebble/vfs"
"github.com/kr/pretty"
Expand Down Expand Up @@ -63,8 +64,14 @@ func TestIngestLoad(t *testing.T) {
return fmt.Sprintf("malformed input: %s\n", data)
}
key := base.ParseInternalKey(data[:j])
value := []byte(data[j+1:])
if err := w.Add(key, value); err != nil {
if k := key.Kind(); rangekey.IsRangeKey(k) {
value := rangekey.ParseValue(k, data[j+1:])
err = w.AddRangeKey(key, value)
} else {
value := []byte(data[j+1:])
err = w.Add(key, value)
}
if err != nil {
return err.Error()
}
}
Expand All @@ -81,6 +88,8 @@ func TestIngestLoad(t *testing.T) {
var buf bytes.Buffer
for _, m := range meta {
fmt.Fprintf(&buf, "%d: %s-%s\n", m.FileNum, m.Smallest, m.Largest)
fmt.Fprintf(&buf, " points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey)
fmt.Fprintf(&buf, " ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey)
}
return buf.String()

Expand Down Expand Up @@ -131,8 +140,10 @@ func TestIngestLoadRand(t *testing.T) {
return base.InternalCompare(cmp, keys[i], keys[j]) < 0
})

expected[i].Smallest = keys[0]
expected[i].Largest = keys[len(keys)-1]
expected[i].SmallestPointKey = keys[0]
expected[i].LargestPointKey = keys[len(keys)-1]
expected[i].Smallest = expected[i].SmallestPointKey
expected[i].Largest = expected[i].LargestPointKey

w := sstable.NewWriter(f, sstable.WriterOptions{})
var count uint64
Expand Down
10 changes: 10 additions & 0 deletions internal/base/internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,16 @@ func MakeRangeDeleteSentinelKey(userKey []byte) InternalKey {
}
}

// MakeRangeKeySentinelKey constructs an internal key that is a range key
// sentinel key, used as the upper boundary for an sstable when a range key is
// the largest key in an sstable.
func MakeRangeKeySentinelKey(userKey []byte) InternalKey {
return InternalKey{
UserKey: userKey,
Trailer: InternalKeyBoundaryRangeKey,
}
}

var kindsMap = map[string]InternalKeyKind{
"DEL": InternalKeyKindDelete,
"SINGLEDEL": InternalKeyKindSingleDelete,
Expand Down
51 changes: 45 additions & 6 deletions internal/manifest/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,23 @@ type FileMetadata struct {
// UTC). For ingested sstables, this corresponds to the time the file was
// ingested.
CreationTime int64
// Smallest and Largest are the inclusive bounds for the internal keys
// stored in the table.
// SmallestPointKey and LargestPointKey are the inclusive bounds for the
// internal point keys stored in the table. This includes RANGEDELs, which
// alter point keys.
SmallestPointKey InternalKey
LargestPointKey InternalKey
// SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
// internal range keys stored in the table.
SmallestRangeKey InternalKey
LargestRangeKey InternalKey
// Smallest and Largest are the inclusive bounds for the internal keys stored
// in the table, across both point and range keys. These values can be
// reconstructed from the respective point and range key fields.
Smallest InternalKey
Largest InternalKey
// Smallest and largest sequence numbers in the table.
// Smallest and largest sequence numbers in the table, across both point and
// range keys. These values can be reconstructed from the respective point and
// range key fields.
SmallestSeqNum uint64
LargestSeqNum uint64
// True if the file is actively being compacted. Protected by DB.mu.
Expand Down Expand Up @@ -133,6 +145,24 @@ func (m *FileMetadata) String() string {
// Validate validates the metadata for consistency with itself, returning an
// error if inconsistent.
func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
// Point key validation.

if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
m.LargestPointKey.Pretty(formatKey))
}

// Range key validation.

if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
m.LargestRangeKey.Pretty(formatKey))
}

// Combined range and point key validation.

if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
Expand All @@ -142,6 +172,11 @@ func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
}

// TODO(travers): add consistency checks to ensure that the point / range key
// smallest / largest are within the bounds of the combined smallest /
// largest.

return nil
}

Expand Down Expand Up @@ -466,10 +501,14 @@ func ParseVersionDebug(
if err != nil {
return nil, err
}
smallest := base.ParsePrettyInternalKey(fields[1])
largest := base.ParsePrettyInternalKey(fields[2])
files[level] = append(files[level], &FileMetadata{
FileNum: base.FileNum(fileNum),
Smallest: base.ParsePrettyInternalKey(fields[1]),
Largest: base.ParsePrettyInternalKey(fields[2]),
FileNum: base.FileNum(fileNum),
SmallestPointKey: smallest,
LargestPointKey: largest,
Smallest: smallest,
Largest: largest,
})
}
}
Expand Down
Loading

0 comments on commit e352d8e

Please sign in to comment.