Skip to content

Commit

Permalink
compaction: don't split outputs within a user key
Browse files Browse the repository at this point in the history
During a compaction, if the current sstable hits the file size limit, defer
finishing the sstable if the next sstable would share a user key. This is the
current behavior of flushes, and this change brings parity between the two.

This change is motivated by introduction of range keys (see cockroachdb#1339). This
ensures we can always cleanly truncate range keys that span range-key boundaries.

This commit also removes (keyspan.Fragmenter).FlushTo. Now that we prohibit
splitting sstables in the middle of a user key, the Fragmenter's FlushTo
function is unnecessary. Compactions and flushes always use the
TruncateAndFlushTo variant.

This change required a tweak to the way grandparent limits are applied, in
order to switch the grandparent splitter's comparison into a >= comparsion.
This was necessary due to the shift in interpreting `splitterSuggestion`s as
exclusive boundaries.

Close cockroachdb#734.
  • Loading branch information
jbowens committed Jan 31, 2022
1 parent ca9b452 commit b55361d
Show file tree
Hide file tree
Showing 15 changed files with 180 additions and 624 deletions.
330 changes: 69 additions & 261 deletions compaction.go

Large diffs are not rendered by default.

9 changes: 3 additions & 6 deletions compaction_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -769,17 +769,14 @@ func (i *compactionIter) Close() error {
// exclude specifies if the specified key is exclusive or inclusive.
// When exclude = true, all returned range tombstones are truncated to the
// specified key.
func (i *compactionIter) Tombstones(key []byte, exclude bool) []keyspan.Span {
switch {
case key == nil:
func (i *compactionIter) Tombstones(key []byte) []keyspan.Span {
if key == nil {
i.rangeDelFrag.Finish()
case exclude:
} else {
// The specified end key is exclusive; no versions of the specified
// user key (including range tombstones covering that key) should
// be flushed yet.
i.rangeDelFrag.TruncateAndFlushTo(key)
default:
i.rangeDelFrag.FlushTo(key)
}
tombstones := i.tombstones
i.tombstones = nil
Expand Down
2 changes: 1 addition & 1 deletion compaction_iter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ func TestCompactionIter(t *testing.T) {
if len(parts) == 2 {
key = []byte(parts[1])
}
for _, v := range iter.Tombstones(key, false) {
for _, v := range iter.Tombstones(key) {
fmt.Fprintf(&b, "%s-%s#%d\n",
v.Start.UserKey, v.End, v.Start.SeqNum())
}
Expand Down
5 changes: 5 additions & 0 deletions compaction_picker.go
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,11 @@ func (pc *pickedCompaction) grow(sm, la InternalKey, maxExpandedBytes uint64) bo
// disableIsCompacting is true, isCompacting always returns false. This helps
// avoid spurious races from being detected when this method is used outside
// of compaction picking code.
//
// TODO(jackson): Compactions and flushes no longer split a user key between two
// sstables. We could perform a migration, re-compacting any sstables with split
// user keys, which would allow us to remove atomic compaction unit expansion
// code.
func expandToAtomicUnit(
cmp Compare, inputs manifest.LevelSlice, disableIsCompacting bool,
) (slice manifest.LevelSlice, isCompacting bool) {
Expand Down
4 changes: 4 additions & 0 deletions docs/range_deletions.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Range Deletions

TODO: The following explanation of range deletions does not take into account
the recent change to prohibit splitting of a user key between sstables. This
change simplifies the logic, removing 'improperly truncated range tombstones.'

TODO: The following explanation of range deletions ignores the
kind/trailer that appears at the end of keys after the sequence
number. This should be harmless but need to add a justification on why
Expand Down
85 changes: 6 additions & 79 deletions internal/keyspan/fragmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ type Fragmenter struct {
// flushed. All pending spans have the same Start.UserKey.
pending []Span
// doneBuf is used to buffer completed span fragments when flushing to a
// specific key (e.g. FlushTo). It is cached in the Fragmenter to allow
// reuse.
// specific key (e.g. TruncateAndFlushTo). It is cached in the Fragmenter to
// allow reuse.
doneBuf []Span
// sortBuf is used to sort fragments by end key when flushing.
sortBuf spansByEndKey
Expand Down Expand Up @@ -266,88 +266,15 @@ func (f *Fragmenter) Empty() bool {
return f.finished || len(f.pending) == 0
}

// FlushTo flushes all of the fragments with a start key <= key. Used during
// compaction to force emitting of spans which straddle an sstable boundary.
// Note that the emitted spans are not truncated to the specified key. Consider
// TruncateAndFlushTo flushes all of the fragments with a start key <= key,
// truncating spans to the specified end key. Used during compaction to force
// emitting of spans which straddle an sstable boundary. Consider
// the scenario:
//
// a---------k#10
// f#8
// f#7
//
// If the compaction logic splits f#8 and f#7 into different sstables, we can't
// truncate the span [a,k) at f. Doing so could produce an sstable with the
// records:
//
// a----f#10
// f#8
//
// The span [a,f) does not cover the key f.
func (f *Fragmenter) FlushTo(key []byte) {
if f.finished {
panic("pebble: span fragmenter already finished")
}

if f.flushedKey != nil {
switch c := f.Cmp(key, f.flushedKey); {
case c < 0:
panic(fmt.Sprintf("pebble: flush-to key (%s) < flushed key (%s)",
f.Format(key), f.Format(f.flushedKey)))
}
}
f.flushedKey = append(f.flushedKey[:0], key...)

if len(f.pending) > 0 {
// Since all of the pending spans have the same start key, we only need
// to compare against the first one.
switch c := f.Cmp(f.pending[0].Start.UserKey, key); {
case c > 0:
panic(fmt.Sprintf("pebble: keys must be in order: %s > %s",
f.Format(f.pending[0].Start.UserKey), f.Format(key)))
}
// Note that we explicitly do not return early here if Start.UserKey ==
// key. Similar to the scenario described above, consider:
//
// f----k#10
// f#8
// f#7
//
// If the compaction logic splits f#8 and f#7 into different sstables, we
// have to emit the span [f,k) in both sstables.
}

// At this point we know that the new start key is greater than the pending
// spans start keys. We flush all span fragments with a start key
// <= key.
f.flush(f.pending, key)

// Truncate the pending spans to start with key, filtering any which
// would become empty.
pending := f.pending
f.pending = f.pending[:0]
for _, s := range pending {
if f.Cmp(key, s.End) < 0 {
// s: a--+--e
// new: c------
f.pending = append(f.pending, Span{
Start: base.MakeInternalKey(key, s.Start.SeqNum(), s.Start.Kind()),
End: s.End,
Value: s.Value,
})
}
}
}

// TruncateAndFlushTo is similar to FlushTo, except it also truncates spans to
// the specified end key by calling truncateAndFlush. Only called in compactions
// where we can guarantee that all versions of UserKeys < key have been written,
// or in other words, where we can ensure we don't split a user key across two
// sstables. Going back to the scenario from above:
//
// a---------k#10
// f#8
// f#7
//
///
// Let's say the next user key after f is g. Calling TruncateAndFlushTo(g) will
// flush this span:
//
Expand Down
33 changes: 1 addition & 32 deletions internal/keyspan/fragmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,7 @@ func buildSpans(
},
}
for _, line := range strings.Split(s, "\n") {
if strings.HasPrefix(line, "flush-to ") {
parts := strings.Split(line, " ")
if len(parts) != 2 {
t.Fatalf("expected 2 components, but found %d: %s", len(parts), line)
}
f.FlushTo([]byte(parts[1]))
continue
} else if strings.HasPrefix(line, "truncate-and-flush-to ") {
if strings.HasPrefix(line, "truncate-and-flush-to ") {
parts := strings.Split(line, " ")
if len(parts) != 2 {
t.Fatalf("expected 2 components, but found %d: %s", len(parts), line)
Expand Down Expand Up @@ -204,30 +197,6 @@ func TestFragmenterDeleted(t *testing.T) {
})
}

func TestFragmenterFlushTo(t *testing.T) {
cmp := base.DefaultComparer.Compare
fmtKey := base.DefaultComparer.FormatKey

datadriven.RunTest(t, "testdata/fragmenter_flush_to", func(d *datadriven.TestData) string {
switch d.Cmd {
case "build":
return func() (result string) {
defer func() {
if r := recover(); r != nil {
result = fmt.Sprint(r)
}
}()

spans := buildSpans(t, cmp, fmtKey, d.Input, base.InternalKeyKindRangeDelete)
return formatSpans(spans)
}()

default:
return fmt.Sprintf("unknown command: %s", d.Cmd)
}
})
}

func TestFragmenterTruncateAndFlushTo(t *testing.T) {
cmp := base.DefaultComparer.Compare
fmtKey := base.DefaultComparer.FormatKey
Expand Down
107 changes: 0 additions & 107 deletions internal/keyspan/testdata/fragmenter_flush_to

This file was deleted.

23 changes: 0 additions & 23 deletions internal/keyspan/testdata/fragmenter_values
Original file line number Diff line number Diff line change
Expand Up @@ -63,26 +63,3 @@ truncate-and-flush-to d
3: e-g orange
1: e-g coconut
3: g-i orange

# NB: Unlike the above truncate-and-flush-to calls, a flush-to does not truncate
# the end boundary. In this case, the fragments beginning at `c` are not
# truncated to `d`, they're flushed with the bounadries formed by fragmentation
# (`e`)
build
3: a-c apple
2: a---e banana
1: a-----g coconut
flush-to d
3: d----i orange
----
3: a-c apple
2: a-c banana
1: a-c coconut
2: c-e banana
1: c-e coconut
3: de orange
2: de banana
1: de coconut
3: e-g orange
1: e-g coconut
3: g-i orange
Loading

0 comments on commit b55361d

Please sign in to comment.