-
Notifications
You must be signed in to change notification settings - Fork 476
/
Copy pathcoalesce.go
371 lines (353 loc) · 14.9 KB
/
coalesce.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package rangekey
import (
"bytes"
"math"
"sort"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/invariants"
"github.com/cockroachdb/pebble/internal/keyspan"
"github.com/cockroachdb/pebble/internal/manifest"
)
// UserIteratorConfig holds state for constructing the range key iterator stack
// for user iteration. The range key iterator must merge range key spans across
// the levels of the LSM. This merging is performed by a keyspan.MergingIter
// on-the-fly. The UserIteratorConfig implements keyspan.Transformer, evaluating
// range-key semantics and shadowing, so the spans returned by a MergingIter are
// fully resolved.
//
// The MergingIter is wrapped by a BoundedIter, which elides spans that are
// outside the iterator bounds (or the current prefix's bounds, during prefix
// iteration mode).
//
// To provide determinisim during iteration, the BoundedIter is wrapped by a
// DefragmentingIter that defragments abutting spans with identical
// user-observable state.
//
// At the top-level an InterleavingIter interleaves range keys with point keys
// and performs truncation to iterator bounds.
//
// Below is an abbreviated diagram illustrating the mechanics of a SeekGE.
//
// InterleavingIter.SeekGE
// │
// DefragmentingIter.SeekGE
// │
// BoundedIter.SeekGE
// │
// ╭────────────────┴───────────────╮
// │ ├── defragmentBwd*
// MergingIter.SeekGE │
// │ ╰── defragmentFwd
// ╰─╶╶ per level╶╶ ─╮
// │
// │
// ├── <?>.SeekLT
// │
// ╰── <?>.Next
type UserIteratorConfig struct {
snapshot uint64
comparer *base.Comparer
miter keyspan.MergingIter
biter keyspan.BoundedIter
diter keyspan.DefragmentingIter
liters [manifest.NumLevels]keyspan.LevelIter
litersUsed int
onlySets bool
bufs *Buffers
}
// Buffers holds various buffers used for range key iteration. They're exposed
// so that they may be pooled and reused between iterators.
type Buffers struct {
merging keyspan.MergingBuffers
defragmenting keyspan.DefragmentingBuffers
sortBuf keyspan.KeysBySuffix
}
// PrepareForReuse discards any excessively large buffers.
func (bufs *Buffers) PrepareForReuse() {
bufs.merging.PrepareForReuse()
bufs.defragmenting.PrepareForReuse()
}
// Init initializes the range key iterator stack for user iteration. The
// resulting fragment iterator applies range key semantics, defragments spans
// according to their user-observable state and, if onlySets = true, removes all
// Keys other than RangeKeySets describing the current state of range keys. The
// resulting spans contain Keys sorted by Suffix.
//
// The snapshot sequence number parameter determines which keys are visible. Any
// keys not visible at the provided snapshot are ignored.
func (ui *UserIteratorConfig) Init(
comparer *base.Comparer,
snapshot uint64,
lower, upper []byte,
hasPrefix *bool,
prefix *[]byte,
onlySets bool,
bufs *Buffers,
iters ...keyspan.FragmentIterator,
) keyspan.FragmentIterator {
ui.snapshot = snapshot
ui.comparer = comparer
ui.onlySets = onlySets
ui.miter.Init(comparer.Compare, ui, &bufs.merging, iters...)
ui.biter.Init(comparer.Compare, comparer.Split, &ui.miter, lower, upper, hasPrefix, prefix)
ui.diter.Init(comparer, &ui.biter, ui, keyspan.StaticDefragmentReducer, &bufs.defragmenting)
ui.litersUsed = 0
ui.bufs = bufs
return &ui.diter
}
// AddLevel adds a new level to the bottom of the iterator stack. AddLevel
// must be called after Init and before any other method on the iterator.
func (ui *UserIteratorConfig) AddLevel(iter keyspan.FragmentIterator) {
ui.miter.AddLevel(iter)
}
// NewLevelIter returns a pointer to a newly allocated or reused
// keyspan.LevelIter. The caller is responsible for calling Init() on this
// instance.
func (ui *UserIteratorConfig) NewLevelIter() *keyspan.LevelIter {
if ui.litersUsed >= len(ui.liters) {
return &keyspan.LevelIter{}
}
ui.litersUsed++
return &ui.liters[ui.litersUsed-1]
}
// SetBounds propagates bounds to the iterator stack. The fragment iterator
// interface ordinarily doesn't enforce bounds, so this is exposed as an
// explicit method on the user iterator config.
func (ui *UserIteratorConfig) SetBounds(lower, upper []byte) {
ui.biter.SetBounds(lower, upper)
}
// Transform implements the keyspan.Transformer interface for use with a
// keyspan.MergingIter. It transforms spans by resolving range keys at the
// provided snapshot sequence number. Shadowing of keys is resolved (eg, removal
// of unset keys, removal of keys overwritten by a set at the same suffix, etc)
// and then non-RangeKeySet keys are removed. The resulting transformed spans
// only contain RangeKeySets describing the state visible at the provided
// sequence number, and hold their Keys sorted by Suffix.
func (ui *UserIteratorConfig) Transform(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error {
// Apply shadowing of keys.
dst.Start = s.Start
dst.End = s.End
ui.bufs.sortBuf = keyspan.KeysBySuffix{
Cmp: cmp,
Keys: ui.bufs.sortBuf.Keys[:0],
}
if err := coalesce(ui.comparer.Equal, &ui.bufs.sortBuf, ui.snapshot, s.Keys); err != nil {
return err
}
// During user iteration over range keys, unsets and deletes don't matter.
// Remove them if onlySets = true. This step helps logical defragmentation
// during iteration.
keys := ui.bufs.sortBuf.Keys
dst.Keys = dst.Keys[:0]
for i := range keys {
switch keys[i].Kind() {
case base.InternalKeyKindRangeKeySet:
if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
panic("pebble: keys unexpectedly not in ascending suffix order")
}
dst.Keys = append(dst.Keys, keys[i])
case base.InternalKeyKindRangeKeyUnset:
if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
panic("pebble: keys unexpectedly not in ascending suffix order")
}
if ui.onlySets {
// Skip.
continue
}
dst.Keys = append(dst.Keys, keys[i])
case base.InternalKeyKindRangeKeyDelete:
if ui.onlySets {
// Skip.
continue
}
dst.Keys = append(dst.Keys, keys[i])
default:
return base.CorruptionErrorf("pebble: unrecognized range key kind %s", keys[i].Kind())
}
}
// coalesce results in dst.Keys being sorted by Suffix.
dst.KeysOrder = keyspan.BySuffixAsc
return nil
}
// ShouldDefragment implements the DefragmentMethod interface and configures a
// DefragmentingIter to defragment spans of range keys if their user-visible
// state is identical. This defragmenting method assumes the provided spans have
// already been transformed through (UserIterationConfig).Transform, so all
// RangeKeySets are user-visible sets and are already in Suffix order. This
// defragmenter checks for equality between set suffixes and values (ignoring
// sequence numbers). It's intended for use during user iteration, when the
// wrapped keyspan iterator is merging spans across all levels of the LSM.
func (ui *UserIteratorConfig) ShouldDefragment(equal base.Equal, a, b *keyspan.Span) bool {
// This implementation must only be used on spans that have transformed by
// ui.Transform. The transform applies shadowing, removes all keys besides
// the resulting Sets and sorts the keys by suffix. Since shadowing has been
// applied, each Set must set a unique suffix. If the two spans are
// equivalent, they must have the same number of range key sets.
if len(a.Keys) != len(b.Keys) || len(a.Keys) == 0 {
return false
}
if a.KeysOrder != keyspan.BySuffixAsc || b.KeysOrder != keyspan.BySuffixAsc {
panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
}
ret := true
for i := range a.Keys {
if invariants.Enabled {
if ui.onlySets && (a.Keys[i].Kind() != base.InternalKeyKindRangeKeySet ||
b.Keys[i].Kind() != base.InternalKeyKindRangeKeySet) {
panic("pebble: unexpected non-RangeKeySet during defragmentation")
}
// Don't do a comparison with RangeKeyDeletes, as those will always sort
// to the end and have no prefix.
if i > 0 && ((a.Keys[i].Kind() != base.InternalKeyKindRangeKeyDelete && ui.comparer.Compare(a.Keys[i].Suffix, a.Keys[i-1].Suffix) < 0) ||
(b.Keys[i].Kind() != base.InternalKeyKindRangeKeyDelete && ui.comparer.Compare(b.Keys[i].Suffix, b.Keys[i-1].Suffix) < 0)) {
panic("pebble: range keys not ordered by suffix during defragmentation")
}
}
if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) {
ret = false
break
}
if !bytes.Equal(a.Keys[i].Value, b.Keys[i].Value) {
ret = false
break
}
}
return ret
}
// Coalesce imposes range key semantics and coalesces range keys with the same
// bounds. Coalesce drops any keys shadowed by more recent sets, unsets or
// deletes. Coalesce modifies the provided span's Keys slice, reslicing the
// slice to remove dropped keys.
//
// Coalescence has subtle behavior with respect to sequence numbers. Coalesce
// depends on a keyspan.Span's Keys being sorted in sequence number descending
// order. The first key has the largest sequence number. The returned coalesced
// span includes only the largest sequence number. All other sequence numbers
// are forgotten. When a compaction constructs output range keys from a
// coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and
// one RANGEKEYDEL. Each one of these keys adopt the largest sequence number.
//
// This has the potentially surprising effect of 'promoting' a key to a higher
// sequence number. This is okay, because:
// - There are no other overlapping keys within the coalesced span of
// sequence numbers (otherwise they would be in the compaction, due to
// the LSM invariant).
// - Range key sequence numbers are never compared to point key sequence
// numbers. Range keys and point keys have parallel existences.
// - Compactions only coalesce within snapshot stripes.
//
// Additionally, internal range keys at the same sequence number have subtle
// mechanics:
// - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix.
// - RANGEKEYDELs only apply to keys at lower sequence numbers.
//
// This is required for ingestion. Ingested sstables are assigned a single
// sequence number for the file, at which all of the file's keys are visible.
// The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such
// that among keys with equal sequence numbers (thus ordered by their kinds) the
// keys do not affect one another. Ingested sstables are expected to be
// consistent with respect to the set/unset suffixes: A given suffix should be
// set or unset but not both.
//
// The resulting dst Keys slice is sorted by Trailer.
func Coalesce(cmp base.Compare, eq base.Equal, keys []keyspan.Key, dst *[]keyspan.Key) error {
// TODO(jackson): Currently, Coalesce doesn't actually perform the sequence
// number promotion described in the comment above.
keysBySuffix := keyspan.KeysBySuffix{
Cmp: cmp,
Keys: (*dst)[:0],
}
if err := coalesce(eq, &keysBySuffix, math.MaxUint64, keys); err != nil {
return err
}
// Update the span with the (potentially reduced) keys slice. coalesce left
// the keys in *dst sorted by suffix. Re-sort them by trailer.
*dst = keysBySuffix.Keys
keyspan.SortKeysByTrailer(dst)
return nil
}
func coalesce(
equal base.Equal, keysBySuffix *keyspan.KeysBySuffix, snapshot uint64, keys []keyspan.Key,
) error {
// First, enforce visibility and RangeKeyDelete mechanics. We only need to
// consider the prefix of keys before and including the first
// RangeKeyDelete. We also must skip any keys that aren't visible at the
// provided snapshot sequence number.
//
// NB: Within a given sequence number, keys are ordered as:
// RangeKeySet > RangeKeyUnset > RangeKeyDelete
// This is significant, because this ensures that a Set or Unset sharing a
// sequence number with a Delete do not shadow each other.
deleteIdx := -1
for i := range keys {
if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer {
panic("pebble: invariant violation: span keys unordered")
}
if !keys[i].VisibleAt(snapshot) {
continue
}
// Once a RangeKeyDelete is observed, we know it shadows all subsequent
// keys and we can break early. We don't add the RangeKeyDelete key to
// keysBySuffix.keys yet, because we don't want a suffix-less key
// that appeared earlier in the slice to elide it. It'll be added back
// in at the end.
if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete {
deleteIdx = i
break
}
keysBySuffix.Keys = append(keysBySuffix.Keys, keys[i])
}
// Sort the accumulated keys by suffix. There may be duplicates within a
// suffix, in which case the one with a larger trailer survives.
//
// We use a stable sort so that the first key with a given suffix is the one
// that with the highest Trailer (because the input `keys` was sorted by
// trailer descending).
sort.Stable(keysBySuffix)
// Grab a handle of the full sorted slice, before reslicing
// keysBySuffix.keys to accumulate the final coalesced keys.
sorted := keysBySuffix.Keys
keysBySuffix.Keys = keysBySuffix.Keys[:0]
var (
// prevSuffix is updated on each iteration of the below loop, and
// compared by the subsequent iteration to determine whether adjacent
// keys are defined at the same suffix.
prevSuffix []byte
// shadowing is set to true once any Key is shadowed by another key.
// When it's set to true—or after the loop if no keys are shadowed—the
// keysBySuffix.keys slice is resliced to contain the prefix of
// unshadowed keys. This avoids copying them incrementally in the common
// case of no shadowing.
shadowing bool
)
for i := range sorted {
if i > 0 && equal(prevSuffix, sorted[i].Suffix) {
// Skip; this key is shadowed by the predecessor that had a larger
// Trailer. If this is the first shadowed key, set shadowing=true
// and reslice keysBySuffix.keys to hold the entire unshadowed
// prefix.
if !shadowing {
keysBySuffix.Keys = keysBySuffix.Keys[:i]
shadowing = true
}
continue
}
prevSuffix = sorted[i].Suffix
if shadowing {
keysBySuffix.Keys = append(keysBySuffix.Keys, sorted[i])
}
}
// If there was no shadowing, keysBySuffix.keys is untouched. We can simply
// set it to the existing `sorted` slice (also backed by keysBySuffix.keys).
if !shadowing {
keysBySuffix.Keys = sorted
}
// If the original input `keys` slice contained a RangeKeyDelete, add it.
if deleteIdx >= 0 {
keysBySuffix.Keys = append(keysBySuffix.Keys, keys[deleteIdx])
}
return nil
}