-
Notifications
You must be signed in to change notification settings - Fork 472
/
Copy pathformat_major_version.go
432 lines (401 loc) · 16.8 KB
/
format_major_version.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package pebble
import (
"fmt"
"strconv"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/manifest"
"github.com/cockroachdb/pebble/sstable"
"github.com/cockroachdb/pebble/vfs"
"github.com/cockroachdb/pebble/vfs/atomicfs"
)
// FormatMajorVersion is a constant controlling the format of persisted
// data. Backwards incompatible changes to durable formats are gated
// behind new format major versions.
//
// At any point, a database's format major version may be bumped.
// However, once a database's format major version is increased,
// previous versions of Pebble will refuse to open the database.
//
// The zero value format is the FormatDefault constant. The exact
// FormatVersion that the default corresponds to may change with time.
type FormatMajorVersion uint64
// String implements fmt.Stringer.
func (v FormatMajorVersion) String() string {
// NB: This must not change. It's used as the value for the the
// on-disk version marker file.
//
// Specifically, this value must always parse as a base 10 integer
// that fits in a uint64. We format it as zero-padded, 3-digit
// number today, but the padding may change.
return fmt.Sprintf("%03d", v)
}
const (
// FormatDefault leaves the format version unspecified. The
// FormatDefault constant may be ratcheted upwards over time.
FormatDefault FormatMajorVersion = iota
// FormatMostCompatible maintains the most backwards compatibility,
// maintaining bi-directional compatibility with RocksDB 6.2.1 in
// the particular configuration described in the Pebble README.
FormatMostCompatible
// formatVersionedManifestMarker is the first
// backwards-incompatible change made to Pebble, introducing the
// format-version marker file for handling backwards-incompatible
// changes more broadly, and replacing the `CURRENT` file with a
// marker file.
//
// This format version is intended as an intermediary version state.
// It is deliberately unexported to discourage direct use of this
// format major version. Clients should use FormatVersioned which
// also ensures earlier versions of Pebble fail to open a database
// written in a future format major version.
formatVersionedManifestMarker
// FormatVersioned is a new format major version that replaces the
// old `CURRENT` file with a new 'marker' file scheme. Previous
// Pebble versions will be unable to open the database unless
// they're aware of format versions.
FormatVersioned
// FormatSetWithDelete is a format major version that introduces a new key
// kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
// unable to open this database.
FormatSetWithDelete
// FormatBlockPropertyCollector is a format major version that introduces
// BlockPropertyCollectors.
FormatBlockPropertyCollector
// FormatSplitUserKeysMarked is a format major version that guarantees that
// all files the share user keys with neighbors are marked for compaction in
// the manifest. Ratcheting to FormatSplitUserKeysMarked will block until
// the scan of the LSM is complete and the manifest has been rotated.
FormatSplitUserKeysMarked
// FormatMarkedCompacted is a format major version that guarantees that all
// files explicitly marked for compaction in the manifest have been
// compacted. Combined with the FormatSplitUserKeysMarked format major
// version, this version guarantees that there are no user keys split across
// multiple files within a level L1+. Ratcheting to this format version will
// block until all necessary compactions for files marked for compaction are
// complete.
FormatMarkedCompacted
// FormatRangeKeys is a format major version that introduces range keys.
FormatRangeKeys
// FormatNewest always contains the most recent format major version.
// NB: When adding new versions, the MaxTableFormat method should also be
// updated to return the maximum allowable version for the new
// FormatMajorVersion.
FormatNewest FormatMajorVersion = FormatRangeKeys
)
// MaxTableFormat returns the maximum sstable.TableFormat that can be used at
// this FormatMajorVersion.
func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
switch v {
case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
FormatVersioned, FormatSetWithDelete:
return sstable.TableFormatRocksDBv2
case FormatBlockPropertyCollector, FormatSplitUserKeysMarked, FormatMarkedCompacted:
return sstable.TableFormatPebblev1
case FormatRangeKeys:
return sstable.TableFormatPebblev2
default:
panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
}
}
// formatMajorVersionMigrations defines the migrations from one format
// major version to the next. Each migration is defined as a closure
// which will be invoked on the database before the new format major
// version is committed. Migrations must be idempotent. Migrations are
// invoked with d.mu locked.
//
// Each migration is responsible for invoking finalizeFormatVersUpgrade
// to set the new format major version. RatchetFormatMajorVersion will
// panic if a migration returns a nil error but fails to finalize the
// new format major version.
var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
FormatMostCompatible: func(d *DB) error { return nil },
formatVersionedManifestMarker: func(d *DB) error {
// formatVersionedManifestMarker introduces the use of a marker
// file for pointing to the current MANIFEST file.
// Lock the manifest.
d.mu.versions.logLock()
defer d.mu.versions.logUnlock()
// Construct the filename of the currently active manifest and
// move the manifest marker to that filename. The marker is
// guaranteed to exist, because we unconditionally locate it
// during Open.
manifestFileNum := d.mu.versions.manifestFileNum
filename := base.MakeFilename(fileTypeManifest, manifestFileNum)
if err := d.mu.versions.manifestMarker.Move(filename); err != nil {
return errors.Wrap(err, "moving manifest marker")
}
// Now that we have a manifest marker file in place and pointing
// to the current MANIFEST, finalize the upgrade. If we fail for
// some reason, a retry of this migration is guaranteed to again
// move the manifest marker file to the latest manifest. If
// we're unable to finalize the upgrade, a subsequent call to
// Open will ignore the manifest marker.
if err := d.finalizeFormatVersUpgrade(formatVersionedManifestMarker); err != nil {
return err
}
// We've finalized the upgrade. All subsequent Open calls will
// ignore the CURRENT file and instead read the manifest marker.
// Before we unlock the manifest, we need to update versionSet
// to use the manifest marker on future rotations.
d.mu.versions.setCurrent = setCurrentFuncMarker(
d.mu.versions.manifestMarker,
d.mu.versions.fs,
d.mu.versions.dirname)
return nil
},
// The FormatVersioned version is split into two, each with their
// own migration to ensure the post-migration cleanup happens even
// if there's a crash immediately after finalizing the version. Once
// a new format major version is finalized, its migration will never
// run again. Post-migration cleanup like the one in the migration
// below must be performed in a separate migration or every time the
// database opens.
FormatVersioned: func(d *DB) error {
// Replace the `CURRENT` file with one that points to the
// nonexistent `MANIFEST-000000` file. If an earlier Pebble
// version that does not know about format major versions
// attempts to open the database, it will error avoiding
// accidental corruption.
if err := setCurrentFile(d.mu.versions.dirname, d.mu.versions.fs, 0); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatVersioned)
},
// As SetWithDelete is a new key kind, there is nothing to migrate. We can
// simply finalize the format version and we're done.
FormatSetWithDelete: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatSetWithDelete)
},
FormatBlockPropertyCollector: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatBlockPropertyCollector)
},
FormatSplitUserKeysMarked: func(d *DB) error {
// Mark any unmarked files with split-user keys. Note all format major
// versions migrations are invoked with DB.mu locked.
if err := d.markFilesWithSplitUserKeysLocked(); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarked)
},
FormatMarkedCompacted: func(d *DB) error {
// Before finalizing the format major version, rewrite any sstables
// still marked for compaction. Note all format major versions
// migrations are invoked with DB.mu locked.
if err := d.compactMarkedFilesLocked(); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatMarkedCompacted)
},
FormatRangeKeys: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatRangeKeys)
},
}
const formatVersionMarkerName = `format-version`
func lookupFormatMajorVersion(
fs vfs.FS, dirname string,
) (FormatMajorVersion, *atomicfs.Marker, error) {
m, versString, err := atomicfs.LocateMarker(fs, dirname, formatVersionMarkerName)
if err != nil {
return 0, nil, err
}
if versString == "" {
return FormatMostCompatible, m, nil
}
v, err := strconv.ParseUint(versString, 10, 64)
if err != nil {
return 0, nil, errors.Wrap(err, "parsing format major version")
}
vers := FormatMajorVersion(v)
if vers == FormatDefault {
return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
}
if vers > FormatNewest {
return 0, nil, errors.Newf("pebble: database %q written in format major version %d", dirname, vers)
}
return vers, m, nil
}
// FormatMajorVersion returns the database's active format major
// version. The format major version may be higher than the one
// provided in Options when the database was opened if the existing
// database was written with a higher format version.
func (d *DB) FormatMajorVersion() FormatMajorVersion {
d.mu.Lock()
defer d.mu.Unlock()
return d.mu.formatVers.vers
}
// RatchetFormatMajorVersion ratchets the opened database's format major
// version to the provided version. It errors if the provided format
// major version is below the database's current version. Once a
// database's format major version is upgraded, previous Pebble versions
// that do not know of the format version will be unable to open the
// database.
func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
if err := d.closed.Load(); err != nil {
panic(err)
}
d.mu.Lock()
defer d.mu.Unlock()
return d.ratchetFormatMajorVersionLocked(fmv)
}
func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
if d.opts.ReadOnly {
return ErrReadOnly
}
if formatVers > FormatNewest {
// Guard against accidentally forgetting to update FormatNewest.
return errors.Errorf("pebble: unknown format version %d", formatVers)
}
if d.mu.formatVers.vers > formatVers {
return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
d.mu.formatVers.vers, formatVers)
}
for nextVers := d.mu.formatVers.vers + 1; nextVers <= formatVers; nextVers++ {
if err := formatMajorVersionMigrations[nextVers](d); err != nil {
return errors.Wrapf(err, "migrating to version %d", nextVers)
}
// NB: The migration is responsible for calling
// finalizeFormatVersUpgrade to finalize the upgrade. This
// structure is necessary because some migrations may need to
// update in-memory state (without ever dropping locks) after
// the upgrade is finalized. Here we assert that the upgrade
// did occur.
if d.mu.formatVers.vers != nextVers {
d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
}
}
return nil
}
// finalizeFormatVersUpgrade is typically only be called from within a
// format major version migration.
//
// See formatMajorVersionMigrations.
func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
// We use the marker to encode the active format version in the
// marker filename. Unlike other uses of the atomic marker, there is
// no file with the filename `formatVers.String()` on the
// filesystem.
if err := d.mu.formatVers.marker.Move(formatVers.String()); err != nil {
return err
}
d.mu.formatVers.vers = formatVers
d.opts.EventListener.FormatUpgrade(formatVers)
return nil
}
// compactMarkedFilesLocked performs a migration that schedules rewrite
// compactions to compact away any sstables marked for compaction.
// compactMarkedFilesLocked is run while ratcheting the database's format major
// version to FormatMarkedCompacted.
func (d *DB) compactMarkedFilesLocked() error {
curr := d.mu.versions.currentVersion()
for curr.Stats.MarkedForCompaction > 0 {
// Attempt to schedule a compaction to rewrite a file marked for
// compaction.
d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction {
return picker.pickRewriteCompaction(env)
})
// The above attempt might succeed and schedule a rewrite compaction. Or
// there might not be available compaction concurrency to schedule the
// compaction. Or compaction of the file might have already been in
// progress. In any scenario, wait until there's some change in the
// state of active compactions.
// Before waiting, check that the database hasn't been closed. Trying to
// schedule the compaction may have dropped d.mu while waiting for a
// manifest write to complete. In that dropped interim, the database may
// have been closed.
if err := d.closed.Load(); err != nil {
return err.(error)
}
d.mu.compact.cond.Wait()
// Some flush or compaction was scheduled or completed. Loop again to
// check again for files that must be compacted. The next iteration may
// find same file again, but that's okay. It'll eventually succeed in
// scheduling the compaction and eventually be woken by its completion.
curr = d.mu.versions.currentVersion()
}
return nil
}
// markFilesWithSplitUserKeysLocked scans the LSM's levels 1 through 6 for
// adjacent files that contain the same user key. Such arrangements of files
// were permitted in RocksDB and in Pebble up to SHA a860bbad.
// markFilesWithSplitUserKeysLocked marks such files as marked for compaction
// and returns the count of such files.
func (d *DB) markFilesWithSplitUserKeysLocked() error {
jobID := d.mu.nextJobID
d.mu.nextJobID++
vers := d.mu.versions.currentVersion()
marked := d.markFilesWithSplitUserKeysVersionLocked(vers)
if !marked {
// There was nothing to mark, so there's no need to rotate the manifest.
return nil
}
// There was at least one file marked for compaction. Force rotation to a
// new MANIFEST file, which ensures that the now marked-for-compaction file
// metadata are persisted as marked.
d.mu.versions.logLock()
return d.mu.versions.logAndApply(
jobID,
&manifest.VersionEdit{},
map[int]*LevelMetrics{},
true, /* forceRotation */
func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) })
}
func (d *DB) markFilesWithSplitUserKeysVersionLocked(vers *version) (marked bool) {
// Files with split user keys are expected to be rare and performing key
// comparisons for every file within the LSM is expensive, so drop the
// database lock while we're scanning the file metadata.
//
// If we find a file to mark, we'll re-acquire the mutex before marking it,
// since MarkedForCompaction is protected is by d.mu.
//
// Note the unusual locking: unlock, defer Lock().
d.mu.Unlock()
defer d.mu.Lock()
equal := d.opts.equal()
for l := numLevels - 1; l > 0; l-- {
iter := vers.Levels[l].Iter()
var markedWithinLevel bool
var prevFile *fileMetadata
var prevUserKey []byte
for f := iter.First(); f != nil; f = iter.Next() {
if prevUserKey != nil && equal(prevUserKey, f.Smallest.UserKey) {
// Acquire the database lock before examining or setting
// MarkedForCompaction.
d.mu.Lock()
if !f.MarkedForCompaction {
f.MarkedForCompaction = true
vers.Stats.MarkedForCompaction++
marked = true
markedWithinLevel = true
}
// Mark the previous file, if we didn't just mark it.
if !prevFile.MarkedForCompaction {
prevFile.MarkedForCompaction = true
vers.Stats.MarkedForCompaction++
marked = true
markedWithinLevel = true
}
d.mu.Unlock()
}
if f.Largest.IsExclusiveSentinel() {
prevUserKey = nil
prevFile = nil
} else {
prevUserKey = f.Largest.UserKey
prevFile = f
}
}
// If we marked any files for compaction, clear the compaction-picking
// annotation that caches files marked-for-compaction, as it's now
// out-of-date.
if markedWithinLevel {
d.mu.Lock()
vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{})
d.mu.Unlock()
}
}
return marked
}