-
Notifications
You must be signed in to change notification settings - Fork 472
/
Copy pathformat_major_version.go
339 lines (314 loc) · 13.6 KB
/
format_major_version.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package pebble
import (
"fmt"
"strconv"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/sstable"
"github.com/cockroachdb/pebble/vfs"
"github.com/cockroachdb/pebble/vfs/atomicfs"
)
// FormatMajorVersion is a constant controlling the format of persisted
// data. Backwards incompatible changes to durable formats are gated
// behind new format major versions.
//
// At any point, a database's format major version may be bumped.
// However, once a database's format major version is increased,
// previous versions of Pebble will refuse to open the database.
//
// The zero value format is the FormatDefault constant. The exact
// FormatVersion that the default corresponds to may change with time.
type FormatMajorVersion uint64
// String implements fmt.Stringer.
func (v FormatMajorVersion) String() string {
// NB: This must not change. It's used as the value for the the
// on-disk version marker file.
//
// Specifically, this value must always parse as a base 10 integer
// that fits in a uint64. We format it as zero-padded, 3-digit
// number today, but the padding may change.
return fmt.Sprintf("%03d", v)
}
const (
// FormatDefault leaves the format version unspecified. The
// FormatDefault constant may be ratcheted upwards over time.
FormatDefault FormatMajorVersion = iota
// FormatMostCompatible maintains the most backwards compatibility,
// maintaining bi-directional compatibility with RocksDB 6.2.1 in
// the particular configuration described in the Pebble README.
FormatMostCompatible
// formatVersionedManifestMarker is the first
// backwards-incompatible change made to Pebble, introducing the
// format-version marker file for handling backwards-incompatible
// changes more broadly, and replacing the `CURRENT` file with a
// marker file.
//
// This format version is intended as an intermediary version state.
// It is deliberately unexported to discourage direct use of this
// format major version. Clients should use FormatVersioned which
// also ensures earlier versions of Pebble fail to open a database
// written in a future format major version.
formatVersionedManifestMarker
// FormatVersioned is a new format major version that replaces the
// old `CURRENT` file with a new 'marker' file scheme. Previous
// Pebble versions will be unable to open the database unless
// they're aware of format versions.
FormatVersioned
// FormatSetWithDelete is a format major version that introduces a new key
// kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
// unable to open this database.
FormatSetWithDelete
// FormatBlockPropertyCollector is a format major version that introduces
// BlockPropertyCollectors.
FormatBlockPropertyCollector
// FormatSplitUserKeys is a format major version that guarantees that
// versions of a single user key is not split across multiple files within a
// level. Ratcheting to the FormatSplitUserKeys version will block until all
// necessary compactions are complete.
FormatSplitUserKeys
// FormatRangeKeys is a format major version that introduces range keys.
FormatRangeKeys
// FormatNewest always contains the most recent format major version.
// NB: When adding new versions, the MaxTableFormat method should also be
// updated to return the maximum allowable version for the new
// FormatMajorVersion.
FormatNewest FormatMajorVersion = FormatRangeKeys
)
// MaxTableFormat returns the maximum sstable.TableFormat that can be used at
// this FormatMajorVersion.
func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
switch v {
case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
FormatVersioned, FormatSetWithDelete:
return sstable.TableFormatRocksDBv2
case FormatBlockPropertyCollector, FormatSplitUserKeys:
return sstable.TableFormatPebblev1
case FormatRangeKeys:
return sstable.TableFormatPebblev2
default:
panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
}
}
// formatMajorVersionMigrations defines the migrations from one format
// major version to the next. Each migration is defined as a closure
// which will be invoked on the database before the new format major
// version is committed. Migrations must be idempotent. Migrations are
// invoked with d.mu locked.
//
// Each migration is responsible for invoking finalizeFormatVersUpgrade
// to set the new format major version. RatchetFormatMajorVersion will
// panic if a migration returns a nil error but fails to finalize the
// new format major version.
var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
FormatMostCompatible: func(d *DB) error { return nil },
formatVersionedManifestMarker: func(d *DB) error {
// formatVersionedManifestMarker introduces the use of a marker
// file for pointing to the current MANIFEST file.
// Lock the manifest.
d.mu.versions.logLock()
defer d.mu.versions.logUnlock()
// Construct the filename of the currently active manifest and
// move the manifest marker to that filename. The marker is
// guaranteed to exist, because we unconditionally locate it
// during Open.
manifestFileNum := d.mu.versions.manifestFileNum
filename := base.MakeFilename(fileTypeManifest, manifestFileNum)
if err := d.mu.versions.manifestMarker.Move(filename); err != nil {
return errors.Wrap(err, "moving manifest marker")
}
// Now that we have a manifest marker file in place and pointing
// to the current MANIFEST, finalize the upgrade. If we fail for
// some reason, a retry of this migration is guaranteed to again
// move the manifest marker file to the latest manifest. If
// we're unable to finalize the upgrade, a subsequent call to
// Open will ignore the manifest marker.
if err := d.finalizeFormatVersUpgrade(formatVersionedManifestMarker); err != nil {
return err
}
// We've finalized the upgrade. All subsequent Open calls will
// ignore the CURRENT file and instead read the manifest marker.
// Before we unlock the manifest, we need to update versionSet
// to use the manifest marker on future rotations.
d.mu.versions.setCurrent = setCurrentFuncMarker(
d.mu.versions.manifestMarker,
d.mu.versions.fs,
d.mu.versions.dirname)
return nil
},
// The FormatVersioned version is split into two, each with their
// own migration to ensure the post-migration cleanup happens even
// if there's a crash immediately after finalizing the version. Once
// a new format major version is finalized, its migration will never
// run again. Post-migration cleanup like the one in the migration
// below must be performed in a separate migration or every time the
// database opens.
FormatVersioned: func(d *DB) error {
// Replace the `CURRENT` file with one that points to the
// nonexistent `MANIFEST-000000` file. If an earlier Pebble
// version that does not know about format major versions
// attempts to open the database, it will error avoiding
// accidental corruption.
if err := setCurrentFile(d.mu.versions.dirname, d.mu.versions.fs, 0); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatVersioned)
},
// As SetWithDelete is a new key kind, there is nothing to migrate. We can
// simply finalize the format version and we're done.
FormatSetWithDelete: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatSetWithDelete)
},
FormatBlockPropertyCollector: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatBlockPropertyCollector)
},
FormatSplitUserKeys: func(d *DB) error {
// Before finalizing the format major version, rewrite any sstables that
// form multi-file atomic compaction units.
if err := d.rewriteSplitUserKeysLocked(); err != nil {
return err
}
return d.finalizeFormatVersUpgrade(FormatSplitUserKeys)
},
FormatRangeKeys: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatRangeKeys)
},
}
const formatVersionMarkerName = `format-version`
func lookupFormatMajorVersion(
fs vfs.FS, dirname string,
) (FormatMajorVersion, *atomicfs.Marker, error) {
m, versString, err := atomicfs.LocateMarker(fs, dirname, formatVersionMarkerName)
if err != nil {
return 0, nil, err
}
if versString == "" {
return FormatMostCompatible, m, nil
}
v, err := strconv.ParseUint(versString, 10, 64)
if err != nil {
return 0, nil, errors.Wrap(err, "parsing format major version")
}
vers := FormatMajorVersion(v)
if vers == FormatDefault {
return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
}
if vers > FormatNewest {
return 0, nil, errors.Newf("pebble: database %q written in format major version %d", dirname, vers)
}
return vers, m, nil
}
// FormatMajorVersion returns the database's active format major
// version. The format major version may be higher than the one
// provided in Options when the database was opened if the existing
// database was written with a higher format version.
func (d *DB) FormatMajorVersion() FormatMajorVersion {
d.mu.Lock()
defer d.mu.Unlock()
return d.mu.formatVers.vers
}
// RatchetFormatMajorVersion ratchets the opened database's format major
// version to the provided version. It errors if the provided format
// major version is below the database's current version. Once a
// database's format major version is upgraded, previous Pebble versions
// that do not know of the format version will be unable to open the
// database.
func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
if err := d.closed.Load(); err != nil {
panic(err)
}
d.mu.Lock()
defer d.mu.Unlock()
return d.ratchetFormatMajorVersionLocked(fmv)
}
func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
if d.opts.ReadOnly {
return ErrReadOnly
}
if formatVers > FormatNewest {
// Guard against accidentally forgetting to update FormatNewest.
return errors.Errorf("pebble: unknown format version %d", formatVers)
}
if d.mu.formatVers.vers > formatVers {
return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
d.mu.formatVers.vers, formatVers)
}
for nextVers := d.mu.formatVers.vers + 1; nextVers <= formatVers; nextVers++ {
if err := formatMajorVersionMigrations[nextVers](d); err != nil {
return errors.Wrapf(err, "migrating to version %d", nextVers)
}
// NB: The migration is responsible for calling
// finalizeFormatVersUpgrade to finalize the upgrade. This
// structure is necessary because some migrations may need to
// update in-memory state (without ever dropping locks) after
// the upgrade is finalized. Here we assert that the upgrade
// did occur.
if d.mu.formatVers.vers != nextVers {
d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
}
}
return nil
}
// finalizeFormatVersUpgrade is typically only be called from within a
// format major version migration.
//
// See formatMajorVersionMigrations.
func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
// We use the marker to encode the active format version in the
// marker filename. Unlike other uses of the atomic marker, there is
// no file with the filename `formatVers.String()` on the
// filesystem.
if err := d.mu.formatVers.marker.Move(formatVers.String()); err != nil {
return err
}
d.mu.formatVers.vers = formatVers
d.opts.EventListener.FormatUpgrade(formatVers)
return nil
}
// rewriteSplitUserKeysLocked performs a migration that rewrites adjacent
// sstables containing the same user key within levels 1-6. While current Pebble
// code does not construct such sstables, RocksDB and earlier versions of Pebble
// may have created them. These split user keys form sets of files that must be
// compacted together for correctness (referred to as "atomic compaction units"
// within the code).
//
// This migration will allow future versions of Pebble to simplify code and
// remove the concept of 'atomic compaction units'.
//
// rewriteSplitUserKeysLocked is run while ratcheting the database's format
// major version to FormatSplitUserKeys.
func (d *DB) rewriteSplitUserKeysLocked() error {
for {
// Look for any files that we must compact.
count, level, file, ok := findSplitUserKey(d.opts, d.mu.versions.currentVersion())
d.mu.compact.nonatomicFileCount = count
if !ok {
// There are no multi-file atomic compaction units in the database.
return nil
}
// Attempt to schedule a compaction to rewrite the split user key.
d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction {
pc := picker.pickRewriteCompaction(env, level, file)
return pc
})
// The above attempt might succeed and schedule a rewrite compaction. Or
// there might not be available compaction concurrency to schedule the
// compaction. Or compaction of the file might have already been in
// progress. In any scenario, wait until there's some change in the
// state of active compactions.
// Before waiting, check that the database hasn't been closed. Trying to
// schedule the compaction may have dropped d.mu while waiting for a
// manifest write to complete. In that dropped interim, the database may
// have been closed.
if err := d.closed.Load(); err != nil {
return err.(error)
}
d.mu.compact.cond.Wait()
// Some flush or compaction was scheduled or completed. Loop again to
// check again for files that must be compacted. The next iteration may
// find same file again, but that's okay. It'll eventually succeed in
// scheduling the compaction and eventually be woken by its completion.
}
}