Skip to content

Commit

Permalink
db: add format major version for flushable ingests
Browse files Browse the repository at this point in the history
Introduce a new format major version that gates uses of flushable ingests and
the corresponding backwards-incompatible changes to the write-ahead log. Also
invert the IngestSSTablesAsFlushable Option into DisableIngestAsFlushable,
enabling ingest-as-flushable by default on supported format major versions.

Close cockroachdb#2292.
  • Loading branch information
jbowens committed Feb 28, 2023
1 parent 63a8366 commit e2ebcd7
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 43 deletions.
24 changes: 18 additions & 6 deletions format_major_version.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,19 @@ const (
// TableFormatPebblev2.
FormatSSTableValueBlocks

// FormatFlushableIngest is a format major version that enables lazy
// addition of ingested sstables into the LSM structure. When an ingest
// overlaps with a memtable, a record of the ingest is written to the WAL
// without waiting for a flush. Subsequent reads treat the ingested files as
// a level above the overlapping memtable. Once the memtable is flushed, the
// ingested files are moved into the lowest possible levels.
//
// This feature is behind a format major version because it required
// breaking changes to the WAL format.
FormatFlushableIngest

// FormatNewest always contains the most recent format major version.
// NB: When adding new versions, the MaxTableFormat method should also be
// updated to return the maximum allowable version for the new
// FormatMajorVersion.
FormatNewest FormatMajorVersion = FormatSSTableValueBlocks
FormatNewest FormatMajorVersion = iota - 1
)

// MaxTableFormat returns the maximum sstable.TableFormat that can be used at
Expand All @@ -151,7 +159,7 @@ func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
FormatPrePebblev1MarkedCompacted:
return sstable.TableFormatPebblev2
case FormatSSTableValueBlocks:
case FormatSSTableValueBlocks, FormatFlushableIngest:
return sstable.TableFormatPebblev3
default:
panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
Expand All @@ -168,7 +176,8 @@ func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat {
FormatRangeKeys:
return sstable.TableFormatLevelDB
case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
FormatPrePebblev1MarkedCompacted, FormatSSTableValueBlocks:
FormatPrePebblev1MarkedCompacted, FormatSSTableValueBlocks,
FormatFlushableIngest:
return sstable.TableFormatPebblev1
default:
panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
Expand Down Expand Up @@ -294,6 +303,9 @@ var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
FormatSSTableValueBlocks: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatSSTableValueBlocks)
},
FormatFlushableIngest: func(d *DB) error {
return d.finalizeFormatVersUpgrade(FormatFlushableIngest)
},
}

const formatVersionMarkerName = `format-version`
Expand Down
3 changes: 3 additions & 0 deletions format_major_version_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ func TestRatchetFormat(t *testing.T) {
require.Equal(t, FormatPrePebblev1MarkedCompacted, d.FormatMajorVersion())
require.NoError(t, d.RatchetFormatMajorVersion(FormatSSTableValueBlocks))
require.Equal(t, FormatSSTableValueBlocks, d.FormatMajorVersion())
require.NoError(t, d.RatchetFormatMajorVersion(FormatFlushableIngest))
require.Equal(t, FormatFlushableIngest, d.FormatMajorVersion())

require.NoError(t, d.Close())

Expand Down Expand Up @@ -219,6 +221,7 @@ func TestFormatMajorVersions_TableFormat(t *testing.T) {
FormatPrePebblev1Marked: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev2},
FormatPrePebblev1MarkedCompacted: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev2},
FormatSSTableValueBlocks: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev3},
FormatFlushableIngest: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev3},
}

// Valid versions.
Expand Down
3 changes: 2 additions & 1 deletion ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,8 @@ func (d *DB) ingest(
m := d.mu.mem.queue[i]
if ingestMemtableOverlaps(d.cmp, m, meta) {
if (len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) ||
!d.opts.Experimental.IngestSSTablesAsFlushable {
d.mu.formatVers.vers < FormatFlushableIngest ||
d.opts.Experimental.DisableIngestAsFlushable {
mem = m
if mem.flushable == d.mu.mem.mutable {
err = d.makeRoomForWrite(nil)
Expand Down
2 changes: 0 additions & 2 deletions ingest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,6 @@ func TestOverlappingIngestedSSTs(t *testing.T) {
// Disable automatic compactions because otherwise we'll race with
// delete-only compactions triggered by ingesting range tombstones.
opts.DisableAutomaticCompactions = true
opts.Experimental.IngestSSTablesAsFlushable = true

var err error
d, err = Open(dir, opts)
Expand Down Expand Up @@ -824,7 +823,6 @@ func TestIngest(t *testing.T) {
// Disable automatic compactions because otherwise we'll race with
// delete-only compactions triggered by ingesting range tombstones.
opts.DisableAutomaticCompactions = true
opts.Experimental.IngestSSTablesAsFlushable = true

var err error
d, err = Open("", opts)
Expand Down
3 changes: 1 addition & 2 deletions internal/metamorphic/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ func defaultOptions() *pebble.Options {
FilterPolicy: bloom.FilterPolicy(10),
}},
}
opts.Experimental.IngestSSTablesAsFlushable = true
opts.EnsureDefaults()
return opts
}
Expand Down Expand Up @@ -348,7 +347,7 @@ func randomOptions(rng *rand.Rand) *testOptions {
opts.Experimental.MaxWriterConcurrency = 2
opts.Experimental.ForceWriterParallelism = true
}
opts.Experimental.IngestSSTablesAsFlushable = rng.Intn(2) == 0
opts.Experimental.DisableIngestAsFlushable = rng.Intn(2) == 0
var lopts pebble.LevelOptions
lopts.BlockRestartInterval = 1 + rng.Intn(64) // 1 - 64
lopts.BlockSize = 1 << uint(rng.Intn(24)) // 1 - 16MB
Expand Down
2 changes: 1 addition & 1 deletion open_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ func TestNewDBFilenames(t *testing.T) {
"LOCK",
"MANIFEST-000001",
"OPTIONS-000003",
"marker.format-version.000011.012",
"marker.format-version.000012.013",
"marker.manifest.000001.MANIFEST-000001",
},
}
Expand Down
17 changes: 9 additions & 8 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -627,14 +627,10 @@ type Options struct {
// sstables, and does not start rewriting existing sstables.
RequiredInPlaceValueBound UserKeyPrefixBound

// IngestSSTablesAsFlushable is used to determine if ingested sstables
// should be ingested as a flushable. By default this is false, but it
// is true for all metamorphic tests.
//
// TODO(bananabrick): Remove this field and enable by default once
// https://github.com/cockroachdb/pebble/issues/2292 and
// https://github.com/cockroachdb/pebble/issues/2266 are closed.
IngestSSTablesAsFlushable bool
// DisableIngestAsFlushable disables lazy ingestion of sstables through
// a WAL write and memtable rotation. Only effectual if the the format
// major version is at least `FormatFlushableIngest`.
DisableIngestAsFlushable bool

// SharedStorage is a second FS-like storage medium that can be shared
// between multiple Pebble instances. It is used to store sstables only, and
Expand Down Expand Up @@ -1123,6 +1119,9 @@ func (o *Options) String() string {
fmt.Fprintf(&buf, " compaction_debt_concurrency=%d\n", o.Experimental.CompactionDebtConcurrency)
fmt.Fprintf(&buf, " comparer=%s\n", o.Comparer.Name)
fmt.Fprintf(&buf, " disable_wal=%t\n", o.DisableWAL)
if o.Experimental.DisableIngestAsFlushable {
fmt.Fprintf(&buf, " disable_ingest_as_flushable=%t\n", o.Experimental.DisableIngestAsFlushable)
}
fmt.Fprintf(&buf, " flush_delay_delete_range=%s\n", o.FlushDelayDeleteRange)
fmt.Fprintf(&buf, " flush_delay_range_key=%s\n", o.FlushDelayRangeKey)
fmt.Fprintf(&buf, " flush_split_bytes=%d\n", o.FlushSplitBytes)
Expand Down Expand Up @@ -1323,6 +1322,8 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error {
o.private.disableDeleteOnlyCompactions, err = strconv.ParseBool(value)
case "disable_elision_only_compactions":
o.private.disableElisionOnlyCompactions, err = strconv.ParseBool(value)
case "disable_ingest_as_flushable":
o.Experimental.DisableIngestAsFlushable, err = strconv.ParseBool(value)
case "disable_lazy_combined_iteration":
o.private.disableLazyCombinedIteration, err = strconv.ParseBool(value)
case "disable_wal":
Expand Down
30 changes: 17 additions & 13 deletions testdata/checkpoint
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ create: db/marker.format-version.000011.012
close: db/marker.format-version.000011.012
remove: db/marker.format-version.000010.011
sync: db
create: db/marker.format-version.000012.013
close: db/marker.format-version.000012.013
remove: db/marker.format-version.000011.012
sync: db
create: db/temporary.000003.dbtmp
sync: db/temporary.000003.dbtmp
close: db/temporary.000003.dbtmp
Expand Down Expand Up @@ -130,9 +134,9 @@ close:
open-dir: checkpoints/checkpoint1
link: db/OPTIONS-000003 -> checkpoints/checkpoint1/OPTIONS-000003
open-dir: checkpoints/checkpoint1
create: checkpoints/checkpoint1/marker.format-version.000001.012
sync-data: checkpoints/checkpoint1/marker.format-version.000001.012
close: checkpoints/checkpoint1/marker.format-version.000001.012
create: checkpoints/checkpoint1/marker.format-version.000001.013
sync-data: checkpoints/checkpoint1/marker.format-version.000001.013
close: checkpoints/checkpoint1/marker.format-version.000001.013
sync: checkpoints/checkpoint1
close: checkpoints/checkpoint1
link: db/000005.sst -> checkpoints/checkpoint1/000005.sst
Expand Down Expand Up @@ -166,9 +170,9 @@ close: checkpoints
open-dir: checkpoints/checkpoint2
link: db/OPTIONS-000003 -> checkpoints/checkpoint2/OPTIONS-000003
open-dir: checkpoints/checkpoint2
create: checkpoints/checkpoint2/marker.format-version.000001.012
sync-data: checkpoints/checkpoint2/marker.format-version.000001.012
close: checkpoints/checkpoint2/marker.format-version.000001.012
create: checkpoints/checkpoint2/marker.format-version.000001.013
sync-data: checkpoints/checkpoint2/marker.format-version.000001.013
close: checkpoints/checkpoint2/marker.format-version.000001.013
sync: checkpoints/checkpoint2
close: checkpoints/checkpoint2
link: db/000007.sst -> checkpoints/checkpoint2/000007.sst
Expand Down Expand Up @@ -197,9 +201,9 @@ close: checkpoints
open-dir: checkpoints/checkpoint3
link: db/OPTIONS-000003 -> checkpoints/checkpoint3/OPTIONS-000003
open-dir: checkpoints/checkpoint3
create: checkpoints/checkpoint3/marker.format-version.000001.012
sync-data: checkpoints/checkpoint3/marker.format-version.000001.012
close: checkpoints/checkpoint3/marker.format-version.000001.012
create: checkpoints/checkpoint3/marker.format-version.000001.013
sync-data: checkpoints/checkpoint3/marker.format-version.000001.013
close: checkpoints/checkpoint3/marker.format-version.000001.013
sync: checkpoints/checkpoint3
close: checkpoints/checkpoint3
link: db/000005.sst -> checkpoints/checkpoint3/000005.sst
Expand Down Expand Up @@ -253,7 +257,7 @@ CURRENT
LOCK
MANIFEST-000001
OPTIONS-000003
marker.format-version.000011.012
marker.format-version.000012.013
marker.manifest.000001.MANIFEST-000001

list checkpoints/checkpoint1
Expand All @@ -263,7 +267,7 @@ list checkpoints/checkpoint1
000007.sst
MANIFEST-000001
OPTIONS-000003
marker.format-version.000001.012
marker.format-version.000001.013
marker.manifest.000001.MANIFEST-000001

open checkpoints/checkpoint1 readonly
Expand Down Expand Up @@ -304,7 +308,7 @@ list checkpoints/checkpoint2
000007.sst
MANIFEST-000001
OPTIONS-000003
marker.format-version.000001.012
marker.format-version.000001.013
marker.manifest.000001.MANIFEST-000001

open checkpoints/checkpoint2 readonly
Expand Down Expand Up @@ -332,7 +336,7 @@ list checkpoints/checkpoint3
000007.sst
MANIFEST-000001
OPTIONS-000003
marker.format-version.000001.012
marker.format-version.000001.013
marker.manifest.000001.MANIFEST-000001

open checkpoints/checkpoint3 readonly
Expand Down
11 changes: 8 additions & 3 deletions testdata/event_listener
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ close: db/marker.format-version.000011.012
remove: db/marker.format-version.000010.011
sync: db
upgraded to format version: 012
create: db/marker.format-version.000012.013
close: db/marker.format-version.000012.013
remove: db/marker.format-version.000011.012
sync: db
upgraded to format version: 013
create: db/temporary.000003.dbtmp
sync: db/temporary.000003.dbtmp
close: db/temporary.000003.dbtmp
Expand Down Expand Up @@ -251,9 +256,9 @@ close:
open-dir: checkpoint
link: db/OPTIONS-000003 -> checkpoint/OPTIONS-000003
open-dir: checkpoint
create: checkpoint/marker.format-version.000001.012
sync-data: checkpoint/marker.format-version.000001.012
close: checkpoint/marker.format-version.000001.012
create: checkpoint/marker.format-version.000001.013
sync-data: checkpoint/marker.format-version.000001.013
close: checkpoint/marker.format-version.000001.013
sync: checkpoint
close: checkpoint
link: db/000013.sst -> checkpoint/000013.sst
Expand Down
14 changes: 7 additions & 7 deletions testdata/flushable_ingest
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ LOCK
MANIFEST-000001
OPTIONS-000003
ext
marker.format-version.000011.012
marker.format-version.000012.013
marker.manifest.000001.MANIFEST-000001

# Test basic WAL replay
Expand All @@ -83,7 +83,7 @@ LOCK
MANIFEST-000001
OPTIONS-000003
ext
marker.format-version.000011.012
marker.format-version.000012.013
marker.manifest.000001.MANIFEST-000001

open
Expand Down Expand Up @@ -392,7 +392,7 @@ LOCK
MANIFEST-000001
OPTIONS-000003
ext
marker.format-version.000011.012
marker.format-version.000012.013
marker.manifest.000001.MANIFEST-000001

close
Expand All @@ -413,7 +413,7 @@ LOCK
MANIFEST-000001
OPTIONS-000003
ext
marker.format-version.000011.012
marker.format-version.000012.013
marker.manifest.000001.MANIFEST-000001

open
Expand Down Expand Up @@ -446,7 +446,7 @@ MANIFEST-000001
MANIFEST-000012
OPTIONS-000013
ext
marker.format-version.000011.012
marker.format-version.000012.013
marker.manifest.000002.MANIFEST-000012

# Make sure that the new mutable memtable can accept writes.
Expand Down Expand Up @@ -590,7 +590,7 @@ LOCK
MANIFEST-000001
OPTIONS-000003
ext
marker.format-version.000011.012
marker.format-version.000012.013
marker.manifest.000001.MANIFEST-000001

close
Expand All @@ -610,7 +610,7 @@ MANIFEST-000001
OPTIONS-000003
ext
ext1
marker.format-version.000011.012
marker.format-version.000012.013
marker.manifest.000001.MANIFEST-000001

ignoreSyncs false
Expand Down

0 comments on commit e2ebcd7

Please sign in to comment.