diff --git a/format_major_version.go b/format_major_version.go index 844b1205ef..8fcb5d68fa 100644 --- a/format_major_version.go +++ b/format_major_version.go @@ -131,11 +131,19 @@ const ( // TableFormatPebblev2. FormatSSTableValueBlocks + // FormatFlushableIngest is a format major version that enables lazy + // addition of ingested sstables into the LSM structure. When an ingest + // overlaps with a memtable, a record of the ingest is written to the WAL + // without waiting for a flush. Subsequent reads treat the ingested files as + // a level above the overlapping memtable. Once the memtable is flushed, the + // ingested files are moved into the lowest possible levels. + // + // This feature is behind a format major version because it required + // breaking changes to the WAL format. + FormatFlushableIngest + // FormatNewest always contains the most recent format major version. - // NB: When adding new versions, the MaxTableFormat method should also be - // updated to return the maximum allowable version for the new - // FormatMajorVersion. - FormatNewest FormatMajorVersion = FormatSSTableValueBlocks + FormatNewest FormatMajorVersion = iota - 1 ) // MaxTableFormat returns the maximum sstable.TableFormat that can be used at @@ -151,7 +159,7 @@ func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat { case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, FormatPrePebblev1MarkedCompacted: return sstable.TableFormatPebblev2 - case FormatSSTableValueBlocks: + case FormatSSTableValueBlocks, FormatFlushableIngest: return sstable.TableFormatPebblev3 default: panic(fmt.Sprintf("pebble: unsupported format major version: %s", v)) @@ -168,7 +176,8 @@ func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat { FormatRangeKeys: return sstable.TableFormatLevelDB case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, - FormatPrePebblev1MarkedCompacted, FormatSSTableValueBlocks: + FormatPrePebblev1MarkedCompacted, FormatSSTableValueBlocks, + FormatFlushableIngest: return sstable.TableFormatPebblev1 default: panic(fmt.Sprintf("pebble: unsupported format major version: %s", v)) @@ -294,6 +303,9 @@ var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{ FormatSSTableValueBlocks: func(d *DB) error { return d.finalizeFormatVersUpgrade(FormatSSTableValueBlocks) }, + FormatFlushableIngest: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatFlushableIngest) + }, } const formatVersionMarkerName = `format-version` diff --git a/format_major_version_test.go b/format_major_version_test.go index ce24c0aebe..b862b984bf 100644 --- a/format_major_version_test.go +++ b/format_major_version_test.go @@ -58,6 +58,8 @@ func TestRatchetFormat(t *testing.T) { require.Equal(t, FormatPrePebblev1MarkedCompacted, d.FormatMajorVersion()) require.NoError(t, d.RatchetFormatMajorVersion(FormatSSTableValueBlocks)) require.Equal(t, FormatSSTableValueBlocks, d.FormatMajorVersion()) + require.NoError(t, d.RatchetFormatMajorVersion(FormatFlushableIngest)) + require.Equal(t, FormatFlushableIngest, d.FormatMajorVersion()) require.NoError(t, d.Close()) @@ -219,6 +221,7 @@ func TestFormatMajorVersions_TableFormat(t *testing.T) { FormatPrePebblev1Marked: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev2}, FormatPrePebblev1MarkedCompacted: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev2}, FormatSSTableValueBlocks: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev3}, + FormatFlushableIngest: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev3}, } // Valid versions. diff --git a/ingest.go b/ingest.go index de1d6aeecf..2ce5b2d146 100644 --- a/ingest.go +++ b/ingest.go @@ -845,7 +845,8 @@ func (d *DB) ingest( m := d.mu.mem.queue[i] if ingestMemtableOverlaps(d.cmp, m, meta) { if (len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) || - !d.opts.Experimental.IngestSSTablesAsFlushable { + d.mu.formatVers.vers < FormatFlushableIngest || + d.opts.Experimental.DisableIngestAsFlushable { mem = m if mem.flushable == d.mu.mem.mutable { err = d.makeRoomForWrite(nil) diff --git a/ingest_test.go b/ingest_test.go index b3ac2e471d..53ca9b3c71 100644 --- a/ingest_test.go +++ b/ingest_test.go @@ -437,7 +437,6 @@ func TestOverlappingIngestedSSTs(t *testing.T) { // Disable automatic compactions because otherwise we'll race with // delete-only compactions triggered by ingesting range tombstones. opts.DisableAutomaticCompactions = true - opts.Experimental.IngestSSTablesAsFlushable = true var err error d, err = Open(dir, opts) @@ -824,7 +823,6 @@ func TestIngest(t *testing.T) { // Disable automatic compactions because otherwise we'll race with // delete-only compactions triggered by ingesting range tombstones. opts.DisableAutomaticCompactions = true - opts.Experimental.IngestSSTablesAsFlushable = true var err error d, err = Open("", opts) diff --git a/internal/metamorphic/options.go b/internal/metamorphic/options.go index d8b8bc082a..6ade191d95 100644 --- a/internal/metamorphic/options.go +++ b/internal/metamorphic/options.go @@ -133,7 +133,6 @@ func defaultOptions() *pebble.Options { FilterPolicy: bloom.FilterPolicy(10), }}, } - opts.Experimental.IngestSSTablesAsFlushable = true opts.EnsureDefaults() return opts } @@ -348,7 +347,7 @@ func randomOptions(rng *rand.Rand) *testOptions { opts.Experimental.MaxWriterConcurrency = 2 opts.Experimental.ForceWriterParallelism = true } - opts.Experimental.IngestSSTablesAsFlushable = rng.Intn(2) == 0 + opts.Experimental.DisableIngestAsFlushable = rng.Intn(2) == 0 var lopts pebble.LevelOptions lopts.BlockRestartInterval = 1 + rng.Intn(64) // 1 - 64 lopts.BlockSize = 1 << uint(rng.Intn(24)) // 1 - 16MB diff --git a/open_test.go b/open_test.go index d3ee7bad01..64cb46adba 100644 --- a/open_test.go +++ b/open_test.go @@ -161,7 +161,7 @@ func TestNewDBFilenames(t *testing.T) { "LOCK", "MANIFEST-000001", "OPTIONS-000003", - "marker.format-version.000011.012", + "marker.format-version.000012.013", "marker.manifest.000001.MANIFEST-000001", }, } diff --git a/options.go b/options.go index 0d2bbbc84d..3478107436 100644 --- a/options.go +++ b/options.go @@ -627,14 +627,10 @@ type Options struct { // sstables, and does not start rewriting existing sstables. RequiredInPlaceValueBound UserKeyPrefixBound - // IngestSSTablesAsFlushable is used to determine if ingested sstables - // should be ingested as a flushable. By default this is false, but it - // is true for all metamorphic tests. - // - // TODO(bananabrick): Remove this field and enable by default once - // https://github.com/cockroachdb/pebble/issues/2292 and - // https://github.com/cockroachdb/pebble/issues/2266 are closed. - IngestSSTablesAsFlushable bool + // DisableIngestAsFlushable disables lazy ingestion of sstables through + // a WAL write and memtable rotation. Only effectual if the the format + // major version is at least `FormatFlushableIngest`. + DisableIngestAsFlushable bool // SharedStorage is a second FS-like storage medium that can be shared // between multiple Pebble instances. It is used to store sstables only, and @@ -1123,6 +1119,9 @@ func (o *Options) String() string { fmt.Fprintf(&buf, " compaction_debt_concurrency=%d\n", o.Experimental.CompactionDebtConcurrency) fmt.Fprintf(&buf, " comparer=%s\n", o.Comparer.Name) fmt.Fprintf(&buf, " disable_wal=%t\n", o.DisableWAL) + if o.Experimental.DisableIngestAsFlushable { + fmt.Fprintf(&buf, " disable_ingest_as_flushable=%t\n", o.Experimental.DisableIngestAsFlushable) + } fmt.Fprintf(&buf, " flush_delay_delete_range=%s\n", o.FlushDelayDeleteRange) fmt.Fprintf(&buf, " flush_delay_range_key=%s\n", o.FlushDelayRangeKey) fmt.Fprintf(&buf, " flush_split_bytes=%d\n", o.FlushSplitBytes) @@ -1323,6 +1322,8 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { o.private.disableDeleteOnlyCompactions, err = strconv.ParseBool(value) case "disable_elision_only_compactions": o.private.disableElisionOnlyCompactions, err = strconv.ParseBool(value) + case "disable_ingest_as_flushable": + o.Experimental.DisableIngestAsFlushable, err = strconv.ParseBool(value) case "disable_lazy_combined_iteration": o.private.disableLazyCombinedIteration, err = strconv.ParseBool(value) case "disable_wal": diff --git a/testdata/checkpoint b/testdata/checkpoint index 099318356a..2872d98f8a 100644 --- a/testdata/checkpoint +++ b/testdata/checkpoint @@ -68,6 +68,10 @@ create: db/marker.format-version.000011.012 close: db/marker.format-version.000011.012 remove: db/marker.format-version.000010.011 sync: db +create: db/marker.format-version.000012.013 +close: db/marker.format-version.000012.013 +remove: db/marker.format-version.000011.012 +sync: db create: db/temporary.000003.dbtmp sync: db/temporary.000003.dbtmp close: db/temporary.000003.dbtmp @@ -130,9 +134,9 @@ close: open-dir: checkpoints/checkpoint1 link: db/OPTIONS-000003 -> checkpoints/checkpoint1/OPTIONS-000003 open-dir: checkpoints/checkpoint1 -create: checkpoints/checkpoint1/marker.format-version.000001.012 -sync-data: checkpoints/checkpoint1/marker.format-version.000001.012 -close: checkpoints/checkpoint1/marker.format-version.000001.012 +create: checkpoints/checkpoint1/marker.format-version.000001.013 +sync-data: checkpoints/checkpoint1/marker.format-version.000001.013 +close: checkpoints/checkpoint1/marker.format-version.000001.013 sync: checkpoints/checkpoint1 close: checkpoints/checkpoint1 link: db/000005.sst -> checkpoints/checkpoint1/000005.sst @@ -166,9 +170,9 @@ close: checkpoints open-dir: checkpoints/checkpoint2 link: db/OPTIONS-000003 -> checkpoints/checkpoint2/OPTIONS-000003 open-dir: checkpoints/checkpoint2 -create: checkpoints/checkpoint2/marker.format-version.000001.012 -sync-data: checkpoints/checkpoint2/marker.format-version.000001.012 -close: checkpoints/checkpoint2/marker.format-version.000001.012 +create: checkpoints/checkpoint2/marker.format-version.000001.013 +sync-data: checkpoints/checkpoint2/marker.format-version.000001.013 +close: checkpoints/checkpoint2/marker.format-version.000001.013 sync: checkpoints/checkpoint2 close: checkpoints/checkpoint2 link: db/000007.sst -> checkpoints/checkpoint2/000007.sst @@ -197,9 +201,9 @@ close: checkpoints open-dir: checkpoints/checkpoint3 link: db/OPTIONS-000003 -> checkpoints/checkpoint3/OPTIONS-000003 open-dir: checkpoints/checkpoint3 -create: checkpoints/checkpoint3/marker.format-version.000001.012 -sync-data: checkpoints/checkpoint3/marker.format-version.000001.012 -close: checkpoints/checkpoint3/marker.format-version.000001.012 +create: checkpoints/checkpoint3/marker.format-version.000001.013 +sync-data: checkpoints/checkpoint3/marker.format-version.000001.013 +close: checkpoints/checkpoint3/marker.format-version.000001.013 sync: checkpoints/checkpoint3 close: checkpoints/checkpoint3 link: db/000005.sst -> checkpoints/checkpoint3/000005.sst @@ -253,7 +257,7 @@ CURRENT LOCK MANIFEST-000001 OPTIONS-000003 -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 list checkpoints/checkpoint1 @@ -263,7 +267,7 @@ list checkpoints/checkpoint1 000007.sst MANIFEST-000001 OPTIONS-000003 -marker.format-version.000001.012 +marker.format-version.000001.013 marker.manifest.000001.MANIFEST-000001 open checkpoints/checkpoint1 readonly @@ -304,7 +308,7 @@ list checkpoints/checkpoint2 000007.sst MANIFEST-000001 OPTIONS-000003 -marker.format-version.000001.012 +marker.format-version.000001.013 marker.manifest.000001.MANIFEST-000001 open checkpoints/checkpoint2 readonly @@ -332,7 +336,7 @@ list checkpoints/checkpoint3 000007.sst MANIFEST-000001 OPTIONS-000003 -marker.format-version.000001.012 +marker.format-version.000001.013 marker.manifest.000001.MANIFEST-000001 open checkpoints/checkpoint3 readonly diff --git a/testdata/event_listener b/testdata/event_listener index 8c167923a1..7ed11522e4 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -83,6 +83,11 @@ close: db/marker.format-version.000011.012 remove: db/marker.format-version.000010.011 sync: db upgraded to format version: 012 +create: db/marker.format-version.000012.013 +close: db/marker.format-version.000012.013 +remove: db/marker.format-version.000011.012 +sync: db +upgraded to format version: 013 create: db/temporary.000003.dbtmp sync: db/temporary.000003.dbtmp close: db/temporary.000003.dbtmp @@ -251,9 +256,9 @@ close: open-dir: checkpoint link: db/OPTIONS-000003 -> checkpoint/OPTIONS-000003 open-dir: checkpoint -create: checkpoint/marker.format-version.000001.012 -sync-data: checkpoint/marker.format-version.000001.012 -close: checkpoint/marker.format-version.000001.012 +create: checkpoint/marker.format-version.000001.013 +sync-data: checkpoint/marker.format-version.000001.013 +close: checkpoint/marker.format-version.000001.013 sync: checkpoint close: checkpoint link: db/000013.sst -> checkpoint/000013.sst diff --git a/testdata/flushable_ingest b/testdata/flushable_ingest index ff51856d32..41ce3eef0c 100644 --- a/testdata/flushable_ingest +++ b/testdata/flushable_ingest @@ -61,7 +61,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 # Test basic WAL replay @@ -83,7 +83,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 open @@ -392,7 +392,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 close @@ -413,7 +413,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 open @@ -446,7 +446,7 @@ MANIFEST-000001 MANIFEST-000012 OPTIONS-000013 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000002.MANIFEST-000012 # Make sure that the new mutable memtable can accept writes. @@ -590,7 +590,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 close @@ -610,7 +610,7 @@ MANIFEST-000001 OPTIONS-000003 ext ext1 -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 ignoreSyncs false