From 74f40ed1593ee567bf1af8e7bb6c807a8c4b9f48 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Mon, 27 Feb 2023 11:51:56 -0500 Subject: [PATCH] db: add format major version for flushable ingests Introduce a new format major version that gates uses of flushable ingests and the corresponding backwards-incompatible changes to the write-ahead log. Close #2292. --- format_major_version.go | 21 ++++++++++++++++++--- format_major_version_test.go | 3 +++ ingest.go | 3 ++- open_test.go | 2 +- options.go | 8 ++++---- testdata/checkpoint | 30 +++++++++++++++++------------- testdata/event_listener | 11 ++++++++--- testdata/flushable_ingest | 14 +++++++------- 8 files changed, 60 insertions(+), 32 deletions(-) diff --git a/format_major_version.go b/format_major_version.go index 844b1205ef..02f995bc54 100644 --- a/format_major_version.go +++ b/format_major_version.go @@ -131,11 +131,22 @@ const ( // TableFormatPebblev2. FormatSSTableValueBlocks + // FormatFlushableIngest is a format major version that enables lazy + // addition of ingested sstables into the LSM structure. When an ingest + // overlaps with a memtable, a record of the ingest is written to the WAL + // without waiting for a flush. Subsequent reads treat the ingested files as + // a level above the overlapping memtable. Once the memtable is flushed, the + // ingested files are moved into the lowest possible levels. + // + // This feature is behind a format major version because it required + // breaking changes to the WAL format. + FormatFlushableIngest + // FormatNewest always contains the most recent format major version. // NB: When adding new versions, the MaxTableFormat method should also be // updated to return the maximum allowable version for the new // FormatMajorVersion. - FormatNewest FormatMajorVersion = FormatSSTableValueBlocks + FormatNewest FormatMajorVersion = FormatFlushableIngest ) // MaxTableFormat returns the maximum sstable.TableFormat that can be used at @@ -151,7 +162,7 @@ func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat { case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, FormatPrePebblev1MarkedCompacted: return sstable.TableFormatPebblev2 - case FormatSSTableValueBlocks: + case FormatSSTableValueBlocks, FormatFlushableIngest: return sstable.TableFormatPebblev3 default: panic(fmt.Sprintf("pebble: unsupported format major version: %s", v)) @@ -168,7 +179,8 @@ func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat { FormatRangeKeys: return sstable.TableFormatLevelDB case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, - FormatPrePebblev1MarkedCompacted, FormatSSTableValueBlocks: + FormatPrePebblev1MarkedCompacted, FormatSSTableValueBlocks, + FormatFlushableIngest: return sstable.TableFormatPebblev1 default: panic(fmt.Sprintf("pebble: unsupported format major version: %s", v)) @@ -294,6 +306,9 @@ var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{ FormatSSTableValueBlocks: func(d *DB) error { return d.finalizeFormatVersUpgrade(FormatSSTableValueBlocks) }, + FormatFlushableIngest: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatFlushableIngest) + }, } const formatVersionMarkerName = `format-version` diff --git a/format_major_version_test.go b/format_major_version_test.go index ce24c0aebe..b862b984bf 100644 --- a/format_major_version_test.go +++ b/format_major_version_test.go @@ -58,6 +58,8 @@ func TestRatchetFormat(t *testing.T) { require.Equal(t, FormatPrePebblev1MarkedCompacted, d.FormatMajorVersion()) require.NoError(t, d.RatchetFormatMajorVersion(FormatSSTableValueBlocks)) require.Equal(t, FormatSSTableValueBlocks, d.FormatMajorVersion()) + require.NoError(t, d.RatchetFormatMajorVersion(FormatFlushableIngest)) + require.Equal(t, FormatFlushableIngest, d.FormatMajorVersion()) require.NoError(t, d.Close()) @@ -219,6 +221,7 @@ func TestFormatMajorVersions_TableFormat(t *testing.T) { FormatPrePebblev1Marked: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev2}, FormatPrePebblev1MarkedCompacted: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev2}, FormatSSTableValueBlocks: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev3}, + FormatFlushableIngest: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev3}, } // Valid versions. diff --git a/ingest.go b/ingest.go index de1d6aeecf..ecb721f880 100644 --- a/ingest.go +++ b/ingest.go @@ -845,7 +845,8 @@ func (d *DB) ingest( m := d.mu.mem.queue[i] if ingestMemtableOverlaps(d.cmp, m, meta) { if (len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) || - !d.opts.Experimental.IngestSSTablesAsFlushable { + !d.opts.Experimental.IngestSSTablesAsFlushable || + d.mu.formatVers.vers < FormatFlushableIngest { mem = m if mem.flushable == d.mu.mem.mutable { err = d.makeRoomForWrite(nil) diff --git a/open_test.go b/open_test.go index d3ee7bad01..64cb46adba 100644 --- a/open_test.go +++ b/open_test.go @@ -161,7 +161,7 @@ func TestNewDBFilenames(t *testing.T) { "LOCK", "MANIFEST-000001", "OPTIONS-000003", - "marker.format-version.000011.012", + "marker.format-version.000012.013", "marker.manifest.000001.MANIFEST-000001", }, } diff --git a/options.go b/options.go index 0d2bbbc84d..8c494f41dc 100644 --- a/options.go +++ b/options.go @@ -629,11 +629,11 @@ type Options struct { // IngestSSTablesAsFlushable is used to determine if ingested sstables // should be ingested as a flushable. By default this is false, but it - // is true for all metamorphic tests. + // is true for all metamorphic tests. Only effective if the format major + // version is also at least `FormatFlushableIngest`. // - // TODO(bananabrick): Remove this field and enable by default once - // https://github.com/cockroachdb/pebble/issues/2292 and - // https://github.com/cockroachdb/pebble/issues/2266 are closed. + // TODO(bananabrick,jackson): Remove this field and unconditionally + // ingest as flushable. IngestSSTablesAsFlushable bool // SharedStorage is a second FS-like storage medium that can be shared diff --git a/testdata/checkpoint b/testdata/checkpoint index 099318356a..2872d98f8a 100644 --- a/testdata/checkpoint +++ b/testdata/checkpoint @@ -68,6 +68,10 @@ create: db/marker.format-version.000011.012 close: db/marker.format-version.000011.012 remove: db/marker.format-version.000010.011 sync: db +create: db/marker.format-version.000012.013 +close: db/marker.format-version.000012.013 +remove: db/marker.format-version.000011.012 +sync: db create: db/temporary.000003.dbtmp sync: db/temporary.000003.dbtmp close: db/temporary.000003.dbtmp @@ -130,9 +134,9 @@ close: open-dir: checkpoints/checkpoint1 link: db/OPTIONS-000003 -> checkpoints/checkpoint1/OPTIONS-000003 open-dir: checkpoints/checkpoint1 -create: checkpoints/checkpoint1/marker.format-version.000001.012 -sync-data: checkpoints/checkpoint1/marker.format-version.000001.012 -close: checkpoints/checkpoint1/marker.format-version.000001.012 +create: checkpoints/checkpoint1/marker.format-version.000001.013 +sync-data: checkpoints/checkpoint1/marker.format-version.000001.013 +close: checkpoints/checkpoint1/marker.format-version.000001.013 sync: checkpoints/checkpoint1 close: checkpoints/checkpoint1 link: db/000005.sst -> checkpoints/checkpoint1/000005.sst @@ -166,9 +170,9 @@ close: checkpoints open-dir: checkpoints/checkpoint2 link: db/OPTIONS-000003 -> checkpoints/checkpoint2/OPTIONS-000003 open-dir: checkpoints/checkpoint2 -create: checkpoints/checkpoint2/marker.format-version.000001.012 -sync-data: checkpoints/checkpoint2/marker.format-version.000001.012 -close: checkpoints/checkpoint2/marker.format-version.000001.012 +create: checkpoints/checkpoint2/marker.format-version.000001.013 +sync-data: checkpoints/checkpoint2/marker.format-version.000001.013 +close: checkpoints/checkpoint2/marker.format-version.000001.013 sync: checkpoints/checkpoint2 close: checkpoints/checkpoint2 link: db/000007.sst -> checkpoints/checkpoint2/000007.sst @@ -197,9 +201,9 @@ close: checkpoints open-dir: checkpoints/checkpoint3 link: db/OPTIONS-000003 -> checkpoints/checkpoint3/OPTIONS-000003 open-dir: checkpoints/checkpoint3 -create: checkpoints/checkpoint3/marker.format-version.000001.012 -sync-data: checkpoints/checkpoint3/marker.format-version.000001.012 -close: checkpoints/checkpoint3/marker.format-version.000001.012 +create: checkpoints/checkpoint3/marker.format-version.000001.013 +sync-data: checkpoints/checkpoint3/marker.format-version.000001.013 +close: checkpoints/checkpoint3/marker.format-version.000001.013 sync: checkpoints/checkpoint3 close: checkpoints/checkpoint3 link: db/000005.sst -> checkpoints/checkpoint3/000005.sst @@ -253,7 +257,7 @@ CURRENT LOCK MANIFEST-000001 OPTIONS-000003 -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 list checkpoints/checkpoint1 @@ -263,7 +267,7 @@ list checkpoints/checkpoint1 000007.sst MANIFEST-000001 OPTIONS-000003 -marker.format-version.000001.012 +marker.format-version.000001.013 marker.manifest.000001.MANIFEST-000001 open checkpoints/checkpoint1 readonly @@ -304,7 +308,7 @@ list checkpoints/checkpoint2 000007.sst MANIFEST-000001 OPTIONS-000003 -marker.format-version.000001.012 +marker.format-version.000001.013 marker.manifest.000001.MANIFEST-000001 open checkpoints/checkpoint2 readonly @@ -332,7 +336,7 @@ list checkpoints/checkpoint3 000007.sst MANIFEST-000001 OPTIONS-000003 -marker.format-version.000001.012 +marker.format-version.000001.013 marker.manifest.000001.MANIFEST-000001 open checkpoints/checkpoint3 readonly diff --git a/testdata/event_listener b/testdata/event_listener index 8c167923a1..7ed11522e4 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -83,6 +83,11 @@ close: db/marker.format-version.000011.012 remove: db/marker.format-version.000010.011 sync: db upgraded to format version: 012 +create: db/marker.format-version.000012.013 +close: db/marker.format-version.000012.013 +remove: db/marker.format-version.000011.012 +sync: db +upgraded to format version: 013 create: db/temporary.000003.dbtmp sync: db/temporary.000003.dbtmp close: db/temporary.000003.dbtmp @@ -251,9 +256,9 @@ close: open-dir: checkpoint link: db/OPTIONS-000003 -> checkpoint/OPTIONS-000003 open-dir: checkpoint -create: checkpoint/marker.format-version.000001.012 -sync-data: checkpoint/marker.format-version.000001.012 -close: checkpoint/marker.format-version.000001.012 +create: checkpoint/marker.format-version.000001.013 +sync-data: checkpoint/marker.format-version.000001.013 +close: checkpoint/marker.format-version.000001.013 sync: checkpoint close: checkpoint link: db/000013.sst -> checkpoint/000013.sst diff --git a/testdata/flushable_ingest b/testdata/flushable_ingest index ff51856d32..41ce3eef0c 100644 --- a/testdata/flushable_ingest +++ b/testdata/flushable_ingest @@ -61,7 +61,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 # Test basic WAL replay @@ -83,7 +83,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 open @@ -392,7 +392,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 close @@ -413,7 +413,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 open @@ -446,7 +446,7 @@ MANIFEST-000001 MANIFEST-000012 OPTIONS-000013 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000002.MANIFEST-000012 # Make sure that the new mutable memtable can accept writes. @@ -590,7 +590,7 @@ LOCK MANIFEST-000001 OPTIONS-000003 ext -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 close @@ -610,7 +610,7 @@ MANIFEST-000001 OPTIONS-000003 ext ext1 -marker.format-version.000011.012 +marker.format-version.000012.013 marker.manifest.000001.MANIFEST-000001 ignoreSyncs false