diff --git a/internal/manifest/testdata/virtual_backings b/internal/manifest/testdata/virtual_backings index c67d98fac2..d4fe55384d 100644 --- a/internal/manifest/testdata/virtual_backings +++ b/internal/manifest/testdata/virtual_backings @@ -2,14 +2,14 @@ add n=1 size=100 ---- 1 virtual backings, total size 100: - 000001: size=100 useCount=0 virtualizedSize=0 + 000001: size=100 useCount=0 protectionCount=0 virtualizedSize=0 unused virtual backings: 000001 add n=2 size=200 ---- 2 virtual backings, total size 300: - 000001: size=100 useCount=0 virtualizedSize=0 - 000002: size=200 useCount=0 virtualizedSize=0 + 000001: size=100 useCount=0 protectionCount=0 virtualizedSize=0 + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 unused virtual backings: 000001 000002 add n=2 size=200 @@ -19,51 +19,107 @@ pebble: trying to add an existing file backing add-table n=1 size=10 ---- 2 virtual backings, total size 300: - 000001: size=100 useCount=1 virtualizedSize=10 - 000002: size=200 useCount=0 virtualizedSize=0 + 000001: size=100 useCount=1 protectionCount=0 virtualizedSize=10 + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 unused virtual backings: 000002 add-table n=1 size=10 ---- 2 virtual backings, total size 300: - 000001: size=100 useCount=2 virtualizedSize=20 - 000002: size=200 useCount=0 virtualizedSize=0 + 000001: size=100 useCount=2 protectionCount=0 virtualizedSize=20 + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 unused virtual backings: 000002 add-table n=1 size=10 ---- 2 virtual backings, total size 300: - 000001: size=100 useCount=3 virtualizedSize=30 - 000002: size=200 useCount=0 virtualizedSize=0 + 000001: size=100 useCount=3 protectionCount=0 virtualizedSize=30 + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 unused virtual backings: 000002 remove n=1 ---- -backing 000001 still in use (useCount=3) +backing 000001 still in use (useCount=3 protectionCount=0) remove-table n=1 size=10 ---- 2 virtual backings, total size 300: - 000001: size=100 useCount=2 virtualizedSize=20 - 000002: size=200 useCount=0 virtualizedSize=0 + 000001: size=100 useCount=2 protectionCount=0 virtualizedSize=20 + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 unused virtual backings: 000002 remove-table n=1 size=10 ---- 2 virtual backings, total size 300: - 000001: size=100 useCount=1 virtualizedSize=10 - 000002: size=200 useCount=0 virtualizedSize=0 + 000001: size=100 useCount=1 protectionCount=0 virtualizedSize=10 + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 unused virtual backings: 000002 remove-table n=1 size=10 ---- 2 virtual backings, total size 300: - 000001: size=100 useCount=0 virtualizedSize=0 - 000002: size=200 useCount=0 virtualizedSize=0 + 000001: size=100 useCount=0 protectionCount=0 virtualizedSize=0 + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 unused virtual backings: 000001 000002 remove n=1 ---- 1 virtual backings, total size 200: - 000002: size=200 useCount=0 virtualizedSize=0 + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 +unused virtual backings: 000002 + +protect n=2 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=0 protectionCount=1 virtualizedSize=0 + +protect n=2 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=0 protectionCount=2 virtualizedSize=0 + +unprotect n=2 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=0 protectionCount=1 virtualizedSize=0 + +remove n=2 +---- +backing 000002 still in use (useCount=0 protectionCount=1) + +unprotect n=2 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 +unused virtual backings: 000002 + +add-table n=2 size=10 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=1 protectionCount=0 virtualizedSize=10 + +add-table n=2 size=10 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=2 protectionCount=0 virtualizedSize=20 + +protect n=2 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=2 protectionCount=1 virtualizedSize=20 + +remove-table n=2 size=10 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=1 protectionCount=1 virtualizedSize=10 + +remove-table n=2 size=10 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=0 protectionCount=1 virtualizedSize=0 + +unprotect n=2 +---- +1 virtual backings, total size 200: + 000002: size=200 useCount=0 protectionCount=0 virtualizedSize=0 unused virtual backings: 000002 diff --git a/internal/manifest/version.go b/internal/manifest/version.go index 4fbb4873b8..8aa87a5cc8 100644 --- a/internal/manifest/version.go +++ b/internal/manifest/version.go @@ -396,6 +396,9 @@ type FileBacking struct { // backing across all versions that have a non-zero reference count. The tables // in each version are maintained in a copy-on-write B-tree and each B-tree node // keeps a reference on the respective backings. + // + // In addition, a reference count is taken for every backing in the latest + // version's VirtualBackings (necessary to support Protect/Unprotect). refs atomic.Int32 } diff --git a/internal/manifest/version_edit.go b/internal/manifest/version_edit.go index 95d0bfa2f5..6430deaec4 100644 --- a/internal/manifest/version_edit.go +++ b/internal/manifest/version_edit.go @@ -924,9 +924,15 @@ func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) error { } } - // Since a file can be removed from backing files in exactly one version - // edit it is safe to just append without any de-duplication. - b.RemovedFileBacking = append(b.RemovedFileBacking, ve.RemovedBackingTables...) + for _, n := range ve.RemovedBackingTables { + if _, ok := b.AddedFileBacking[n]; ok { + delete(b.AddedFileBacking, n) + } else { + // Since a file can be removed from backing files in exactly one version + // edit it is safe to just append without any de-duplication. + b.RemovedFileBacking = append(b.RemovedFileBacking, n) + } + } return nil } diff --git a/internal/manifest/virtual_backings.go b/internal/manifest/virtual_backings.go index 50b8b30590..110b9f314c 100644 --- a/internal/manifest/virtual_backings.go +++ b/internal/manifest/virtual_backings.go @@ -15,7 +15,7 @@ import ( ) // VirtualBackings maintains information about the set of backings that support -// virtual tables in a given version. +// virtual tables in the latest version. // // The VirtualBackings set internally maintains for each backing the number of // virtual tables that use that backing and the sum of their virtual sizes. When @@ -23,6 +23,48 @@ import ( // tables. AddTable/RemoveTable are used to maintain the set of tables that are // associated with a backing. Finally, a backing can only be removed from the // set when it is no longer in use. +// +// -- Protection API -- +// +// VirtualBackings exposes a Protect/Unprotect API. This is used to allow +// external file ingestions to reuse existing virtual backings. Because +// ingestions can run in parallel with other operations like compactions, it is +// possible for a backing to "go away" in-between the time the ingestion decides +// to use it and the time the ingestion installs a new version. The protection +// API solves this problem by keeping backings alive, even if they become +// otherwise unused by any tables. +// +// Backing protection achieves two goals: +// - it must prevent the removal of the backing from the latest version, where +// removal means becoming part of a VersionEdit.RemovedBackingTables. This +// is achieved by treating the backing as "in use", preventing Unused() from +// reporting it. +// - it must prevent the backing from becoming obsolete (i.e. reaching a ref +// count of 0). To achieve this, VirtualBackings takes a ref on each backing +// when it is added; this ref must be released after the backing is removed +// (when it is ok for the backing to be reported as obsolete). +// +// For example, say we have virtual table T1 with backing B1 and an ingestion tries +// to reuse the file. This is what will usually happen (the happy case): +// - latest version is V1 and it contains T1(B1). +// - ingestion request comes for another virtual portion of B1. Ingestion process +// finds B1 and calls Protect(B1). +// - ingestion completes, installs version V2 which has T1(B1) and a new +// T2(B1), and calls Unprotect(B1). +// +// In this path, the Protect/Unprotect calls do nothing. But here is what could +// happen (the corner case): +// - latest version is V1 and it contains T1(B1). +// - ingestion request comes for another virtual portion of B1. Ingestion process +// finds B1 and calls Protect(B1). +// - compaction completes and installs version V2 which no longer has T1. +// But because B1 is protected, V2 still has B1. +// - ingestion completes, installs version V3 which has a new T2(B1) and calls +// Unprotect(B1). +// +// If instead the ingestion fails to complete, the last step becomes: +// - ingestion fails, calls Unprotect(B1). B1 is now Unused() and the next +// version (applied by whatever next operation is) will remove B1. type VirtualBackings struct { m map[base.DiskFileNum]backingWithMetadata @@ -46,16 +88,25 @@ type backingWithMetadata struct { // A backing initially has a useCount of 0. The useCount is increased by // AddTable and decreased by RemoveTable. Backings that have useCount=0 are - // reported by Unused(). - useCount int + + useCount int32 + // protectionCount is used by Protect to temporarily prevent a backing from + // being reported as unused. + protectionCount int32 + // virtualizedSize is the sum of the sizes of the useCount virtual tables + // associated with this backing. virtualizedSize uint64 } -// Add adds a new backing that will be used by virtual tables. Another +// AddAndRef adds a new backing to the set and takes a reference on it. Another // backing for the same DiskFilNum must not exist. // -// The added backing is unused until it is associated with a table via AddTable. -func (bv *VirtualBackings) Add(backing *FileBacking) { +// The added backing is unused until it is associated with a table via AddTable +// or protected via Protect. +func (bv *VirtualBackings) AddAndRef(backing *FileBacking) { + // We take a reference on the backing because in case of protected backings + // (see Protect), we might be the only ones holding on to a backing. + backing.Ref() bv.mustAdd(backingWithMetadata{ backing: backing, }) @@ -63,12 +114,17 @@ func (bv *VirtualBackings) Add(backing *FileBacking) { bv.totalSize += backing.Size } -// Remove a backing; the backing must not be in use. Normally backings are -// removed once they are reported by Unused(). +// Remove removes a backing. The backing must not be in use; normally backings +// are removed once they are reported by Unused(). +// +// It is up to the caller to release the reference took by AddAndRef. func (bv *VirtualBackings) Remove(n base.DiskFileNum) { v := bv.mustGet(n) if v.inUse() { - panic(errors.AssertionFailedf("backing %s still in use (useCount=%d)", v.backing.DiskFileNum, v.useCount)) + panic(errors.AssertionFailedf( + "backing %s still in use (useCount=%d protectionCount=%d)", + v.backing.DiskFileNum, v.useCount, v.protectionCount, + )) } delete(bv.m, n) delete(bv.unused, v.backing) @@ -98,6 +154,9 @@ func (bv *VirtualBackings) RemoveTable(m *FileMetadata) { } v := bv.mustGet(m.FileBacking.DiskFileNum) + if v.useCount <= 0 { + panic(errors.AssertionFailedf("invalid useCount")) + } v.useCount-- v.virtualizedSize -= m.Size bv.m[m.FileBacking.DiskFileNum] = v @@ -106,6 +165,34 @@ func (bv *VirtualBackings) RemoveTable(m *FileMetadata) { } } +// Protect prevents a backing from being reported as unused until a +// corresponding Unprotect call is made. The backing must be in the set. +// +// Multiple Protect calls can be made for the same backing; each must have a +// corresponding Unprotect call before the backing can become unused. +func (bv *VirtualBackings) Protect(n base.DiskFileNum) { + v := bv.mustGet(n) + if !v.inUse() { + delete(bv.unused, v.backing) + } + v.protectionCount++ + bv.m[n] = v +} + +// Unprotect reverses a Protect call. +func (bv *VirtualBackings) Unprotect(n base.DiskFileNum) { + v := bv.mustGet(n) + + if v.protectionCount <= 0 { + panic(errors.AssertionFailedf("invalid protectionCount")) + } + v.protectionCount-- + bv.m[n] = v + if !v.inUse() { + bv.unused[v.backing] = struct{}{} + } +} + // Stats returns the number and total size of all the virtual backings. func (bv *VirtualBackings) Stats() (count int, totalSize uint64) { return len(bv.m), bv.totalSize @@ -123,10 +210,11 @@ func (bv *VirtualBackings) Stats() (count int, totalSize uint64) { // virtual sstable with a higher priority. func (bv *VirtualBackings) Usage(n base.DiskFileNum) (useCount int, virtualizedSize uint64) { v := bv.mustGet(n) - return v.useCount, v.virtualizedSize + return int(v.useCount), v.virtualizedSize } -// Unused returns all backings that are not in use, in DiskFileNum order. +// Unused returns all backings that are and no longer used by the latest version +// and are not protected, in DiskFileNum order. func (bv *VirtualBackings) Unused() []*FileBacking { res := make([]*FileBacking, 0, len(bv.unused)) for b := range bv.unused { @@ -138,7 +226,16 @@ func (bv *VirtualBackings) Unused() []*FileBacking { return res } -// ForEach calls fn on each backing, in an unspecified order. +// Get returns the backing with the given DiskFileNum, if it is in the set. +func (bv *VirtualBackings) Get(n base.DiskFileNum) (_ *FileBacking, ok bool) { + v, ok := bv.m[n] + if ok { + return v.backing, true + } + return nil, false +} + +// ForEach calls fn on each backing, in unspecified order. func (bv *VirtualBackings) ForEach(fn func(backing *FileBacking)) { for _, v := range bv.m { fn(v.backing) @@ -156,6 +253,15 @@ func (bv *VirtualBackings) DiskFileNums() []base.DiskFileNum { return res } +// Backings returns all backings in the set, in unspecified order. +func (bv *VirtualBackings) Backings() []*FileBacking { + res := make([]*FileBacking, 0, len(bv.m)) + for _, v := range bv.m { + res = append(res, v.backing) + } + return res +} + func (bv *VirtualBackings) String() string { nums := bv.DiskFileNums() @@ -167,7 +273,8 @@ func (bv *VirtualBackings) String() string { fmt.Fprintf(&buf, "%d virtual backings, total size %d:\n", count, totalSize) for _, n := range nums { v := bv.m[n] - fmt.Fprintf(&buf, " %s: size=%d useCount=%d virtualizedSize=%d\n", n, v.backing.Size, v.useCount, v.virtualizedSize) + fmt.Fprintf(&buf, " %s: size=%d useCount=%d protectionCount=%d virtualizedSize=%d\n", + n, v.backing.Size, v.useCount, v.protectionCount, v.virtualizedSize) } } unused := bv.Unused() @@ -199,5 +306,5 @@ func (bv *VirtualBackings) mustGet(n base.DiskFileNum) backingWithMetadata { // inUse returns true if b is used to back at least one virtual table. func (v *backingWithMetadata) inUse() bool { - return v.useCount > 0 + return v.useCount > 0 || v.protectionCount > 0 } diff --git a/internal/manifest/virtual_backings_test.go b/internal/manifest/virtual_backings_test.go index 260005ce86..9252e17b55 100644 --- a/internal/manifest/virtual_backings_test.go +++ b/internal/manifest/virtual_backings_test.go @@ -28,7 +28,7 @@ func TestVirtualBackings(t *testing.T) { switch d.Cmd { case "add": - bv.Add(&FileBacking{ + bv.AddAndRef(&FileBacking{ DiskFileNum: n, Size: size, }) @@ -50,6 +50,12 @@ func TestVirtualBackings(t *testing.T) { Size: size, }) + case "protect": + bv.Protect(n) + + case "unprotect": + bv.Unprotect(n) + default: d.Fatalf(t, "unknown command %q", d.Cmd) } diff --git a/testdata/version_set b/testdata/version_set index 43aedf3424..4ca657a8ac 100644 --- a/testdata/version_set +++ b/testdata/version_set @@ -2,18 +2,24 @@ apply add-table: L2 000001:[a#1,SET-c#1,SET] add-table: L2 000002:[e#1,SET-h#1,SET] ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 - 000002:[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:200 +applied: + last-seq-num: 99 + add-table: L2 000001:[a#1,SET-c#1,SET] + add-table: L2 000002:[e#1,SET-h#1,SET] +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + 000002:[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:200 no virtual backings no zombie tables no obsolete tables reopen ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 - 000002:[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:200 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + 000002:[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:200 no virtual backings no zombie tables no obsolete tables @@ -24,11 +30,17 @@ apply add-table: L2 000003(000002):[e#1,SET-h#1,SET] add-backing: 000002 ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 - 000003(000002):[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:300 +applied: + last-seq-num: 99 + del-table: L2 000002 + add-table: L2 000003(000002):[e#1,SET-h#1,SET] + add-backing: 000002 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + 000003(000002):[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:300 1 virtual backings, total size 2000: - 000002: size=2000 useCount=1 virtualizedSize=300 + 000002: size=2000 useCount=1 protectionCount=0 virtualizedSize=300 no zombie tables no obsolete tables @@ -36,12 +48,16 @@ no obsolete tables apply add-table: L2 000004(000002):[i#1,SET-k#1,SET] ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 - 000003(000002):[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:300 - 000004(000002):[i#1,SET-k#1,SET] seqnums:[0-0] points:[i#1,SET-k#1,SET] size:400 +applied: + last-seq-num: 99 + add-table: L2 000004(000002):[i#1,SET-k#1,SET] +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + 000003(000002):[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:300 + 000004(000002):[i#1,SET-k#1,SET] seqnums:[0-0] points:[i#1,SET-k#1,SET] size:400 1 virtual backings, total size 2000: - 000002: size=2000 useCount=2 virtualizedSize=700 + 000002: size=2000 useCount=2 protectionCount=0 virtualizedSize=700 no zombie tables no obsolete tables @@ -50,13 +66,18 @@ apply del-table: L2 000003 add-table: L3 000003(000002):[e#1,SET-h#1,SET] ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 - 000004(000002):[i#1,SET-k#1,SET] seqnums:[0-0] points:[i#1,SET-k#1,SET] size:400 -L3: - 000003(000002):[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:300 +applied: + last-seq-num: 99 + del-table: L2 000003 + add-table: L3 000003(000002):[e#1,SET-h#1,SET] +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + 000004(000002):[i#1,SET-k#1,SET] seqnums:[0-0] points:[i#1,SET-k#1,SET] size:400 + L3: + 000003(000002):[e#1,SET-h#1,SET] seqnums:[0-0] points:[e#1,SET-h#1,SET] size:300 1 virtual backings, total size 2000: - 000002: size=2000 useCount=2 virtualizedSize=700 + 000002: size=2000 useCount=2 protectionCount=0 virtualizedSize=700 no zombie tables no obsolete tables @@ -64,21 +85,26 @@ no obsolete tables apply del-table: L3 000003 ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 - 000004(000002):[i#1,SET-k#1,SET] seqnums:[0-0] points:[i#1,SET-k#1,SET] size:400 +applied: + last-seq-num: 99 + del-table: L3 000003 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + 000004(000002):[i#1,SET-k#1,SET] seqnums:[0-0] points:[i#1,SET-k#1,SET] size:400 1 virtual backings, total size 2000: - 000002: size=2000 useCount=1 virtualizedSize=400 + 000002: size=2000 useCount=1 protectionCount=0 virtualizedSize=400 no zombie tables no obsolete tables reopen ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 - 000004(000002):[i#1,SET-k#1,SET] seqnums:[0-0] points:[i#1,SET-k#1,SET] size:400 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + 000004(000002):[i#1,SET-k#1,SET] seqnums:[0-0] points:[i#1,SET-k#1,SET] size:400 1 virtual backings, total size 2000: - 000002: size=2000 useCount=1 virtualizedSize=400 + 000002: size=2000 useCount=1 protectionCount=0 virtualizedSize=400 no zombie tables no obsolete tables @@ -86,34 +112,45 @@ no obsolete tables apply del-table: L2 000004 ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +applied: + last-seq-num: 99 + del-table: L2 000004 + del-backing: 000002 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 no virtual backings zombie tables: 000002 obsolete tables: 000002 # Add a virtual table with a new backing (like an ingestion would). apply - add-table: L1 000005(000010):[u#1,SET-v#1,SET] - add-backing: 000010 ----- -L1: - 000005(000010):[u#1,SET-v#1,SET] seqnums:[0-0] points:[u#1,SET-v#1,SET] size:500 -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 -1 virtual backings, total size 10000: - 000010: size=10000 useCount=1 virtualizedSize=500 + add-table: L1 000005(000100):[u#1,SET-v#1,SET] + add-backing: 000100 +---- +applied: + last-seq-num: 99 + add-table: L1 000005(000100):[u#1,SET-v#1,SET] + add-backing: 000100 +current version: + L1: + 000005(000100):[u#1,SET-v#1,SET] seqnums:[0-0] points:[u#1,SET-v#1,SET] size:500 + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +1 virtual backings, total size 100000: + 000100: size=100000 useCount=1 protectionCount=0 virtualizedSize=500 zombie tables: 000002 obsolete tables: 000002 -ref r1 +ref-version r1 ---- -L1: - 000005(000010):[u#1,SET-v#1,SET] seqnums:[0-0] points:[u#1,SET-v#1,SET] size:500 -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 -1 virtual backings, total size 10000: - 000010: size=10000 useCount=1 virtualizedSize=500 +current version: + L1: + 000005(000100):[u#1,SET-v#1,SET] seqnums:[0-0] points:[u#1,SET-v#1,SET] size:500 + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +1 virtual backings, total size 100000: + 000100: size=100000 useCount=1 protectionCount=0 virtualizedSize=500 zombie tables: 000002 obsolete tables: 000002 @@ -122,17 +159,188 @@ obsolete tables: 000002 apply del-table: L1 000005 ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +applied: + last-seq-num: 99 + del-table: L1 000005 + del-backing: 000100 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 no virtual backings -zombie tables: 000002 000010 +zombie tables: 000002 000100 obsolete tables: 000002 # The backing is now obsolete. -unref r1 +unref-version r1 +---- +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +no virtual backings +zombie tables: 000002 000100 +obsolete tables: 000002 000100 + +# Test backing protection mechanism. + +apply + add-table: L1 000006(000101):[u#1,SET-v#1,SET] + add-table: L1 000007(000101):[w#1,SET-x#1,SET] + add-backing: 000101 +---- +applied: + last-seq-num: 99 + add-table: L1 000006(000101):[u#1,SET-v#1,SET] + add-table: L1 000007(000101):[w#1,SET-x#1,SET] + add-backing: 000101 +current version: + L1: + 000006(000101):[u#1,SET-v#1,SET] seqnums:[0-0] points:[u#1,SET-v#1,SET] size:600 + 000007(000101):[w#1,SET-x#1,SET] seqnums:[0-0] points:[w#1,SET-x#1,SET] size:700 + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +1 virtual backings, total size 101000: + 000101: size=101000 useCount=2 protectionCount=0 virtualizedSize=1300 +zombie tables: 000002 000100 +obsolete tables: 000002 000100 + +protect-backing 101 +---- +current version: + L1: + 000006(000101):[u#1,SET-v#1,SET] seqnums:[0-0] points:[u#1,SET-v#1,SET] size:600 + 000007(000101):[w#1,SET-x#1,SET] seqnums:[0-0] points:[w#1,SET-x#1,SET] size:700 + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +1 virtual backings, total size 101000: + 000101: size=101000 useCount=2 protectionCount=1 virtualizedSize=1300 +zombie tables: 000002 000100 +obsolete tables: 000002 000100 + +# We should not see a "del-backing" field here. +apply + del-table: L1 000006 + del-table: L1 000007 +---- +applied: + last-seq-num: 99 + del-table: L1 000006 + del-table: L1 000007 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +1 virtual backings, total size 101000: + 000101: size=101000 useCount=0 protectionCount=1 virtualizedSize=0 +zombie tables: 000002 000100 +obsolete tables: 000002 000100 + +unprotect-backing 101 +---- +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +1 virtual backings, total size 101000: + 000101: size=101000 useCount=0 protectionCount=0 virtualizedSize=0 +unused virtual backings: 000101 +zombie tables: 000002 000100 +obsolete tables: 000002 000100 + +# Whatever this next apply is, it should remove the unused backing. +apply + add-table: L3 000008:[a#1,SET-c#1,SET] +---- +applied: + last-seq-num: 99 + add-table: L3 000008:[a#1,SET-c#1,SET] + del-backing: 000101 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + L3: + 000008:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:800 +no virtual backings +zombie tables: 000002 000100 000101 +obsolete tables: 000002 000100 000101 + +# Test handling of leaked protected backings. + +apply + add-table: L1 000009(000102):[u#1,SET-v#1,SET] + add-backing: 000102 +---- +applied: + last-seq-num: 99 + add-table: L1 000009(000102):[u#1,SET-v#1,SET] + add-backing: 000102 +current version: + L1: + 000009(000102):[u#1,SET-v#1,SET] seqnums:[0-0] points:[u#1,SET-v#1,SET] size:900 + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + L3: + 000008:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:800 +1 virtual backings, total size 102000: + 000102: size=102000 useCount=1 protectionCount=0 virtualizedSize=900 +zombie tables: 000002 000100 000101 +obsolete tables: 000002 000100 000101 + +protect-backing 102 +---- +current version: + L1: + 000009(000102):[u#1,SET-v#1,SET] seqnums:[0-0] points:[u#1,SET-v#1,SET] size:900 + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + L3: + 000008:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:800 +1 virtual backings, total size 102000: + 000102: size=102000 useCount=1 protectionCount=1 virtualizedSize=900 +zombie tables: 000002 000100 000101 +obsolete tables: 000002 000100 000101 + +apply + del-table: L1 000009 +---- +applied: + last-seq-num: 99 + del-table: L1 000009 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + L3: + 000008:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:800 +1 virtual backings, total size 102000: + 000102: size=102000 useCount=0 protectionCount=1 virtualizedSize=0 +zombie tables: 000002 000100 000101 +obsolete tables: 000002 000100 000101 + +# Upon reopen, we still have a record of backing 102. +reopen +---- +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + L3: + 000008:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:800 +1 virtual backings, total size 102000: + 000102: size=102000 useCount=0 protectionCount=0 virtualizedSize=0 +unused virtual backings: 000102 +no zombie tables +no obsolete tables + +# Whatever this next apply is, it should remove the leaked backing. +apply + add-table: L3 000010:[d#1,SET-e#1,SET] ---- -L2: - 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 +applied: + last-seq-num: 99 + add-table: L3 000010:[d#1,SET-e#1,SET] + del-backing: 000102 +current version: + L2: + 000001:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:100 + L3: + 000008:[a#1,SET-c#1,SET] seqnums:[0-0] points:[a#1,SET-c#1,SET] size:800 + 000010:[d#1,SET-e#1,SET] seqnums:[0-0] points:[d#1,SET-e#1,SET] size:1000 no virtual backings -zombie tables: 000002 000010 -obsolete tables: 000002 000010 +zombie tables: 000102 +obsolete tables: 000102 diff --git a/version_set.go b/version_set.go index d99c84847d..d56aab1a88 100644 --- a/version_set.go +++ b/version_set.go @@ -168,7 +168,7 @@ func (vs *versionSet) create( // Note that a "snapshot" version edit is written to the manifest when it is // created. vs.manifestFileNum = vs.getNextDiskFileNum() - err = vs.createManifest(vs.dirname, vs.manifestFileNum, vs.minUnflushedLogNum, vs.nextFileNum) + err = vs.createManifest(vs.dirname, vs.manifestFileNum, vs.minUnflushedLogNum, vs.nextFileNum, nil /* virtualBackings */) if err == nil { if err = vs.manifest.Flush(); err != nil { vs.opts.Logger.Fatalf("MANIFEST flush failed: %v", err) @@ -295,12 +295,8 @@ func (vs *versionSet) load( // Populate the fileBackingMap and the FileBacking for virtual sstables since // we have finished version edit accumulation. - for _, s := range bve.AddedFileBacking { - vs.virtualBackings.Add(s) - } - - for _, diskFileNum := range bve.RemovedFileBacking { - vs.virtualBackings.Remove(diskFileNum) + for _, b := range bve.AddedFileBacking { + vs.virtualBackings.AddAndRef(b) } for _, addedLevel := range bve.Added { @@ -311,13 +307,17 @@ func (vs *versionSet) load( } } - // There should be no deleted files, since we're starting with an empty state. if invariants.Enabled { + // There should be no deleted tables or backings, since we're starting from + // an empty state. for _, deletedLevel := range bve.Deleted { if len(deletedLevel) != 0 { - panic("deleted files during manifest replay") + panic("deleted files after manifest replay") } } + if len(bve.RemovedFileBacking) > 0 { + panic("deleted backings after manifest replay") + } } newVersion, err := bve.Apply(nil, opts.Comparer, opts.FlushSplitBytes, opts.Experimental.ReadCompactionRate) @@ -500,9 +500,15 @@ func (vs *versionSet) logAndApply( } var newManifestFileNum base.DiskFileNum var prevManifestFileSize uint64 + var newManifestVirtualBackings []*fileBacking if requireRotation { newManifestFileNum = vs.getNextDiskFileNum() prevManifestFileSize = uint64(vs.manifest.Size()) + + // We want the virtual backings *before* applying the version edit, because + // the new manifest will contain the pre-apply version plus the last version + // edit. + newManifestVirtualBackings = vs.virtualBackings.Backings() } // Grab certain values before releasing vs.mu, in case createManifest() needs @@ -510,8 +516,8 @@ func (vs *versionSet) logAndApply( minUnflushedLogNum := vs.minUnflushedLogNum nextFileNum := vs.nextFileNum - // Update backing metadata and populate RemovedBackingTables. - zombies := getZombiesAndUpdateVirtualBackings(ve, &vs.virtualBackings) + // Note: this call populates ve.RemovedBackingTables. + zombieBackings, removedVirtualBackings := getZombiesAndUpdateVirtualBackings(ve, &vs.virtualBackings) if err := func() error { vs.mu.Unlock() @@ -533,7 +539,7 @@ func (vs *versionSet) logAndApply( } if newManifestFileNum != 0 { - if err := vs.createManifest(vs.dirname, newManifestFileNum, minUnflushedLogNum, nextFileNum); err != nil { + if err := vs.createManifest(vs.dirname, newManifestFileNum, minUnflushedLogNum, nextFileNum, newManifestVirtualBackings); err != nil { vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{ JobID: jobID, Path: base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum), @@ -599,12 +605,25 @@ func (vs *versionSet) logAndApply( // Update the zombie tables set first, as installation of the new version // will unref the previous version which could result in addObsoleteLocked // being called. - for fileNum, size := range zombies { - vs.zombieTables[fileNum] = size + for _, b := range zombieBackings { + vs.zombieTables[b.DiskFileNum] = b.Size + } + + // Unref the removed backings and report those that already became obsolete. + // Note that the only case where we report obsolete tables here is when + // VirtualBackings.Protect/Unprotect was used to keep a backing alive without + // it being used in the current version. + var obsoleteVirtualBackings []*fileBacking + for _, b := range removedVirtualBackings { + if b.Unref() == 0 { + obsoleteVirtualBackings = append(obsoleteVirtualBackings, b) + } } + vs.addObsoleteLocked(obsoleteVirtualBackings) // Install the new version. vs.append(newVersion) + if ve.MinUnflushedLogNum != 0 { vs.minUnflushedLogNum = ve.MinUnflushedLogNum } @@ -661,40 +680,47 @@ func (vs *versionSet) logAndApply( } // getZombiesAndUpdateVirtualBackings updates the virtual backings with the -// changes in the versionEdit, populates ve.RemovedBackingTables, and returns -// all backings (physical and virtual) that will no longer be needed when we -// apply ve. +// changes in the versionEdit and populates ve.RemovedBackingTables. +// Returns: +// - zombieBackings: all backings (physical and virtual) that will no longer be +// needed when we apply ve. +// - removedVirtualBackings: the virtual backings that will be removed by the +// VersionEdit and which must be Unref()ed by the caller. These backings +// match ve.RemovedBackingTables. func getZombiesAndUpdateVirtualBackings( ve *versionEdit, virtualBackings *manifest.VirtualBackings, -) map[base.DiskFileNum]uint64 { - var zombies map[base.DiskFileNum]uint64 - - // We deal with physical and virtual tables separately. - - // Physical tables are the sole users of their backing. - for _, m := range ve.DeletedFiles { - if !m.Virtual { - if zombies == nil { - zombies = make(map[base.DiskFileNum]uint64) - } - zombies[m.FileBacking.DiskFileNum] = m.FileBacking.Size - } - } - // Tables can move between levels, in which case they appear in both - // DeletedFiles and NewFiles. +) (zombieBackings, removedVirtualBackings []*fileBacking) { + // First, deal with the physical tables. + // + // A physical backing has become unused if it is in DeletedFiles but not in + // NewFiles or CreatedBackingTables. + // + // Note that for the common case where there are very few elements, the map + // will stay on the stack. + stillUsed := make(map[base.DiskFileNum]struct{}) for _, nf := range ve.NewFiles { if !nf.Meta.Virtual { - delete(zombies, nf.Meta.FileBacking.DiskFileNum) + stillUsed[nf.Meta.FileBacking.DiskFileNum] = struct{}{} } } - for _, b := range ve.CreatedBackingTables { - virtualBackings.Add(b) - // Physical backings can become virtual. - delete(zombies, b.DiskFileNum) + stillUsed[b.DiskFileNum] = struct{}{} + } + for _, m := range ve.DeletedFiles { + if !m.Virtual { + if _, ok := stillUsed[m.FileBacking.DiskFileNum]; !ok { + zombieBackings = append(zombieBackings, m.FileBacking) + } + } } + + // Now deal with virtual tables. + // // When a virtual table moves between levels we AddTable() then RemoveTable(), // which works out. + for _, b := range ve.CreatedBackingTables { + virtualBackings.AddAndRef(b) + } for _, nf := range ve.NewFiles { if nf.Meta.Virtual { virtualBackings.AddTable(nf.Meta) @@ -709,17 +735,15 @@ func getZombiesAndUpdateVirtualBackings( if unused := virtualBackings.Unused(); len(unused) > 0 { // Virtual backings that are no longer used are zombies and are also added // to RemovedBackingTables (before the version edit is written to disk). - if zombies == nil { - zombies = make(map[base.DiskFileNum]uint64, len(ve.RemovedBackingTables)) - } ve.RemovedBackingTables = make([]base.DiskFileNum, len(unused)) for i, b := range unused { - zombies[b.DiskFileNum] = b.Size - virtualBackings.Remove(b.DiskFileNum) ve.RemovedBackingTables[i] = b.DiskFileNum + zombieBackings = append(zombieBackings, b) + virtualBackings.Remove(b.DiskFileNum) } + removedVirtualBackings = zombieBackings[len(zombieBackings)-len(unused):] } - return zombies + return zombieBackings, removedVirtualBackings } func (vs *versionSet) incrementCompactions( @@ -764,7 +788,10 @@ func (vs *versionSet) incrementCompactionBytes(numBytes int64) { // createManifest creates a manifest file that contains a snapshot of vs. func (vs *versionSet) createManifest( - dirname string, fileNum, minUnflushedLogNum base.DiskFileNum, nextFileNum uint64, + dirname string, + fileNum, minUnflushedLogNum base.DiskFileNum, + nextFileNum uint64, + virtualBackings []*fileBacking, ) (err error) { var ( filename = base.MakeFilepath(vs.fs, dirname, fileTypeManifest, fileNum) @@ -791,7 +818,7 @@ func (vs *versionSet) createManifest( snapshot := versionEdit{ ComparerName: vs.cmp.Name, } - dedup := make(map[base.DiskFileNum]struct{}) + for level, levelMetadata := range vs.currentVersion().Levels { iter := levelMetadata.Iter() for meta := iter.First(); meta != nil; meta = iter.Next() { @@ -799,16 +826,11 @@ func (vs *versionSet) createManifest( Level: level, Meta: meta, }) - if _, ok := dedup[meta.FileBacking.DiskFileNum]; meta.Virtual && !ok { - dedup[meta.FileBacking.DiskFileNum] = struct{}{} - snapshot.CreatedBackingTables = append( - snapshot.CreatedBackingTables, - meta.FileBacking, - ) - } } } + snapshot.CreatedBackingTables = virtualBackings + // When creating a version snapshot for an existing DB, this snapshot VersionEdit will be // immediately followed by another VersionEdit (being written in logAndApply()). That // VersionEdit always contains a LastSeqNum, so we don't need to include that in the snapshot. @@ -871,25 +893,18 @@ func (vs *versionSet) append(v *version) { v.Ref() vs.versions.PushBack(v) if invariants.Enabled { - // Verify that the virtualBackings map is correct. - m := make(map[base.DiskFileNum]struct{}) + // Verify that the virtualBackings contains all the backings referenced by + // the version. for _, l := range v.Levels { iter := l.Iter() for f := iter.First(); f != nil; f = iter.Next() { if f.Virtual { - m[f.FileBacking.DiskFileNum] = struct{}{} + if _, ok := vs.virtualBackings.Get(f.FileBacking.DiskFileNum); !ok { + panic(fmt.Sprintf("%s is not in virtualBackings", f.FileBacking.DiskFileNum)) + } } } } - vs.virtualBackings.ForEach(func(b *fileBacking) { - if _, ok := m[b.DiskFileNum]; !ok { - panic(fmt.Sprintf("%s should not be in virtualBackings", b.DiskFileNum)) - } - delete(m, b.DiskFileNum) - }) - for n := range m { - panic(fmt.Sprintf("%s is not in virtualBackings", n)) - } } } @@ -910,6 +925,14 @@ func (vs *versionSet) addLiveFileNums(m map[base.DiskFileNum]struct{}) { break } } + // virtualBackings contains backings that are referenced by some virtual + // tables in the latest version (which are handled above), and backings that + // are not but are still alive because of the protection mechanism (see + // manifset.VirtualBackings). This loop ensures the latter get added to the + // map. + vs.virtualBackings.ForEach(func(b *fileBacking) { + m[b.DiskFileNum] = struct{}{} + }) } // addObsoleteLocked will add the fileInfo associated with obsolete backing diff --git a/version_set_test.go b/version_set_test.go index 92308989bc..dfa467c48c 100644 --- a/version_set_test.go +++ b/version_set_test.go @@ -9,6 +9,7 @@ import ( "io" "math/rand" "slices" + "strconv" "strings" "sync" "testing" @@ -116,13 +117,26 @@ func TestVersionSet(t *testing.T) { if err != nil { td.Fatalf(t, "%v", err) } + // Show the edit, so that we can see the fields populated by Apply. We + // zero out the next file number because it is not deterministic (because + // of the randomized forceRotation). + ve.NextFileNum = 0 + fmt.Fprintf(&buf, "applied:\n%s", ve.String()) - case "ref": + case "protect-backing": + n, _ := strconv.Atoi(td.CmdArgs[0].String()) + vs.virtualBackings.Protect(base.DiskFileNum(n)) + + case "unprotect-backing": + n, _ := strconv.Atoi(td.CmdArgs[0].String()) + vs.virtualBackings.Unprotect(base.DiskFileNum(n)) + + case "ref-version": name := td.CmdArgs[0].String() refs[name] = vs.currentVersion() refs[name].Ref() - case "unref": + case "unref-version": name := td.CmdArgs[0].String() refs[name].Unref() @@ -162,7 +176,12 @@ func TestVersionSet(t *testing.T) { td.Fatalf(t, "unknown command: %s", td.Cmd) } - buf.WriteString(vs.currentVersion().DebugString()) + fmt.Fprintf(&buf, "current version:\n") + for _, l := range strings.Split(vs.currentVersion().DebugString(), "\n") { + if l != "" { + fmt.Fprintf(&buf, " %s\n", l) + } + } buf.WriteString(vs.virtualBackings.String()) if len(vs.zombieTables) == 0 { buf.WriteString("no zombie tables\n")