diff --git a/compaction_picker.go b/compaction_picker.go index 578328c284..999f84d299 100644 --- a/compaction_picker.go +++ b/compaction_picker.go @@ -1030,22 +1030,22 @@ func pickCompactionSeedFile( vers *version, opts *Options, level, outputLevel int, earliestSnapshotSeqNum uint64, ) (manifest.LevelFile, bool) { // Select the file within the level to compact. We want to minimize write - // amplification, but also ensure that deletes are propagated to the - // bottom level in a timely fashion so as to reclaim disk space. A table's - // smallest sequence number provides a measure of its age. The ratio of - // overlapping-bytes / table-size gives an indication of write - // amplification (a smaller ratio is preferrable). + // amplification, but also ensure that (a) deletes are propagated to the + // bottom level in a timely fashion, and (b) virtual sstables that are + // pinning backing sstables where most of the data is garbage are compacted + // away. Doing (a) and (b) reclaims disk space. A table's smallest sequence + // number provides a measure of its age. The ratio of overlapping-bytes / + // table-size gives an indication of write amplification (a smaller ratio is + // preferrable). // // The current heuristic is based off the the RocksDB kMinOverlappingRatio // heuristic. It chooses the file with the minimum overlapping ratio with // the target level, which minimizes write amplification. // - // It uses a "compensated size" for the denominator, which is the file - // size but artificially inflated by an estimate of the space that may be - // reclaimed through compaction. Currently, we only compensate for range - // deletions and only with a rough estimate of the reclaimable bytes. This - // differs from RocksDB which only compensates for point tombstones and - // only if they exceed the number of non-deletion entries in table. + // The heuristic uses a "compensated size" for the denominator, which is the + // file size inflated by (a) an estimate of the space that may be reclaimed + // through compaction, and (b) a fraction of the amount of garbage in the + // backing sstable pinned by this (virtual) sstable. // // TODO(peter): For concurrent compactions, we may want to try harder to // pick a seed file whose resulting compaction bounds do not overlap with @@ -1132,7 +1132,7 @@ func pickCompactionSeedFile( continue } - compSz := compensatedSize(f) + compSz := compensatedSize(f) + f.ResponsibleForGarbageBytes() scaledRatio := overlappingBytes * 1024 / compSz if scaledRatio < smallestRatio { smallestRatio = scaledRatio diff --git a/compaction_picker_test.go b/compaction_picker_test.go index d6940fe2d5..add86c3787 100644 --- a/compaction_picker_test.go +++ b/compaction_picker_test.go @@ -1366,6 +1366,24 @@ func TestCompactionPickerPickFile(t *testing.T) { case "file-sizes": return runTableFileSizesCmd(td, d) + case "build": + if err := runBuildCmd(td, d, d.opts.FS); err != nil { + return err.Error() + } + return "" + + case "ingest-and-excise": + if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil { + return err.Error() + } + return "" + + case "lsm": + d.mu.Lock() + s := d.mu.versions.currentVersion().String() + d.mu.Unlock() + return s + case "pick-file": s := strings.TrimPrefix(td.CmdArgs[0].String(), "L") level, err := strconv.Atoi(s) diff --git a/internal/manifest/version.go b/internal/manifest/version.go index 0111c93484..17acf01495 100644 --- a/internal/manifest/version.go +++ b/internal/manifest/version.go @@ -410,15 +410,12 @@ type FileBacking struct { // virtual sstable sizes of all of the virtual sstables in the latest // version which are backed by the physical sstable. When a virtual // sstable is removed from the latest version, we will decrement the - // VirtualizedSize. During compaction picking, we'll compensate a + // VirtualizedSize. During compaction picking, we compensate a // virtual sstable file size by // (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs. // The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize // is high, then the space amplification due to virtual sstables is // high, and we should pick the virtual sstable with a higher priority. - // - // TODO(bananabrick): Compensate the virtual sstable file size using - // the VirtualizedSize during compaction picking and test. VirtualizedSize atomic.Uint64 DiskFileNum base.DiskFileNum Size uint64 @@ -527,6 +524,39 @@ func (m *FileMetadata) LatestRefs() int32 { return m.FileBacking.latestVersionRefs.Load() } +// ResponsibleForGarbageBytes returns the amount of garbage in the backing +// sstable that we consider the responsibility of this virtual sstable. For +// non-virtual sstables, this is of course 0. For virtual sstables, we equally +// distribute the responsibility of the garbage across all the virtual +// sstables that are referencing the same backing sstable. One could +// alternatively distribute this in proportion to the virtual sst sizes, but +// it isn't clear that more sophisticated heuristics are worth it, given that +// the garbage cannot be reclaimed until all the referencing virtual sstables +// are compacted. +// +// REQUIRES: m exists in the latest version. +func (m *FileMetadata) ResponsibleForGarbageBytes() uint64 { + if !m.Virtual { + return 0 + } + virtualizedSize := m.FileBacking.VirtualizedSize.Load() + // Since virtualizedSize is the sum of the estimated size of all virtual + // ssts, we allow for the possibility that virtualizedSize could exceed + // m.FileBacking.Size. + totalGarbage := int64(m.FileBacking.Size) - int64(virtualizedSize) + if totalGarbage <= 0 { + return 0 + } + latestRefs := m.LatestRefs() + if latestRefs == 0 { + // This cannot happen if m exists in the latest version. The call to + // ResponsibleForGarbageBytes during compaction picking ensures that m + // exists in the latest version by holding versionSet.logLock. + panic(errors.AssertionFailedf("%s has no LatestRefs", m.String())) + } + return uint64(totalGarbage) / uint64(latestRefs) +} + // SetCompactionState transitions this file's compaction state to the given // state. Protected by DB.mu. func (m *FileMetadata) SetCompactionState(to CompactionState) { diff --git a/testdata/compaction_picker_pick_file b/testdata/compaction_picker_pick_file index 38db709798..395403f3b2 100644 --- a/testdata/compaction_picker_pick_file +++ b/testdata/compaction_picker_pick_file @@ -104,3 +104,119 @@ L6 pick-file L5 ---- 000004:[c#11,SET-d#11,SET] + +# Test with virtual ssts. +define +L5 + c.SET.11: + d.SET.11: + e.SET.11: +L5 + f.SET.11: +L6 + c.SET.0: +L6 + e.SET.0: +L6 + f.SET.0: +---- +5: + 000004:[c#11,SET-e#11,SET] + 000005:[f#11,SET-f#11,SET] +6: + 000006:[c#0,SET-c#0,SET] + 000007:[e#0,SET-e#0,SET] + 000008:[f#0,SET-f#0,SET] + +file-sizes +---- +L5: + 000004:[c#11,SET-e#11,SET]: 99086 bytes (97KB) + 000005:[f#11,SET-f#11,SET]: 57942 bytes (57KB) +L6: + 000006:[c#0,SET-c#0,SET]: 66134 bytes (65KB) + 000007:[e#0,SET-e#0,SET]: 66134 bytes (65KB) + 000008:[f#0,SET-f#0,SET]: 66134 bytes (65KB) + +# Sst 5 is picked since 65KB/57KB is less than 130KB/97KB. +pick-file L5 +---- +000005:[f#11,SET-f#11,SET] + +build ext1 +set d d +---- + +# Sst 4 is split into two virtual ssts, where the 64KB of key d is excised. +ingest-and-excise ext1 excise=d-e +---- + +lsm +---- +5: + 000010(000004):[c#11,SET-c#11,SET] + 000011(000004):[e#11,SET-e#11,SET] + 000005:[f#11,SET-f#11,SET] +6: + 000006:[c#0,SET-c#0,SET] + 000009:[d#13,SET-d#13,SET] + 000007:[e#0,SET-e#0,SET] + 000008:[f#0,SET-f#0,SET] + +file-sizes +---- +L5: + 000010:[c#11,SET-c#11,SET]: 32796 bytes (32KB) + 000011:[e#11,SET-e#11,SET]: 126 bytes (126B) + 000005:[f#11,SET-f#11,SET]: 57942 bytes (57KB) +L6: + 000006:[c#0,SET-c#0,SET]: 66134 bytes (65KB) + 000009:[d#13,SET-d#13,SET]: 621 bytes (621B) + 000007:[e#0,SET-e#0,SET]: 66134 bytes (65KB) + 000008:[f#0,SET-f#0,SET]: 66134 bytes (65KB) + +# Superficially, sst 10 causes write amp of 65KB/32KB which is worse than sst +# 5. But the garbage of ~64KB in the backing sst 4 is equally distributed +# between sst 10 and sst 10, which results in sst 10 causing a write amp of +# 65KB/(32KB + 32KB), which is the lowest. +pick-file L5 +---- +000010:[c#11,SET-c#11,SET] + +build ext2 +set c c +---- + +# Remove sst 10, so the backing sst 4 is mostly garbage, and is only +# referenced by sst 11. +ingest-and-excise ext2 excise=b-d +---- + +lsm +---- +5: + 000011(000004):[e#11,SET-e#11,SET] + 000005:[f#11,SET-f#11,SET] +6: + 000012:[c#15,SET-c#15,SET] + 000009:[d#13,SET-d#13,SET] + 000007:[e#0,SET-e#0,SET] + 000008:[f#0,SET-f#0,SET] + +file-sizes +---- +L5: + 000011:[e#11,SET-e#11,SET]: 126 bytes (126B) + 000005:[f#11,SET-f#11,SET]: 57942 bytes (57KB) +L6: + 000012:[c#15,SET-c#15,SET]: 621 bytes (621B) + 000009:[d#13,SET-d#13,SET]: 621 bytes (621B) + 000007:[e#0,SET-e#0,SET]: 66134 bytes (65KB) + 000008:[f#0,SET-f#0,SET]: 66134 bytes (65KB) + +# Even though picking sst 11 seems to cause poor write amp of 65KB/126B, it is +# picked because it is blamed for all the garbage in backing sst 4 (~96KB), +# and so the actual write amp is 65KB/(126B + 96KB), which is the lowest. +pick-file L5 +---- +000011:[e#11,SET-e#11,SET]