Skip to content

Commit

Permalink
db: account for garbage in backing files when picking a compaction
Browse files Browse the repository at this point in the history
When we consider a virtual sst as a candidate for the seed file for a
compaction, the size of that sst is adjusted by a fraction of the
garbage in the backing sst, where the fraction is the reciprocal of
the number of virtual ssts that are referencing this backing sst.

The objective is to reduce space amp, by being aware of garbage
accumulation, when picking compactions.

Fixes #2323
  • Loading branch information
sumeerbhola committed Mar 5, 2024
1 parent 6566b6b commit 1ff6ed8
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 16 deletions.
24 changes: 12 additions & 12 deletions compaction_picker.go
Original file line number Diff line number Diff line change
Expand Up @@ -1030,22 +1030,22 @@ func pickCompactionSeedFile(
vers *version, opts *Options, level, outputLevel int, earliestSnapshotSeqNum uint64,
) (manifest.LevelFile, bool) {
// Select the file within the level to compact. We want to minimize write
// amplification, but also ensure that deletes are propagated to the
// bottom level in a timely fashion so as to reclaim disk space. A table's
// smallest sequence number provides a measure of its age. The ratio of
// overlapping-bytes / table-size gives an indication of write
// amplification (a smaller ratio is preferrable).
// amplification, but also ensure that (a) deletes are propagated to the
// bottom level in a timely fashion, and (b) virtual sstables that are
// pinning backing sstables where most of the data is garbage are compacted
// away. Doing (a) and (b) reclaims disk space. A table's smallest sequence
// number provides a measure of its age. The ratio of overlapping-bytes /
// table-size gives an indication of write amplification (a smaller ratio is
// preferrable).
//
// The current heuristic is based off the the RocksDB kMinOverlappingRatio
// heuristic. It chooses the file with the minimum overlapping ratio with
// the target level, which minimizes write amplification.
//
// It uses a "compensated size" for the denominator, which is the file
// size but artificially inflated by an estimate of the space that may be
// reclaimed through compaction. Currently, we only compensate for range
// deletions and only with a rough estimate of the reclaimable bytes. This
// differs from RocksDB which only compensates for point tombstones and
// only if they exceed the number of non-deletion entries in table.
// The heuristic uses a "compensated size" for the denominator, which is the
// file size inflated by (a) an estimate of the space that may be reclaimed
// through compaction, and (b) a fraction of the amount of garbage in the
// backing sstable pinned by this (virtual) sstable.
//
// TODO(peter): For concurrent compactions, we may want to try harder to
// pick a seed file whose resulting compaction bounds do not overlap with
Expand Down Expand Up @@ -1132,7 +1132,7 @@ func pickCompactionSeedFile(
continue
}

compSz := compensatedSize(f)
compSz := compensatedSize(f) + f.ResponsibleForGarbageBytes()
scaledRatio := overlappingBytes * 1024 / compSz
if scaledRatio < smallestRatio {
smallestRatio = scaledRatio
Expand Down
18 changes: 18 additions & 0 deletions compaction_picker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1366,6 +1366,24 @@ func TestCompactionPickerPickFile(t *testing.T) {
case "file-sizes":
return runTableFileSizesCmd(td, d)

case "build":
if err := runBuildCmd(td, d, d.opts.FS); err != nil {
return err.Error()
}
return ""

case "ingest-and-excise":
if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
return err.Error()
}
return ""

case "lsm":
d.mu.Lock()
s := d.mu.versions.currentVersion().String()
d.mu.Unlock()
return s

case "pick-file":
s := strings.TrimPrefix(td.CmdArgs[0].String(), "L")
level, err := strconv.Atoi(s)
Expand Down
38 changes: 34 additions & 4 deletions internal/manifest/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -410,15 +410,12 @@ type FileBacking struct {
// virtual sstable sizes of all of the virtual sstables in the latest
// version which are backed by the physical sstable. When a virtual
// sstable is removed from the latest version, we will decrement the
// VirtualizedSize. During compaction picking, we'll compensate a
// VirtualizedSize. During compaction picking, we compensate a
// virtual sstable file size by
// (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs.
// The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize
// is high, then the space amplification due to virtual sstables is
// high, and we should pick the virtual sstable with a higher priority.
//
// TODO(bananabrick): Compensate the virtual sstable file size using
// the VirtualizedSize during compaction picking and test.
VirtualizedSize atomic.Uint64
DiskFileNum base.DiskFileNum
Size uint64
Expand Down Expand Up @@ -527,6 +524,39 @@ func (m *FileMetadata) LatestRefs() int32 {
return m.FileBacking.latestVersionRefs.Load()
}

// ResponsibleForGarbageBytes returns the amount of garbage in the backing
// sstable that we consider the responsibility of this virtual sstable. For
// non-virtual sstables, this is of course 0. For virtual sstables, we equally
// distribute the responsibility of the garbage across all the virtual
// sstables that are referencing the same backing sstable. One could
// alternatively distribute this in proportion to the virtual sst sizes, but
// it isn't clear that more sophisticated heuristics are worth it, given that
// the garbage cannot be reclaimed until all the referencing virtual sstables
// are compacted.
//
// REQUIRES: m exists in the latest version.
func (m *FileMetadata) ResponsibleForGarbageBytes() uint64 {
if !m.Virtual {
return 0
}
virtualizedSize := m.FileBacking.VirtualizedSize.Load()
// Since virtualizedSize is the sum of the estimated size of all virtual
// ssts, we allow for the possibility that virtualizedSize could exceed
// m.FileBacking.Size.
totalGarbage := int64(m.FileBacking.Size) - int64(virtualizedSize)
if totalGarbage <= 0 {
return 0
}
latestRefs := m.LatestRefs()
if latestRefs == 0 {
// This cannot happen if m exists in the latest version. The call to
// ResponsibleForGarbageBytes during compaction picking ensures that m
// exists in the latest version by holding versionSet.logLock.
panic(errors.AssertionFailedf("%s has no LatestRefs", m.String()))
}
return uint64(totalGarbage) / uint64(latestRefs)
}

// SetCompactionState transitions this file's compaction state to the given
// state. Protected by DB.mu.
func (m *FileMetadata) SetCompactionState(to CompactionState) {
Expand Down
116 changes: 116 additions & 0 deletions testdata/compaction_picker_pick_file
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,119 @@ L6
pick-file L5
----
000004:[c#11,SET-d#11,SET]

# Test with virtual ssts.
define
L5
c.SET.11:<rand-bytes=32768>
d.SET.11:<rand-bytes=65536>
e.SET.11:<rand-bytes=100>
L5
f.SET.11:<rand-bytes=57344>
L6
c.SET.0:<rand-bytes=65536>
L6
e.SET.0:<rand-bytes=65536>
L6
f.SET.0:<rand-bytes=65536>
----
5:
000004:[c#11,SET-e#11,SET]
000005:[f#11,SET-f#11,SET]
6:
000006:[c#0,SET-c#0,SET]
000007:[e#0,SET-e#0,SET]
000008:[f#0,SET-f#0,SET]

file-sizes
----
L5:
000004:[c#11,SET-e#11,SET]: 99086 bytes (97KB)
000005:[f#11,SET-f#11,SET]: 57942 bytes (57KB)
L6:
000006:[c#0,SET-c#0,SET]: 66134 bytes (65KB)
000007:[e#0,SET-e#0,SET]: 66134 bytes (65KB)
000008:[f#0,SET-f#0,SET]: 66134 bytes (65KB)

# Sst 5 is picked since 65KB/57KB is less than 130KB/97KB.
pick-file L5
----
000005:[f#11,SET-f#11,SET]

build ext1
set d d
----

# Sst 4 is split into two virtual ssts, where the 64KB of key d is excised.
ingest-and-excise ext1 excise=d-e
----

lsm
----
5:
000010(000004):[c#11,SET-c#11,SET]
000011(000004):[e#11,SET-e#11,SET]
000005:[f#11,SET-f#11,SET]
6:
000006:[c#0,SET-c#0,SET]
000009:[d#13,SET-d#13,SET]
000007:[e#0,SET-e#0,SET]
000008:[f#0,SET-f#0,SET]

file-sizes
----
L5:
000010:[c#11,SET-c#11,SET]: 32796 bytes (32KB)
000011:[e#11,SET-e#11,SET]: 126 bytes (126B)
000005:[f#11,SET-f#11,SET]: 57942 bytes (57KB)
L6:
000006:[c#0,SET-c#0,SET]: 66134 bytes (65KB)
000009:[d#13,SET-d#13,SET]: 621 bytes (621B)
000007:[e#0,SET-e#0,SET]: 66134 bytes (65KB)
000008:[f#0,SET-f#0,SET]: 66134 bytes (65KB)

# Superficially, sst 10 causes write amp of 65KB/32KB which is worse than sst
# 5. But the garbage of ~64KB in the backing sst 4 is equally distributed
# between sst 10 and sst 10, which results in sst 10 causing a write amp of
# 65KB/(32KB + 32KB), which is the lowest.
pick-file L5
----
000010:[c#11,SET-c#11,SET]

build ext2
set c c
----

# Remove sst 10, so the backing sst 4 is mostly garbage, and is only
# referenced by sst 11.
ingest-and-excise ext2 excise=b-d
----

lsm
----
5:
000011(000004):[e#11,SET-e#11,SET]
000005:[f#11,SET-f#11,SET]
6:
000012:[c#15,SET-c#15,SET]
000009:[d#13,SET-d#13,SET]
000007:[e#0,SET-e#0,SET]
000008:[f#0,SET-f#0,SET]

file-sizes
----
L5:
000011:[e#11,SET-e#11,SET]: 126 bytes (126B)
000005:[f#11,SET-f#11,SET]: 57942 bytes (57KB)
L6:
000012:[c#15,SET-c#15,SET]: 621 bytes (621B)
000009:[d#13,SET-d#13,SET]: 621 bytes (621B)
000007:[e#0,SET-e#0,SET]: 66134 bytes (65KB)
000008:[f#0,SET-f#0,SET]: 66134 bytes (65KB)

# Even though picking sst 11 seems to cause poor write amp of 65KB/126B, it is
# picked because it is blamed for all the garbage in backing sst 4 (~96KB),
# and so the actual write amp is 65KB/(126B + 96KB), which is the lowest.
pick-file L5
----
000011:[e#11,SET-e#11,SET]

0 comments on commit 1ff6ed8

Please sign in to comment.