From afe5e62546375f8700f1c4f18275091ac76dcde5 Mon Sep 17 00:00:00 2001 From: Cheran Mahalingam Date: Tue, 9 Apr 2024 18:46:18 -0400 Subject: [PATCH] sstable: reduce block cache fragmentation Previously, the sstable writer contained heuristics to flush sstable blocks when the size reached a certain threshold. In CRDB this is defined as 32KiB. However, when these blocks are loaded into memory additional metadata is allocated with the block causing the allocation to go beyond this threshold. Since CRDB uses jemalloc, these allocations use a 40KiB size class which leads to internal fragmentation and higher memory usage. This commit decrements the block size threshold to reduce internal memory fragmentation. Informs: #999. --- internal/cache/value_invariants.go | 6 ++++++ internal/cache/value_normal.go | 14 +++++++++----- sstable/options.go | 9 +++++++-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/internal/cache/value_invariants.go b/internal/cache/value_invariants.go index 1e30d2714bd..a01a780b09c 100644 --- a/internal/cache/value_invariants.go +++ b/internal/cache/value_invariants.go @@ -15,6 +15,12 @@ import ( "github.com/cockroachdb/pebble/internal/manual" ) +// NewValueMetadataSize returns the number of bytes of metadata allocated for +// a cache entry. +func NewValueMetadataSize() int { + return 0 +} + // newValue creates a Value with a manually managed buffer of size n. // // This definition of newValue is used when either the "invariants" or diff --git a/internal/cache/value_normal.go b/internal/cache/value_normal.go index e03379d53f8..e73c475e63a 100644 --- a/internal/cache/value_normal.go +++ b/internal/cache/value_normal.go @@ -15,6 +15,15 @@ import ( const valueSize = int(unsafe.Sizeof(Value{})) +// NewValueMetadataSize returns the number of bytes of metadata allocated for +// a cache entry. +func NewValueMetadataSize() int { + if cgoEnabled { + return valueSize + } + return 0 +} + func newValue(n int) *Value { if n == 0 { return nil @@ -31,11 +40,6 @@ func newValue(n int) *Value { // When we're not performing leak detection, the lifetime of the returned // Value is exactly the lifetime of the backing buffer and we can manually // allocate both. - // - // TODO(peter): It may be better to separate the allocation of the value and - // the buffer in order to reduce internal fragmentation in malloc. If the - // buffer is right at a power of 2, adding valueSize might push the - // allocation over into the next larger size. b := manual.New(valueSize + n) v := (*Value)(unsafe.Pointer(&b[0])) v.buf = b[valueSize:] diff --git a/sstable/options.go b/sstable/options.go index 8d88a32debd..abafea506d3 100644 --- a/sstable/options.go +++ b/sstable/options.go @@ -236,9 +236,12 @@ func (o WriterOptions) ensureDefaults() WriterOptions { if o.BlockRestartInterval <= 0 { o.BlockRestartInterval = base.DefaultBlockRestartInterval } - if o.BlockSize <= 0 { + // The target block size is decremented to reduce internal fragmentation when + // blocks are loaded into the block cache. + if o.BlockSize <= cache.NewValueMetadataSize() { o.BlockSize = base.DefaultBlockSize } + o.BlockSize -= cache.NewValueMetadataSize() if o.BlockSizeThreshold <= 0 { o.BlockSizeThreshold = base.DefaultBlockSizeThreshold } @@ -248,8 +251,10 @@ func (o WriterOptions) ensureDefaults() WriterOptions { if o.Compression <= DefaultCompression || o.Compression >= NCompression { o.Compression = SnappyCompression } - if o.IndexBlockSize <= 0 { + if o.IndexBlockSize <= cache.NewValueMetadataSize() { o.IndexBlockSize = o.BlockSize + } else { + o.IndexBlockSize -= cache.NewValueMetadataSize() } if o.MergerName == "" { o.MergerName = base.DefaultMerger.Name