From 99f8a254775c5c5bd51d650cdadac0f74141680d Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Tue, 19 Mar 2024 16:50:36 -0400 Subject: [PATCH] storage: add storage.sstable.compression_algorithm cluster setting Introduce a new cluster setting that allows the operator to configure the compression algorithm used when compressing sstable blocks. This allows operators to opt into use of zstd (as opposed to the previous setting of snappy). ZSTD typically achieves better compression ratios than snappy, and operators may find that they can achieve higher node densities through enabling zstd. Future releases may change the default compression algorithm. In a side-by-side comparison of a 10000-warehouse tpcc import, the zstd cluster achieved a higher import speed of 146 MiB/s versus snappy's 135 MiB/s. The zstd cluster's physical database size was significantly less. Epic: none Release note (ops change): Add `storage.sstable.compression_algorithm` cluster setting that configures the compression algorithm to use when compressing sstable blocks. --- .../settings/settings-for-tenants.txt | 1 + docs/generated/settings/settings.html | 1 + pkg/ccl/backupccl/file_sst_sink_test.go | 2 +- pkg/storage/pebble.go | 49 +++++++++++++++++++ pkg/storage/sst_writer.go | 2 + 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index 06431a02c183..57bd9d765e10 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -328,6 +328,7 @@ sql.txn.read_committed_isolation.enabled boolean true set to true to allow trans sql.txn_fingerprint_id_cache.capacity integer 100 the maximum number of txn fingerprint IDs stored application storage.max_sync_duration duration 20s maximum duration for disk operations; any operations that take longer than this setting trigger a warning log entry or process crash system-visible storage.max_sync_duration.fatal.enabled boolean true if true, fatal the process when a disk operation exceeds storage.max_sync_duration application +storage.sstable.compression_algorithm enumeration snappy "determines the compression algorithm to use when compressing sstable data blocks; supported values: ""snappy"", ""zstd"" [snappy = 1, zstd = 2]" system-visible storage.value_blocks.enabled boolean true set to true to enable writing of value blocks in sstables application timeseries.storage.resolution_10s.ttl duration 240h0m0s the maximum age of time series data stored at the 10 second resolution. Data older than this is subject to rollup and deletion. system-visible timeseries.storage.resolution_30m.ttl duration 2160h0m0s the maximum age of time series data stored at the 30 minute resolution. Data older than this is subject to deletion. system-visible diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index ceead693639f..7bc45615beaa 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -281,6 +281,7 @@
storage.ingest_split.enabled
booleantrueset to false to disable ingest-time splitting that lowers write-amplificationDedicated/Self-Hosted
storage.max_sync_duration
duration20smaximum duration for disk operations; any operations that take longer than this setting trigger a warning log entry or process crashServerless/Dedicated/Self-Hosted (read-only)
storage.max_sync_duration.fatal.enabled
booleantrueif true, fatal the process when a disk operation exceeds storage.max_sync_durationServerless/Dedicated/Self-Hosted +
storage.sstable.compression_algorithm
enumerationsnappydetermines the compression algorithm to use when compressing sstable data blocks; supported values: "snappy", "zstd" [snappy = 1, zstd = 2]Serverless/Dedicated/Self-Hosted (read-only)
storage.value_blocks.enabled
booleantrueset to true to enable writing of value blocks in sstablesServerless/Dedicated/Self-Hosted
storage.wal_failover.unhealthy_op_threshold
duration100msthe latency of a WAL write considered unhealthy and triggers a failover to a secondary WAL locationDedicated/Self-Hosted
timeseries.storage.enabled
booleantrueif set, periodic timeseries data is stored within the cluster; disabling is not recommended unless you are storing the data elsewhereDedicated/Self-Hosted diff --git a/pkg/ccl/backupccl/file_sst_sink_test.go b/pkg/ccl/backupccl/file_sst_sink_test.go index cc24aa78ed51..c6d8b75628b2 100644 --- a/pkg/ccl/backupccl/file_sst_sink_test.go +++ b/pkg/ccl/backupccl/file_sst_sink_test.go @@ -46,7 +46,7 @@ func TestFileSSTSinkExtendOneFile(t *testing.T) { getKeys := func(prefix string, n int) []byte { var b bytes.Buffer - sst := storage.MakeBackupSSTWriter(ctx, nil, &b) + sst := storage.MakeBackupSSTWriter(ctx, cluster.MakeTestingClusterSettings(), &b) for i := 0; i < n; i++ { require.NoError(t, sst.PutUnversioned([]byte(fmt.Sprintf("%s%08d", prefix, i)), nil)) } diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 9230e433574f..6189fd262279 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -132,6 +132,50 @@ var IngestAsFlushable = settings.RegisterBoolSetting( util.ConstantWithMetamorphicTestBool( "storage.ingest_as_flushable.enabled", true)) +const ( + compressionAlgorithmSnappy int64 = 1 + compressionAlgorithmZstd int64 = 2 +) + +// compressionAlgorithm determines the compression algorithm used to compress +// data blocks when writing sstables. Users should call getCompressionAlgorithm +// rather than calling compressionAlgorithm.Get directly. +var compressionAlgorithm = settings.RegisterEnumSetting( + // NB: We can't use settings.SystemOnly today because we may need to read the + // value from within a tenant building an sstable for AddSSTable. + settings.SystemVisible, + "storage.sstable.compression_algorithm", + `determines the compression algorithm to use when compressing sstable data blocks;`+ + ` supported values: "snappy", "zstd"`, + // TODO(jackson): Consider using a metamorphic constant here, but many tests + // will need to override it because they depend on a deterministic sstable + // size. + "snappy", + map[int64]string{ + compressionAlgorithmSnappy: "snappy", + compressionAlgorithmZstd: "zstd", + }, + settings.WithPublic, +) + +func getCompressionAlgorithm(ctx context.Context, settings *cluster.Settings) pebble.Compression { + switch compressionAlgorithm.Get(&settings.SV) { + case compressionAlgorithmSnappy: + return pebble.SnappyCompression + case compressionAlgorithmZstd: + // Pre-24.1 Pebble's implementation of zstd had bugs that could cause + // in-memory corruption. We require that the cluster version is 24.1 which + // implies that all nodes are running 24.1 code and will never run code + // < 24.1 again. + if settings.Version.ActiveVersionOrEmpty(ctx).IsActive(clusterversion.V24_1) { + return pebble.ZstdCompression + } + return pebble.DefaultCompression + default: + return pebble.DefaultCompression + } +} + // DO NOT set storage.single_delete.crash_on_invariant_violation.enabled or // storage.single_delete.crash_on_ineffectual.enabled to true. // @@ -1025,6 +1069,11 @@ func newPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) { } opts.FS = cfg.Env opts.Lock = cfg.Env.DirectoryLock + for _, l := range opts.Levels { + l.Compression = func() sstable.Compression { + return getCompressionAlgorithm(ctx, cfg.Settings) + } + } opts.EnsureDefaults() // The context dance here is done so that we have a clean context without diff --git a/pkg/storage/sst_writer.go b/pkg/storage/sst_writer.go index 7825ce1ac673..45488a4cf0d4 100644 --- a/pkg/storage/sst_writer.go +++ b/pkg/storage/sst_writer.go @@ -82,6 +82,7 @@ func MakeIngestionWriterOptions(ctx context.Context, cs *cluster.Settings) sstab format = sstable.TableFormatPebblev4 } opts := DefaultPebbleOptions().MakeWriterOptions(0, format) + opts.Compression = getCompressionAlgorithm(ctx, cs) opts.MergerName = "nullptr" return opts } @@ -117,6 +118,7 @@ func MakeBackupSSTWriter(ctx context.Context, cs *cluster.Settings, f io.Writer) // block checksums and more index entries are just overhead and smaller blocks // reduce compression ratio. opts.BlockSize = 128 << 10 + opts.Compression = getCompressionAlgorithm(ctx, cs) opts.MergerName = "nullptr" return SSTWriter{ fw: sstable.NewWriter(&noopFinishAbort{f}, opts),