diff --git a/.changelog/4561.cfg.md b/.changelog/4561.cfg.md new file mode 100644 index 00000000000..0d788121c92 --- /dev/null +++ b/.changelog/4561.cfg.md @@ -0,0 +1,5 @@ +go/worker/storage: Storage checkpoints are now disabled by default + +- `worker.storage.checkpointer.disabled` flag is removed. + +- use the `worker.storage.checkpointer.enabled` flag to enable checkpoints. diff --git a/.changelog/4561.feature.md b/.changelog/4561.feature.md new file mode 100644 index 00000000000..478a0dbc35b --- /dev/null +++ b/.changelog/4561.feature.md @@ -0,0 +1 @@ +Randomize storage checkpoints wall-clock interval diff --git a/go/common/random/random.go b/go/common/random/random.go index 33dd2238370..bbf0ecd1cb3 100644 --- a/go/common/random/random.go +++ b/go/common/random/random.go @@ -5,6 +5,7 @@ package random import ( "math/rand" "sync" + "time" ) // NewRand is a convenience function to generate a new @@ -49,3 +50,18 @@ func (c *concurrenySafeSource) Seed(seed int64) { c.src.Seed(seed) c.mut.Unlock() } + +// Borrowed from https://github.com/cenkalti/backoff. + +// Returns a random value from the following interval: +// [currentInterval - randomizationFactor * currentInterval, currentInterval + randomizationFactor * currentInterval]. +func GetRandomValueFromInterval(randomizationFactor, random float64, currentInterval time.Duration) time.Duration { + delta := randomizationFactor * float64(currentInterval) + minInterval := float64(currentInterval) - delta + maxInterval := float64(currentInterval) + delta + + // Get a random value from the range [minInterval, maxInterval]. + // The formula used below has a +1 because if the minInterval is 1 and the maxInterval is 3 then + // we want a 33% chance for selecting either 1, 2 or 3. + return time.Duration(minInterval + (random * (maxInterval - minInterval + 1))) +} diff --git a/go/consensus/tendermint/full/full.go b/go/consensus/tendermint/full/full.go index e31ed6acef3..0aa109968a4 100644 --- a/go/consensus/tendermint/full/full.go +++ b/go/consensus/tendermint/full/full.go @@ -40,6 +40,7 @@ import ( "github.com/oasisprotocol/oasis-core/go/common/logging" "github.com/oasisprotocol/oasis-core/go/common/node" "github.com/oasisprotocol/oasis-core/go/common/pubsub" + "github.com/oasisprotocol/oasis-core/go/common/random" cmservice "github.com/oasisprotocol/oasis-core/go/common/service" "github.com/oasisprotocol/oasis-core/go/common/version" consensusAPI "github.com/oasisprotocol/oasis-core/go/consensus/api" @@ -1387,7 +1388,7 @@ func (t *fullService) lazyInit() error { // when all the other services start shutting down. // // Randomize the period so that not all nodes shut down at the same time. - delay := getRandomValueFromInterval(0.5, rand.Float64(), viper.GetDuration(CfgUpgradeStopDelay)) + delay := random.GetRandomValueFromInterval(0.5, rand.Float64(), viper.GetDuration(CfgUpgradeStopDelay)) time.Sleep(delay) t.Logger.Info("stopping the node for upgrade") diff --git a/go/consensus/tendermint/full/helpers.go b/go/consensus/tendermint/full/helpers.go deleted file mode 100644 index 9b93cf1a654..00000000000 --- a/go/consensus/tendermint/full/helpers.go +++ /dev/null @@ -1,18 +0,0 @@ -package full - -import "time" - -// Borrowed from https://github.com/cenkalti/backoff. - -// Returns a random value from the following interval: -// [currentInterval - randomizationFactor * currentInterval, currentInterval + randomizationFactor * currentInterval]. -func getRandomValueFromInterval(randomizationFactor, random float64, currentInterval time.Duration) time.Duration { - delta := randomizationFactor * float64(currentInterval) - minInterval := float64(currentInterval) - delta - maxInterval := float64(currentInterval) + delta - - // Get a random value from the range [minInterval, maxInterval]. - // The formula used below has a +1 because if the minInterval is 1 and the maxInterval is 3 then - // we want a 33% chance for selecting either 1, 2 or 3. - return time.Duration(minInterval + (random * (maxInterval - minInterval + 1))) -} diff --git a/go/oasis-test-runner/oasis/args.go b/go/oasis-test-runner/oasis/args.go index d1b51ce3646..db1f8cf7ac8 100644 --- a/go/oasis-test-runner/oasis/args.go +++ b/go/oasis-test-runner/oasis/args.go @@ -466,6 +466,13 @@ func (args *argBuilder) workerStorageDebugDisableCheckpointSync(disable bool) *a return args } +func (args *argBuilder) workerStorageCheckpointerEnabled(enable bool) *argBuilder { + if enable { + args.vec = append(args.vec, Argument{Name: workerStorage.CfgWorkerCheckpointerEnabled}) + } + return args +} + func (args *argBuilder) workerStorageCheckpointCheckInterval(interval time.Duration) *argBuilder { if interval > 0 { args.vec = append(args.vec, Argument{ diff --git a/go/oasis-test-runner/oasis/compute.go b/go/oasis-test-runner/oasis/compute.go index 96bdfb42857..eceee626fb9 100644 --- a/go/oasis-test-runner/oasis/compute.go +++ b/go/oasis-test-runner/oasis/compute.go @@ -167,6 +167,7 @@ func (worker *Compute) AddArgs(args *argBuilder) error { storageBackend(worker.storageBackend). workerStoragePublicRPCEnabled(!worker.disablePublicRPC). workerStorageDebugDisableCheckpointSync(worker.checkpointSyncDisabled). + workerStorageCheckpointerEnabled(true). workerStorageCheckpointCheckInterval(worker.checkpointCheckInterval). configureDebugCrashPoints(worker.crashPointsProbability). tendermintSupplementarySanity(worker.supplementarySanityInterval). diff --git a/go/storage/mkvs/checkpoint/checkpointer.go b/go/storage/mkvs/checkpoint/checkpointer.go index d517d0c072f..f3c4c2406f7 100644 --- a/go/storage/mkvs/checkpoint/checkpointer.go +++ b/go/storage/mkvs/checkpoint/checkpointer.go @@ -3,6 +3,7 @@ package checkpoint import ( "context" "fmt" + "math/rand" "sort" "time" @@ -11,10 +12,15 @@ import ( "github.com/oasisprotocol/oasis-core/go/common" "github.com/oasisprotocol/oasis-core/go/common/logging" "github.com/oasisprotocol/oasis-core/go/common/pubsub" + "github.com/oasisprotocol/oasis-core/go/common/random" db "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" ) +// Specifies the factor by which the checkpoint interval (in wall clock time) +// is randomized. +const checkpointIntervalRandomizationFactor = 0.1 + // CheckpointerConfig is a checkpointer configuration. type CheckpointerConfig struct { // Name identifying this checkpointer in logs. @@ -285,17 +291,14 @@ func (c *checkpointer) worker(ctx context.Context) { c.logger.Debug("storage checkpointer terminating") }() - // Use a ticker to avoid checking for checkpoints too often. - ticker := time.NewTicker(c.cfg.CheckInterval) - defer ticker.Stop() - paused := false for { + interval := random.GetRandomValueFromInterval(checkpointIntervalRandomizationFactor, rand.Float64(), c.cfg.CheckInterval) select { case <-ctx.Done(): return - case <-ticker.C: + case <-time.After(interval): case <-c.flushCh.Out(): case paused = <-c.pausedCh: continue diff --git a/go/worker/storage/init.go b/go/worker/storage/init.go index 3c1257187c4..1c352ae66f0 100644 --- a/go/worker/storage/init.go +++ b/go/worker/storage/init.go @@ -25,8 +25,8 @@ const ( // storage committee members. CfgWorkerPublicRPCEnabled = "worker.storage.public_rpc.enabled" - // CfgWorkerCheckpointerDisabled disables the storage checkpointer. - CfgWorkerCheckpointerDisabled = "worker.storage.checkpointer.disabled" + // CfgWorkerCheckpointerEnabled enables the storage checkpointer. + CfgWorkerCheckpointerEnabled = "worker.storage.checkpointer.enabled" // CfgWorkerCheckpointCheckInterval configures the checkpointer check interval. CfgWorkerCheckpointCheckInterval = "worker.storage.checkpointer.check_interval" @@ -89,7 +89,7 @@ func NewLocalBackend( func init() { Flags.Uint(cfgWorkerFetcherCount, 4, "Number of concurrent storage diff fetchers") Flags.Bool(CfgWorkerPublicRPCEnabled, false, "Enable storage RPC access for all nodes") - Flags.Bool(CfgWorkerCheckpointerDisabled, false, "Disable the storage checkpointer") + Flags.Bool(CfgWorkerCheckpointerEnabled, false, "Enable the storage checkpointer") Flags.Duration(CfgWorkerCheckpointCheckInterval, 1*time.Minute, "Storage checkpointer check interval") Flags.Bool(CfgWorkerCheckpointSyncDisabled, false, "Disable initial storage sync from checkpoints") diff --git a/go/worker/storage/worker.go b/go/worker/storage/worker.go index c0ea92ce21f..d0ac58c5621 100644 --- a/go/worker/storage/worker.go +++ b/go/worker/storage/worker.go @@ -81,7 +81,7 @@ func New( } var checkpointerCfg *checkpoint.CheckpointerConfig - if !viper.GetBool(CfgWorkerCheckpointerDisabled) { + if viper.GetBool(CfgWorkerCheckpointerEnabled) { checkpointerCfg = &checkpoint.CheckpointerConfig{ CheckInterval: viper.GetDuration(CfgWorkerCheckpointCheckInterval), }