From 9d39e886433d11df5d889394ebc5a9cc3f779f21 Mon Sep 17 00:00:00 2001 From: Nick Travers Date: Thu, 5 May 2022 21:33:30 +0000 Subject: [PATCH] storage,log: reduce max sync duration default timeouts Currently, Pebble will emit a fatal (or error, if configured) log event in a situation where a single write or sync operation is exceeds the the `MaxSyncDuration`. By default, this value is set to `60s`, but can be configured with the `storage.max_sync_duration` setting. Recent incidents have demonstrated that the current default value is most likely too high. For example, stalled disk operations that prevent a node heartbeating within 4.5 seconds will result in the node shedding all leases. Failing faster in this case is desirable. There also exist situations in which stalled disk operations on a single node can adversely affect throughput for an entire cluster (see cockroachlabs/support#1571 and cockroachlabs/support#1564). Lowering the timeout improves the recovery time. Lower the default value to `20s`, to strike a balance between being able to crash the process earlier in the event of a hardware failure (hard or soft), while also allowing ample time for a slow disk operation to clear in the transient case. Update the corresponding value in the logging package. Release note (ops change): The default value for `storage.max_sync_duration` has been lowered from `60s` to `20s`. Cockroach will exit sooner with a fatal error if a single slow disk operation exceeds this value. Touches #80942, #74712. --- pkg/storage/pebble.go | 2 +- pkg/util/log/log_flush.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 2457a6798697..276523b168e6 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -58,7 +58,7 @@ import ( const maxSyncDurationFatalOnExceededDefault = true // Default for MaxSyncDuration below. -var maxSyncDurationDefault = envutil.EnvOrDefaultDuration("COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT", 60*time.Second) +var maxSyncDurationDefault = envutil.EnvOrDefaultDuration("COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT", 20*time.Second) // MaxSyncDuration is the threshold above which an observed engine sync duration // triggers either a warning or a fatal error. diff --git a/pkg/util/log/log_flush.go b/pkg/util/log/log_flush.go index de0dba375ca0..3906afd970fc 100644 --- a/pkg/util/log/log_flush.go +++ b/pkg/util/log/log_flush.go @@ -52,7 +52,7 @@ const syncInterval = 30 // In practice, even a fraction of that would indicate a problem. This metric's // default should ideally match its sister metric in the storage engine, set by // COCKROACH_ENGINE_MAX_SYNC_DURATION. -var maxSyncDuration = envutil.EnvOrDefaultDuration("COCKROACH_LOG_MAX_SYNC_DURATION", 60*time.Second) +var maxSyncDuration = envutil.EnvOrDefaultDuration("COCKROACH_LOG_MAX_SYNC_DURATION", 20*time.Second) // syncWarnDuration is the threshold after which a slow disk warning is written // to the log and to stderr.