From 65193e03293976e5b6c3b836ecf9f05585ba43fd Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Thu, 14 Mar 2024 13:58:35 -0400 Subject: [PATCH] wal: allow runtime toggling of WAL failover This commit adds a new bool return value to the option func UnhealthyOperationLatencyThreshold that allows the client to dynamically toggle whether failover is permitted. This will be used by Cockroach to prevent failover until cluster version finalization begins. Informs #3230. --- db_test.go | 2 +- metamorphic/options.go | 4 ++-- options.go | 7 +++--- wal/failover_manager.go | 6 ++++-- wal/failover_manager_test.go | 8 ++++--- wal/testdata/manager_failover | 40 +++++++++++++++++++++++++++++++++++ wal/wal.go | 10 +++++---- 7 files changed, 62 insertions(+), 15 deletions(-) diff --git a/db_test.go b/db_test.go index 7a3b0d1b69..e363d6016a 100644 --- a/db_test.go +++ b/db_test.go @@ -2058,7 +2058,7 @@ func TestWALFailoverAvoidsWriteStall(t *testing.T) { secondary := wal.Dir{FS: mem, Dirname: "secondary"} walFailover := &WALFailoverOptions{Secondary: secondary, FailoverOptions: wal.FailoverOptions{ UnhealthySamplingInterval: 100 * time.Millisecond, - UnhealthyOperationLatencyThreshold: func() time.Duration { return time.Second }, + UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) { return time.Second, true }, }} o := &Options{ FS: primaryFS, diff --git a/metamorphic/options.go b/metamorphic/options.go index 24a21c7a0f..46544928c4 100644 --- a/metamorphic/options.go +++ b/metamorphic/options.go @@ -629,8 +629,8 @@ func RandomOptions( HealthyProbeLatencyThreshold: healthyThreshold, HealthyInterval: healthyInterval, UnhealthySamplingInterval: scaleDuration(unhealthyThreshold, 0.10, 0.50), // Between 10-50% of the unhealthy threshold - UnhealthyOperationLatencyThreshold: func() time.Duration { - return unhealthyThreshold + UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) { + return unhealthyThreshold, true }, ElevatedWriteStallThresholdLag: expRandDuration(rng, 5*referenceDur, 2*time.Second), }, diff --git a/options.go b/options.go index f370255ddc..d617fb3b17 100644 --- a/options.go +++ b/options.go @@ -1380,6 +1380,7 @@ func (o *Options) String() string { } if o.WALFailover != nil { + unhealthyThreshold, _ := o.WALFailover.FailoverOptions.UnhealthyOperationLatencyThreshold() fmt.Fprintf(&buf, "\n") fmt.Fprintf(&buf, "[WAL Failover]\n") fmt.Fprintf(&buf, " secondary_dir=%s\n", o.WALFailover.Secondary.Dirname) @@ -1387,7 +1388,7 @@ func (o *Options) String() string { fmt.Fprintf(&buf, " healthy_probe_latency_threshold=%s\n", o.WALFailover.FailoverOptions.HealthyProbeLatencyThreshold) fmt.Fprintf(&buf, " healthy_interval=%s\n", o.WALFailover.FailoverOptions.HealthyInterval) fmt.Fprintf(&buf, " unhealthy_sampling_interval=%s\n", o.WALFailover.FailoverOptions.UnhealthySamplingInterval) - fmt.Fprintf(&buf, " unhealthy_operation_latency_threshold=%s\n", o.WALFailover.FailoverOptions.UnhealthyOperationLatencyThreshold()) + fmt.Fprintf(&buf, " unhealthy_operation_latency_threshold=%s\n", unhealthyThreshold) fmt.Fprintf(&buf, " elevated_write_stall_threshold_lag=%s\n", o.WALFailover.FailoverOptions.ElevatedWriteStallThresholdLag) } @@ -1707,8 +1708,8 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { case "unhealthy_operation_latency_threshold": var threshold time.Duration threshold, err = time.ParseDuration(value) - o.WALFailover.UnhealthyOperationLatencyThreshold = func() time.Duration { - return threshold + o.WALFailover.UnhealthyOperationLatencyThreshold = func() (time.Duration, bool) { + return threshold, true } case "elevated_write_stall_threshold_lag": o.WALFailover.ElevatedWriteStallThresholdLag, err = time.ParseDuration(value) diff --git a/wal/failover_manager.go b/wal/failover_manager.go index d6466ca5b0..e452098145 100644 --- a/wal/failover_manager.go +++ b/wal/failover_manager.go @@ -353,15 +353,17 @@ func (m *failoverMonitor) monitorLoop(shouldQuiesce <-chan struct{}) { // has misconfigured a secondary e.g. wrong permissions or not enough // disk space. We only remember the error history in the context of the // lastWriter since an operator can fix the underlying misconfiguration. + unhealthyThreshold, failoverEnabled := m.opts.UnhealthyOperationLatencyThreshold() + if !(lastWriter.errorCounts[secondaryDirIndex] >= highSecondaryErrorCountThreshold && - dirIndex == primaryDirIndex) { + dirIndex == primaryDirIndex) && failoverEnabled { // Switching heuristics. Subject to change based on real world experience. if writerErr != nil { // An error causes an immediate switch, since a LogWriter with an // error is useless. lastWriter.errorCounts[dirIndex]++ switchDir = true - } else if writerOngoingLatency > m.opts.UnhealthyOperationLatencyThreshold() { + } else if writerOngoingLatency > unhealthyThreshold { // Arbitrary value. const switchImmediatelyCountThreshold = 2 // High latency. Switch immediately if the number of switches that diff --git a/wal/failover_manager_test.go b/wal/failover_manager_test.go index 0d15fc6bb7..784febd4c4 100644 --- a/wal/failover_manager_test.go +++ b/wal/failover_manager_test.go @@ -294,6 +294,7 @@ func TestManagerFailover(t *testing.T) { var fs *blockingFS var fm *failoverManager var fw *failoverWriter + var allowFailover bool dirs := [numDirIndices]string{"pri", "sec"} datadriven.RunTest(t, "testdata/manager_failover", func(t *testing.T, td *datadriven.TestData) string { @@ -303,6 +304,7 @@ func TestManagerFailover(t *testing.T) { if !td.HasArg("reuse-fs") { memFS = vfs.NewMem() } + allowFailover = !td.HasArg("disable-failover") proberIterationForTesting = make(chan struct{}, 50000) monitorIterationForTesting = make(chan struct{}, 50000) monitorStateBuf.Reset() @@ -322,7 +324,7 @@ func TestManagerFailover(t *testing.T) { } injs[i] = inj } - case "reuse-fs": + case "reuse-fs", "disable-failover": // Ignore, already handled above. default: return fmt.Sprintf("unknown arg %s", cmdArg.Key) @@ -359,7 +361,7 @@ func TestManagerFailover(t *testing.T) { // Use 75ms to not align with PrimaryDirProbeInterval, to avoid // races. UnhealthySamplingInterval: 75 * time.Millisecond, - UnhealthyOperationLatencyThreshold: func() time.Duration { return 50 * time.Millisecond }, + UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) { return 50 * time.Millisecond, allowFailover }, ElevatedWriteStallThresholdLag: 10 * time.Second, timeSource: ts, monitorIterationForTesting: monitorIterationForTesting, @@ -566,7 +568,7 @@ func TestFailoverManager_Quiesce(t *testing.T) { HealthyProbeLatencyThreshold: time.Millisecond, HealthyInterval: 3 * time.Millisecond, UnhealthySamplingInterval: 250 * time.Microsecond, - UnhealthyOperationLatencyThreshold: func() time.Duration { return time.Millisecond }, + UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) { return time.Millisecond, true }, }, }, nil /* initial logs */)) for i := 0; i < 3; i++ { diff --git a/wal/testdata/manager_failover b/wal/testdata/manager_failover index 688e270ede..cd0b324a95 100644 --- a/wal/testdata/manager_failover +++ b/wal/testdata/manager_failover @@ -664,3 +664,43 @@ list-fs pri/000001-002.log pri/probe-file sec/000001-001.log + +# Test that if UnhealthyOperationLatencyThreshold says not to allow failovers +# yet, failover doesn't occur even if the primary errors. + +init-manager disable-failover inject-errors=((ErrInjected (And Writes (PathMatch "*/000001.log")))) +---- +ok +recycler min-log-num: 0 + +# Wait for monitor ticker to start. +advance-time dur=0ms wait-monitor +---- +monitor state: dir index: 0 +now: 0s + +create-writer wal-num=1 +---- +ok + +# Wait until create error has been noticed. +advance-time dur=1ms wait-ongoing-io wait-for-log-writer +---- +now: 1ms + +# Wait until monitor sees the error. +advance-time dur=74ms wait-monitor +---- +monitor state: dir index: 0 num-switches: 0, ongoing-latency-at-switch: 0s +now: 75ms + +close-writer +---- +injected error + +close-manager +---- +ok + +list-fs +---- diff --git a/wal/wal.go b/wal/wal.go index 4304e8205a..214a8abbc5 100644 --- a/wal/wal.go +++ b/wal/wal.go @@ -184,8 +184,10 @@ type FailoverOptions struct { // errors in the latest LogWriter. UnhealthySamplingInterval time.Duration // UnhealthyOperationLatencyThreshold is the latency threshold that is - // considered unhealthy, for operations done by a LogWriter. - UnhealthyOperationLatencyThreshold func() time.Duration + // considered unhealthy, for operations done by a LogWriter. The second return + // value indicates whether we should consider failover at all. If the second + // return value is false, failover is disabled. + UnhealthyOperationLatencyThreshold func() (time.Duration, bool) // ElevatedWriteStallThresholdLag is the duration for which an elevated // threshold should continue after a switch back to the primary dir. This is @@ -218,8 +220,8 @@ func (o *FailoverOptions) EnsureDefaults() { o.UnhealthySamplingInterval = 100 * time.Millisecond } if o.UnhealthyOperationLatencyThreshold == nil { - o.UnhealthyOperationLatencyThreshold = func() time.Duration { - return 200 * time.Millisecond + o.UnhealthyOperationLatencyThreshold = func() (time.Duration, bool) { + return 200 * time.Millisecond, true } } }