diff --git a/db_test.go b/db_test.go index 7a3b0d1b69..e363d6016a 100644 --- a/db_test.go +++ b/db_test.go @@ -2058,7 +2058,7 @@ func TestWALFailoverAvoidsWriteStall(t *testing.T) { secondary := wal.Dir{FS: mem, Dirname: "secondary"} walFailover := &WALFailoverOptions{Secondary: secondary, FailoverOptions: wal.FailoverOptions{ UnhealthySamplingInterval: 100 * time.Millisecond, - UnhealthyOperationLatencyThreshold: func() time.Duration { return time.Second }, + UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) { return time.Second, true }, }} o := &Options{ FS: primaryFS, diff --git a/metamorphic/options.go b/metamorphic/options.go index 24a21c7a0f..46544928c4 100644 --- a/metamorphic/options.go +++ b/metamorphic/options.go @@ -629,8 +629,8 @@ func RandomOptions( HealthyProbeLatencyThreshold: healthyThreshold, HealthyInterval: healthyInterval, UnhealthySamplingInterval: scaleDuration(unhealthyThreshold, 0.10, 0.50), // Between 10-50% of the unhealthy threshold - UnhealthyOperationLatencyThreshold: func() time.Duration { - return unhealthyThreshold + UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) { + return unhealthyThreshold, true }, ElevatedWriteStallThresholdLag: expRandDuration(rng, 5*referenceDur, 2*time.Second), }, diff --git a/options.go b/options.go index f370255ddc..d617fb3b17 100644 --- a/options.go +++ b/options.go @@ -1380,6 +1380,7 @@ func (o *Options) String() string { } if o.WALFailover != nil { + unhealthyThreshold, _ := o.WALFailover.FailoverOptions.UnhealthyOperationLatencyThreshold() fmt.Fprintf(&buf, "\n") fmt.Fprintf(&buf, "[WAL Failover]\n") fmt.Fprintf(&buf, " secondary_dir=%s\n", o.WALFailover.Secondary.Dirname) @@ -1387,7 +1388,7 @@ func (o *Options) String() string { fmt.Fprintf(&buf, " healthy_probe_latency_threshold=%s\n", o.WALFailover.FailoverOptions.HealthyProbeLatencyThreshold) fmt.Fprintf(&buf, " healthy_interval=%s\n", o.WALFailover.FailoverOptions.HealthyInterval) fmt.Fprintf(&buf, " unhealthy_sampling_interval=%s\n", o.WALFailover.FailoverOptions.UnhealthySamplingInterval) - fmt.Fprintf(&buf, " unhealthy_operation_latency_threshold=%s\n", o.WALFailover.FailoverOptions.UnhealthyOperationLatencyThreshold()) + fmt.Fprintf(&buf, " unhealthy_operation_latency_threshold=%s\n", unhealthyThreshold) fmt.Fprintf(&buf, " elevated_write_stall_threshold_lag=%s\n", o.WALFailover.FailoverOptions.ElevatedWriteStallThresholdLag) } @@ -1707,8 +1708,8 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { case "unhealthy_operation_latency_threshold": var threshold time.Duration threshold, err = time.ParseDuration(value) - o.WALFailover.UnhealthyOperationLatencyThreshold = func() time.Duration { - return threshold + o.WALFailover.UnhealthyOperationLatencyThreshold = func() (time.Duration, bool) { + return threshold, true } case "elevated_write_stall_threshold_lag": o.WALFailover.ElevatedWriteStallThresholdLag, err = time.ParseDuration(value) diff --git a/wal/failover_manager.go b/wal/failover_manager.go index d6466ca5b0..e452098145 100644 --- a/wal/failover_manager.go +++ b/wal/failover_manager.go @@ -353,15 +353,17 @@ func (m *failoverMonitor) monitorLoop(shouldQuiesce <-chan struct{}) { // has misconfigured a secondary e.g. wrong permissions or not enough // disk space. We only remember the error history in the context of the // lastWriter since an operator can fix the underlying misconfiguration. + unhealthyThreshold, failoverEnabled := m.opts.UnhealthyOperationLatencyThreshold() + if !(lastWriter.errorCounts[secondaryDirIndex] >= highSecondaryErrorCountThreshold && - dirIndex == primaryDirIndex) { + dirIndex == primaryDirIndex) && failoverEnabled { // Switching heuristics. Subject to change based on real world experience. if writerErr != nil { // An error causes an immediate switch, since a LogWriter with an // error is useless. lastWriter.errorCounts[dirIndex]++ switchDir = true - } else if writerOngoingLatency > m.opts.UnhealthyOperationLatencyThreshold() { + } else if writerOngoingLatency > unhealthyThreshold { // Arbitrary value. const switchImmediatelyCountThreshold = 2 // High latency. Switch immediately if the number of switches that diff --git a/wal/failover_manager_test.go b/wal/failover_manager_test.go index 0d15fc6bb7..784febd4c4 100644 --- a/wal/failover_manager_test.go +++ b/wal/failover_manager_test.go @@ -294,6 +294,7 @@ func TestManagerFailover(t *testing.T) { var fs *blockingFS var fm *failoverManager var fw *failoverWriter + var allowFailover bool dirs := [numDirIndices]string{"pri", "sec"} datadriven.RunTest(t, "testdata/manager_failover", func(t *testing.T, td *datadriven.TestData) string { @@ -303,6 +304,7 @@ func TestManagerFailover(t *testing.T) { if !td.HasArg("reuse-fs") { memFS = vfs.NewMem() } + allowFailover = !td.HasArg("disable-failover") proberIterationForTesting = make(chan struct{}, 50000) monitorIterationForTesting = make(chan struct{}, 50000) monitorStateBuf.Reset() @@ -322,7 +324,7 @@ func TestManagerFailover(t *testing.T) { } injs[i] = inj } - case "reuse-fs": + case "reuse-fs", "disable-failover": // Ignore, already handled above. default: return fmt.Sprintf("unknown arg %s", cmdArg.Key) @@ -359,7 +361,7 @@ func TestManagerFailover(t *testing.T) { // Use 75ms to not align with PrimaryDirProbeInterval, to avoid // races. UnhealthySamplingInterval: 75 * time.Millisecond, - UnhealthyOperationLatencyThreshold: func() time.Duration { return 50 * time.Millisecond }, + UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) { return 50 * time.Millisecond, allowFailover }, ElevatedWriteStallThresholdLag: 10 * time.Second, timeSource: ts, monitorIterationForTesting: monitorIterationForTesting, @@ -566,7 +568,7 @@ func TestFailoverManager_Quiesce(t *testing.T) { HealthyProbeLatencyThreshold: time.Millisecond, HealthyInterval: 3 * time.Millisecond, UnhealthySamplingInterval: 250 * time.Microsecond, - UnhealthyOperationLatencyThreshold: func() time.Duration { return time.Millisecond }, + UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) { return time.Millisecond, true }, }, }, nil /* initial logs */)) for i := 0; i < 3; i++ { diff --git a/wal/testdata/manager_failover b/wal/testdata/manager_failover index 688e270ede..cd0b324a95 100644 --- a/wal/testdata/manager_failover +++ b/wal/testdata/manager_failover @@ -664,3 +664,43 @@ list-fs pri/000001-002.log pri/probe-file sec/000001-001.log + +# Test that if UnhealthyOperationLatencyThreshold says not to allow failovers +# yet, failover doesn't occur even if the primary errors. + +init-manager disable-failover inject-errors=((ErrInjected (And Writes (PathMatch "*/000001.log")))) +---- +ok +recycler min-log-num: 0 + +# Wait for monitor ticker to start. +advance-time dur=0ms wait-monitor +---- +monitor state: dir index: 0 +now: 0s + +create-writer wal-num=1 +---- +ok + +# Wait until create error has been noticed. +advance-time dur=1ms wait-ongoing-io wait-for-log-writer +---- +now: 1ms + +# Wait until monitor sees the error. +advance-time dur=74ms wait-monitor +---- +monitor state: dir index: 0 num-switches: 0, ongoing-latency-at-switch: 0s +now: 75ms + +close-writer +---- +injected error + +close-manager +---- +ok + +list-fs +---- diff --git a/wal/wal.go b/wal/wal.go index 4304e8205a..214a8abbc5 100644 --- a/wal/wal.go +++ b/wal/wal.go @@ -184,8 +184,10 @@ type FailoverOptions struct { // errors in the latest LogWriter. UnhealthySamplingInterval time.Duration // UnhealthyOperationLatencyThreshold is the latency threshold that is - // considered unhealthy, for operations done by a LogWriter. - UnhealthyOperationLatencyThreshold func() time.Duration + // considered unhealthy, for operations done by a LogWriter. The second return + // value indicates whether we should consider failover at all. If the second + // return value is false, failover is disabled. + UnhealthyOperationLatencyThreshold func() (time.Duration, bool) // ElevatedWriteStallThresholdLag is the duration for which an elevated // threshold should continue after a switch back to the primary dir. This is @@ -218,8 +220,8 @@ func (o *FailoverOptions) EnsureDefaults() { o.UnhealthySamplingInterval = 100 * time.Millisecond } if o.UnhealthyOperationLatencyThreshold == nil { - o.UnhealthyOperationLatencyThreshold = func() time.Duration { - return 200 * time.Millisecond + o.UnhealthyOperationLatencyThreshold = func() (time.Duration, bool) { + return 200 * time.Millisecond, true } } }