From b6e563f67d90a66be7ec65d412a230109cd65b9e Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Wed, 27 Mar 2024 15:21:27 -0400 Subject: [PATCH] wal: synchronously verify secondary is writable When initializing the WAL failover manager, synchronously verify that we can write to the secondary directory by writing some human-readable metadata about the Pebble instance using it as a secondary. Informs #3230. --- wal/failover_manager.go | 22 ++++++++++++++++++++++ wal/failover_manager_test.go | 9 +++++++++ wal/testdata/manager_failover | 7 +++++++ 3 files changed, 38 insertions(+) diff --git a/wal/failover_manager.go b/wal/failover_manager.go index e452098145..6463795f71 100644 --- a/wal/failover_manager.go +++ b/wal/failover_manager.go @@ -5,10 +5,13 @@ package wal import ( + "fmt" + "io" "os" "sync" "time" + "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/vfs" "golang.org/x/exp/rand" @@ -472,6 +475,25 @@ func (wm *failoverManager) init(o Options, initial Logs) error { o.timeSource = defaultTime{} } o.FailoverOptions.EnsureDefaults() + + // Synchronously ensure that we're able to write to the secondary before we + // proceed. An operator doesn't want to encounter an issue writing to the + // secondary the first time there's a need to failover. We write a bit of + // metadata to a file in the secondary's directory. + f, err := o.Secondary.FS.Create(o.Secondary.FS.PathJoin(o.Secondary.Dirname, "failover_source")) + if err != nil { + return errors.Newf("failed to write to WAL secondary dir: %v", err) + } + if _, err := io.WriteString(f, fmt.Sprintf("primary: %s\nprocess start: %s\n", + o.Primary.Dirname, + time.Now(), + )); err != nil { + return errors.Newf("failed to write metadata to WAL secondary dir: %v", err) + } + if err := errors.CombineErrors(f.Sync(), f.Close()); err != nil { + return err + } + stopper := newStopper() var dirs [numDirIndices]dirAndFileHandle for i, dir := range []Dir{o.Primary, o.Secondary} { diff --git a/wal/failover_manager_test.go b/wal/failover_manager_test.go index 784febd4c4..bf174add9d 100644 --- a/wal/failover_manager_test.go +++ b/wal/failover_manager_test.go @@ -582,6 +582,15 @@ func TestFailoverManager_Quiesce(t *testing.T) { require.NoError(t, m.Close()) } +func TestFailoverManager_SecondaryIsWritable(t *testing.T) { + var m failoverManager + require.EqualError(t, m.init(Options{ + Primary: Dir{FS: vfs.NewMem(), Dirname: "primary"}, + Secondary: Dir{FS: errorfs.Wrap(vfs.NewMem(), errorfs.ErrInjected), Dirname: "secondary"}, + PreallocateSize: func() int { return 4 }, + }, nil /* initial logs */), "failed to write to WAL secondary dir: injected error") +} + // TODO(sumeer): test wrap around of history in dirProber. // TODO(sumeer): the failover datadriven test cases are not easy to write, diff --git a/wal/testdata/manager_failover b/wal/testdata/manager_failover index cd0b324a95..6ce45be820 100644 --- a/wal/testdata/manager_failover +++ b/wal/testdata/manager_failover @@ -42,6 +42,7 @@ ok list-fs ---- pri/000001.log +sec/failover_source create-writer wal-num=2 ---- @@ -59,6 +60,7 @@ list-fs ---- pri/000001.log pri/000002.log +sec/failover_source close-manager ---- @@ -353,6 +355,7 @@ list-fs pri/000001-002.log pri/000002.log sec/000001-001.log +sec/failover_source # Test with dampening of switching based on latency and secondary errors. # @@ -417,6 +420,7 @@ now: 77ms list-fs ---- +sec/failover_source # Wait until monitor sees the error and switches back to primary. advance-time dur=75ms wait-monitor wait-prober @@ -542,6 +546,7 @@ list-fs pri/000001-002.log pri/000001-004.log pri/000001.log +sec/failover_source # Test failback after primary is healthy. init-manager inject-errors=((ErrInjected (And Writes (PathMatch "*/000001.log")))) @@ -664,6 +669,7 @@ list-fs pri/000001-002.log pri/probe-file sec/000001-001.log +sec/failover_source # Test that if UnhealthyOperationLatencyThreshold says not to allow failovers # yet, failover doesn't occur even if the primary errors. @@ -704,3 +710,4 @@ ok list-fs ---- +sec/failover_source