roachtest: make failure recovery independent

Previously, the multiple failures were started and finished independently. This caused a problem if the ability to recover from one failure depended on a different failure recovering first. To mitigate this and to add a little more chaos, start and recover each failure in a seperate goroutine. This will allow the "most important" failure to recover first so that the others can recover if they depend on each other. Note that this is more important today while we don't support all the failure modes that the chaos implements. Specifically we don't handle partial partitions handling yet. Epic: none Fixes: cockroachdb#119085 Fixes: cockroachdb#119347 Fixes: cockroachdb#119361 Fixes: cockroachdb#119454 Release note: None
andrewbaptist · Feb 26, 2024 · 1f0bef6 · 1f0bef6
1 parent 903e2fc
commit 1f0bef6
Showing 1 changed file with 31 additions and 18 deletions.
diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go
@@ -15,6 +15,7 @@ import (
 	gosql "database/sql"
 	"fmt"
 	"math/rand"
+	"sync"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/base"
@@ -320,29 +321,41 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
 				nodeFailers[node] = failer
 			}
 
+			// Run the failers on different goroutines. Otherwise, they can interact
+			// by having certain failures in place preventing other failures from
+			// recovering.
+			var wg sync.WaitGroup
 			for node, failer := range nodeFailers {
-				// If the failer supports partial failures (e.g. partial partitions), do
-				// one with 50% probability against a random node (including SQL
-				// gateways).
-				if partialFailer, ok := failer.(PartialFailer); ok && rng.Float64() < 0.5 {
-					var partialPeer int
-					for partialPeer == 0 || partialPeer == node {
-						partialPeer = 1 + rng.Intn(9)
+				node := node
+				failer := failer
+				wg.Add(1)
+				m.Go(func(ctx context.Context) error {
+					defer wg.Done()
+					// If the failer supports partial failures (e.g. partial partitions), do
+					// one with 50% probability against a random node (including SQL
+					// gateways).
+					if partialFailer, ok := failer.(PartialFailer); ok && rng.Float64() < 0.5 {
+						var partialPeer int
+						for partialPeer == 0 || partialPeer == node {
+							partialPeer = 1 + rng.Intn(9)
+						}
+						t.L().Printf("failing n%d to n%d (%s)", node, partialPeer, failer)
+						partialFailer.FailPartial(ctx, node, []int{partialPeer})
+					} else {
+						t.L().Printf("failing n%d (%s)", node, failer)
+						failer.Fail(ctx, node)
 					}
-					t.L().Printf("failing n%d to n%d (%s)", node, partialPeer, failer)
-					partialFailer.FailPartial(ctx, node, []int{partialPeer})
-				} else {
-					t.L().Printf("failing n%d (%s)", node, failer)
-					failer.Fail(ctx, node)
-				}
-			}
 
-			sleepFor(ctx, t, time.Minute)
+					// Maintain the failure for up to 90 seconds before recovering.
+					sleepFor(ctx, t, randutil.RandDuration(rng, 90*time.Second))
 
-			for node, failer := range nodeFailers {
-				t.L().Printf("recovering n%d (%s)", node, failer)
-				failer.Recover(ctx, node)
+					t.L().Printf("recovering n%d (%s)", node, failer)
+					failer.Recover(ctx, node)
+
+					return nil
+				})
 			}
+			wg.Wait()
 		}
 
 		sleepFor(ctx, t, time.Minute) // let cluster recover