From 234e513a0a9be5834ea29af6742894456c65646c Mon Sep 17 00:00:00 2001 From: Andrew Baptist Date: Mon, 26 Feb 2024 18:37:25 +0000 Subject: [PATCH] roachtest: make failure recovery independent Previously, the multiple failures were started and finished independently. This caused a problem if the ability to recover from one failure depended on a different failure recovering first. To mitigate this, recover each failure in a separate goroutine. This will allow the "most important" failure to recover first so that the others can recover if they depend on each other. This is more important today while we don't recover from all the failure modes that chaos implements. Specifically we don't handle partial partitions fully with epoch leases. Epic: none Fixes: #119085 Fixes: #119347 Fixes: #119361 Fixes: #119454 Release note: None --- pkg/cmd/roachtest/tests/failover.go | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go index e4f86256c1b5..98b5828fc88b 100644 --- a/pkg/cmd/roachtest/tests/failover.go +++ b/pkg/cmd/roachtest/tests/failover.go @@ -15,6 +15,7 @@ import ( gosql "database/sql" "fmt" "math/rand" + "sync" "time" "github.com/cockroachdb/cockroach/pkg/base" @@ -339,10 +340,23 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO sleepFor(ctx, t, time.Minute) + // Recover the failers on different goroutines. Otherwise, they + // might interact as certain failures can prevent other failures + // from recovering. + var wg sync.WaitGroup for node, failer := range nodeFailers { - t.L().Printf("recovering n%d (%s)", node, failer) - failer.Recover(ctx, node) + wg.Add(1) + node := node + failer := failer + m.Go(func(ctx context.Context) error { + defer wg.Done() + t.L().Printf("recovering n%d (%s)", node, failer) + failer.Recover(ctx, node) + + return nil + }) } + wg.Wait() } sleepFor(ctx, t, time.Minute) // let cluster recover