From 234e513a0a9be5834ea29af6742894456c65646c Mon Sep 17 00:00:00 2001
From: Andrew Baptist <baptist@cockroachlabs.com>
Date: Mon, 26 Feb 2024 18:37:25 +0000
Subject: [PATCH] roachtest: make failure recovery independent

Previously, the multiple failures were started and finished
independently. This caused a problem if the ability to recover from one
failure depended on a different failure recovering first. To mitigate
this, recover each failure in a separate goroutine. This will allow the
"most important" failure to recover first so that the others can recover
if they depend on each other.

This is more important today while we don't recover from all the failure
modes that chaos implements. Specifically we don't handle partial
partitions fully with epoch leases.

Epic: none
Fixes: #119085
Fixes: #119347
Fixes: #119361
Fixes: #119454

Release note: None
---
 pkg/cmd/roachtest/tests/failover.go | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go
index e4f86256c1b5..98b5828fc88b 100644
--- a/pkg/cmd/roachtest/tests/failover.go
+++ b/pkg/cmd/roachtest/tests/failover.go
@@ -15,6 +15,7 @@ import (
 	gosql "database/sql"
 	"fmt"
 	"math/rand"
+	"sync"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/base"
@@ -339,10 +340,23 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
 
 			sleepFor(ctx, t, time.Minute)
 
+			// Recover the failers on different goroutines. Otherwise, they
+			// might interact as certain failures can prevent other failures
+			// from recovering.
+			var wg sync.WaitGroup
 			for node, failer := range nodeFailers {
-				t.L().Printf("recovering n%d (%s)", node, failer)
-				failer.Recover(ctx, node)
+				wg.Add(1)
+				node := node
+				failer := failer
+				m.Go(func(ctx context.Context) error {
+					defer wg.Done()
+					t.L().Printf("recovering n%d (%s)", node, failer)
+					failer.Recover(ctx, node)
+
+					return nil
+				})
 			}
+			wg.Wait()
 		}
 
 		sleepFor(ctx, t, time.Minute) // let cluster recover