Skip to content

Commit

Permalink
roachtest: make failure recovery independent
Browse files Browse the repository at this point in the history
Previously, the multiple failures were started and finished
independently. This caused a problem if the ability to recover from one
failure depended on a different failure recovering first. To mitigate
this and to add a little more chaos, start and recover each failure in a
seperate goroutine. This will allow the "most important" failure to
recover first so that the others can recover if they depend on each
other.

Note that this is more important today while we don't support all the
failure modes that the chaos implements. Specifically we don't handle
partial partitions handling yet.

Epic: none
Fixes: cockroachdb#119085
Fixes: cockroachdb#119347
Fixes: cockroachdb#119361
Fixes: cockroachdb#119454

Release note: None
  • Loading branch information
andrewbaptist committed Feb 26, 2024
1 parent 903e2fc commit 1f0bef6
Showing 1 changed file with 31 additions and 18 deletions.
49 changes: 31 additions & 18 deletions pkg/cmd/roachtest/tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
gosql "database/sql"
"fmt"
"math/rand"
"sync"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
Expand Down Expand Up @@ -320,29 +321,41 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
nodeFailers[node] = failer
}

// Run the failers on different goroutines. Otherwise, they can interact
// by having certain failures in place preventing other failures from
// recovering.
var wg sync.WaitGroup
for node, failer := range nodeFailers {
// If the failer supports partial failures (e.g. partial partitions), do
// one with 50% probability against a random node (including SQL
// gateways).
if partialFailer, ok := failer.(PartialFailer); ok && rng.Float64() < 0.5 {
var partialPeer int
for partialPeer == 0 || partialPeer == node {
partialPeer = 1 + rng.Intn(9)
node := node
failer := failer
wg.Add(1)
m.Go(func(ctx context.Context) error {
defer wg.Done()
// If the failer supports partial failures (e.g. partial partitions), do
// one with 50% probability against a random node (including SQL
// gateways).
if partialFailer, ok := failer.(PartialFailer); ok && rng.Float64() < 0.5 {
var partialPeer int
for partialPeer == 0 || partialPeer == node {
partialPeer = 1 + rng.Intn(9)
}
t.L().Printf("failing n%d to n%d (%s)", node, partialPeer, failer)
partialFailer.FailPartial(ctx, node, []int{partialPeer})
} else {
t.L().Printf("failing n%d (%s)", node, failer)
failer.Fail(ctx, node)
}
t.L().Printf("failing n%d to n%d (%s)", node, partialPeer, failer)
partialFailer.FailPartial(ctx, node, []int{partialPeer})
} else {
t.L().Printf("failing n%d (%s)", node, failer)
failer.Fail(ctx, node)
}
}

sleepFor(ctx, t, time.Minute)
// Maintain the failure for up to 90 seconds before recovering.
sleepFor(ctx, t, randutil.RandDuration(rng, 90*time.Second))

for node, failer := range nodeFailers {
t.L().Printf("recovering n%d (%s)", node, failer)
failer.Recover(ctx, node)
t.L().Printf("recovering n%d (%s)", node, failer)
failer.Recover(ctx, node)

return nil
})
}
wg.Wait()
}

sleepFor(ctx, t, time.Minute) // let cluster recover
Expand Down

0 comments on commit 1f0bef6

Please sign in to comment.