Skip to content

Commit

Permalink
roachtest: wait for stability in rebalance load
Browse files Browse the repository at this point in the history
The `rebalance/by-load` roachtests would immediately pass once the
balance target was hit. However, it was possible that the cluster was
only transiently balanced.

Bump the timeout of all tests to be uniformly 10 minutes, from a
previous 5-10 minutes, require that the load remains balanced for at
least 1 minute to pass.

Informs: cockroachdb#107247

Release note: None
  • Loading branch information
kvoli committed Jul 31, 2023
1 parent 480c277 commit e1f4739
Showing 1 changed file with 18 additions and 8 deletions.
26 changes: 18 additions & 8 deletions pkg/cmd/roachtest/tests/rebalance_load.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ const (
meanCPUTolerance = 0.15
// statSamplePeriod is the period at which timeseries stats are sampled.
statSamplePeriod = 10 * time.Second
// stableDuration is the duration which the cluster's load must remain
// balanced for to pass.
stableDuration = time.Minute
)

func registerRebalanceLoad(r registry.Registry) {
Expand Down Expand Up @@ -150,7 +153,8 @@ func registerRebalanceLoad(r registry.Registry) {
}

var reason string
var done bool
var balancedStartTime time.Time
var prevIsBalanced bool
for tBegin := timeutil.Now(); timeutil.Since(tBegin) <= maxDuration; {
// Wait out the sample period initially to allow the timeseries to
// populate meaningful information for the test to query.
Expand All @@ -160,14 +164,20 @@ func registerRebalanceLoad(r registry.Registry) {
case <-time.After(statSamplePeriod):
}

now := timeutil.Now()
clusterStoresCPU, err := storeCPUFn(ctx)
if err != nil {
t.L().Printf("unable to get the cluster stores CPU %s\n", err.Error())
continue
}

done, reason = isLoadEvenlyDistributed(clusterStoresCPU, meanCPUTolerance)
var curIsBalanced bool
curIsBalanced, reason = isLoadEvenlyDistributed(clusterStoresCPU, meanCPUTolerance)
t.L().Printf("cpu %s", reason)
if done {
if !prevIsBalanced && curIsBalanced {
balancedStartTime = now
}
prevIsBalanced = curIsBalanced
if prevIsBalanced && now.Sub(balancedStartTime) > stableDuration {
t.Status("successfully achieved CPU balance; waiting for kv to finish running")
cancel()
return nil
Expand All @@ -194,7 +204,7 @@ func registerRebalanceLoad(r registry.Registry) {
concurrency = 32
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
}
rebalanceLoadRun(ctx, t, c, "leases", 5*time.Minute, concurrency, false /* mixedVersion */)
rebalanceLoadRun(ctx, t, c, "leases", 10*time.Minute, concurrency, false /* mixedVersion */)
},
},
)
Expand All @@ -208,7 +218,7 @@ func registerRebalanceLoad(r registry.Registry) {
concurrency = 32
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
}
rebalanceLoadRun(ctx, t, c, "leases", 5*time.Minute, concurrency, true /* mixedVersion */)
rebalanceLoadRun(ctx, t, c, "leases", 10*time.Minute, concurrency, true /* mixedVersion */)
},
},
)
Expand All @@ -224,7 +234,7 @@ func registerRebalanceLoad(r registry.Registry) {
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
}
rebalanceLoadRun(
ctx, t, c, "leases and replicas", 5*time.Minute, concurrency, false, /* mixedVersion */
ctx, t, c, "leases and replicas", 10*time.Minute, concurrency, false, /* mixedVersion */
)
},
},
Expand All @@ -240,7 +250,7 @@ func registerRebalanceLoad(r registry.Registry) {
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
}
rebalanceLoadRun(
ctx, t, c, "leases and replicas", 5*time.Minute, concurrency, true, /* mixedVersion */
ctx, t, c, "leases and replicas", 10*time.Minute, concurrency, true, /* mixedVersion */
)
},
},
Expand Down

0 comments on commit e1f4739

Please sign in to comment.