Skip to content

Commit

Permalink
roachtest: tpccbench: handle overload vm crash in last search iter
Browse files Browse the repository at this point in the history
`tpccbench` is set up to "handle" (ignore) crashes during its line
search on the assumption that these are due to pushing CRDB into
overload territory, which at the time of writing it does not handle
gracefully.
There was a special case in which this was broken, namely that of
the line search terminating in a final step with a crash. In that
case, the cluster would be left running with one node down, which
roachtest checks and emits as an error.

Unconditionally restart the cluster after the line search (assuming
it found a passing warehouse count, i.e. didn't error out itself)
to make roachtest happy.

Closes cockroachdb#64187.

Release note: None
  • Loading branch information
tbg committed Apr 26, 2021
1 parent d85d49d commit d72b855
Showing 1 changed file with 21 additions and 11 deletions.
32 changes: 21 additions & 11 deletions pkg/cmd/roachtest/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -793,18 +793,8 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
t.Fatal(errors.Wrap(err, "failed to create temp dir"))
}
defer func() { _ = os.RemoveAll(resultsDir) }()
s := search.NewLineSearcher(1, b.LoadWarehouses, b.EstimatedMax, initStepSize, precision)
iteration := 0
if res, err := s.Search(func(warehouses int) (bool, error) {
iteration++
t.l.Printf("initializing cluster for %d warehouses (search attempt: %d)", warehouses, iteration)

// NB: for goroutines in this monitor, handle errors via `t.Fatal` to
// *abort* the line search and whole tpccbench run. Return the errors
// to indicate that the specific warehouse count failed, but that the
// line search ought to continue.
m := newMonitor(ctx, c, roachNodes)

restart := func() {
// We overload the clusters in tpccbench, which can lead to transient infra
// failures. These are a) really annoying to debug and b) hide the actual
// passing warehouse count, making the line search sensitive to the choice
Expand Down Expand Up @@ -841,13 +831,29 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
}

c.Start(ctx, t, append(b.startOpts(), roachNodes)...)
}

s := search.NewLineSearcher(1, b.LoadWarehouses, b.EstimatedMax, initStepSize, precision)
iteration := 0
if res, err := s.Search(func(warehouses int) (bool, error) {
iteration++
t.l.Printf("initializing cluster for %d warehouses (search attempt: %d)", warehouses, iteration)

restart()

time.Sleep(restartWait)

// Set up the load generation configuration.
rampDur := 5 * time.Minute
loadDur := 10 * time.Minute
loadDone := make(chan time.Time, numLoadGroups)

// NB: for goroutines in this monitor, handle errors via `t.Fatal` to
// *abort* the line search and whole tpccbench run. Return the errors
// to indicate that the specific warehouse count failed, but that the
// line search ought to continue.
m := newMonitor(ctx, c, roachNodes)

// If we're running chaos in this configuration, modify this config.
if b.Chaos {
// Kill one node at a time.
Expand Down Expand Up @@ -981,6 +987,10 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
}); err != nil {
t.Fatal(err)
} else {
// The last iteration may have been a failing run that overloaded
// nodes to the point of them crashing. Make roachtest happy by
// restarting the cluster so that it can run consistency checks.
restart()
ttycolor.Stdout(ttycolor.Green)
t.l.Printf("------\nMAX WAREHOUSES = %d\n------\n\n", res)
ttycolor.Stdout(ttycolor.Reset)
Expand Down

0 comments on commit d72b855

Please sign in to comment.