From d72b8559b69b8948de7a3c2792739c22beaa2882 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Mon, 26 Apr 2021 14:43:40 +0200
Subject: [PATCH] roachtest: tpccbench: handle overload vm crash in last search
 iter

`tpccbench` is set up to "handle" (ignore) crashes during its line
search on the assumption that these are due to pushing CRDB into
overload territory, which at the time of writing it does not handle
gracefully.
There was a special case in which this was broken, namely that of
the line search terminating in a final step with a crash. In that
case, the cluster would be left running with one node down, which
roachtest checks and emits as an error.

Unconditionally restart the cluster after the line search (assuming
it found a passing warehouse count, i.e. didn't error out itself)
to make roachtest happy.

Closes #64187.

Release note: None
---
 pkg/cmd/roachtest/tpcc.go | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go
index 387cac670c17..f9f65da8ca84 100644
--- a/pkg/cmd/roachtest/tpcc.go
+++ b/pkg/cmd/roachtest/tpcc.go
@@ -793,18 +793,8 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
 		t.Fatal(errors.Wrap(err, "failed to create temp dir"))
 	}
 	defer func() { _ = os.RemoveAll(resultsDir) }()
-	s := search.NewLineSearcher(1, b.LoadWarehouses, b.EstimatedMax, initStepSize, precision)
-	iteration := 0
-	if res, err := s.Search(func(warehouses int) (bool, error) {
-		iteration++
-		t.l.Printf("initializing cluster for %d warehouses (search attempt: %d)", warehouses, iteration)
-
-		// NB: for goroutines in this monitor, handle errors via `t.Fatal` to
-		// *abort* the line search and whole tpccbench run. Return the errors
-		// to indicate that the specific warehouse count failed, but that the
-		// line search ought to continue.
-		m := newMonitor(ctx, c, roachNodes)
 
+	restart := func() {
 		// We overload the clusters in tpccbench, which can lead to transient infra
 		// failures. These are a) really annoying to debug and b) hide the actual
 		// passing warehouse count, making the line search sensitive to the choice
@@ -841,6 +831,16 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
 		}
 
 		c.Start(ctx, t, append(b.startOpts(), roachNodes)...)
+	}
+
+	s := search.NewLineSearcher(1, b.LoadWarehouses, b.EstimatedMax, initStepSize, precision)
+	iteration := 0
+	if res, err := s.Search(func(warehouses int) (bool, error) {
+		iteration++
+		t.l.Printf("initializing cluster for %d warehouses (search attempt: %d)", warehouses, iteration)
+
+		restart()
+
 		time.Sleep(restartWait)
 
 		// Set up the load generation configuration.
@@ -848,6 +848,12 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
 		loadDur := 10 * time.Minute
 		loadDone := make(chan time.Time, numLoadGroups)
 
+		// NB: for goroutines in this monitor, handle errors via `t.Fatal` to
+		// *abort* the line search and whole tpccbench run. Return the errors
+		// to indicate that the specific warehouse count failed, but that the
+		// line search ought to continue.
+		m := newMonitor(ctx, c, roachNodes)
+
 		// If we're running chaos in this configuration, modify this config.
 		if b.Chaos {
 			// Kill one node at a time.
@@ -981,6 +987,10 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
 	}); err != nil {
 		t.Fatal(err)
 	} else {
+		// The last iteration may have been a failing run that overloaded
+		// nodes to the point of them crashing. Make roachtest happy by
+		// restarting the cluster so that it can run consistency checks.
+		restart()
 		ttycolor.Stdout(ttycolor.Green)
 		t.l.Printf("------\nMAX WAREHOUSES = %d\n------\n\n", res)
 		ttycolor.Stdout(ttycolor.Reset)