Skip to content

Commit

Permalink
roachtest: deflake bank/{node-restart,cluster-recovery}
Browse files Browse the repository at this point in the history
Fixes #38785.
Fixes #35326.

Because everything roachprod does, it does through SSH, we're
particularly susceptible to network delays, packet drops, etc. We've
seen this before, or at least pointed to this being a problem before,
over at #37001. Setting timeouts around our calls to roachprod helps to
better surface these kind of errors.

The underlying issue in #38785 and in #35326 is the fact that we're
running roachprod commands that may (reasonably) fail due to connection
issues, and we're unable to retry them safely (the underlying commands
are non-idempotent). Presently we simply fail the entire test, when
really we should be able to retry the commands. This is left unaddressed.

Release justification: Category 1: Non-production code changes
Release note: None
  • Loading branch information
irfansharif committed Sep 24, 2019
1 parent a5070c0 commit 5296a27
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 8 deletions.
14 changes: 6 additions & 8 deletions pkg/cmd/roachtest/bank.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func (client *bankClient) transferMoney(ctx context.Context, numAccounts, maxTra
// statement timeout, which unfortunately precludes the use of prepared
// statements.
q := fmt.Sprintf(`
SET statement_timeout = '31s';
SET statement_timeout = '30s';
UPDATE bank.accounts
SET balance = CASE id WHEN %[1]d THEN balance-%[3]d WHEN %[2]d THEN balance+%[3]d END
WHERE id IN (%[1]d, %[2]d) AND (SELECT balance >= %[3]d FROM bank.accounts WHERE id = %[1]d);
Expand Down Expand Up @@ -237,9 +237,7 @@ func (s *bankState) startChaosMonkey(
break
}
t.l.Printf("round %d: restarting %d\n", curRound, i)

c.Stop(ctx, c.Node(i))
c.Start(ctx, t, c.Node(i))
c.Restart(ctx, t, c.Node(i))
}

preCount := s.counts()
Expand Down Expand Up @@ -438,7 +436,7 @@ func runBankClusterRecovery(ctx context.Context, t *test, c *cluster) {
}
s.startChaosMonkey(ctx, t, c, pickNodes, -1)

s.waitClientsStop(ctx, t, c, 30*time.Second)
s.waitClientsStop(ctx, t, c, 45*time.Second)

// Verify accounts.
s.verifyAccounts(ctx, t)
Expand Down Expand Up @@ -480,7 +478,7 @@ func runBankNodeRestart(ctx context.Context, t *test, c *cluster) {
}
s.startChaosMonkey(ctx, t, c, pickNodes, clientIdx)

s.waitClientsStop(ctx, t, c, 30*time.Second)
s.waitClientsStop(ctx, t, c, 45*time.Second)

// Verify accounts.
s.verifyAccounts(ctx, t)
Expand Down Expand Up @@ -511,7 +509,7 @@ func runBankNodeZeroSum(ctx context.Context, t *test, c *cluster) {
}

s.startSplitMonkey(ctx, 2*time.Second, c)
s.waitClientsStop(ctx, t, c, 30*time.Second)
s.waitClientsStop(ctx, t, c, 45*time.Second)

s.verifyAccounts(ctx, t)

Expand Down Expand Up @@ -559,7 +557,7 @@ func runBankZeroSumRestart(ctx context.Context, t *test, c *cluster) {
// Starting up the goroutines that restart and do splits and lease moves.
s.startChaosMonkey(ctx, t, c, pickNodes, -1)
s.startSplitMonkey(ctx, 2*time.Second, c)
s.waitClientsStop(ctx, t, c, 30*time.Second)
s.waitClientsStop(ctx, t, c, 45*time.Second)

// Verify accounts.
s.verifyAccounts(ctx, t)
Expand Down
26 changes: 26 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,12 @@ func execCmd(ctx context.Context, l *logger, args ...string) error {
if err := cmd.Run(); err != nil {
cancel()
wg.Wait() // synchronize access to ring buffer

// Context deadline exceeded errors opaquely appear as "signal killed" when
// manifested. We surface this error explicitly.
if ctx.Err() == context.DeadlineExceeded {
return ctx.Err()
}
return errors.Wrapf(
err,
"%s returned:\nstderr:\n%s\nstdout:\n%s",
Expand Down Expand Up @@ -1635,6 +1641,26 @@ func roachprodArgs(opts []option) []string {
return args
}

// Restart restarts the specified cockroach node. It takes a test and, on error,
// calls t.Fatal().
func (c *cluster) Restart(ctx context.Context, t *test, node nodeListOption) {
// We bound the time taken to restart a node through roachprod. Because
// roachprod uses SSH, it's particularly vulnerable to network flakiness (as
// seen in #35326) and may stall indefinitely. Setting up timeouts better
// surfaces this kind of failure.
//
// TODO(irfansharif): The underlying issue here is the fact that we're running
// roachprod commands that may (reasonably) fail due to connection issues, and
// we're unable to retry them safely (the underlying commands are
// non-idempotent). Presently we simply fail the entire test, when really we
// should be able to retry the specific roachprod commands.
var cancel func()
ctx, cancel = context.WithTimeout(ctx, 30*time.Second)
c.Stop(ctx, node)
c.Start(ctx, t, node)
cancel()
}

// StartE starts cockroach nodes on a subset of the cluster. The nodes parameter
// can either be a specific node, empty (to indicate all nodes), or a pair of
// nodes indicating a range.
Expand Down

0 comments on commit 5296a27

Please sign in to comment.