diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index d82c5bc2996d..c70dc12b755a 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1466,43 +1466,6 @@ func (c *clusterImpl) FetchVMSpecs(ctx context.Context, l *logger.Logger) error }) } -// checkNoDeadNode returns an error if at least one of the nodes that have a populated -// data dir are found to be not running. It prints both to t.L() and the test -// output. -func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error { - if c.spec.NodeCount == 0 { - // No nodes can happen during unit tests and implies nothing to do. - return nil - } - - t.L().Printf("checking for dead nodes") - eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true}) - - // An error here means there was a problem initialising a SyncedCluster. - if err != nil { - return err - } - - deadProcesses := 0 - for info := range eventsCh { - t.L().Printf("%s", info) - - if _, isDeath := info.Event.(install.MonitorProcessDead); isDeath { - deadProcesses++ - } - } - - var plural string - if deadProcesses > 1 { - plural = "es" - } - - if deadProcesses > 0 { - return errors.Newf("%d dead cockroach process%s detected", deadProcesses, plural) - } - return nil -} - func selectedNodesOrDefault( opts []option.Option, defaultNodes option.NodeListOption, ) option.NodeListOption { @@ -1540,16 +1503,18 @@ func newHealthStatusResult(node int, status int, body []byte, err error) *Health func (c *clusterImpl) HealthStatus( ctx context.Context, l *logger.Logger, nodes option.NodeListOption, ) ([]*HealthStatusResult, error) { - if len(nodes) < 1 { + nodeCount := len(nodes) + if nodeCount < 1 { return nil, nil // unit tests } + adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, nodes) if err != nil { return nil, errors.WithDetail(err, "Unable to get admin UI address(es)") } client := roachtestutil.DefaultHTTPClient(c, l) - getStatus := func(ctx context.Context, node int) *HealthStatusResult { - url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[node-1]) + getStatus := func(ctx context.Context, nodeIndex, node int) *HealthStatusResult { + url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[nodeIndex]) resp, err := client.Get(ctx, url) if err != nil { return newHealthStatusResult(node, 0, nil, err) @@ -1561,16 +1526,16 @@ func (c *clusterImpl) HealthStatus( return newHealthStatusResult(node, resp.StatusCode, body, err) } - results := make([]*HealthStatusResult, c.spec.NodeCount) + results := make([]*HealthStatusResult, nodeCount) _ = timeutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error { var wg sync.WaitGroup wg.Add(c.spec.NodeCount) - for i := 1; i <= c.spec.NodeCount; i++ { - go func(node int) { + for i := 0; i < nodeCount; i++ { + go func() { defer wg.Done() - results[node-1] = getStatus(ctx, node) - }(i) + results[i] = getStatus(ctx, i, nodes[i]) + }() } wg.Wait() return nil diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 538a43cfb008..720cfc29e9b4 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1487,27 +1487,16 @@ func (r *testRunner) postTestAssertions( postAssertCh := make(chan struct{}) _ = r.stopper.RunAsyncTask(ctx, "test-post-assertions", func(ctx context.Context) { defer close(postAssertCh) - // When a dead node is detected, the subsequent post validation queries are likely - // to hang (reason unclear), and eventually timeout according to the statement_timeout. - // If this occurs frequently enough, we can look at skipping post validations on a node - // failure (or even on any test failure). - if err := c.assertNoDeadNode(ctx, t); err != nil { - // Some tests expect dead nodes, so they may opt out of this check. - if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { - postAssertionErr(err) - } else { - t.L().Printf("dead node(s) detected but expected") - } - } // We collect all the admin health endpoints in parallel, // and select the first one that succeeds to run the validation queries - statuses, err := c.HealthStatus(ctx, t.L(), c.All()) + statuses, err := c.HealthStatus(ctx, t.L(), c.CRDBNodes()) if err != nil { postAssertionErr(errors.WithDetail(err, "Unable to check health status")) } validationNode := 0 + liveNodes := 0 for _, s := range statuses { if s.Err != nil { t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err) @@ -1523,6 +1512,23 @@ func (r *testRunner) postTestAssertions( validationNode = s.Node // NB: s.Node is never zero } t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node) + liveNodes++ + } + + // When a dead node is detected, the subsequent post validation queries are likely + // to hang (reason unclear), and eventually timeout according to the statement_timeout. + // If this occurs frequently enough, we can look at skipping post validations on a node + // failure (or even on any test failure). + if liveNodes != len(c.CRDBNodes()) { + // Some tests expect dead nodes, so they may opt out of this check. + if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { + postAssertionErr( + errors.Newf("unexpected dead node(s) detected, expected %d live nodes out of %d", + liveNodes, len(c.CRDBNodes())), + ) + } else { + t.L().Printf("dead node(s) detected but expected") + } } // We avoid trying to do this when t.Failed() (and in particular when there