From 22636b913bebf23e4a9055fc8947dc597ae5a9d3 Mon Sep 17 00:00:00 2001 From: Herko Lategan Date: Fri, 6 Dec 2024 12:10:36 +0000 Subject: [PATCH] roachtest: remove assert dead nodes Previously, the post assertions would check for dead nodes by using roachprod Monitor. It checks if the data directory of a cockroach node is non-trivial. This is not a reliable way to exclude empty nodes, or to check for dead nodes. It also complicates the Monitor functionality unnecessarily. This change refactors the code to rather use the HTTP health checks to determine if there are dead nodes. Since we now know which node is used for the workload, this check only needs to determine if the cockroach nodes are all still live. Informs: #118214 Epic: None --- pkg/cmd/roachtest/cluster.go | 55 ++++++-------------------------- pkg/cmd/roachtest/test_runner.go | 32 +++++++++++-------- 2 files changed, 29 insertions(+), 58 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index d82c5bc2996d..c70dc12b755a 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1466,43 +1466,6 @@ func (c *clusterImpl) FetchVMSpecs(ctx context.Context, l *logger.Logger) error }) } -// checkNoDeadNode returns an error if at least one of the nodes that have a populated -// data dir are found to be not running. It prints both to t.L() and the test -// output. -func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error { - if c.spec.NodeCount == 0 { - // No nodes can happen during unit tests and implies nothing to do. - return nil - } - - t.L().Printf("checking for dead nodes") - eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true}) - - // An error here means there was a problem initialising a SyncedCluster. - if err != nil { - return err - } - - deadProcesses := 0 - for info := range eventsCh { - t.L().Printf("%s", info) - - if _, isDeath := info.Event.(install.MonitorProcessDead); isDeath { - deadProcesses++ - } - } - - var plural string - if deadProcesses > 1 { - plural = "es" - } - - if deadProcesses > 0 { - return errors.Newf("%d dead cockroach process%s detected", deadProcesses, plural) - } - return nil -} - func selectedNodesOrDefault( opts []option.Option, defaultNodes option.NodeListOption, ) option.NodeListOption { @@ -1540,16 +1503,18 @@ func newHealthStatusResult(node int, status int, body []byte, err error) *Health func (c *clusterImpl) HealthStatus( ctx context.Context, l *logger.Logger, nodes option.NodeListOption, ) ([]*HealthStatusResult, error) { - if len(nodes) < 1 { + nodeCount := len(nodes) + if nodeCount < 1 { return nil, nil // unit tests } + adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, nodes) if err != nil { return nil, errors.WithDetail(err, "Unable to get admin UI address(es)") } client := roachtestutil.DefaultHTTPClient(c, l) - getStatus := func(ctx context.Context, node int) *HealthStatusResult { - url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[node-1]) + getStatus := func(ctx context.Context, nodeIndex, node int) *HealthStatusResult { + url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[nodeIndex]) resp, err := client.Get(ctx, url) if err != nil { return newHealthStatusResult(node, 0, nil, err) @@ -1561,16 +1526,16 @@ func (c *clusterImpl) HealthStatus( return newHealthStatusResult(node, resp.StatusCode, body, err) } - results := make([]*HealthStatusResult, c.spec.NodeCount) + results := make([]*HealthStatusResult, nodeCount) _ = timeutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error { var wg sync.WaitGroup wg.Add(c.spec.NodeCount) - for i := 1; i <= c.spec.NodeCount; i++ { - go func(node int) { + for i := 0; i < nodeCount; i++ { + go func() { defer wg.Done() - results[node-1] = getStatus(ctx, node) - }(i) + results[i] = getStatus(ctx, i, nodes[i]) + }() } wg.Wait() return nil diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 538a43cfb008..720cfc29e9b4 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1487,27 +1487,16 @@ func (r *testRunner) postTestAssertions( postAssertCh := make(chan struct{}) _ = r.stopper.RunAsyncTask(ctx, "test-post-assertions", func(ctx context.Context) { defer close(postAssertCh) - // When a dead node is detected, the subsequent post validation queries are likely - // to hang (reason unclear), and eventually timeout according to the statement_timeout. - // If this occurs frequently enough, we can look at skipping post validations on a node - // failure (or even on any test failure). - if err := c.assertNoDeadNode(ctx, t); err != nil { - // Some tests expect dead nodes, so they may opt out of this check. - if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { - postAssertionErr(err) - } else { - t.L().Printf("dead node(s) detected but expected") - } - } // We collect all the admin health endpoints in parallel, // and select the first one that succeeds to run the validation queries - statuses, err := c.HealthStatus(ctx, t.L(), c.All()) + statuses, err := c.HealthStatus(ctx, t.L(), c.CRDBNodes()) if err != nil { postAssertionErr(errors.WithDetail(err, "Unable to check health status")) } validationNode := 0 + liveNodes := 0 for _, s := range statuses { if s.Err != nil { t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err) @@ -1523,6 +1512,23 @@ func (r *testRunner) postTestAssertions( validationNode = s.Node // NB: s.Node is never zero } t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node) + liveNodes++ + } + + // When a dead node is detected, the subsequent post validation queries are likely + // to hang (reason unclear), and eventually timeout according to the statement_timeout. + // If this occurs frequently enough, we can look at skipping post validations on a node + // failure (or even on any test failure). + if liveNodes != len(c.CRDBNodes()) { + // Some tests expect dead nodes, so they may opt out of this check. + if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { + postAssertionErr( + errors.Newf("unexpected dead node(s) detected, expected %d live nodes out of %d", + liveNodes, len(c.CRDBNodes())), + ) + } else { + t.L().Printf("dead node(s) detected but expected") + } } // We avoid trying to do this when t.Failed() (and in particular when there