From 987fc337df53e7c98dbcf4840b1988be83735003 Mon Sep 17 00:00:00 2001 From: Herko Lategan Date: Fri, 6 Dec 2024 12:10:36 +0000 Subject: [PATCH] roachtest: remove assert dead nodes Previously, the post assertions would check for dead nodes by using roachprod Monitor. It checks if the data directory of a cockroach node is non-trivial. This is not a reliable way to exclude empty nodes, or to check for dead nodes. It also complicates the Monitor functionality unnecessarily. This change refactors the code to rather use the HTTP health checks to determine if there are dead nodes. Since we now know which node is used for the workload, this check only needs to determine if the cockroach nodes are all still live. Informs: #118214 Epic: None --- pkg/cmd/roachtest/cluster.go | 86 ++++++++++++------------- pkg/cmd/roachtest/test_runner.go | 38 +++++++---- pkg/cmd/roachtest/tests/acceptance.go | 28 ++++---- pkg/roachprod/install/cluster_synced.go | 1 + 4 files changed, 82 insertions(+), 71 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index d82c5bc2996d..e1480f341a00 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -16,6 +16,7 @@ import ( "log" "math/rand" "net" + "net/http" "net/url" "os" "os/exec" @@ -1466,43 +1467,6 @@ func (c *clusterImpl) FetchVMSpecs(ctx context.Context, l *logger.Logger) error }) } -// checkNoDeadNode returns an error if at least one of the nodes that have a populated -// data dir are found to be not running. It prints both to t.L() and the test -// output. -func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error { - if c.spec.NodeCount == 0 { - // No nodes can happen during unit tests and implies nothing to do. - return nil - } - - t.L().Printf("checking for dead nodes") - eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true}) - - // An error here means there was a problem initialising a SyncedCluster. - if err != nil { - return err - } - - deadProcesses := 0 - for info := range eventsCh { - t.L().Printf("%s", info) - - if _, isDeath := info.Event.(install.MonitorProcessDead); isDeath { - deadProcesses++ - } - } - - var plural string - if deadProcesses > 1 { - plural = "es" - } - - if deadProcesses > 0 { - return errors.Newf("%d dead cockroach process%s detected", deadProcesses, plural) - } - return nil -} - func selectedNodesOrDefault( opts []option.Option, defaultNodes option.NodeListOption, ) option.NodeListOption { @@ -1536,20 +1500,40 @@ func newHealthStatusResult(node int, status int, body []byte, err error) *Health } } +// SystemNodes looks for the system start script on each node and returns the +// node numbers that have it. +func (c *clusterImpl) SystemNodes() (option.NodeListOption, error) { + scriptPath := install.StartScriptPath(install.SystemInterfaceName, 0) + results, err := c.RunWithDetails(context.Background(), nil, option.WithNodes(c.All()), "test -f "+scriptPath) + if err != nil { + return nil, err + } + var nodes option.NodeListOption + for _, res := range results { + if res.RemoteExitStatus == 0 { + nodes = append(nodes, int(res.Node)) + } + } + return nodes, nil +} + // HealthStatus returns the result of the /health?ready=1 endpoint for each node. func (c *clusterImpl) HealthStatus( ctx context.Context, l *logger.Logger, nodes option.NodeListOption, ) ([]*HealthStatusResult, error) { - if len(nodes) < 1 { + nodeCount := len(nodes) + if nodeCount < 1 { return nil, nil // unit tests } + adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, nodes) if err != nil { return nil, errors.WithDetail(err, "Unable to get admin UI address(es)") } client := roachtestutil.DefaultHTTPClient(c, l) - getStatus := func(ctx context.Context, node int) *HealthStatusResult { - url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[node-1]) + getStatus := func(ctx context.Context, nodeIndex, node int) *HealthStatusResult { + url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[nodeIndex]) + resp, err := client.Get(ctx, url) if err != nil { return newHealthStatusResult(node, 0, nil, err) @@ -1561,16 +1545,26 @@ func (c *clusterImpl) HealthStatus( return newHealthStatusResult(node, resp.StatusCode, body, err) } - results := make([]*HealthStatusResult, c.spec.NodeCount) + results := make([]*HealthStatusResult, nodeCount) _ = timeutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error { var wg sync.WaitGroup - wg.Add(c.spec.NodeCount) - for i := 1; i <= c.spec.NodeCount; i++ { - go func(node int) { + wg.Add(nodeCount) + for i := 0; i < nodeCount; i++ { + go func() { defer wg.Done() - results[node-1] = getStatus(ctx, node) - }(i) + for { + results[i] = getStatus(ctx, i, nodes[i]) + if results[i].Err == nil && results[i].Status == http.StatusOK { + return + } + select { + case <-ctx.Done(): + return + case <-time.After(3 * time.Second): + } + } + }() } wg.Wait() return nil diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 538a43cfb008..1a53a1e8faec 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1487,27 +1487,23 @@ func (r *testRunner) postTestAssertions( postAssertCh := make(chan struct{}) _ = r.stopper.RunAsyncTask(ctx, "test-post-assertions", func(ctx context.Context) { defer close(postAssertCh) - // When a dead node is detected, the subsequent post validation queries are likely - // to hang (reason unclear), and eventually timeout according to the statement_timeout. - // If this occurs frequently enough, we can look at skipping post validations on a node - // failure (or even on any test failure). - if err := c.assertNoDeadNode(ctx, t); err != nil { - // Some tests expect dead nodes, so they may opt out of this check. - if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { - postAssertionErr(err) - } else { - t.L().Printf("dead node(s) detected but expected") - } + + // Determine on which nodes the system tenant is running. + systemNodes, err := c.SystemNodes() + _, _ = fmt.Fprintf(os.Stderr, "SYSTEM NODES: %v\n", systemNodes) + if err != nil { + postAssertionErr(errors.WithDetail(err, "Unable to determine system nodes")) } // We collect all the admin health endpoints in parallel, // and select the first one that succeeds to run the validation queries - statuses, err := c.HealthStatus(ctx, t.L(), c.All()) + statuses, err := c.HealthStatus(ctx, t.L(), systemNodes) if err != nil { postAssertionErr(errors.WithDetail(err, "Unable to check health status")) } validationNode := 0 + liveNodes := 0 for _, s := range statuses { if s.Err != nil { t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err) @@ -1523,6 +1519,24 @@ func (r *testRunner) postTestAssertions( validationNode = s.Node // NB: s.Node is never zero } t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node) + liveNodes++ + } + + // When a dead node is detected, the subsequent post validation queries are likely + // to hang (reason unclear), and eventually timeout according to the statement_timeout. + // If this occurs frequently enough, we can look at skipping post validations on a node + // failure (or even on any test failure). + if liveNodes != len(systemNodes) { + t.L().Printf("validation: %d out of %d node(s) are alive", liveNodes, len(systemNodes)) + // Some tests expect dead nodes, so they may opt out of this check. + if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { + postAssertionErr( + errors.Newf("unexpected dead node(s) detected, %d out of %d node(s) are alive", + liveNodes, len(systemNodes)), + ) + } else { + t.L().Printf("dead node(s) detected but expected") + } } // We avoid trying to do this when t.Failed() (and in particular when there diff --git a/pkg/cmd/roachtest/tests/acceptance.go b/pkg/cmd/roachtest/tests/acceptance.go index ebb8192b7113..e82745bc530d 100644 --- a/pkg/cmd/roachtest/tests/acceptance.go +++ b/pkg/cmd/roachtest/tests/acceptance.go @@ -33,6 +33,7 @@ func registerAcceptance(r registry.Registry) { randomized bool workloadNode bool incompatibleClouds registry.CloudSet + skipPostValidation registry.PostValidation }{ // NOTE: acceptance tests are lightweight tests that run as part // of CI. As such, they must: @@ -44,7 +45,7 @@ func registerAcceptance(r registry.Registry) { // properties, please register it separately (not as an acceptance // test). registry.OwnerKV: { - {name: "decommission-self", fn: runDecommissionSelf}, + {name: "decommission-self", fn: runDecommissionSelf, skipPostValidation: registry.PostValidationNoDeadNodes}, {name: "event-log", fn: runEventLog}, {name: "gossip/peerings", fn: runGossipPeerings}, {name: "gossip/restart", fn: runGossipRestart}, @@ -59,8 +60,8 @@ func registerAcceptance(r registry.Registry) { encryptionSupport: registry.EncryptionMetamorphic, }, {name: "cli/node-status", fn: runCLINodeStatus}, - {name: "cluster-init", fn: runClusterInit}, - {name: "rapid-restart", fn: runRapidRestart}, + {name: "cluster-init", fn: runClusterInit, skipPostValidation: registry.PostValidationNoDeadNodes}, + {name: "rapid-restart", fn: runRapidRestart, skipPostValidation: registry.PostValidationNoDeadNodes}, }, registry.OwnerObservability: { {name: "status-server", fn: runStatusServer}, @@ -139,16 +140,17 @@ func registerAcceptance(r registry.Registry) { } testSpec := registry.TestSpec{ - Name: "acceptance/" + tc.name, - Owner: owner, - Cluster: r.MakeClusterSpec(numNodes, extraOptions...), - Skip: tc.skip, - EncryptionSupport: tc.encryptionSupport, - Timeout: 10 * time.Minute, - CompatibleClouds: registry.AllClouds.Remove(tc.incompatibleClouds), - Suites: registry.Suites(registry.Nightly, registry.Quick, registry.Acceptance), - Randomized: tc.randomized, - RequiresLicense: tc.requiresLicense, + Name: "acceptance/" + tc.name, + Owner: owner, + Cluster: r.MakeClusterSpec(numNodes, extraOptions...), + Skip: tc.skip, + EncryptionSupport: tc.encryptionSupport, + Timeout: 10 * time.Minute, + CompatibleClouds: registry.AllClouds.Remove(tc.incompatibleClouds), + Suites: registry.Suites(registry.Nightly, registry.Quick, registry.Acceptance), + Randomized: tc.randomized, + RequiresLicense: tc.requiresLicense, + SkipPostValidations: tc.skipPostValidation, } if tc.timeout != 0 { diff --git a/pkg/roachprod/install/cluster_synced.go b/pkg/roachprod/install/cluster_synced.go index 6676b6745271..84f2dd2d9a95 100644 --- a/pkg/roachprod/install/cluster_synced.go +++ b/pkg/roachprod/install/cluster_synced.go @@ -609,6 +609,7 @@ func (c *SyncedCluster) Wipe(ctx context.Context, l *logger.Logger, preserveCert rmCmds := []string{ fmt.Sprintf(`sudo find /mnt/data* -maxdepth 1 -type f -not -name %s -exec rm -f {} \;`, vm.InitializedFile), `sudo rm -fr /mnt/data*/{auxiliary,local,tmp,cassandra,cockroach,cockroach-temp*,mongo-data}`, + `sudo rm -fr *.sh`, `sudo rm -fr logs* data*`, } if !preserveCerts {