From c7c9e6f79ce76670c77ab613fe438879688d5d57 Mon Sep 17 00:00:00 2001 From: Herko Lategan Date: Fri, 6 Dec 2024 12:10:36 +0000 Subject: [PATCH] roachtest: remove assert dead nodes Previously, the post assertions would check for dead nodes by using `roachprod.Monitor`. It checks if the data directory of a cockroach node is non-trivial. This is not a reliable way to exclude empty nodes, or to check for dead nodes. It also complicates the Monitor functionality unnecessarily. It also no longer functions, since `roachprod.Monitor` only watches already live processes. This change removes this functionality as it no longer works. An issue (See: #137329) has been created to address this should it still be a requirement. Informs: #118214 Epic: None --- pkg/cmd/roachtest/cluster.go | 69 +++++++++---------------- pkg/cmd/roachtest/registry/test_spec.go | 3 ++ pkg/cmd/roachtest/test_runner.go | 16 ++---- pkg/cmd/roachtest/tests/acceptance.go | 28 +++++----- pkg/roachprod/install/cluster_synced.go | 1 + 5 files changed, 45 insertions(+), 72 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index d82c5bc2996d..aef4286427a8 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -16,6 +16,7 @@ import ( "log" "math/rand" "net" + "net/http" "net/url" "os" "os/exec" @@ -1466,43 +1467,6 @@ func (c *clusterImpl) FetchVMSpecs(ctx context.Context, l *logger.Logger) error }) } -// checkNoDeadNode returns an error if at least one of the nodes that have a populated -// data dir are found to be not running. It prints both to t.L() and the test -// output. -func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error { - if c.spec.NodeCount == 0 { - // No nodes can happen during unit tests and implies nothing to do. - return nil - } - - t.L().Printf("checking for dead nodes") - eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true}) - - // An error here means there was a problem initialising a SyncedCluster. - if err != nil { - return err - } - - deadProcesses := 0 - for info := range eventsCh { - t.L().Printf("%s", info) - - if _, isDeath := info.Event.(install.MonitorProcessDead); isDeath { - deadProcesses++ - } - } - - var plural string - if deadProcesses > 1 { - plural = "es" - } - - if deadProcesses > 0 { - return errors.Newf("%d dead cockroach process%s detected", deadProcesses, plural) - } - return nil -} - func selectedNodesOrDefault( opts []option.Option, defaultNodes option.NodeListOption, ) option.NodeListOption { @@ -1540,16 +1504,19 @@ func newHealthStatusResult(node int, status int, body []byte, err error) *Health func (c *clusterImpl) HealthStatus( ctx context.Context, l *logger.Logger, nodes option.NodeListOption, ) ([]*HealthStatusResult, error) { - if len(nodes) < 1 { + nodeCount := len(nodes) + if nodeCount < 1 { return nil, nil // unit tests } + adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, nodes) if err != nil { return nil, errors.WithDetail(err, "Unable to get admin UI address(es)") } client := roachtestutil.DefaultHTTPClient(c, l) - getStatus := func(ctx context.Context, node int) *HealthStatusResult { - url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[node-1]) + getStatus := func(ctx context.Context, nodeIndex, node int) *HealthStatusResult { + url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[nodeIndex]) + resp, err := client.Get(ctx, url) if err != nil { return newHealthStatusResult(node, 0, nil, err) @@ -1561,16 +1528,26 @@ func (c *clusterImpl) HealthStatus( return newHealthStatusResult(node, resp.StatusCode, body, err) } - results := make([]*HealthStatusResult, c.spec.NodeCount) + results := make([]*HealthStatusResult, nodeCount) _ = timeutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error { var wg sync.WaitGroup - wg.Add(c.spec.NodeCount) - for i := 1; i <= c.spec.NodeCount; i++ { - go func(node int) { + wg.Add(nodeCount) + for i := 0; i < nodeCount; i++ { + go func() { defer wg.Done() - results[node-1] = getStatus(ctx, node) - }(i) + for { + results[i] = getStatus(ctx, i, nodes[i]) + if results[i].Err == nil && results[i].Status == http.StatusOK { + return + } + select { + case <-ctx.Done(): + return + case <-time.After(3 * time.Second): + } + } + }() } wg.Wait() return nil diff --git a/pkg/cmd/roachtest/registry/test_spec.go b/pkg/cmd/roachtest/registry/test_spec.go index a61554cc0c02..d364a439bdd1 100644 --- a/pkg/cmd/roachtest/registry/test_spec.go +++ b/pkg/cmd/roachtest/registry/test_spec.go @@ -202,6 +202,9 @@ const ( // the crdb_internal.invalid_objects virtual table. PostValidationInvalidDescriptors // PostValidationNoDeadNodes checks if there are any dead nodes in the cluster. + // TODO: Deprecate or replace this functionality + // In its current state it is no longer functional. + // See: https://github.com/cockroachdb/cockroach/issues/137329 for details. PostValidationNoDeadNodes ) diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 538a43cfb008..429e7433bea4 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1487,27 +1487,16 @@ func (r *testRunner) postTestAssertions( postAssertCh := make(chan struct{}) _ = r.stopper.RunAsyncTask(ctx, "test-post-assertions", func(ctx context.Context) { defer close(postAssertCh) - // When a dead node is detected, the subsequent post validation queries are likely - // to hang (reason unclear), and eventually timeout according to the statement_timeout. - // If this occurs frequently enough, we can look at skipping post validations on a node - // failure (or even on any test failure). - if err := c.assertNoDeadNode(ctx, t); err != nil { - // Some tests expect dead nodes, so they may opt out of this check. - if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { - postAssertionErr(err) - } else { - t.L().Printf("dead node(s) detected but expected") - } - } // We collect all the admin health endpoints in parallel, // and select the first one that succeeds to run the validation queries - statuses, err := c.HealthStatus(ctx, t.L(), c.All()) + statuses, err := c.HealthStatus(ctx, t.L(), c.CRDBNodes()) if err != nil { postAssertionErr(errors.WithDetail(err, "Unable to check health status")) } validationNode := 0 + liveNodes := 0 for _, s := range statuses { if s.Err != nil { t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err) @@ -1523,6 +1512,7 @@ func (r *testRunner) postTestAssertions( validationNode = s.Node // NB: s.Node is never zero } t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node) + liveNodes++ } // We avoid trying to do this when t.Failed() (and in particular when there diff --git a/pkg/cmd/roachtest/tests/acceptance.go b/pkg/cmd/roachtest/tests/acceptance.go index ebb8192b7113..e82745bc530d 100644 --- a/pkg/cmd/roachtest/tests/acceptance.go +++ b/pkg/cmd/roachtest/tests/acceptance.go @@ -33,6 +33,7 @@ func registerAcceptance(r registry.Registry) { randomized bool workloadNode bool incompatibleClouds registry.CloudSet + skipPostValidation registry.PostValidation }{ // NOTE: acceptance tests are lightweight tests that run as part // of CI. As such, they must: @@ -44,7 +45,7 @@ func registerAcceptance(r registry.Registry) { // properties, please register it separately (not as an acceptance // test). registry.OwnerKV: { - {name: "decommission-self", fn: runDecommissionSelf}, + {name: "decommission-self", fn: runDecommissionSelf, skipPostValidation: registry.PostValidationNoDeadNodes}, {name: "event-log", fn: runEventLog}, {name: "gossip/peerings", fn: runGossipPeerings}, {name: "gossip/restart", fn: runGossipRestart}, @@ -59,8 +60,8 @@ func registerAcceptance(r registry.Registry) { encryptionSupport: registry.EncryptionMetamorphic, }, {name: "cli/node-status", fn: runCLINodeStatus}, - {name: "cluster-init", fn: runClusterInit}, - {name: "rapid-restart", fn: runRapidRestart}, + {name: "cluster-init", fn: runClusterInit, skipPostValidation: registry.PostValidationNoDeadNodes}, + {name: "rapid-restart", fn: runRapidRestart, skipPostValidation: registry.PostValidationNoDeadNodes}, }, registry.OwnerObservability: { {name: "status-server", fn: runStatusServer}, @@ -139,16 +140,17 @@ func registerAcceptance(r registry.Registry) { } testSpec := registry.TestSpec{ - Name: "acceptance/" + tc.name, - Owner: owner, - Cluster: r.MakeClusterSpec(numNodes, extraOptions...), - Skip: tc.skip, - EncryptionSupport: tc.encryptionSupport, - Timeout: 10 * time.Minute, - CompatibleClouds: registry.AllClouds.Remove(tc.incompatibleClouds), - Suites: registry.Suites(registry.Nightly, registry.Quick, registry.Acceptance), - Randomized: tc.randomized, - RequiresLicense: tc.requiresLicense, + Name: "acceptance/" + tc.name, + Owner: owner, + Cluster: r.MakeClusterSpec(numNodes, extraOptions...), + Skip: tc.skip, + EncryptionSupport: tc.encryptionSupport, + Timeout: 10 * time.Minute, + CompatibleClouds: registry.AllClouds.Remove(tc.incompatibleClouds), + Suites: registry.Suites(registry.Nightly, registry.Quick, registry.Acceptance), + Randomized: tc.randomized, + RequiresLicense: tc.requiresLicense, + SkipPostValidations: tc.skipPostValidation, } if tc.timeout != 0 { diff --git a/pkg/roachprod/install/cluster_synced.go b/pkg/roachprod/install/cluster_synced.go index 6676b6745271..84f2dd2d9a95 100644 --- a/pkg/roachprod/install/cluster_synced.go +++ b/pkg/roachprod/install/cluster_synced.go @@ -609,6 +609,7 @@ func (c *SyncedCluster) Wipe(ctx context.Context, l *logger.Logger, preserveCert rmCmds := []string{ fmt.Sprintf(`sudo find /mnt/data* -maxdepth 1 -type f -not -name %s -exec rm -f {} \;`, vm.InitializedFile), `sudo rm -fr /mnt/data*/{auxiliary,local,tmp,cassandra,cockroach,cockroach-temp*,mongo-data}`, + `sudo rm -fr *.sh`, `sudo rm -fr logs* data*`, } if !preserveCerts {