diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index d82c5bc2996d..e1480f341a00 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -16,6 +16,7 @@ import ( "log" "math/rand" "net" + "net/http" "net/url" "os" "os/exec" @@ -1466,43 +1467,6 @@ func (c *clusterImpl) FetchVMSpecs(ctx context.Context, l *logger.Logger) error }) } -// checkNoDeadNode returns an error if at least one of the nodes that have a populated -// data dir are found to be not running. It prints both to t.L() and the test -// output. -func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error { - if c.spec.NodeCount == 0 { - // No nodes can happen during unit tests and implies nothing to do. - return nil - } - - t.L().Printf("checking for dead nodes") - eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true}) - - // An error here means there was a problem initialising a SyncedCluster. - if err != nil { - return err - } - - deadProcesses := 0 - for info := range eventsCh { - t.L().Printf("%s", info) - - if _, isDeath := info.Event.(install.MonitorProcessDead); isDeath { - deadProcesses++ - } - } - - var plural string - if deadProcesses > 1 { - plural = "es" - } - - if deadProcesses > 0 { - return errors.Newf("%d dead cockroach process%s detected", deadProcesses, plural) - } - return nil -} - func selectedNodesOrDefault( opts []option.Option, defaultNodes option.NodeListOption, ) option.NodeListOption { @@ -1536,20 +1500,40 @@ func newHealthStatusResult(node int, status int, body []byte, err error) *Health } } +// SystemNodes looks for the system start script on each node and returns the +// node numbers that have it. +func (c *clusterImpl) SystemNodes() (option.NodeListOption, error) { + scriptPath := install.StartScriptPath(install.SystemInterfaceName, 0) + results, err := c.RunWithDetails(context.Background(), nil, option.WithNodes(c.All()), "test -f "+scriptPath) + if err != nil { + return nil, err + } + var nodes option.NodeListOption + for _, res := range results { + if res.RemoteExitStatus == 0 { + nodes = append(nodes, int(res.Node)) + } + } + return nodes, nil +} + // HealthStatus returns the result of the /health?ready=1 endpoint for each node. func (c *clusterImpl) HealthStatus( ctx context.Context, l *logger.Logger, nodes option.NodeListOption, ) ([]*HealthStatusResult, error) { - if len(nodes) < 1 { + nodeCount := len(nodes) + if nodeCount < 1 { return nil, nil // unit tests } + adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, nodes) if err != nil { return nil, errors.WithDetail(err, "Unable to get admin UI address(es)") } client := roachtestutil.DefaultHTTPClient(c, l) - getStatus := func(ctx context.Context, node int) *HealthStatusResult { - url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[node-1]) + getStatus := func(ctx context.Context, nodeIndex, node int) *HealthStatusResult { + url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[nodeIndex]) + resp, err := client.Get(ctx, url) if err != nil { return newHealthStatusResult(node, 0, nil, err) @@ -1561,16 +1545,26 @@ func (c *clusterImpl) HealthStatus( return newHealthStatusResult(node, resp.StatusCode, body, err) } - results := make([]*HealthStatusResult, c.spec.NodeCount) + results := make([]*HealthStatusResult, nodeCount) _ = timeutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error { var wg sync.WaitGroup - wg.Add(c.spec.NodeCount) - for i := 1; i <= c.spec.NodeCount; i++ { - go func(node int) { + wg.Add(nodeCount) + for i := 0; i < nodeCount; i++ { + go func() { defer wg.Done() - results[node-1] = getStatus(ctx, node) - }(i) + for { + results[i] = getStatus(ctx, i, nodes[i]) + if results[i].Err == nil && results[i].Status == http.StatusOK { + return + } + select { + case <-ctx.Done(): + return + case <-time.After(3 * time.Second): + } + } + }() } wg.Wait() return nil diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 538a43cfb008..1a53a1e8faec 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1487,27 +1487,23 @@ func (r *testRunner) postTestAssertions( postAssertCh := make(chan struct{}) _ = r.stopper.RunAsyncTask(ctx, "test-post-assertions", func(ctx context.Context) { defer close(postAssertCh) - // When a dead node is detected, the subsequent post validation queries are likely - // to hang (reason unclear), and eventually timeout according to the statement_timeout. - // If this occurs frequently enough, we can look at skipping post validations on a node - // failure (or even on any test failure). - if err := c.assertNoDeadNode(ctx, t); err != nil { - // Some tests expect dead nodes, so they may opt out of this check. - if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { - postAssertionErr(err) - } else { - t.L().Printf("dead node(s) detected but expected") - } + + // Determine on which nodes the system tenant is running. + systemNodes, err := c.SystemNodes() + _, _ = fmt.Fprintf(os.Stderr, "SYSTEM NODES: %v\n", systemNodes) + if err != nil { + postAssertionErr(errors.WithDetail(err, "Unable to determine system nodes")) } // We collect all the admin health endpoints in parallel, // and select the first one that succeeds to run the validation queries - statuses, err := c.HealthStatus(ctx, t.L(), c.All()) + statuses, err := c.HealthStatus(ctx, t.L(), systemNodes) if err != nil { postAssertionErr(errors.WithDetail(err, "Unable to check health status")) } validationNode := 0 + liveNodes := 0 for _, s := range statuses { if s.Err != nil { t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err) @@ -1523,6 +1519,24 @@ func (r *testRunner) postTestAssertions( validationNode = s.Node // NB: s.Node is never zero } t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node) + liveNodes++ + } + + // When a dead node is detected, the subsequent post validation queries are likely + // to hang (reason unclear), and eventually timeout according to the statement_timeout. + // If this occurs frequently enough, we can look at skipping post validations on a node + // failure (or even on any test failure). + if liveNodes != len(systemNodes) { + t.L().Printf("validation: %d out of %d node(s) are alive", liveNodes, len(systemNodes)) + // Some tests expect dead nodes, so they may opt out of this check. + if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { + postAssertionErr( + errors.Newf("unexpected dead node(s) detected, %d out of %d node(s) are alive", + liveNodes, len(systemNodes)), + ) + } else { + t.L().Printf("dead node(s) detected but expected") + } } // We avoid trying to do this when t.Failed() (and in particular when there diff --git a/pkg/cmd/roachtest/tests/acceptance.go b/pkg/cmd/roachtest/tests/acceptance.go index ebb8192b7113..e82745bc530d 100644 --- a/pkg/cmd/roachtest/tests/acceptance.go +++ b/pkg/cmd/roachtest/tests/acceptance.go @@ -33,6 +33,7 @@ func registerAcceptance(r registry.Registry) { randomized bool workloadNode bool incompatibleClouds registry.CloudSet + skipPostValidation registry.PostValidation }{ // NOTE: acceptance tests are lightweight tests that run as part // of CI. As such, they must: @@ -44,7 +45,7 @@ func registerAcceptance(r registry.Registry) { // properties, please register it separately (not as an acceptance // test). registry.OwnerKV: { - {name: "decommission-self", fn: runDecommissionSelf}, + {name: "decommission-self", fn: runDecommissionSelf, skipPostValidation: registry.PostValidationNoDeadNodes}, {name: "event-log", fn: runEventLog}, {name: "gossip/peerings", fn: runGossipPeerings}, {name: "gossip/restart", fn: runGossipRestart}, @@ -59,8 +60,8 @@ func registerAcceptance(r registry.Registry) { encryptionSupport: registry.EncryptionMetamorphic, }, {name: "cli/node-status", fn: runCLINodeStatus}, - {name: "cluster-init", fn: runClusterInit}, - {name: "rapid-restart", fn: runRapidRestart}, + {name: "cluster-init", fn: runClusterInit, skipPostValidation: registry.PostValidationNoDeadNodes}, + {name: "rapid-restart", fn: runRapidRestart, skipPostValidation: registry.PostValidationNoDeadNodes}, }, registry.OwnerObservability: { {name: "status-server", fn: runStatusServer}, @@ -139,16 +140,17 @@ func registerAcceptance(r registry.Registry) { } testSpec := registry.TestSpec{ - Name: "acceptance/" + tc.name, - Owner: owner, - Cluster: r.MakeClusterSpec(numNodes, extraOptions...), - Skip: tc.skip, - EncryptionSupport: tc.encryptionSupport, - Timeout: 10 * time.Minute, - CompatibleClouds: registry.AllClouds.Remove(tc.incompatibleClouds), - Suites: registry.Suites(registry.Nightly, registry.Quick, registry.Acceptance), - Randomized: tc.randomized, - RequiresLicense: tc.requiresLicense, + Name: "acceptance/" + tc.name, + Owner: owner, + Cluster: r.MakeClusterSpec(numNodes, extraOptions...), + Skip: tc.skip, + EncryptionSupport: tc.encryptionSupport, + Timeout: 10 * time.Minute, + CompatibleClouds: registry.AllClouds.Remove(tc.incompatibleClouds), + Suites: registry.Suites(registry.Nightly, registry.Quick, registry.Acceptance), + Randomized: tc.randomized, + RequiresLicense: tc.requiresLicense, + SkipPostValidations: tc.skipPostValidation, } if tc.timeout != 0 { diff --git a/pkg/roachprod/install/cluster_synced.go b/pkg/roachprod/install/cluster_synced.go index 6676b6745271..84f2dd2d9a95 100644 --- a/pkg/roachprod/install/cluster_synced.go +++ b/pkg/roachprod/install/cluster_synced.go @@ -609,6 +609,7 @@ func (c *SyncedCluster) Wipe(ctx context.Context, l *logger.Logger, preserveCert rmCmds := []string{ fmt.Sprintf(`sudo find /mnt/data* -maxdepth 1 -type f -not -name %s -exec rm -f {} \;`, vm.InitializedFile), `sudo rm -fr /mnt/data*/{auxiliary,local,tmp,cassandra,cockroach,cockroach-temp*,mongo-data}`, + `sudo rm -fr *.sh`, `sudo rm -fr logs* data*`, } if !preserveCerts {