Skip to content

Commit

Permalink
roachtest: remove assert dead nodes
Browse files Browse the repository at this point in the history
Previously, the post assertions would check for dead nodes by using
`roachprod.Monitor`. It checks if the data directory of a cockroach node is
non-trivial. This is not a reliable way to exclude empty nodes, or to check for
dead nodes. It also complicates the Monitor functionality unnecessarily. It also
no longer functions, since `roachprod.Monitor` only watches already live
processes.

This change removes this functionality as it no longer works. An issue (See:
cockroachdb#137329) has been created to address this should it still be a requirement.

Informs: cockroachdb#118214
Epic: None
  • Loading branch information
herkolategan committed Dec 12, 2024
1 parent 677943d commit c7c9e6f
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 72 deletions.
69 changes: 23 additions & 46 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"log"
"math/rand"
"net"
"net/http"
"net/url"
"os"
"os/exec"
Expand Down Expand Up @@ -1466,43 +1467,6 @@ func (c *clusterImpl) FetchVMSpecs(ctx context.Context, l *logger.Logger) error
})
}

// checkNoDeadNode returns an error if at least one of the nodes that have a populated
// data dir are found to be not running. It prints both to t.L() and the test
// output.
func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return nil
}

t.L().Printf("checking for dead nodes")
eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})

// An error here means there was a problem initialising a SyncedCluster.
if err != nil {
return err
}

deadProcesses := 0
for info := range eventsCh {
t.L().Printf("%s", info)

if _, isDeath := info.Event.(install.MonitorProcessDead); isDeath {
deadProcesses++
}
}

var plural string
if deadProcesses > 1 {
plural = "es"
}

if deadProcesses > 0 {
return errors.Newf("%d dead cockroach process%s detected", deadProcesses, plural)
}
return nil
}

func selectedNodesOrDefault(
opts []option.Option, defaultNodes option.NodeListOption,
) option.NodeListOption {
Expand Down Expand Up @@ -1540,16 +1504,19 @@ func newHealthStatusResult(node int, status int, body []byte, err error) *Health
func (c *clusterImpl) HealthStatus(
ctx context.Context, l *logger.Logger, nodes option.NodeListOption,
) ([]*HealthStatusResult, error) {
if len(nodes) < 1 {
nodeCount := len(nodes)
if nodeCount < 1 {
return nil, nil // unit tests
}

adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, nodes)
if err != nil {
return nil, errors.WithDetail(err, "Unable to get admin UI address(es)")
}
client := roachtestutil.DefaultHTTPClient(c, l)
getStatus := func(ctx context.Context, node int) *HealthStatusResult {
url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[node-1])
getStatus := func(ctx context.Context, nodeIndex, node int) *HealthStatusResult {
url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[nodeIndex])

resp, err := client.Get(ctx, url)
if err != nil {
return newHealthStatusResult(node, 0, nil, err)
Expand All @@ -1561,16 +1528,26 @@ func (c *clusterImpl) HealthStatus(
return newHealthStatusResult(node, resp.StatusCode, body, err)
}

results := make([]*HealthStatusResult, c.spec.NodeCount)
results := make([]*HealthStatusResult, nodeCount)

_ = timeutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error {
var wg sync.WaitGroup
wg.Add(c.spec.NodeCount)
for i := 1; i <= c.spec.NodeCount; i++ {
go func(node int) {
wg.Add(nodeCount)
for i := 0; i < nodeCount; i++ {
go func() {
defer wg.Done()
results[node-1] = getStatus(ctx, node)
}(i)
for {
results[i] = getStatus(ctx, i, nodes[i])
if results[i].Err == nil && results[i].Status == http.StatusOK {
return
}
select {
case <-ctx.Done():
return
case <-time.After(3 * time.Second):
}
}
}()
}
wg.Wait()
return nil
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/registry/test_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ const (
// the crdb_internal.invalid_objects virtual table.
PostValidationInvalidDescriptors
// PostValidationNoDeadNodes checks if there are any dead nodes in the cluster.
// TODO: Deprecate or replace this functionality
// In its current state it is no longer functional.
// See: https://github.com/cockroachdb/cockroach/issues/137329 for details.
PostValidationNoDeadNodes
)

Expand Down
16 changes: 3 additions & 13 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1487,27 +1487,16 @@ func (r *testRunner) postTestAssertions(
postAssertCh := make(chan struct{})
_ = r.stopper.RunAsyncTask(ctx, "test-post-assertions", func(ctx context.Context) {
defer close(postAssertCh)
// When a dead node is detected, the subsequent post validation queries are likely
// to hang (reason unclear), and eventually timeout according to the statement_timeout.
// If this occurs frequently enough, we can look at skipping post validations on a node
// failure (or even on any test failure).
if err := c.assertNoDeadNode(ctx, t); err != nil {
// Some tests expect dead nodes, so they may opt out of this check.
if t.spec.SkipPostValidations&registry.PostValidationNoDeadNodes == 0 {
postAssertionErr(err)
} else {
t.L().Printf("dead node(s) detected but expected")
}
}

// We collect all the admin health endpoints in parallel,
// and select the first one that succeeds to run the validation queries
statuses, err := c.HealthStatus(ctx, t.L(), c.All())
statuses, err := c.HealthStatus(ctx, t.L(), c.CRDBNodes())
if err != nil {
postAssertionErr(errors.WithDetail(err, "Unable to check health status"))
}

validationNode := 0
liveNodes := 0
for _, s := range statuses {
if s.Err != nil {
t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err)
Expand All @@ -1523,6 +1512,7 @@ func (r *testRunner) postTestAssertions(
validationNode = s.Node // NB: s.Node is never zero
}
t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node)
liveNodes++
}

// We avoid trying to do this when t.Failed() (and in particular when there
Expand Down
28 changes: 15 additions & 13 deletions pkg/cmd/roachtest/tests/acceptance.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ func registerAcceptance(r registry.Registry) {
randomized bool
workloadNode bool
incompatibleClouds registry.CloudSet
skipPostValidation registry.PostValidation
}{
// NOTE: acceptance tests are lightweight tests that run as part
// of CI. As such, they must:
Expand All @@ -44,7 +45,7 @@ func registerAcceptance(r registry.Registry) {
// properties, please register it separately (not as an acceptance
// test).
registry.OwnerKV: {
{name: "decommission-self", fn: runDecommissionSelf},
{name: "decommission-self", fn: runDecommissionSelf, skipPostValidation: registry.PostValidationNoDeadNodes},
{name: "event-log", fn: runEventLog},
{name: "gossip/peerings", fn: runGossipPeerings},
{name: "gossip/restart", fn: runGossipRestart},
Expand All @@ -59,8 +60,8 @@ func registerAcceptance(r registry.Registry) {
encryptionSupport: registry.EncryptionMetamorphic,
},
{name: "cli/node-status", fn: runCLINodeStatus},
{name: "cluster-init", fn: runClusterInit},
{name: "rapid-restart", fn: runRapidRestart},
{name: "cluster-init", fn: runClusterInit, skipPostValidation: registry.PostValidationNoDeadNodes},
{name: "rapid-restart", fn: runRapidRestart, skipPostValidation: registry.PostValidationNoDeadNodes},
},
registry.OwnerObservability: {
{name: "status-server", fn: runStatusServer},
Expand Down Expand Up @@ -139,16 +140,17 @@ func registerAcceptance(r registry.Registry) {
}

testSpec := registry.TestSpec{
Name: "acceptance/" + tc.name,
Owner: owner,
Cluster: r.MakeClusterSpec(numNodes, extraOptions...),
Skip: tc.skip,
EncryptionSupport: tc.encryptionSupport,
Timeout: 10 * time.Minute,
CompatibleClouds: registry.AllClouds.Remove(tc.incompatibleClouds),
Suites: registry.Suites(registry.Nightly, registry.Quick, registry.Acceptance),
Randomized: tc.randomized,
RequiresLicense: tc.requiresLicense,
Name: "acceptance/" + tc.name,
Owner: owner,
Cluster: r.MakeClusterSpec(numNodes, extraOptions...),
Skip: tc.skip,
EncryptionSupport: tc.encryptionSupport,
Timeout: 10 * time.Minute,
CompatibleClouds: registry.AllClouds.Remove(tc.incompatibleClouds),
Suites: registry.Suites(registry.Nightly, registry.Quick, registry.Acceptance),
Randomized: tc.randomized,
RequiresLicense: tc.requiresLicense,
SkipPostValidations: tc.skipPostValidation,
}

if tc.timeout != 0 {
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,7 @@ func (c *SyncedCluster) Wipe(ctx context.Context, l *logger.Logger, preserveCert
rmCmds := []string{
fmt.Sprintf(`sudo find /mnt/data* -maxdepth 1 -type f -not -name %s -exec rm -f {} \;`, vm.InitializedFile),
`sudo rm -fr /mnt/data*/{auxiliary,local,tmp,cassandra,cockroach,cockroach-temp*,mongo-data}`,
`sudo rm -fr *.sh`,
`sudo rm -fr logs* data*`,
}
if !preserveCerts {
Expand Down

0 comments on commit c7c9e6f

Please sign in to comment.