Skip to content

Commit

Permalink
roachtest: remove assert dead nodes
Browse files Browse the repository at this point in the history
Previously, the post assertions would check for dead nodes by using roachprod
Monitor. It checks if the data directory of a cockroach node is non-trivial.
This is not a reliable way to exclude empty nodes, or to check for dead nodes.
It also complicates the Monitor functionality unnecessarily.

This change refactors the code to rather use the HTTP health checks to determine
if there are dead nodes. Since we now know which node is used for the workload,
this check only needs to determine if the cockroach nodes are all still live.

Informs: cockroachdb#118214
Epic: None
  • Loading branch information
herkolategan committed Dec 9, 2024
1 parent 4d7952c commit 22636b9
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 58 deletions.
55 changes: 10 additions & 45 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1466,43 +1466,6 @@ func (c *clusterImpl) FetchVMSpecs(ctx context.Context, l *logger.Logger) error
})
}

// checkNoDeadNode returns an error if at least one of the nodes that have a populated
// data dir are found to be not running. It prints both to t.L() and the test
// output.
func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return nil
}

t.L().Printf("checking for dead nodes")
eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})

// An error here means there was a problem initialising a SyncedCluster.
if err != nil {
return err
}

deadProcesses := 0
for info := range eventsCh {
t.L().Printf("%s", info)

if _, isDeath := info.Event.(install.MonitorProcessDead); isDeath {
deadProcesses++
}
}

var plural string
if deadProcesses > 1 {
plural = "es"
}

if deadProcesses > 0 {
return errors.Newf("%d dead cockroach process%s detected", deadProcesses, plural)
}
return nil
}

func selectedNodesOrDefault(
opts []option.Option, defaultNodes option.NodeListOption,
) option.NodeListOption {
Expand Down Expand Up @@ -1540,16 +1503,18 @@ func newHealthStatusResult(node int, status int, body []byte, err error) *Health
func (c *clusterImpl) HealthStatus(
ctx context.Context, l *logger.Logger, nodes option.NodeListOption,
) ([]*HealthStatusResult, error) {
if len(nodes) < 1 {
nodeCount := len(nodes)
if nodeCount < 1 {
return nil, nil // unit tests
}

adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, nodes)
if err != nil {
return nil, errors.WithDetail(err, "Unable to get admin UI address(es)")
}
client := roachtestutil.DefaultHTTPClient(c, l)
getStatus := func(ctx context.Context, node int) *HealthStatusResult {
url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[node-1])
getStatus := func(ctx context.Context, nodeIndex, node int) *HealthStatusResult {
url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[nodeIndex])
resp, err := client.Get(ctx, url)
if err != nil {
return newHealthStatusResult(node, 0, nil, err)
Expand All @@ -1561,16 +1526,16 @@ func (c *clusterImpl) HealthStatus(
return newHealthStatusResult(node, resp.StatusCode, body, err)
}

results := make([]*HealthStatusResult, c.spec.NodeCount)
results := make([]*HealthStatusResult, nodeCount)

_ = timeutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error {
var wg sync.WaitGroup
wg.Add(c.spec.NodeCount)
for i := 1; i <= c.spec.NodeCount; i++ {
go func(node int) {
for i := 0; i < nodeCount; i++ {
go func() {
defer wg.Done()
results[node-1] = getStatus(ctx, node)
}(i)
results[i] = getStatus(ctx, i, nodes[i])
}()
}
wg.Wait()
return nil
Expand Down
32 changes: 19 additions & 13 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1487,27 +1487,16 @@ func (r *testRunner) postTestAssertions(
postAssertCh := make(chan struct{})
_ = r.stopper.RunAsyncTask(ctx, "test-post-assertions", func(ctx context.Context) {
defer close(postAssertCh)
// When a dead node is detected, the subsequent post validation queries are likely
// to hang (reason unclear), and eventually timeout according to the statement_timeout.
// If this occurs frequently enough, we can look at skipping post validations on a node
// failure (or even on any test failure).
if err := c.assertNoDeadNode(ctx, t); err != nil {
// Some tests expect dead nodes, so they may opt out of this check.
if t.spec.SkipPostValidations&registry.PostValidationNoDeadNodes == 0 {
postAssertionErr(err)
} else {
t.L().Printf("dead node(s) detected but expected")
}
}

// We collect all the admin health endpoints in parallel,
// and select the first one that succeeds to run the validation queries
statuses, err := c.HealthStatus(ctx, t.L(), c.All())
statuses, err := c.HealthStatus(ctx, t.L(), c.CRDBNodes())
if err != nil {
postAssertionErr(errors.WithDetail(err, "Unable to check health status"))
}

validationNode := 0
liveNodes := 0
for _, s := range statuses {
if s.Err != nil {
t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err)
Expand All @@ -1523,6 +1512,23 @@ func (r *testRunner) postTestAssertions(
validationNode = s.Node // NB: s.Node is never zero
}
t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node)
liveNodes++
}

// When a dead node is detected, the subsequent post validation queries are likely
// to hang (reason unclear), and eventually timeout according to the statement_timeout.
// If this occurs frequently enough, we can look at skipping post validations on a node
// failure (or even on any test failure).
if liveNodes != len(c.CRDBNodes()) {
// Some tests expect dead nodes, so they may opt out of this check.
if t.spec.SkipPostValidations&registry.PostValidationNoDeadNodes == 0 {
postAssertionErr(
errors.Newf("unexpected dead node(s) detected, expected %d live nodes out of %d",
liveNodes, len(c.CRDBNodes())),
)
} else {
t.L().Printf("dead node(s) detected but expected")
}
}

// We avoid trying to do this when t.Failed() (and in particular when there
Expand Down

0 comments on commit 22636b9

Please sign in to comment.