From 67e1af755bfc114344051025a1183908763041ff Mon Sep 17 00:00:00 2001 From: Miral Gadani Date: Fri, 28 Apr 2023 20:04:25 +0000 Subject: [PATCH] roachtest: remove `ConnectToLiveNode` and instead report on health of all nodes in test teardown. Validations can use a db connection from any node which has a status of 200. Epic: none Release note: none Informs: #102603 --- pkg/cmd/roachtest/cluster.go | 74 +++++++++++++++++++------------- pkg/cmd/roachtest/test_runner.go | 31 ++++++++++++- 2 files changed, 74 insertions(+), 31 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 9601a1cb34cf..ac0cbec56bed 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -18,7 +18,6 @@ import ( "io" "io/fs" "net" - "net/http" "net/url" "os" "os/exec" @@ -1366,45 +1365,62 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error { return nil } -// ConnectToLiveNode returns a connection to a live node in the cluster. If no -// live node is found, it returns nil and -1. If a live node is found it returns -// a connection to it and the node's index. -func (c *clusterImpl) ConnectToLiveNode(ctx context.Context, t *testImpl) (*gosql.DB, int) { - if c.spec.NodeCount < 1 { - return nil, -1 // unit tests - } +type HealthStatusResult struct { + Node int + Status int + Body []byte + Err error +} - checkReady := func(ctx context.Context, node int) bool { - adminAddr, err := c.ExternalAdminUIAddr(ctx, t.L(), c.Node(node)) - if err != nil { - t.L().Printf("n%d not ready/live: %s", node, err) - return false - } +func newHealthStatusResult(node int, status int, body []byte, err error) *HealthStatusResult { + return &HealthStatusResult{ + Node: node, + Status: status, + Body: body, + Err: err, + } +} - url := fmt.Sprintf(`http://%s/health?ready=1`, adminAddr[0]) +// HealthStatus returns the result of the /health?ready=1 endpoint for each node. +func (c *clusterImpl) HealthStatus( + ctx context.Context, l *logger.Logger, node option.NodeListOption, +) ([]*HealthStatusResult, error) { + if len(node) < 1 { + return nil, nil // unit tests + } + adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, node) + if err != nil { + return nil, errors.WithDetail(err, "Unable to get admin UI address(es)") + } + getStatus := func(ctx context.Context, node int) *HealthStatusResult { + url := fmt.Sprintf(`http://%s/health?ready=1`, adminAddrs[node-1]) resp, err := httputil.Get(ctx, url) if err != nil { - t.L().Printf("n%d not ready/live: %s", node, err) - return false + return newHealthStatusResult(node, 0, nil, err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) - if err != nil || resp.StatusCode != http.StatusOK { - t.L().Printf("n%d not ready/live: HTTP %d \n%s", node, resp.StatusCode, body) - return false - } - return true + return newHealthStatusResult(node, resp.StatusCode, body, err) } - // Find a live node to run against, if one exists. - for i := 1; i <= c.spec.NodeCount; i++ { - if checkReady(ctx, i) { - return c.Conn(ctx, t.L(), i), i + results := make([]*HealthStatusResult, c.spec.NodeCount) + + _ = contextutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error { + var wg sync.WaitGroup + wg.Add(c.spec.NodeCount) + for i := 1; i <= c.spec.NodeCount; i++ { + go func(node int) { + defer wg.Done() + results[node-1] = getStatus(ctx, node) + }(i) } - } - return nil, -1 + wg.Wait() + return nil + }) + + return results, nil } // FailOnInvalidDescriptors fails the test if there exists any descriptors in @@ -2299,7 +2315,7 @@ func (c *clusterImpl) InternalAdminUIAddr( return addrs, nil } -// ExternalAdminUIAddr returns the internal Admin UI address in the form host:port +// ExternalAdminUIAddr returns the external Admin UI address in the form host:port // for the specified node. func (c *clusterImpl) ExternalAdminUIAddr( ctx context.Context, l *logger.Logger, node option.NodeListOption, diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 39c24979d275..c1cf92026999 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -13,6 +13,7 @@ package main import ( "archive/zip" "context" + gosql "database/sql" "fmt" "html" "io" @@ -1092,6 +1093,33 @@ func (r *testRunner) teardownTest( } } + // We collect all the admin health endpoints in parallel, + // and select the first one that succeeds to run the validation queries + statuses, err := c.HealthStatus(ctx, t.L(), c.All()) + if err != nil { + t.Error(errors.WithDetail(err, "Unable to check health status")) + } + + var db *gosql.DB + var validationNode int + for _, s := range statuses { + if s.Err != nil { + t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err) + continue + } + + if s.Status != http.StatusOK { + t.L().Printf("n%d:/health?ready=1 status=%d body=%s", s.Node, s.Status, s.Body) + continue + } + + if db == nil { + db = c.Conn(ctx, t.L(), s.Node) + validationNode = s.Node + } + t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node) + } + // We avoid trying to do this when t.Failed() (and in particular when there // are dead nodes) because for reasons @tbg does not understand this gets // stuck occasionally, which really ruins the roachtest run. The method @@ -1100,10 +1128,9 @@ func (r *testRunner) teardownTest( // // TODO(testinfra): figure out why this can still get stuck despite the // above. - db, node := c.ConnectToLiveNode(ctx, t) if db != nil { defer db.Close() - t.L().Printf("running validation checks on node %d (<10m)", node) + t.L().Printf("running validation checks on node %d (<10m)", validationNode) // If this validation fails due to a timeout, it is very likely that // the replica divergence check below will also fail. if t.spec.SkipPostValidations®istry.PostValidationInvalidDescriptors == 0 {