Skip to content

Commit

Permalink
roachtest: use health endpoints to determine whether a node is suitable
Browse files Browse the repository at this point in the history
for running post test validations. This avoids connecting to drained
or decommissioned nodes.

Epic: none
Release note: none
Fixes: #102603
  • Loading branch information
Miral Gadani committed Apr 28, 2023
1 parent f8523ec commit 94c5d9a
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 22 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ go_library(
"//pkg/util/allstacks",
"//pkg/util/contextutil",
"//pkg/util/ctxgroup",
"//pkg/util/httputil",
"//pkg/util/log",
"//pkg/util/quotapool",
"//pkg/util/randutil",
Expand Down
54 changes: 32 additions & 22 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"io"
"io/fs"
"net"
"net/http"
"net/url"
"os"
"os/exec"
Expand Down Expand Up @@ -45,6 +46,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/quotapool"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
Expand Down Expand Up @@ -1355,7 +1357,7 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
deadNodes++
}

t.L().Printf("node %d: err=%v,msg=%s", n.Node, n.Err, n.Msg)
t.L().Printf("n%d: err=%v,msg=%s", n.Node, n.Err, n.Msg)
}

if deadNodes > 0 {
Expand All @@ -1368,33 +1370,41 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
// live node is found, it returns nil and -1. If a live node is found it returns
// a connection to it and the node's index.
func (c *clusterImpl) ConnectToLiveNode(ctx context.Context, t *testImpl) (*gosql.DB, int) {
node := -1
if c.spec.NodeCount < 1 {
return nil, node // unit tests
return nil, -1 // unit tests
}

checkReady := func(ctx context.Context, node int) bool {
adminAddr, err := c.ExternalAdminUIAddr(ctx, t.L(), c.Node(node))
if err != nil {
t.L().Printf("n%d not ready/live: %s", node, err)
return false
}

url := fmt.Sprintf(`http://%s/health?ready=1`, adminAddr[0])
resp, err := httputil.Get(ctx, url)
if err != nil {
t.L().Printf("n%d not ready/live: %s", node, err)
return false
}

defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil || resp.StatusCode != http.StatusOK {
t.L().Printf("n%d not ready/live: HTTP %d \n%s", node, resp.StatusCode, body)
return false
}

return true
}

// Find a live node to run against, if one exists.
var db *gosql.DB
for i := 1; i <= c.spec.NodeCount; i++ {
// Don't hang forever.
if err := contextutil.RunWithTimeout(
ctx, "find live node", 5*time.Second,
func(ctx context.Context) error {
db = c.Conn(ctx, t.L(), i)
_, err := db.ExecContext(ctx, `;`)
return err
},
); err != nil {
_ = db.Close()
db = nil
continue
if checkReady(ctx, i) {
return c.Conn(ctx, t.L(), i), i
}
node = i
break
}
if db == nil {
return nil, node
}
return db, node
return nil, -1
}

// FailOnInvalidDescriptors fails the test if there exists any descriptors in
Expand Down

0 comments on commit 94c5d9a

Please sign in to comment.