Skip to content

Commit

Permalink
Merge pull request #104882 from renatolabs/roachtest-improvements-bac…
Browse files Browse the repository at this point in the history
…kport
  • Loading branch information
renatolabs authored Jun 14, 2023
2 parents 88f9534 + f09f633 commit 69d57f3
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 53 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ go_library(
"//pkg/util/allstacks",
"//pkg/util/contextutil",
"//pkg/util/ctxgroup",
"//pkg/util/httputil",
"//pkg/util/log",
"//pkg/util/quotapool",
"//pkg/util/randutil",
Expand Down
137 changes: 96 additions & 41 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/quotapool"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
Expand Down Expand Up @@ -1393,6 +1394,7 @@ func (c *clusterImpl) CopyRoachprodState(ctx context.Context) error {
//
// `COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob ./cockroach start-single-node --insecure --store=$(mktemp -d)`
func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, l *logger.Logger) error {
l.Printf("fetching timeseries data\n")
return contextutil.RunWithTimeout(ctx, "fetch tsdata", 5*time.Minute, func(ctx context.Context) error {
node := 1
for ; node <= c.spec.NodeCount; node++ {
Expand Down Expand Up @@ -1507,63 +1509,115 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error
})
}

// checkNoDeadNode reports an error (via `t.Error`) if nodes that have a populated
// checkNoDeadNode returns an error if at least one of the nodes that have a populated
// data dir are found to be not running. It prints both to t.L() and the test
// output.
func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) {
func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return
return nil
}

_, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})
// If there's an error, it means either that the monitor command failed
// completely, or that it found a dead node worth complaining about.
t.L().Printf("checking for dead nodes")
ch, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})

// An error here means there was a problem initialising a SyncedCluster.
if err != nil {
t.Errorf("dead node detection: %s", err)
}
}

// ConnectToLiveNode returns a connection to a live node in the cluster. If no
// live node is found, it returns nil and -1. If a live node is found it returns
// a connection to it and the node's index.
func (c *clusterImpl) ConnectToLiveNode(ctx context.Context, t *testImpl) (*gosql.DB, int) {
node := -1
if c.spec.NodeCount < 1 {
return nil, node // unit tests
}
// Find a live node to run against, if one exists.
var db *gosql.DB
for i := 1; i <= c.spec.NodeCount; i++ {
// Don't hang forever.
if err := contextutil.RunWithTimeout(
ctx, "find live node", 5*time.Second,
func(ctx context.Context) error {
db = c.Conn(ctx, t.L(), i)
_, err := db.ExecContext(ctx, `;`)
return err
},
); err != nil {
_ = db.Close()
db = nil
continue
return err
}

isDead := func(msg string) bool {
if msg == "" || msg == "skipped" {
return false
}
// A numeric message is a PID and implies that the node is running.
_, err := strconv.Atoi(msg)
return err != nil
}

deadNodes := 0
for n := range ch {
// If there's an error, it means either that the monitor command failed
// completely, or that it found a dead node worth complaining about.
if n.Err != nil || isDead(n.Msg) {
deadNodes++
}
node = i
break

t.L().Printf("n%d: err=%v,msg=%s", n.Node, n.Err, n.Msg)
}
if db == nil {
return nil, node

if deadNodes > 0 {
return errors.Newf("%d dead node(s) detected", deadNodes)
}
return db, node
return nil
}

type HealthStatusResult struct {
Node int
Status int
Body []byte
Err error
}

func newHealthStatusResult(node int, status int, body []byte, err error) *HealthStatusResult {
return &HealthStatusResult{
Node: node,
Status: status,
Body: body,
Err: err,
}
}

// HealthStatus returns the result of the /health?ready=1 endpoint for each node.
func (c *clusterImpl) HealthStatus(
ctx context.Context, l *logger.Logger, node option.NodeListOption,
) ([]*HealthStatusResult, error) {
if len(node) < 1 {
return nil, nil // unit tests
}
adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, node)
if err != nil {
return nil, errors.WithDetail(err, "Unable to get admin UI address(es)")
}
getStatus := func(ctx context.Context, node int) *HealthStatusResult {
url := fmt.Sprintf(`http://%s/health?ready=1`, adminAddrs[node-1])
resp, err := httputil.Get(ctx, url)
if err != nil {
return newHealthStatusResult(node, 0, nil, err)
}

defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)

return newHealthStatusResult(node, resp.StatusCode, body, err)
}

results := make([]*HealthStatusResult, c.spec.NodeCount)

_ = contextutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error {
var wg sync.WaitGroup
wg.Add(c.spec.NodeCount)
for i := 1; i <= c.spec.NodeCount; i++ {
go func(node int) {
defer wg.Done()
results[node-1] = getStatus(ctx, node)
}(i)
}
wg.Wait()
return nil
})

return results, nil
}

// FailOnInvalidDescriptors fails the test if there exists any descriptors in
// the crdb_internal.invalid_objects virtual table.
func (c *clusterImpl) FailOnInvalidDescriptors(ctx context.Context, db *gosql.DB, t *testImpl) {
t.L().Printf("checking for invalid descriptors")
if err := contextutil.RunWithTimeout(
ctx, "invalid descriptors check", 5*time.Minute,
ctx, "invalid descriptors check", 1*time.Minute,
func(ctx context.Context) error {
return roachtestutil.CheckInvalidDescriptors(db)
return roachtestutil.CheckInvalidDescriptors(ctx, db)
},
); err != nil {
t.Errorf("invalid descriptors check failed: %v", err)
Expand All @@ -1574,6 +1628,7 @@ func (c *clusterImpl) FailOnInvalidDescriptors(ctx context.Context, db *gosql.DB
// crdb_internal.check_consistency(true, ”, ”) indicates that any ranges'
// replicas are inconsistent with each other.
func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, db *gosql.DB, t *testImpl) {
t.L().Printf("checking for replica divergence")
if err := contextutil.RunWithTimeout(
ctx, "consistency check", 5*time.Minute,
func(ctx context.Context) error {
Expand Down Expand Up @@ -2456,7 +2511,7 @@ func (c *clusterImpl) InternalAdminUIAddr(
return addrs, nil
}

// ExternalAdminUIAddr returns the internal Admin UI address in the form host:port
// ExternalAdminUIAddr returns the external Admin UI address in the form host:port
// for the specified node.
func (c *clusterImpl) ExternalAdminUIAddr(
ctx context.Context, l *logger.Logger, node option.NodeListOption,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func InstallFixtures(
}
}
// Extract fixture. Fail if there's already an LSM in the store dir.
c.Run(ctx, nodes, "cd {store-dir} && [ ! -f {store-dir}/CURRENT ] && tar -xf fixture.tgz")
c.Run(ctx, nodes, "ls {store-dir}/marker.* 1> /dev/null 2>&1 && exit 1 || (cd {store-dir} && tar -xf fixture.tgz)")
return nil
}

Expand Down
6 changes: 4 additions & 2 deletions pkg/cmd/roachtest/roachtestutil/validation_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@ WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)

// CheckInvalidDescriptors returns an error if there exists any descriptors in
// the crdb_internal.invalid_objects virtual table.
func CheckInvalidDescriptors(db *gosql.DB) error {
func CheckInvalidDescriptors(ctx context.Context, db *gosql.DB) error {
// Because crdb_internal.invalid_objects is a virtual table, by default, the
// query will take a lease on the database sqlDB is connected to and only run
// the query on the given database. The "" prefix prevents this lease
// acquisition and allows the query to fetch all descriptors in the cluster.
rows, err := db.Query(`SELECT id, obj_name, error FROM "".crdb_internal.invalid_objects`)
rows, err := db.QueryContext(ctx, `
SET statement_timeout = '1m';
SELECT id, obj_name, error FROM "".crdb_internal.invalid_objects`)
if err != nil {
return err
}
Expand Down
46 changes: 38 additions & 8 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ package main
import (
"archive/zip"
"context"
gosql "database/sql"
"fmt"
"html"
"io"
Expand Down Expand Up @@ -1141,12 +1142,40 @@ func (r *testRunner) teardownTest(
}
}

// Detect dead nodes. This will call t.Error() when appropriate. Note that
// we do this even if t.Failed() since a down node is often the reason for
// the failure, and it's helpful to have the listing in the teardown logs
// as well (it is typically already in the main logs if the test used a
// monitor).
c.assertNoDeadNode(ctx, t)
// When a dead node is detected, the subsequent post validation queries are likely
// to hang (reason unclear), and eventually timeout according to the statement_timeout.
// If this occurs frequently enough, we can look at skipping post validations on a node
// failure (or even on any test failure).
if err := c.assertNoDeadNode(ctx, t); err != nil {
t.Error(err)
}

// We collect all the admin health endpoints in parallel,
// and select the first one that succeeds to run the validation queries
statuses, err := c.HealthStatus(ctx, t.L(), c.All())
if err != nil {
t.Error(errors.WithDetail(err, "Unable to check health status"))
}

var db *gosql.DB
var validationNode int
for _, s := range statuses {
if s.Err != nil {
t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err)
continue
}

if s.Status != http.StatusOK {
t.L().Printf("n%d:/health?ready=1 status=%d body=%s", s.Node, s.Status, s.Body)
continue
}

if db == nil {
db = c.Conn(ctx, t.L(), s.Node)
validationNode = s.Node
}
t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node)
}

// We avoid trying to do this when t.Failed() (and in particular when there
// are dead nodes) because for reasons @tbg does not understand this gets
Expand All @@ -1156,10 +1185,11 @@ func (r *testRunner) teardownTest(
//
// TODO(testinfra): figure out why this can still get stuck despite the
// above.
db, node := c.ConnectToLiveNode(ctx, t)
if db != nil {
defer db.Close()
t.L().Printf("running validation checks on node %d (<10m)", node)
t.L().Printf("running validation checks on node %d (<10m)", validationNode)
// If this validation fails due to a timeout, it is very likely that
// the replica divergence check below will also fail.
if t.spec.SkipPostValidations&registry.PostValidationInvalidDescriptors == 0 {
c.FailOnInvalidDescriptors(ctx, db, t)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,7 @@ func (c *SyncedCluster) Monitor(

snippet := `
{{ if .IgnoreEmpty }}
if [ ! -f "{{.Store}}/CURRENT" ]; then
if ! ls {{.Store}}/marker.* 1> /dev/null 2>&1; then
echo "skipped"
exit 0
fi
Expand Down

0 comments on commit 69d57f3

Please sign in to comment.