Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachtest: backport recent improvements #104882

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ go_library(
"//pkg/util/allstacks",
"//pkg/util/contextutil",
"//pkg/util/ctxgroup",
"//pkg/util/httputil",
"//pkg/util/log",
"//pkg/util/quotapool",
"//pkg/util/randutil",
Expand Down
137 changes: 96 additions & 41 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/quotapool"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
Expand Down Expand Up @@ -1393,6 +1394,7 @@ func (c *clusterImpl) CopyRoachprodState(ctx context.Context) error {
//
// `COCKROACH_DEBUG_TS_IMPORT_FILE=tsdump.gob ./cockroach start-single-node --insecure --store=$(mktemp -d)`
func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, l *logger.Logger) error {
l.Printf("fetching timeseries data\n")
return contextutil.RunWithTimeout(ctx, "fetch tsdata", 5*time.Minute, func(ctx context.Context) error {
node := 1
for ; node <= c.spec.NodeCount; node++ {
Expand Down Expand Up @@ -1507,63 +1509,115 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error
})
}

// checkNoDeadNode reports an error (via `t.Error`) if nodes that have a populated
// checkNoDeadNode returns an error if at least one of the nodes that have a populated
// data dir are found to be not running. It prints both to t.L() and the test
// output.
func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) {
func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
if c.spec.NodeCount == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return
return nil
}

_, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})
// If there's an error, it means either that the monitor command failed
// completely, or that it found a dead node worth complaining about.
t.L().Printf("checking for dead nodes")
ch, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})

// An error here means there was a problem initialising a SyncedCluster.
if err != nil {
t.Errorf("dead node detection: %s", err)
}
}

// ConnectToLiveNode returns a connection to a live node in the cluster. If no
// live node is found, it returns nil and -1. If a live node is found it returns
// a connection to it and the node's index.
func (c *clusterImpl) ConnectToLiveNode(ctx context.Context, t *testImpl) (*gosql.DB, int) {
node := -1
if c.spec.NodeCount < 1 {
return nil, node // unit tests
}
// Find a live node to run against, if one exists.
var db *gosql.DB
for i := 1; i <= c.spec.NodeCount; i++ {
// Don't hang forever.
if err := contextutil.RunWithTimeout(
ctx, "find live node", 5*time.Second,
func(ctx context.Context) error {
db = c.Conn(ctx, t.L(), i)
_, err := db.ExecContext(ctx, `;`)
return err
},
); err != nil {
_ = db.Close()
db = nil
continue
return err
}

isDead := func(msg string) bool {
if msg == "" || msg == "skipped" {
return false
}
// A numeric message is a PID and implies that the node is running.
_, err := strconv.Atoi(msg)
return err != nil
}

deadNodes := 0
for n := range ch {
// If there's an error, it means either that the monitor command failed
// completely, or that it found a dead node worth complaining about.
if n.Err != nil || isDead(n.Msg) {
deadNodes++
}
node = i
break

t.L().Printf("n%d: err=%v,msg=%s", n.Node, n.Err, n.Msg)
}
if db == nil {
return nil, node

if deadNodes > 0 {
return errors.Newf("%d dead node(s) detected", deadNodes)
}
return db, node
return nil
}

type HealthStatusResult struct {
Node int
Status int
Body []byte
Err error
}

func newHealthStatusResult(node int, status int, body []byte, err error) *HealthStatusResult {
return &HealthStatusResult{
Node: node,
Status: status,
Body: body,
Err: err,
}
}

// HealthStatus returns the result of the /health?ready=1 endpoint for each node.
func (c *clusterImpl) HealthStatus(
ctx context.Context, l *logger.Logger, node option.NodeListOption,
) ([]*HealthStatusResult, error) {
if len(node) < 1 {
return nil, nil // unit tests
}
adminAddrs, err := c.ExternalAdminUIAddr(ctx, l, node)
if err != nil {
return nil, errors.WithDetail(err, "Unable to get admin UI address(es)")
}
getStatus := func(ctx context.Context, node int) *HealthStatusResult {
url := fmt.Sprintf(`http://%s/health?ready=1`, adminAddrs[node-1])
resp, err := httputil.Get(ctx, url)
if err != nil {
return newHealthStatusResult(node, 0, nil, err)
}

defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)

return newHealthStatusResult(node, resp.StatusCode, body, err)
}

results := make([]*HealthStatusResult, c.spec.NodeCount)

_ = contextutil.RunWithTimeout(ctx, "health status", 15*time.Second, func(ctx context.Context) error {
var wg sync.WaitGroup
wg.Add(c.spec.NodeCount)
for i := 1; i <= c.spec.NodeCount; i++ {
go func(node int) {
defer wg.Done()
results[node-1] = getStatus(ctx, node)
}(i)
}
wg.Wait()
return nil
})

return results, nil
}

// FailOnInvalidDescriptors fails the test if there exists any descriptors in
// the crdb_internal.invalid_objects virtual table.
func (c *clusterImpl) FailOnInvalidDescriptors(ctx context.Context, db *gosql.DB, t *testImpl) {
t.L().Printf("checking for invalid descriptors")
if err := contextutil.RunWithTimeout(
ctx, "invalid descriptors check", 5*time.Minute,
ctx, "invalid descriptors check", 1*time.Minute,
func(ctx context.Context) error {
return roachtestutil.CheckInvalidDescriptors(db)
return roachtestutil.CheckInvalidDescriptors(ctx, db)
},
); err != nil {
t.Errorf("invalid descriptors check failed: %v", err)
Expand All @@ -1574,6 +1628,7 @@ func (c *clusterImpl) FailOnInvalidDescriptors(ctx context.Context, db *gosql.DB
// crdb_internal.check_consistency(true, ”, ”) indicates that any ranges'
// replicas are inconsistent with each other.
func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, db *gosql.DB, t *testImpl) {
t.L().Printf("checking for replica divergence")
if err := contextutil.RunWithTimeout(
ctx, "consistency check", 5*time.Minute,
func(ctx context.Context) error {
Expand Down Expand Up @@ -2456,7 +2511,7 @@ func (c *clusterImpl) InternalAdminUIAddr(
return addrs, nil
}

// ExternalAdminUIAddr returns the internal Admin UI address in the form host:port
// ExternalAdminUIAddr returns the external Admin UI address in the form host:port
// for the specified node.
func (c *clusterImpl) ExternalAdminUIAddr(
ctx context.Context, l *logger.Logger, node option.NodeListOption,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func InstallFixtures(
}
}
// Extract fixture. Fail if there's already an LSM in the store dir.
c.Run(ctx, nodes, "cd {store-dir} && [ ! -f {store-dir}/CURRENT ] && tar -xf fixture.tgz")
c.Run(ctx, nodes, "ls {store-dir}/marker.* 1> /dev/null 2>&1 && exit 1 || (cd {store-dir} && tar -xf fixture.tgz)")
return nil
}

Expand Down
6 changes: 4 additions & 2 deletions pkg/cmd/roachtest/roachtestutil/validation_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@ WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)

// CheckInvalidDescriptors returns an error if there exists any descriptors in
// the crdb_internal.invalid_objects virtual table.
func CheckInvalidDescriptors(db *gosql.DB) error {
func CheckInvalidDescriptors(ctx context.Context, db *gosql.DB) error {
// Because crdb_internal.invalid_objects is a virtual table, by default, the
// query will take a lease on the database sqlDB is connected to and only run
// the query on the given database. The "" prefix prevents this lease
// acquisition and allows the query to fetch all descriptors in the cluster.
rows, err := db.Query(`SELECT id, obj_name, error FROM "".crdb_internal.invalid_objects`)
rows, err := db.QueryContext(ctx, `
SET statement_timeout = '1m';
SELECT id, obj_name, error FROM "".crdb_internal.invalid_objects`)
if err != nil {
return err
}
Expand Down
46 changes: 38 additions & 8 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ package main
import (
"archive/zip"
"context"
gosql "database/sql"
"fmt"
"html"
"io"
Expand Down Expand Up @@ -1141,12 +1142,40 @@ func (r *testRunner) teardownTest(
}
}

// Detect dead nodes. This will call t.Error() when appropriate. Note that
// we do this even if t.Failed() since a down node is often the reason for
// the failure, and it's helpful to have the listing in the teardown logs
// as well (it is typically already in the main logs if the test used a
// monitor).
c.assertNoDeadNode(ctx, t)
// When a dead node is detected, the subsequent post validation queries are likely
// to hang (reason unclear), and eventually timeout according to the statement_timeout.
// If this occurs frequently enough, we can look at skipping post validations on a node
// failure (or even on any test failure).
if err := c.assertNoDeadNode(ctx, t); err != nil {
t.Error(err)
}

// We collect all the admin health endpoints in parallel,
// and select the first one that succeeds to run the validation queries
statuses, err := c.HealthStatus(ctx, t.L(), c.All())
if err != nil {
t.Error(errors.WithDetail(err, "Unable to check health status"))
}

var db *gosql.DB
var validationNode int
for _, s := range statuses {
if s.Err != nil {
t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err)
continue
}

if s.Status != http.StatusOK {
t.L().Printf("n%d:/health?ready=1 status=%d body=%s", s.Node, s.Status, s.Body)
continue
}

if db == nil {
db = c.Conn(ctx, t.L(), s.Node)
validationNode = s.Node
}
t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node)
}

// We avoid trying to do this when t.Failed() (and in particular when there
// are dead nodes) because for reasons @tbg does not understand this gets
Expand All @@ -1156,10 +1185,11 @@ func (r *testRunner) teardownTest(
//
// TODO(testinfra): figure out why this can still get stuck despite the
// above.
db, node := c.ConnectToLiveNode(ctx, t)
if db != nil {
defer db.Close()
t.L().Printf("running validation checks on node %d (<10m)", node)
t.L().Printf("running validation checks on node %d (<10m)", validationNode)
// If this validation fails due to a timeout, it is very likely that
// the replica divergence check below will also fail.
if t.spec.SkipPostValidations&registry.PostValidationInvalidDescriptors == 0 {
c.FailOnInvalidDescriptors(ctx, db, t)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,7 @@ func (c *SyncedCluster) Monitor(

snippet := `
{{ if .IgnoreEmpty }}
if [ ! -f "{{.Store}}/CURRENT" ]; then
if ! ls {{.Store}}/marker.* 1> /dev/null 2>&1; then
echo "skipped"
exit 0
fi
Expand Down