diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 2912588e41e6..e7c964708816 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1338,15 +1338,14 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) { } } -// FailOnReplicaDivergence fails the test if -// crdb_internal.check_consistency(true, ”, ”) indicates that any ranges' -// replicas are inconsistent with each other. It uses the first node that -// is up to run the query. -func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, t *testImpl) { +// ConnectToLiveNode returns a connection to a live node in the cluster. If no +// live node is found, it returns nil and -1. If a live node is found it returns +// a connection to it and the node's index. +func (c *clusterImpl) ConnectToLiveNode(ctx context.Context, t *testImpl) (*gosql.DB, int) { + node := -1 if c.spec.NodeCount < 1 { - return // unit tests + return nil, node // unit tests } - // Find a live node to run against, if one exists. var db *gosql.DB for i := 1; i <= c.spec.NodeCount; i++ { @@ -1363,15 +1362,32 @@ func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, t *testImpl) db = nil continue } - t.L().Printf("running (fast) consistency checks on node %d", i) + node = i break } if db == nil { - t.L().Printf("no live node found, skipping consistency check") - return + return nil, node + } + return db, node +} + +// FailOnInvalidDescriptors fails the test if there exists any descriptors in +// the crdb_internal.invalid_objects virtual table. +func (c *clusterImpl) FailOnInvalidDescriptors(ctx context.Context, db *gosql.DB, t *testImpl) { + if err := contextutil.RunWithTimeout( + ctx, "invalid descriptors check", 5*time.Minute, + func(ctx context.Context) error { + return roachtestutil.CheckInvalidDescriptors(db) + }, + ); err != nil { + t.Errorf("invalid descriptors check failed: %v", err) } - defer db.Close() +} +// FailOnReplicaDivergence fails the test if +// crdb_internal.check_consistency(true, ”, ”) indicates that any ranges' +// replicas are inconsistent with each other. +func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, db *gosql.DB, t *testImpl) { if err := contextutil.RunWithTimeout( ctx, "consistency check", 5*time.Minute, func(ctx context.Context) error { diff --git a/pkg/cmd/roachtest/roachtestutil/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/BUILD.bazel index affd838e4ff7..2472bf06ef2e 100644 --- a/pkg/cmd/roachtest/roachtestutil/BUILD.bazel +++ b/pkg/cmd/roachtest/roachtestutil/BUILD.bazel @@ -4,14 +4,15 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") go_library( name = "roachtestutil", srcs = [ - "consistency_check.go", "jaeger.go", + "validation_check.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil", visibility = ["//visibility:public"], deps = [ "//pkg/cmd/roachtest/cluster", "//pkg/roachprod/logger", + "//pkg/testutils/sqlutils", "@com_github_cockroachdb_errors//:errors", ], ) diff --git a/pkg/cmd/roachtest/roachtestutil/consistency_check.go b/pkg/cmd/roachtest/roachtestutil/validation_check.go similarity index 70% rename from pkg/cmd/roachtest/roachtestutil/consistency_check.go rename to pkg/cmd/roachtest/roachtestutil/validation_check.go index 529dbd0a9081..36afef6abcb8 100644 --- a/pkg/cmd/roachtest/roachtestutil/consistency_check.go +++ b/pkg/cmd/roachtest/roachtestutil/validation_check.go @@ -16,6 +16,7 @@ import ( gosql "database/sql" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/errors" ) @@ -60,3 +61,24 @@ WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`) } return finalErr } + +// CheckInvalidDescriptors returns an error if there exists any descriptors in +// the crdb_internal.invalid_objects virtual table. +func CheckInvalidDescriptors(db *gosql.DB) error { + // Because crdb_internal.invalid_objects is a virtual table, by default, the + // query will take a lease on the database sqlDB is connected to and only run + // the query on the given database. The "" prefix prevents this lease + // acquisition and allows the query to fetch all descriptors in the cluster. + rows, err := db.Query(`SELECT id, obj_name, error FROM "".crdb_internal.invalid_objects`) + if err != nil { + return err + } + invalidIDs, err := sqlutils.RowsToDataDrivenOutput(rows) + if err != nil { + return err + } + if invalidIDs != "" { + return errors.Errorf("the following descriptor ids are invalid\n%v", invalidIDs) + } + return nil +} diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index ed3050bfbb34..5bbb8657b988 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1068,9 +1068,6 @@ func (r *testRunner) teardownTest( // monitor). c.assertNoDeadNode(ctx, t) - // Detect replica divergence (i.e. ranges in which replicas have arrived - // at the same log position with different states). - // // We avoid trying to do this when t.Failed() (and in particular when there // are dead nodes) because for reasons @tbg does not understand this gets // stuck occasionally, which really ruins the roachtest run. The method @@ -1079,7 +1076,17 @@ func (r *testRunner) teardownTest( // // TODO(testinfra): figure out why this can still get stuck despite the // above. - c.FailOnReplicaDivergence(ctx, t) + db, node := c.ConnectToLiveNode(ctx, t) + if db != nil { + defer db.Close() + t.L().Printf("running validation checks on node %d (<10m)", node) + c.FailOnInvalidDescriptors(ctx, db, t) + // Detect replica divergence (i.e. ranges in which replicas have arrived + // at the same log position with different states). + c.FailOnReplicaDivergence(ctx, db, t) + } else { + t.L().Printf("no live node found, skipping validation checks") + } if timedOut || t.Failed() { r.collectClusterArtifacts(ctx, c, t.L())