diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 76290919b6fc..7a8ed5efc453 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1347,20 +1347,11 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error { return err } - isDead := func(msg string) bool { - if msg == "" || msg == "skipped" { - return false - } - // A numeric message is a PID and implies that the node is running. - _, err := strconv.Atoi(msg) - return err != nil - } - deadNodes := 0 for n := range ch { // If there's an error, it means either that the monitor command failed // completely, or that it found a dead node worth complaining about. - if n.Err != nil || isDead(n.Msg) { + if n.Err != nil || strings.HasPrefix(n.Msg, "dead") { deadNodes++ } diff --git a/pkg/cmd/roachtest/registry/test_spec.go b/pkg/cmd/roachtest/registry/test_spec.go index 08728136e591..d68d895d2c35 100644 --- a/pkg/cmd/roachtest/registry/test_spec.go +++ b/pkg/cmd/roachtest/registry/test_spec.go @@ -110,6 +110,8 @@ const ( // PostValidationInvalidDescriptors checks if there exists any descriptors in // the crdb_internal.invalid_objects virtual table. PostValidationInvalidDescriptors + // PostValidationNoDeadNodes checks if there are any dead nodes in the cluster. + PostValidationNoDeadNodes ) // MatchType is the type of match a file has to a TestFilter. diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 69989a1f2313..39c24979d275 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1084,7 +1084,12 @@ func (r *testRunner) teardownTest( // If this occurs frequently enough, we can look at skipping post validations on a node // failure (or even on any test failure). if err := c.assertNoDeadNode(ctx, t); err != nil { - t.Error(err) + // Some tests expect dead nodes, so they may opt out of this check. + if t.spec.SkipPostValidations®istry.PostValidationNoDeadNodes == 0 { + t.Error(err) + } else { + t.L().Printf("dead node(s) detected but expected") + } } // We avoid trying to do this when t.Failed() (and in particular when there diff --git a/pkg/cmd/roachtest/tests/decommissionbench.go b/pkg/cmd/roachtest/tests/decommissionbench.go index bc8b2fa39fd2..e0cb1de54dc5 100644 --- a/pkg/cmd/roachtest/tests/decommissionbench.go +++ b/pkg/cmd/roachtest/tests/decommissionbench.go @@ -292,9 +292,10 @@ func registerDecommissionBenchSpec(r registry.Registry, benchSpec decommissionBe benchSpec.nodes+addlNodeCount+1, specOptions..., ), - Timeout: timeout, - NonReleaseBlocker: true, - Skip: benchSpec.skip, + SkipPostValidations: registry.PostValidationNoDeadNodes, + Timeout: timeout, + NonReleaseBlocker: true, + Skip: benchSpec.skip, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { if benchSpec.duration > 0 { runDecommissionBenchLong(ctx, t, c, benchSpec, timeout) diff --git a/pkg/cmd/roachtest/tests/disk_stall.go b/pkg/cmd/roachtest/tests/disk_stall.go index 867617018154..3e68fb054e59 100644 --- a/pkg/cmd/roachtest/tests/disk_stall.go +++ b/pkg/cmd/roachtest/tests/disk_stall.go @@ -55,10 +55,11 @@ func registerDiskStalledDetection(r registry.Registry) { for name, makeStaller := range stallers { name, makeStaller := name, makeStaller r.Add(registry.TestSpec{ - Name: fmt.Sprintf("disk-stalled/%s", name), - Owner: registry.OwnerStorage, - Cluster: makeSpec(), - Timeout: 30 * time.Minute, + Name: fmt.Sprintf("disk-stalled/%s", name), + Owner: registry.OwnerStorage, + Cluster: makeSpec(), + Timeout: 30 * time.Minute, + SkipPostValidations: registry.PostValidationNoDeadNodes, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runDiskStalledDetection(ctx, t, c, makeStaller(t, c), true /* doStall */) }, diff --git a/pkg/cmd/roachtest/tests/drain.go b/pkg/cmd/roachtest/tests/drain.go index 2e3188a36cc6..71df5b6db70a 100644 --- a/pkg/cmd/roachtest/tests/drain.go +++ b/pkg/cmd/roachtest/tests/drain.go @@ -55,9 +55,10 @@ func registerDrain(r registry.Registry) { }) r.Add(registry.TestSpec{ - Name: "drain/not-at-quorum", - Owner: registry.OwnerSQLSessions, - Cluster: r.MakeClusterSpec(3), + Name: "drain/not-at-quorum", + Owner: registry.OwnerSQLSessions, + Cluster: r.MakeClusterSpec(3), + SkipPostValidations: registry.PostValidationNoDeadNodes, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runClusterNotAtQuorum(ctx, t, c) }, diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go index ec1e9cadbef0..cadcdd1c25b6 100644 --- a/pkg/cmd/roachtest/tests/failover.go +++ b/pkg/cmd/roachtest/tests/failover.go @@ -55,29 +55,36 @@ func registerFailover(r registry.Registry) { } return s } + var postValidation registry.PostValidation = 0 + if failureMode == failureModeDiskStall { + postValidation = registry.PostValidationNoDeadNodes + } r.Add(registry.TestSpec{ - Name: fmt.Sprintf("failover/non-system/%s", failureMode), - Owner: registry.OwnerKV, - Timeout: 30 * time.Minute, - Cluster: makeSpec(7 /* nodes */, 4 /* cpus */), + Name: fmt.Sprintf("failover/non-system/%s", failureMode), + Owner: registry.OwnerKV, + Timeout: 30 * time.Minute, + SkipPostValidations: postValidation, + Cluster: makeSpec(7 /* nodes */, 4 /* cpus */), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runFailoverNonSystem(ctx, t, c, failureMode) }, }) r.Add(registry.TestSpec{ - Name: fmt.Sprintf("failover/liveness/%s", failureMode), - Owner: registry.OwnerKV, - Timeout: 30 * time.Minute, - Cluster: makeSpec(5 /* nodes */, 4 /* cpus */), + Name: fmt.Sprintf("failover/liveness/%s", failureMode), + Owner: registry.OwnerKV, + Timeout: 30 * time.Minute, + SkipPostValidations: postValidation, + Cluster: makeSpec(5 /* nodes */, 4 /* cpus */), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runFailoverLiveness(ctx, t, c, failureMode) }, }) r.Add(registry.TestSpec{ - Name: fmt.Sprintf("failover/system-non-liveness/%s", failureMode), - Owner: registry.OwnerKV, - Timeout: 30 * time.Minute, - Cluster: makeSpec(7 /* nodes */, 4 /* cpus */), + Name: fmt.Sprintf("failover/system-non-liveness/%s", failureMode), + Owner: registry.OwnerKV, + Timeout: 30 * time.Minute, + SkipPostValidations: postValidation, + Cluster: makeSpec(7 /* nodes */, 4 /* cpus */), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runFailoverSystemNonLiveness(ctx, t, c, failureMode) }, diff --git a/pkg/cmd/roachtest/tests/kv.go b/pkg/cmd/roachtest/tests/kv.go index 373b7b4477c1..cf6ca81a1d58 100644 --- a/pkg/cmd/roachtest/tests/kv.go +++ b/pkg/cmd/roachtest/tests/kv.go @@ -410,9 +410,10 @@ func registerKVContention(r registry.Registry) { func registerKVQuiescenceDead(r registry.Registry) { r.Add(registry.TestSpec{ - Name: "kv/quiescence/nodes=3", - Owner: registry.OwnerKV, - Cluster: r.MakeClusterSpec(4), + Name: "kv/quiescence/nodes=3", + Owner: registry.OwnerKV, + Cluster: r.MakeClusterSpec(4), + SkipPostValidations: registry.PostValidationNoDeadNodes, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { nodes := c.Spec().NodeCount - 1 c.Put(ctx, t.Cockroach(), "./cockroach", c.Range(1, nodes)) diff --git a/pkg/cmd/roachtest/tests/loss_of_quorum_recovery.go b/pkg/cmd/roachtest/tests/loss_of_quorum_recovery.go index 51eca1a16fb8..bfacd635bbbd 100644 --- a/pkg/cmd/roachtest/tests/loss_of_quorum_recovery.go +++ b/pkg/cmd/roachtest/tests/loss_of_quorum_recovery.go @@ -73,7 +73,7 @@ func registerLOQRecovery(r registry.Registry) { Owner: registry.OwnerReplication, Tags: registry.Tags(`default`), Cluster: spec, - SkipPostValidations: registry.PostValidationInvalidDescriptors, + SkipPostValidations: registry.PostValidationInvalidDescriptors | registry.PostValidationNoDeadNodes, NonReleaseBlocker: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runRecoverLossOfQuorum(ctx, t, c, testSpec) @@ -84,7 +84,7 @@ func registerLOQRecovery(r registry.Registry) { Owner: registry.OwnerReplication, Tags: registry.Tags(`default`), Cluster: spec, - SkipPostValidations: registry.PostValidationInvalidDescriptors, + SkipPostValidations: registry.PostValidationInvalidDescriptors | registry.PostValidationNoDeadNodes, NonReleaseBlocker: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runHalfOnlineRecoverLossOfQuorum(ctx, t, c, testSpec)