Skip to content

Commit

Permalink
roachtest: Allow roachtests to opt out of failing in post validation …
Browse files Browse the repository at this point in the history
…when

dead nodes are detected.

Epic: none
Fixes: cockroachdb#102131

Release note: None
  • Loading branch information
Miral Gadani committed Apr 24, 2023
1 parent 06a8d34 commit b4bdafc
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 38 deletions.
11 changes: 1 addition & 10 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1347,20 +1347,11 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
return err
}

isDead := func(msg string) bool {
if msg == "" || msg == "skipped" {
return false
}
// A numeric message is a PID and implies that the node is running.
_, err := strconv.Atoi(msg)
return err != nil
}

deadNodes := 0
for n := range ch {
// If there's an error, it means either that the monitor command failed
// completely, or that it found a dead node worth complaining about.
if n.Err != nil || isDead(n.Msg) {
if n.Err != nil || strings.HasPrefix(n.Msg, "dead") {
deadNodes++
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/cmd/roachtest/registry/test_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ const (
// PostValidationInvalidDescriptors checks if there exists any descriptors in
// the crdb_internal.invalid_objects virtual table.
PostValidationInvalidDescriptors
// PostValidationNoDeadNodes checks if there are any dead nodes in the cluster.
PostValidationNoDeadNodes
)

// MatchType is the type of match a file has to a TestFilter.
Expand Down
7 changes: 6 additions & 1 deletion pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1084,7 +1084,12 @@ func (r *testRunner) teardownTest(
// If this occurs frequently enough, we can look at skipping post validations on a node
// failure (or even on any test failure).
if err := c.assertNoDeadNode(ctx, t); err != nil {
t.Error(err)
// Some tests expect dead nodes, so they may opt out of this check.
if t.spec.SkipPostValidations&registry.PostValidationNoDeadNodes == 0 {
t.Error(err)
} else {
t.L().Printf("dead node(s) detected but expected")
}
}

// We avoid trying to do this when t.Failed() (and in particular when there
Expand Down
7 changes: 4 additions & 3 deletions pkg/cmd/roachtest/tests/decommissionbench.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,10 @@ func registerDecommissionBenchSpec(r registry.Registry, benchSpec decommissionBe
benchSpec.nodes+addlNodeCount+1,
specOptions...,
),
Timeout: timeout,
NonReleaseBlocker: true,
Skip: benchSpec.skip,
SkipPostValidations: registry.PostValidationNoDeadNodes,
Timeout: timeout,
NonReleaseBlocker: true,
Skip: benchSpec.skip,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
if benchSpec.duration > 0 {
runDecommissionBenchLong(ctx, t, c, benchSpec, timeout)
Expand Down
9 changes: 5 additions & 4 deletions pkg/cmd/roachtest/tests/disk_stall.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,11 @@ func registerDiskStalledDetection(r registry.Registry) {
for name, makeStaller := range stallers {
name, makeStaller := name, makeStaller
r.Add(registry.TestSpec{
Name: fmt.Sprintf("disk-stalled/%s", name),
Owner: registry.OwnerStorage,
Cluster: makeSpec(),
Timeout: 30 * time.Minute,
Name: fmt.Sprintf("disk-stalled/%s", name),
Owner: registry.OwnerStorage,
Cluster: makeSpec(),
Timeout: 30 * time.Minute,
SkipPostValidations: registry.PostValidationNoDeadNodes,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runDiskStalledDetection(ctx, t, c, makeStaller(t, c), true /* doStall */)
},
Expand Down
7 changes: 4 additions & 3 deletions pkg/cmd/roachtest/tests/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@ func registerDrain(r registry.Registry) {
})

r.Add(registry.TestSpec{
Name: "drain/not-at-quorum",
Owner: registry.OwnerSQLSessions,
Cluster: r.MakeClusterSpec(3),
Name: "drain/not-at-quorum",
Owner: registry.OwnerSQLSessions,
Cluster: r.MakeClusterSpec(3),
SkipPostValidations: registry.PostValidationNoDeadNodes,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runClusterNotAtQuorum(ctx, t, c)
},
Expand Down
31 changes: 19 additions & 12 deletions pkg/cmd/roachtest/tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,29 +55,36 @@ func registerFailover(r registry.Registry) {
}
return s
}
var postValidation registry.PostValidation = 0
if failureMode == failureModeDiskStall {
postValidation = registry.PostValidationNoDeadNodes
}
r.Add(registry.TestSpec{
Name: fmt.Sprintf("failover/non-system/%s", failureMode),
Owner: registry.OwnerKV,
Timeout: 30 * time.Minute,
Cluster: makeSpec(7 /* nodes */, 4 /* cpus */),
Name: fmt.Sprintf("failover/non-system/%s", failureMode),
Owner: registry.OwnerKV,
Timeout: 30 * time.Minute,
SkipPostValidations: postValidation,
Cluster: makeSpec(7 /* nodes */, 4 /* cpus */),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runFailoverNonSystem(ctx, t, c, failureMode)
},
})
r.Add(registry.TestSpec{
Name: fmt.Sprintf("failover/liveness/%s", failureMode),
Owner: registry.OwnerKV,
Timeout: 30 * time.Minute,
Cluster: makeSpec(5 /* nodes */, 4 /* cpus */),
Name: fmt.Sprintf("failover/liveness/%s", failureMode),
Owner: registry.OwnerKV,
Timeout: 30 * time.Minute,
SkipPostValidations: postValidation,
Cluster: makeSpec(5 /* nodes */, 4 /* cpus */),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runFailoverLiveness(ctx, t, c, failureMode)
},
})
r.Add(registry.TestSpec{
Name: fmt.Sprintf("failover/system-non-liveness/%s", failureMode),
Owner: registry.OwnerKV,
Timeout: 30 * time.Minute,
Cluster: makeSpec(7 /* nodes */, 4 /* cpus */),
Name: fmt.Sprintf("failover/system-non-liveness/%s", failureMode),
Owner: registry.OwnerKV,
Timeout: 30 * time.Minute,
SkipPostValidations: postValidation,
Cluster: makeSpec(7 /* nodes */, 4 /* cpus */),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runFailoverSystemNonLiveness(ctx, t, c, failureMode)
},
Expand Down
7 changes: 4 additions & 3 deletions pkg/cmd/roachtest/tests/kv.go
Original file line number Diff line number Diff line change
Expand Up @@ -410,9 +410,10 @@ func registerKVContention(r registry.Registry) {

func registerKVQuiescenceDead(r registry.Registry) {
r.Add(registry.TestSpec{
Name: "kv/quiescence/nodes=3",
Owner: registry.OwnerKV,
Cluster: r.MakeClusterSpec(4),
Name: "kv/quiescence/nodes=3",
Owner: registry.OwnerKV,
Cluster: r.MakeClusterSpec(4),
SkipPostValidations: registry.PostValidationNoDeadNodes,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
nodes := c.Spec().NodeCount - 1
c.Put(ctx, t.Cockroach(), "./cockroach", c.Range(1, nodes))
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/tests/loss_of_quorum_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func registerLOQRecovery(r registry.Registry) {
Owner: registry.OwnerReplication,
Tags: registry.Tags(`default`),
Cluster: spec,
SkipPostValidations: registry.PostValidationInvalidDescriptors,
SkipPostValidations: registry.PostValidationInvalidDescriptors | registry.PostValidationNoDeadNodes,
NonReleaseBlocker: true,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runRecoverLossOfQuorum(ctx, t, c, testSpec)
Expand All @@ -84,7 +84,7 @@ func registerLOQRecovery(r registry.Registry) {
Owner: registry.OwnerReplication,
Tags: registry.Tags(`default`),
Cluster: spec,
SkipPostValidations: registry.PostValidationInvalidDescriptors,
SkipPostValidations: registry.PostValidationInvalidDescriptors | registry.PostValidationNoDeadNodes,
NonReleaseBlocker: true,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runHalfOnlineRecoverLossOfQuorum(ctx, t, c, testSpec)
Expand Down

0 comments on commit b4bdafc

Please sign in to comment.