Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
119655: roachtest: skip initialization on failure restarts r=nvanbenschoten a=andrewbaptist

Previously a node restart in the failover code would pass SkipInit = false. This resulted in re-issuing grants which cause a schema change. By passing SkipInit this code is bypassed.

Informs: cockroachdb#119347

Release note: None

Co-authored-by: Andrew Baptist <[email protected]>
  • Loading branch information
craig[bot] and andrewbaptist committed Apr 12, 2024
2 parents 7bc6b2c + 9245035 commit 7618f92
Showing 1 changed file with 32 additions and 41 deletions.
73 changes: 32 additions & 41 deletions pkg/cmd/roachtest/tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,6 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
rng, _ := randutil.NewTestRand()

// Create cluster, and set up failers for all failure modes.
opts := option.DefaultStartOpts()
opts.RoachprodOpts.ScheduleBackups = false
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
Expand All @@ -216,7 +214,7 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO

failers := []Failer{}
for _, failureMode := range allFailureModes {
failer := makeFailerWithoutLocalNoop(t, c, m, failureMode, opts, settings, rng)
failer := makeFailerWithoutLocalNoop(t, c, m, failureMode, settings, rng)
if c.IsLocal() && !failer.CanUseLocal() {
t.L().Printf("skipping failure mode %q on local cluster", failureMode)
continue
Expand All @@ -226,7 +224,7 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
failers = append(failers, failer)
}

c.Start(ctx, t.L(), opts, settings, c.Range(1, 9))
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(1, 9))

conn := c.Conn(ctx, t.L(), 1)

Expand Down Expand Up @@ -401,18 +399,16 @@ func runFailoverPartialLeaseGateway(ctx context.Context, t test.Test, c cluster.
rng, _ := randutil.NewTestRand()

// Create cluster.
opts := option.DefaultStartOpts()
opts.RoachprodOpts.ScheduleBackups = false
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication

m := c.NewMonitor(ctx, c.Range(1, 7))

failer := makeFailer(t, c, m, failureModeBlackhole, opts, settings, rng).(PartialFailer)
failer := makeFailer(t, c, m, failureModeBlackhole, settings, rng).(PartialFailer)
failer.Setup(ctx)
defer failer.Cleanup(ctx)

c.Start(ctx, t.L(), opts, settings, c.Range(1, 7))
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(1, 7))

conn := c.Conn(ctx, t.L(), 1)

Expand Down Expand Up @@ -536,19 +532,17 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C
// Create cluster, disabling leader/leaseholder colocation. We only start
// n1-n3, to precisely place system ranges, since we'll have to disable the
// replicate queue shortly.
opts := option.DefaultStartOpts()
opts.RoachprodOpts.ScheduleBackups = false
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER=true")
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication

m := c.NewMonitor(ctx, c.Range(1, 6))

failer := makeFailer(t, c, m, failureModeBlackhole, opts, settings, rng).(PartialFailer)
failer := makeFailer(t, c, m, failureModeBlackhole, settings, rng).(PartialFailer)
failer.Setup(ctx)
defer failer.Cleanup(ctx)

c.Start(ctx, t.L(), opts, settings, c.Range(1, 3))
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(1, 3))

conn := c.Conn(ctx, t.L(), 1)

Expand All @@ -560,7 +554,7 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C
require.NoError(t, WaitForReplication(ctx, t, t.L(), conn, 3, exactlyReplicationFactor))

// Now that system ranges are properly placed on n1-n3, start n4-n6.
c.Start(ctx, t.L(), opts, settings, c.Range(4, 6))
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(4, 6))

// Create the kv database on n4-n6.
t.L().Printf("creating workload database")
Expand Down Expand Up @@ -668,18 +662,16 @@ func runFailoverPartialLeaseLiveness(ctx context.Context, t test.Test, c cluster
rng, _ := randutil.NewTestRand()

// Create cluster.
opts := option.DefaultStartOpts()
opts.RoachprodOpts.ScheduleBackups = false
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication

m := c.NewMonitor(ctx, c.Range(1, 7))

failer := makeFailer(t, c, m, failureModeBlackhole, opts, settings, rng).(PartialFailer)
failer := makeFailer(t, c, m, failureModeBlackhole, settings, rng).(PartialFailer)
failer.Setup(ctx)
defer failer.Cleanup(ctx)

c.Start(ctx, t.L(), opts, settings, c.Range(1, 7))
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(1, 7))

conn := c.Conn(ctx, t.L(), 1)

Expand Down Expand Up @@ -785,19 +777,17 @@ func runFailoverNonSystem(
rng, _ := randutil.NewTestRand()

// Create cluster.
opts := option.DefaultStartOpts()
opts.RoachprodOpts.ScheduleBackups = false
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication

m := c.NewMonitor(ctx, c.Range(1, 6))

failer := makeFailer(t, c, m, failureMode, opts, settings, rng)
failer := makeFailer(t, c, m, failureMode, settings, rng)
failer.Setup(ctx)
defer failer.Cleanup(ctx)

c.Start(ctx, t.L(), opts, settings, c.Range(1, 6))
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(1, 6))

conn := c.Conn(ctx, t.L(), 1)

Expand Down Expand Up @@ -894,19 +884,17 @@ func runFailoverLiveness(
rng, _ := randutil.NewTestRand()

// Create cluster.
opts := option.DefaultStartOpts()
opts.RoachprodOpts.ScheduleBackups = false
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication

m := c.NewMonitor(ctx, c.Range(1, 4))

failer := makeFailer(t, c, m, failureMode, opts, settings, rng)
failer := makeFailer(t, c, m, failureMode, settings, rng)
failer.Setup(ctx)
defer failer.Cleanup(ctx)

c.Start(ctx, t.L(), opts, settings, c.Range(1, 4))
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(1, 4))

conn := c.Conn(ctx, t.L(), 1)

Expand Down Expand Up @@ -1009,19 +997,17 @@ func runFailoverSystemNonLiveness(
rng, _ := randutil.NewTestRand()

// Create cluster.
opts := option.DefaultStartOpts()
opts.RoachprodOpts.ScheduleBackups = false
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication

m := c.NewMonitor(ctx, c.Range(1, 6))

failer := makeFailer(t, c, m, failureMode, opts, settings, rng)
failer := makeFailer(t, c, m, failureMode, settings, rng)
failer.Setup(ctx)
defer failer.Cleanup(ctx)

c.Start(ctx, t.L(), opts, settings, c.Range(1, 6))
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(1, 6))

conn := c.Conn(ctx, t.L(), 1)

Expand Down Expand Up @@ -1131,11 +1117,10 @@ func makeFailer(
c cluster.Cluster,
m cluster.Monitor,
failureMode failureMode,
opts option.StartOpts,
settings install.ClusterSettings,
rng *rand.Rand,
) Failer {
f := makeFailerWithoutLocalNoop(t, c, m, failureMode, opts, settings, rng)
f := makeFailerWithoutLocalNoop(t, c, m, failureMode, settings, rng)
if c.IsLocal() && !f.CanUseLocal() {
t.L().Printf(
`failure mode %q not supported on local clusters, using "noop" failure mode instead`,
Expand All @@ -1150,7 +1135,6 @@ func makeFailerWithoutLocalNoop(
c cluster.Cluster,
m cluster.Monitor,
failureMode failureMode,
opts option.StartOpts,
settings install.ClusterSettings,
rng *rand.Rand,
) Failer {
Expand Down Expand Up @@ -1179,7 +1163,6 @@ func makeFailerWithoutLocalNoop(
t: t,
c: c,
m: m,
startOpts: opts,
startSettings: settings,
}
case failureModeDeadlock:
Expand All @@ -1188,7 +1171,6 @@ func makeFailerWithoutLocalNoop(
c: c,
m: m,
rng: rng,
startOpts: opts,
startSettings: settings,
onlyLeaseholders: true,
numReplicas: 5,
Expand All @@ -1198,7 +1180,6 @@ func makeFailerWithoutLocalNoop(
t: t,
c: c,
m: m,
startOpts: opts,
startSettings: settings,
staller: &dmsetupDiskStaller{t: t, c: c},
}
Expand Down Expand Up @@ -1377,7 +1358,6 @@ type crashFailer struct {
t test.Test
c cluster.Cluster
m cluster.Monitor
startOpts option.StartOpts
startSettings install.ClusterSettings
}

Expand All @@ -1395,7 +1375,7 @@ func (f *crashFailer) Fail(ctx context.Context, nodeID int) {
}

func (f *crashFailer) Recover(ctx context.Context, nodeID int) {
f.c.Start(ctx, f.t.L(), f.startOpts, f.startSettings, f.c.Node(nodeID))
f.c.Start(ctx, f.t.L(), failoverRestartOpts(), f.startSettings, f.c.Node(nodeID))
}

// deadlockFailer deadlocks replicas. In addition to deadlocks, this failure
Expand All @@ -1406,7 +1386,6 @@ type deadlockFailer struct {
c cluster.Cluster
m cluster.Monitor
rng *rand.Rand
startOpts option.StartOpts
startSettings install.ClusterSettings
onlyLeaseholders bool
numReplicas int
Expand Down Expand Up @@ -1521,7 +1500,7 @@ func (f *deadlockFailer) Recover(ctx context.Context, nodeID int) {
f.t.L().Printf("failed to unlock replicas on n%d, restarting node: %s", nodeID, err)
f.m.ExpectDeath()
f.c.Stop(ctx, f.t.L(), option.DefaultStopOpts(), f.c.Node(nodeID))
f.c.Start(ctx, f.t.L(), f.startOpts, f.startSettings, f.c.Node(nodeID))
f.c.Start(ctx, f.t.L(), failoverRestartOpts(), f.startSettings, f.c.Node(nodeID))
}
delete(f.locks, nodeID)
}
Expand All @@ -1532,7 +1511,6 @@ type diskStallFailer struct {
t test.Test
c cluster.Cluster
m cluster.Monitor
startOpts option.StartOpts
startSettings install.ClusterSettings
staller diskStaller
}
Expand Down Expand Up @@ -1574,7 +1552,7 @@ func (f *diskStallFailer) Recover(ctx context.Context, nodeID int) {
// Pebble's disk stall detector should have terminated the node, but in case
// it didn't, we explicitly stop it first.
f.c.Stop(ctx, f.t.L(), option.DefaultStopOpts(), f.c.Node(nodeID))
f.c.Start(ctx, f.t.L(), f.startOpts, f.startSettings, f.c.Node(nodeID))
f.c.Start(ctx, f.t.L(), failoverRestartOpts(), f.startSettings, f.c.Node(nodeID))
}

// pauseFailer pauses the process, but keeps the OS (and thus network
Expand Down Expand Up @@ -1788,3 +1766,16 @@ func sleepFor(ctx context.Context, t test.Test, duration time.Duration) {
t.Fatalf("sleep failed: %s", ctx.Err())
}
}

func failoverStartOpts() option.StartOpts {
startOpts := option.DefaultStartOpts()
startOpts.RoachprodOpts.ScheduleBackups = false
return startOpts
}

func failoverRestartOpts() option.StartOpts {
startOpts := option.DefaultStartOpts()
startOpts.RoachprodOpts.ScheduleBackups = false
startOpts.RoachprodOpts.SkipInit = true
return startOpts
}

0 comments on commit 7618f92

Please sign in to comment.