From 758b4250f177bb50d6117b3c55c96d6f9dfc21d2 Mon Sep 17 00:00:00 2001 From: Masha Schneider Date: Mon, 11 Jun 2018 15:31:32 -0400 Subject: [PATCH 1/2] workload: add "gentle" chaos to tpccbench Before we used the roachprod stop command to stop cockroach for our chaos scenarios, which is a kill -9. Now for chaos we'll have an option to do a gracefull drain of the node. Closes #26387 Release note: None --- pkg/cmd/roachtest/chaos.go | 13 +++++++++++-- pkg/cmd/roachtest/cluster.go | 14 +++++++++++++- pkg/cmd/roachtest/tpcc.go | 10 ++++++---- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/pkg/cmd/roachtest/chaos.go b/pkg/cmd/roachtest/chaos.go index 05955fd7ab12..7bff308612e8 100644 --- a/pkg/cmd/roachtest/chaos.go +++ b/pkg/cmd/roachtest/chaos.go @@ -45,6 +45,9 @@ type Chaos struct { // Stopper is a channel that the chaos agent listens on. The agent will // terminate cleanly once it receives on the channel. Stopper <-chan time.Time + // DrainAndQuit is used to determine if want to kill the node vs draining it + // first and shutting down gracefully. + DrainAndQuit bool } // Runner returns a closure that runs chaos against the given cluster without @@ -67,9 +70,15 @@ func (ch *Chaos) Runner(c *cluster, m *monitor) func(context.Context) error { } target := ch.Target() - l.printf("killing %v (slept %s)\n", target, before) m.ExpectDeath() - c.Stop(ctx, target) + + if ch.DrainAndQuit { + l.printf("stopping and draining %v (slept %s)\n", target, before) + c.Stop(ctx, target, stopArgs("--sig=15")) + } else { + l.printf("killing %v (slept %s)\n", target, before) + c.Stop(ctx, target) + } select { case <-ch.Stopper: diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index cf80b5d9368b..712d69584574 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -678,6 +678,11 @@ func startArgs(extraArgs ...string) option { return roachprodArgOption(extraArgs) } +// stopArgs specifies extra arguments that are passed to `roachprod` during `c.Stop`. +func stopArgs(extraArgs ...string) option { + return roachprodArgOption(extraArgs) +} + type roachprodArgOption []string func (o roachprodArgOption) option() {} @@ -741,12 +746,19 @@ func (c *cluster) Stop(ctx context.Context, opts ...option) { // If the test has failed, don't try to limp along. return } + + args := []string{ + roachprod, + "stop", + } + args = append(args, roachprodArgs(opts)...) + args = append(args, c.makeNodes(opts...)) if atomic.LoadInt32(&interrupted) == 1 { c.t.Fatal("interrupted") } c.status("stopping cluster") defer c.status() - err := execCmd(ctx, c.l, roachprod, "stop", c.makeNodes(opts...)) + err := execCmd(ctx, c.l, args...) if err != nil { c.t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go index cf990e1d099f..b4ae6ee98b18 100644 --- a/pkg/cmd/roachtest/tpcc.go +++ b/pkg/cmd/roachtest/tpcc.go @@ -19,6 +19,7 @@ import ( "bytes" "context" "fmt" + "math" "strconv" "strings" "time" @@ -256,7 +257,7 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) { // Search between 1 and b.LoadWarehouses for the largest number of // warehouses that can be operated on while sustaining a throughput // threshold, set to a fraction of max tpmC. - precision := b.LoadWarehouses / 200 + precision := int(math.Max(1.0, float64(b.LoadWarehouses/200))) initStepSize := precision s := search.NewLineSearcher(1, b.LoadWarehouses, b.EstimatedMax, initStepSize, precision) res, err := s.Search(func(warehouses int) (bool, error) { @@ -280,9 +281,10 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) { // Kill one node at a time. ch := Chaos{ - Timer: Periodic{Down: 1 * time.Second, Up: 90 * time.Second}, - Target: roachNodes.randNode, - Stopper: loadDone, + Timer: Periodic{Down: 1 * time.Second, Up: 90 * time.Second}, + Target: roachNodes.randNode, + Stopper: loadDone, + DrainAndQuit: true, } m.Go(ch.Runner(c, m)) } From b7a35851d0afdd61310dae7db99205b1e9f1d645 Mon Sep 17 00:00:00 2001 From: Masha Schneider Date: Tue, 12 Jun 2018 16:24:33 -0400 Subject: [PATCH 2/2] roachtest: change Up and Down in ChaosTimer to Period and Downtime. Release note: None --- pkg/cmd/roachtest/chaos.go | 17 +++++++++-------- pkg/cmd/roachtest/scaledata.go | 2 +- pkg/cmd/roachtest/tpcc.go | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pkg/cmd/roachtest/chaos.go b/pkg/cmd/roachtest/chaos.go index 7bff308612e8..2d44a2182ac6 100644 --- a/pkg/cmd/roachtest/chaos.go +++ b/pkg/cmd/roachtest/chaos.go @@ -26,12 +26,12 @@ type ChaosTimer interface { // Periodic is a chaos timing using fixed durations. type Periodic struct { - Down, Up time.Duration + Period, DownTime time.Duration } // Timing implements ChaosTimer. func (p Periodic) Timing() (time.Duration, time.Duration) { - return p.Down, p.Up + return p.Period, p.DownTime } // Chaos stops and restarts nodes in a cluster. @@ -59,24 +59,25 @@ func (ch *Chaos) Runner(c *cluster, m *monitor) func(context.Context) error { if err != nil { return err } + period, downTime := ch.Timer.Timing() + t := time.NewTicker(period) for { - before, between := ch.Timer.Timing() select { case <-ch.Stopper: return nil case <-ctx.Done(): return ctx.Err() - case <-time.After(before): + case <-t.C: } target := ch.Target() m.ExpectDeath() if ch.DrainAndQuit { - l.printf("stopping and draining %v (slept %s)\n", target, before) + l.printf("stopping and draining %v\n", target) c.Stop(ctx, target, stopArgs("--sig=15")) } else { - l.printf("killing %v (slept %s)\n", target, before) + l.printf("killing %v\n", target) c.Stop(ctx, target) } @@ -85,10 +86,10 @@ func (ch *Chaos) Runner(c *cluster, m *monitor) func(context.Context) error { return nil case <-ctx.Done(): return ctx.Err() - case <-time.After(between): + case <-time.After(downTime): } - c.l.printf("restarting %v after %s of downtime\n", target, between) + c.l.printf("restarting %v after %s of downtime\n", target, downTime) c.Start(ctx, target) } } diff --git a/pkg/cmd/roachtest/scaledata.go b/pkg/cmd/roachtest/scaledata.go index bcf3df4a7a0f..498244a5755e 100644 --- a/pkg/cmd/roachtest/scaledata.go +++ b/pkg/cmd/roachtest/scaledata.go @@ -90,7 +90,7 @@ func runSqlapp(ctx context.Context, t *test, c *cluster, app, flags string, dur // Kill one node at a time, with a minute of healthy cluster and thirty // seconds of down node. ch := Chaos{ - Timer: Periodic{Down: 30 * time.Second, Up: 1 * time.Minute}, + Timer: Periodic{Period: 90 * time.Second, DownTime: 30 * time.Second}, Target: roachNodes.randNode, Stopper: time.After(dur), } diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go index b4ae6ee98b18..3fce83aca900 100644 --- a/pkg/cmd/roachtest/tpcc.go +++ b/pkg/cmd/roachtest/tpcc.go @@ -281,7 +281,7 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) { // Kill one node at a time. ch := Chaos{ - Timer: Periodic{Down: 1 * time.Second, Up: 90 * time.Second}, + Timer: Periodic{Period: 90 * time.Second, DownTime: 1 * time.Second}, Target: roachNodes.randNode, Stopper: loadDone, DrainAndQuit: true,