From b77ce2c1973ae8fec6babf07e933d1cf17f432dc Mon Sep 17 00:00:00 2001 From: Peter Mattis Date: Fri, 31 Aug 2018 22:11:09 -0400 Subject: [PATCH] roachtest: add acceptance/rapid-restart Move the rapid-restart acceptance test to a new acceptance/rapid-restart roachtest. See #29151 Release note: None --- pkg/acceptance/localcluster/cluster.go | 3 + pkg/acceptance/rapid_restart_test.go | 152 ------------------------- pkg/cmd/roachtest/acceptance.go | 1 + pkg/cmd/roachtest/rapid_restart.go | 105 +++++++++++++++++ 4 files changed, 109 insertions(+), 152 deletions(-) delete mode 100644 pkg/acceptance/rapid_restart_test.go create mode 100644 pkg/cmd/roachtest/rapid_restart.go diff --git a/pkg/acceptance/localcluster/cluster.go b/pkg/acceptance/localcluster/cluster.go index eec640330358..966c357c7f8f 100644 --- a/pkg/acceptance/localcluster/cluster.go +++ b/pkg/acceptance/localcluster/cluster.go @@ -832,3 +832,6 @@ func (n *Node) Wait() *exec.ExitError { ee, _ := n.waitErr.Load().(*exec.ExitError) return ee } + +// Silence unused warning. +var _ = (*Node)(nil).Wait diff --git a/pkg/acceptance/rapid_restart_test.go b/pkg/acceptance/rapid_restart_test.go deleted file mode 100644 index fef18a9fe60e..000000000000 --- a/pkg/acceptance/rapid_restart_test.go +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright 2017 The Cockroach Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the License. - -package acceptance - -import ( - "context" - "math/rand" - "net/http" - "os" - "os/exec" - "testing" - "time" - - "golang.org/x/sync/errgroup" - - "github.com/pkg/errors" - - "github.com/cockroachdb/cockroach/pkg/acceptance/cluster" - "github.com/cockroachdb/cockroach/pkg/acceptance/localcluster" - "github.com/cockroachdb/cockroach/pkg/util/log" - "github.com/cockroachdb/cockroach/pkg/util/sysutil" - "github.com/cockroachdb/cockroach/pkg/util/timeutil" -) - -func TestRapidRestarts(t *testing.T) { - s := log.Scope(t) - defer s.Close(t) - - ctx := context.Background() - cfg := ReadConfigFromFlags() - RunLocal(t, func(t *testing.T) { - deadline := timeutil.Now().Add(cfg.Duration) - // In a loop, bootstrap a new node and immediately kill it. This is more - // effective at finding problems that restarting an existing node since - // there are more moving parts the first time around. Since there could be - // future issues that only occur on a restart, each invocation of the test - // also restart-kills the existing node once. - for timeutil.Now().Before(deadline) { - testRapidRestartSingle(ctx, t, cfg) - } - }) -} - -func unexpectedExitCode(exitErr *exec.ExitError) error { - if exitErr == nil { - // Server shut down cleanly. Note that returning `err` here would create - // an error interface wrapping a nil *ExitError, which is *not* nil - // itself. - return nil - } - - switch status := sysutil.ExitStatus(exitErr); status { - case -1: - // Received SIGINT before setting up our own signal handlers. - case 1: - // Exit code from a SIGINT received by our signal handlers. - default: - return errors.Wrapf(exitErr, "unexpected exit status %d", status) - } - return nil -} - -func testRapidRestartSingle(ctx context.Context, t *testing.T, cfg cluster.TestConfig) { - // Make this a single-node cluster which unlocks optimizations in - // LocalCluster that skip all the waiting so that we get to kill the process - // early in its boot sequence. - cfg.Nodes = cfg.Nodes[:1] - // Make sure StartCluster doesn't wait for replication but just hands us the - // cluster straight away. - cfg.NoWait = true - - c := StartCluster(ctx, t, cfg) - defer c.AssertAndStop(ctx, t) - - lc := c.(*localcluster.LocalCluster) - - interrupt := func() { - t.Helper() - time.Sleep(time.Duration(rand.Int63n(int64(time.Second)))) - lc.Nodes[0].Signal(os.Interrupt) - } - - check := func() { - t.Helper() - if err := unexpectedExitCode(lc.Nodes[0].Wait()); err != nil { - lc.Cfg.Ephemeral = false // keep log dir - t.Fatalf("node did not terminate cleanly: %v", err) - } - } - - const count = 2 - // NB: the use of Group makes no sense with count=2, but this way you can - // bump it and the whole thing still works. - var g errgroup.Group - - getVars := func(ch <-chan error) func() error { - return func() error { - for { - base := c.URL(ctx, 0) - if base != "" { - // Torture the prometheus endpoint to prevent regression of #19559. - const varsEndpoint = "/_status/vars" - resp, err := cluster.HTTPClient.Get(base + varsEndpoint) - if err == nil { - if resp.StatusCode != http.StatusNotFound && resp.StatusCode != http.StatusOK { - return errors.Errorf("unexpected status code from %s: %d", varsEndpoint, resp.StatusCode) - } - } - } - select { - case err := <-ch: - return err - default: - time.Sleep(time.Millisecond) - } - } - } - } - closedCh := make(chan error) - close(closedCh) - - for i := 0; i < count; i++ { - g.Go(getVars(closedCh)) - - if i > 0 { - ch := lc.RestartAsync(ctx, 0) - g.Go(getVars(ch)) - } - - log.Info(ctx, "interrupting node") - interrupt() - - log.Info(ctx, "waiting for exit code") - check() - } - - if err := g.Wait(); err != nil { - t.Fatal(err) - } -} diff --git a/pkg/cmd/roachtest/acceptance.go b/pkg/cmd/roachtest/acceptance.go index 9ed5e5c2b441..1c8fd047ed2e 100644 --- a/pkg/cmd/roachtest/acceptance.go +++ b/pkg/cmd/roachtest/acceptance.go @@ -32,6 +32,7 @@ func registerAcceptance(r *registry) { {"build-info", runBuildInfo}, {"cli/node-status", runCLINodeStatus}, {"event-log", runEventLog}, + {"rapid-restart", runRapidRestart}, {"status-server", runStatusServer}, } for _, tc := range testCases { diff --git a/pkg/cmd/roachtest/rapid_restart.go b/pkg/cmd/roachtest/rapid_restart.go new file mode 100644 index 000000000000..d514dc30aba9 --- /dev/null +++ b/pkg/cmd/roachtest/rapid_restart.go @@ -0,0 +1,105 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package main + +import ( + "context" + "math/rand" + "net/http" + "os/exec" + "time" + + "github.com/cockroachdb/cockroach/pkg/util/sysutil" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/pkg/errors" +) + +func runRapidRestart(ctx context.Context, t *test, c *cluster) { + // Use a single-node cluster which speeds the stop/start cycle. + nodes := c.Node(1) + c.Put(ctx, cockroach, "./cockroach", nodes) + c.Wipe(ctx, c.All()) + + // In a loop, bootstrap a new single-node cluster and immediately kill + // it. This is more effective at finding problems than restarting an existing + // node since there are more moving parts the first time around. Since there + // could be future issues that only occur on a restart, each invocation of + // the test also restart-kills the existing node twice. + deadline := timeutil.Now().Add(time.Minute) + done := func() bool { + return timeutil.Now().After(deadline) + } + for j := 1; !done(); j++ { + c.Wipe(ctx, nodes) + + // The first 2 iterations we start the cockroach node and kill it right + // away. The 3rd iteration we let cockroach run so that we can check after + // the loop that everything is ok. + for i := 0; i < 3; i++ { + exitCh := make(chan error, 1) + go func() { + err := c.RunE(ctx, nodes, + `mkdir -p {log-dir} && ./cockroach start --insecure --store={store-dir} `+ + `--log-dir={log-dir} --cache=10% --max-sql-memory=10% `+ + `--listen-addr=:{pgport:1} --http-port=$[{pgport:1}+1] `+ + `> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`) + exitCh <- err + }() + if i == 2 { + break + } + + time.Sleep(time.Duration(rand.Int63n(int64(time.Second)))) + sig := [2]string{"2", "9"}[rand.Intn(2)] + c.Stop(ctx, nodes, stopArgs("--sig="+sig)) + select { + case <-ctx.Done(): + return + case err := <-exitCh: + cause := errors.Cause(err) + if exitErr, ok := cause.(*exec.ExitError); ok { + switch status := sysutil.ExitStatus(exitErr); status { + case -1: + // Received SIGINT before setting up our own signal handlers or + // SIGKILL. + case 1: + // Exit code from a SIGINT received by our signal handlers. + default: + t.Fatalf("unexpected exit status %d", status) + } + } else { + t.Fatalf("unexpected exit err: %v", err) + } + } + } + + // Verify the cluster is ok by torturing the prometheus endpoint until it + // returns success. A side-effect is to prevent regression of #19559. + for !done() { + base := `http://` + c.ExternalAdminUIAddr(ctx, nodes)[0] + // Torture the prometheus endpoint to prevent regression of #19559. + url := base + `/_status/vars` + resp, err := http.Get(url) + if err == nil { + if resp.StatusCode != http.StatusNotFound && resp.StatusCode != http.StatusOK { + t.Fatalf("unexpected status code from %s: %d", url, resp.StatusCode) + } + break + } + } + + c.l.Printf("%d OK\n", j) + } +}