Skip to content

Commit

Permalink
roachtest: add acceptance/rapid-restart
Browse files Browse the repository at this point in the history
Move the rapid-restart acceptance test to a new acceptance/rapid-restart
roachtest.

See cockroachdb#29151

Release note: None
  • Loading branch information
petermattis committed Sep 5, 2018
1 parent 2e037d6 commit b77ce2c
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 152 deletions.
3 changes: 3 additions & 0 deletions pkg/acceptance/localcluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -832,3 +832,6 @@ func (n *Node) Wait() *exec.ExitError {
ee, _ := n.waitErr.Load().(*exec.ExitError)
return ee
}

// Silence unused warning.
var _ = (*Node)(nil).Wait
152 changes: 0 additions & 152 deletions pkg/acceptance/rapid_restart_test.go

This file was deleted.

1 change: 1 addition & 0 deletions pkg/cmd/roachtest/acceptance.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ func registerAcceptance(r *registry) {
{"build-info", runBuildInfo},
{"cli/node-status", runCLINodeStatus},
{"event-log", runEventLog},
{"rapid-restart", runRapidRestart},
{"status-server", runStatusServer},
}
for _, tc := range testCases {
Expand Down
105 changes: 105 additions & 0 deletions pkg/cmd/roachtest/rapid_restart.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright 2018 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package main

import (
"context"
"math/rand"
"net/http"
"os/exec"
"time"

"github.com/cockroachdb/cockroach/pkg/util/sysutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/pkg/errors"
)

func runRapidRestart(ctx context.Context, t *test, c *cluster) {
// Use a single-node cluster which speeds the stop/start cycle.
nodes := c.Node(1)
c.Put(ctx, cockroach, "./cockroach", nodes)
c.Wipe(ctx, c.All())

// In a loop, bootstrap a new single-node cluster and immediately kill
// it. This is more effective at finding problems than restarting an existing
// node since there are more moving parts the first time around. Since there
// could be future issues that only occur on a restart, each invocation of
// the test also restart-kills the existing node twice.
deadline := timeutil.Now().Add(time.Minute)
done := func() bool {
return timeutil.Now().After(deadline)
}
for j := 1; !done(); j++ {
c.Wipe(ctx, nodes)

// The first 2 iterations we start the cockroach node and kill it right
// away. The 3rd iteration we let cockroach run so that we can check after
// the loop that everything is ok.
for i := 0; i < 3; i++ {
exitCh := make(chan error, 1)
go func() {
err := c.RunE(ctx, nodes,
`mkdir -p {log-dir} && ./cockroach start --insecure --store={store-dir} `+
`--log-dir={log-dir} --cache=10% --max-sql-memory=10% `+
`--listen-addr=:{pgport:1} --http-port=$[{pgport:1}+1] `+
`> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`)
exitCh <- err
}()
if i == 2 {
break
}

time.Sleep(time.Duration(rand.Int63n(int64(time.Second))))
sig := [2]string{"2", "9"}[rand.Intn(2)]
c.Stop(ctx, nodes, stopArgs("--sig="+sig))
select {
case <-ctx.Done():
return
case err := <-exitCh:
cause := errors.Cause(err)
if exitErr, ok := cause.(*exec.ExitError); ok {
switch status := sysutil.ExitStatus(exitErr); status {
case -1:
// Received SIGINT before setting up our own signal handlers or
// SIGKILL.
case 1:
// Exit code from a SIGINT received by our signal handlers.
default:
t.Fatalf("unexpected exit status %d", status)
}
} else {
t.Fatalf("unexpected exit err: %v", err)
}
}
}

// Verify the cluster is ok by torturing the prometheus endpoint until it
// returns success. A side-effect is to prevent regression of #19559.
for !done() {
base := `http://` + c.ExternalAdminUIAddr(ctx, nodes)[0]
// Torture the prometheus endpoint to prevent regression of #19559.
url := base + `/_status/vars`
resp, err := http.Get(url)
if err == nil {
if resp.StatusCode != http.StatusNotFound && resp.StatusCode != http.StatusOK {
t.Fatalf("unexpected status code from %s: %d", url, resp.StatusCode)
}
break
}
}

c.l.Printf("%d OK\n", j)
}
}

0 comments on commit b77ce2c

Please sign in to comment.