Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachtest/mixedversion: monitor nodes in every test #107548

Merged
merged 4 commits into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 5 additions & 11 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -650,21 +650,15 @@ of nodes, outputting a line whenever a change is detected:
`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
messages, err := roachprod.Monitor(context.Background(), config.Logger, args[0], monitorOpts)
eventChan, err := roachprod.Monitor(context.Background(), config.Logger, args[0], monitorOpts)
if err != nil {
return err
}
for msg := range messages {
if msg.Err != nil {
msg.Msg += "error: " + msg.Err.Error()
}
thisError := errors.Newf("%d: %s", msg.Node, msg.Msg)
if msg.Err != nil || strings.Contains(msg.Msg, "dead") {
err = errors.CombineErrors(err, thisError)
}
fmt.Println(thisError.Error())
for info := range eventChan {
fmt.Println(info.String())
}
return err

return nil
}),
}

Expand Down
12 changes: 5 additions & 7 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1528,22 +1528,20 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
}

t.L().Printf("checking for dead nodes")
ch, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})
eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})

// An error here means there was a problem initialising a SyncedCluster.
if err != nil {
return err
}

deadNodes := 0
for n := range ch {
// If there's an error, it means either that the monitor command failed
// completely, or that it found a dead node worth complaining about.
if n.Err != nil || strings.HasPrefix(n.Msg, "dead") {
for info := range eventsCh {
t.L().Printf("%s", info)

if _, isDeath := info.Event.(install.MonitorNodeDead); isDeath {
deadNodes++
}

t.L().Printf("n%d: err=%v,msg=%s", n.Node, n.Err, n.Msg)
}

if deadNodes > 0 {
Expand Down
65 changes: 35 additions & 30 deletions pkg/cmd/roachtest/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ package main
import (
"context"
"fmt"
"strings"
"sync"
"sync/atomic"

Expand All @@ -32,11 +31,13 @@ type monitorImpl struct {
Failed() bool
WorkerStatus(...interface{})
}
l *logger.Logger
nodes string
ctx context.Context
cancel func()
g *errgroup.Group
l *logger.Logger
nodes string
ctx context.Context
cancel func()
g *errgroup.Group

numTasks int32 // atomically
expDeaths int32 // atomically
}

Expand Down Expand Up @@ -80,6 +81,8 @@ func (m *monitorImpl) ResetDeaths() {
var errTestFatal = errors.New("t.Fatal() was called")

func (m *monitorImpl) Go(fn func(context.Context) error) {
atomic.AddInt32(&m.numTasks, 1)

m.g.Go(func() (err error) {
defer func() {
r := recover()
Expand Down Expand Up @@ -171,15 +174,21 @@ func (m *monitorImpl) wait() error {
}

// 1. The first goroutine waits for the worker errgroup to exit.
// Note that this only happens if the caller created at least one
// task for the monitor. This check enables the roachtest monitor to
// be used in cases where we just want to monitor events in the
// cluster without running any background tasks through the monitor.
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer func() {
m.cancel()
wg.Done()
if atomic.LoadInt32(&m.numTasks) > 0 {
wg.Add(1)
go func() {
defer func() {
m.cancel()
wg.Done()
}()
setErr(errors.Wrap(m.g.Wait(), "function passed to monitor.Go failed"))
}()
setErr(errors.Wrap(m.g.Wait(), "monitor task failed"))
}()
}

// 2. The second goroutine reads from the monitoring channel, watching for any
// unexpected death events.
Expand All @@ -190,28 +199,24 @@ func (m *monitorImpl) wait() error {
wg.Done()
}()

messagesChannel, err := roachprod.Monitor(m.ctx, m.l, m.nodes, install.MonitorOpts{})
eventsCh, err := roachprod.Monitor(m.ctx, m.l, m.nodes, install.MonitorOpts{})
if err != nil {
setErr(errors.Wrap(err, "monitor command failure"))
return
}
var monitorErr error
for msg := range messagesChannel {
if msg.Err != nil {
msg.Msg += "error: " + msg.Err.Error()
}
thisError := errors.Newf("%d: %s", msg.Node, msg.Msg)
if msg.Err != nil || strings.Contains(msg.Msg, "dead") {
monitorErr = errors.CombineErrors(monitorErr, thisError)

for info := range eventsCh {
_, isDeath := info.Event.(install.MonitorNodeDead)
isExpectedDeath := isDeath && atomic.AddInt32(&m.expDeaths, -1) >= 0
var expectedDeathStr string
if isExpectedDeath {
expectedDeathStr = ": expected"
}
var id int
var s string
newMsg := thisError.Error()
if n, _ := fmt.Sscanf(newMsg, "%d: %s", &id, &s); n == 2 {
if strings.Contains(s, "dead") && atomic.AddInt32(&m.expDeaths, -1) < 0 {
setErr(errors.Wrap(fmt.Errorf("unexpected node event: %s", newMsg), "monitor command failure"))
return
}
m.l.Printf("Monitor event: %s%s", info, expectedDeathStr)

if isDeath && !isExpectedDeath {
setErr(fmt.Errorf("unexpected node event: %s", info))
return
}
}
}()
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "mixedversion",
srcs = [
"helper.go",
"mixedversion.go",
"planner.go",
"runner.go",
Expand Down
14 changes: 13 additions & 1 deletion pkg/cmd/roachtest/roachtestutil/mixedversion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,18 @@ for {
}
```

#### Use test helpers

Every test hook receives as parameter a `*mixedversion.Helper` instance. This helper struct contains convenience functions that make it easy to perform certain operations while automatically implementing some of the best practices described here. For a full list of helpers, check out the [`helper.go`](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go) file in the framework's source tree.

##### Aside: background functions

One particularly important helper functionality to highlight relates to the management of functions that need to run in the background during a test. Typically, roachtests are expected to use a [monitor](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/monitor.go) for that purpose; not only does the monitor protect from panics inadvertently crashing the test, it also preempts their execution (via context cancelation) if a node dies unexpectedly.

However, if a mixedversion test needs to perform a task in the background, they **must not use the roachtest monitor**. The reason for this is that mixedversion tests are [randomized](#embrace-randomness); as such, user-created monitors would not be able to predict when a death is expected since it does not know, by design, when a node restarts.

To run functions in the background, use the API provided by the mixedversion framework. Long running tasks that run throughout the test can be defined with `(*mixedversion.Test).BackgroundFunc`. If a test hook needs to perform a task in the background, the `*mixedversion.Helper` instance has a `Background` function that can be used for that purpose. As usual, check the documentation for the public API for more details on usage and behaviour of these functions.

#### Log progress

Logging events in the test as it runs can make debugging failures a lot easier. All hooks passed to the `mixedversion` API receive a `*logger.Logger` instance as parameter. **Functions should use that logger instead of the test logger** (`t.L()`). Doing so has two main advantages:
Expand Down Expand Up @@ -398,6 +410,6 @@ $ COCKROACH_RANDOM_SEED=7357315251706229449 roachtest run --versions-binary-over

### Final Notes

* This is a high level document and does not include API documentation. The `mixedversion` package includes a lot of documentation in the form of source code comments, and that should be the source of truth when it comes to finding out what functionality is available and how to use it.
* This is a high level document and does not include API documentation. The `mixedversion` package includes a lot of documentation in the form of source code comments, and that should be the source of truth when it comes to finding out what functionality is available and how to use it. Most of the public API is in the [`mixedversion.go`](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go) and [`helper.go`](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go) files.
* For a simple application of the `mixedversion` framework, check out the `acceptance/version-upgrade` roachtest. For a more complex example, see `backup-restore/mixed-version`.
* For any other questions, please reach out to `#test-eng`!
134 changes: 134 additions & 0 deletions pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package mixedversion

import (
"context"
gosql "database/sql"
"fmt"
"math/rand"
"path"
"strings"
"sync/atomic"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
)

func (h *Helper) RandomNode(prng *rand.Rand, nodes option.NodeListOption) int {
return nodes[prng.Intn(len(nodes))]
}

// RandomDB returns a (nodeID, connection) tuple for a randomly picked
// cockroach node according to the parameters passed.
func (h *Helper) RandomDB(prng *rand.Rand, nodes option.NodeListOption) (int, *gosql.DB) {
node := h.RandomNode(prng, nodes)
return node, h.Connect(node)
}

// QueryRow performs `db.QueryRowContext` on a randomly picked
// database node. The query and the node picked are logged in the logs
// of the step that calls this function.
func (h *Helper) QueryRow(rng *rand.Rand, query string, args ...interface{}) *gosql.Row {
node, db := h.RandomDB(rng, h.runner.crdbNodes)
h.stepLogger.Printf("running SQL statement:\n%s\nArgs: %v\nNode: %d", query, args, node)
return db.QueryRowContext(h.ctx, query, args...)
}

// Exec performs `db.ExecContext` on a randomly picked database node.
// The query and the node picked are logged in the logs of the step
// that calls this function.
func (h *Helper) Exec(rng *rand.Rand, query string, args ...interface{}) error {
node, db := h.RandomDB(rng, h.runner.crdbNodes)
h.stepLogger.Printf("running SQL statement:\n%s\nArgs: %v\nNode: %d", query, args, node)
_, err := db.ExecContext(h.ctx, query, args...)
return err
}

func (h *Helper) Connect(node int) *gosql.DB {
return h.runner.conn(node)
}

// SetContext should be called by steps that need access to the test
// context, as that is only visible to them.
func (h *Helper) SetContext(c *Context) {
h.testContext = c
}

// Context returns the test context associated with a certain step. It
// is made available for user-functions (see runHookStep).
func (h *Helper) Context() *Context {
return h.testContext
}

// Background allows test authors to create functions that run in the
// background in mixed-version hooks.
func (h *Helper) Background(
name string, fn func(context.Context, *logger.Logger) error,
) context.CancelFunc {
return h.runner.background.Start(name, func(ctx context.Context) error {
bgLogger, err := h.loggerFor(name)
if err != nil {
return fmt.Errorf("failed to create logger for background function %q: %w", name, err)
}

err = panicAsError(bgLogger, func() error { return fn(ctx, bgLogger) })
if err != nil {
if isContextCanceled(ctx) {
return err
}

desc := fmt.Sprintf("error in background function %s: %s", name, err)
return h.runner.testFailure(desc, bgLogger)
}

return nil
})
}

// BackgroundCommand has the same semantics of `Background()`; the
// command passed will run and the test will fail if the command is
// not successful.
func (h *Helper) BackgroundCommand(cmd string, nodes option.NodeListOption) context.CancelFunc {
desc := fmt.Sprintf("run command: %q", cmd)
return h.Background(desc, func(ctx context.Context, l *logger.Logger) error {
l.Printf("running command `%s` on nodes %v in the background", cmd, nodes)
return h.runner.cluster.RunE(ctx, nodes, cmd)
})
}

// ExpectDeath alerts the testing infrastructure that a node is
// expected to die. Regular restarts as part of the mixedversion
// testing are already taken into account. This function should only
// be used by tests that perform their own node restarts or chaos
// events.
func (h *Helper) ExpectDeath() {
h.ExpectDeaths(1)
}

// ExpectDeaths is the general version of `ExpectDeath()`.
func (h *Helper) ExpectDeaths(n int) {
h.runner.monitor.ExpectDeaths(n)
}

// loggerFor creates a logger instance to be used by background
// functions (created by calling `Background` on the helper
// instance). It is similar to the logger instances created for
// mixed-version steps, but with the `background_` prefix.
func (h *Helper) loggerFor(name string) (*logger.Logger, error) {
atomic.AddInt64(&h.bgCount, 1)

fileName := invalidChars.ReplaceAllString(strings.ToLower(name), "")
fileName = fmt.Sprintf("background_%s_%d", fileName, h.bgCount)
fileName = path.Join(logPrefix, fileName)

return prefixedLogger(h.runner.logger, fileName)
}
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,7 @@ func (s restartWithNewBinaryStep) Description() string {
func (s restartWithNewBinaryStep) Run(
ctx context.Context, l *logger.Logger, c cluster.Cluster, h *Helper,
) error {
h.ExpectDeath()
return clusterupgrade.RestartNodesWithNewBinary(
ctx,
s.rt,
Expand Down
Loading