Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
100584: roachtest: fix `gossip/restart` tests r=erikgrinaker a=erikgrinaker

**roachtest: add `WaitForReady` helper**

This patch adds a `WaitForReady()` helper that will wait until the given nodes report ready via health checks.

Epic: none
Release note: None

**roachtest: fix `gossip/restart` tests**

This patch fixes `acceptance/gossip/restart` and `gossip/restart` by waiting for all nodes to report ready before restarting nodes, and unskips them.

Resolves cockroachdb#96091.
Touches cockroachdb#48423.

Epic: none
Release note: None



Co-authored-by: Erik Grinaker <[email protected]>
  • Loading branch information
craig[bot] and erikgrinaker committed Apr 10, 2023
2 parents 85e41ca + c454fe1 commit 652fc0e
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/tests/acceptance.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func registerAcceptance(r registry.Registry) {
registry.OwnerKV: {
{name: "decommission-self", fn: runDecommissionSelf},
{name: "event-log", fn: runEventLog},
{name: "gossip/peerings", fn: runGossipPeerings, skip: "flaky test. tracked in #96091"},
{name: "gossip/peerings", fn: runGossipPeerings},
{name: "gossip/restart", fn: runGossipRestart},
{
name: "gossip/restart-node-one",
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/tests/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ func runGossipPeerings(ctx context.Context, t test.Test, c cluster.Cluster) {
deadline := timeutil.Now().Add(time.Minute)

for i := 1; timeutil.Now().Before(deadline); i++ {
WaitForReady(ctx, t, c, c.All())
if err := g.check(ctx, c, g.hasPeers(c.Spec().NodeCount)); err != nil {
t.Fatal(err)
}
Expand All @@ -308,8 +309,6 @@ func runGossipPeerings(ctx context.Context, t test.Test, c cluster.Cluster) {
}

func runGossipRestart(ctx context.Context, t test.Test, c cluster.Cluster) {
t.Skip("skipping flaky acceptance/gossip/restart", "https://github.com/cockroachdb/cockroach/issues/48423")

c.Put(ctx, t.Cockroach(), "./cockroach")
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings())

Expand All @@ -322,6 +321,7 @@ func runGossipRestart(ctx context.Context, t test.Test, c cluster.Cluster) {
deadline := timeutil.Now().Add(time.Minute)

for i := 1; timeutil.Now().Before(deadline); i++ {
WaitForReady(ctx, t, c, c.All())
g.checkConnectedAndFunctional(ctx, t, c)
t.L().Printf("%d: OK\n", i)

Expand Down
46 changes: 46 additions & 0 deletions pkg/cmd/roachtest/tests/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@ import (
"context"
gosql "database/sql"
"fmt"
"io"
"math/rand"
"net/http"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
)

// WaitFor3XReplication is like WaitForReplication but specifically requires
Expand All @@ -32,6 +37,47 @@ func WaitFor3XReplication(ctx context.Context, t test.Test, db *gosql.DB) error
return WaitForReplication(ctx, t, db, 3 /* replicationFactor */)
}

// WaitForReady waits until the given nodes report ready via health checks.
// This implies that the node has completed server startup, is heartbeating its
// liveness record, and can serve SQL clients.
func WaitForReady(
ctx context.Context, t test.Test, c cluster.Cluster, nodes option.NodeListOption,
) {
checkReady := func(ctx context.Context, url string) error {
resp, err := httputil.Get(ctx, url)
if err != nil {
return err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return err
}
if resp.StatusCode != http.StatusOK {
return errors.Errorf("HTTP %d: %s", resp.StatusCode, body)
}
return nil
}

adminAddrs, err := c.ExternalAdminUIAddr(ctx, t.L(), nodes)
require.NoError(t, err)

require.NoError(t, contextutil.RunWithTimeout(
ctx, "waiting for ready", time.Minute, func(ctx context.Context) error {
for i, adminAddr := range adminAddrs {
url := fmt.Sprintf(`http://%s/health?ready=1`, adminAddr)

for err := checkReady(ctx, url); err != nil; err = checkReady(ctx, url) {
t.L().Printf("n%d not ready, retrying: %s", nodes[i], err)
time.Sleep(time.Second)
}
t.L().Printf("n%d is ready", nodes[i])
}
return nil
},
))
}

// WaitForReplication waits until all ranges in the system are on at least
// replicationFactor voters.
func WaitForReplication(
Expand Down

0 comments on commit 652fc0e

Please sign in to comment.