From baa8fa0ed88f6e6e91fc0c0f7cb6582f8bb29cb8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 4 Apr 2023 12:38:52 +0000 Subject: [PATCH 1/2] roachtest: add `WaitForReady` helper This patch adds a `WaitForReady()` helper that will wait until the given nodes report ready via health checks. Epic: none Release note: None --- pkg/cmd/roachtest/tests/util.go | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/pkg/cmd/roachtest/tests/util.go b/pkg/cmd/roachtest/tests/util.go index 99924ef89eeb..068acc73e840 100644 --- a/pkg/cmd/roachtest/tests/util.go +++ b/pkg/cmd/roachtest/tests/util.go @@ -14,16 +14,21 @@ import ( "context" gosql "database/sql" "fmt" + "io" "math/rand" + "net/http" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/util/contextutil" + "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" + "github.com/stretchr/testify/require" ) // WaitFor3XReplication is like WaitForReplication but specifically requires @@ -32,6 +37,47 @@ func WaitFor3XReplication(ctx context.Context, t test.Test, db *gosql.DB) error return WaitForReplication(ctx, t, db, 3 /* replicationFactor */) } +// WaitForReady waits until the given nodes report ready via health checks. +// This implies that the node has completed server startup, is heartbeating its +// liveness record, and can serve SQL clients. +func WaitForReady( + ctx context.Context, t test.Test, c cluster.Cluster, nodes option.NodeListOption, +) { + checkReady := func(ctx context.Context, url string) error { + resp, err := httputil.Get(ctx, url) + if err != nil { + return err + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return err + } + if resp.StatusCode != http.StatusOK { + return errors.Errorf("HTTP %d: %s", resp.StatusCode, body) + } + return nil + } + + adminAddrs, err := c.ExternalAdminUIAddr(ctx, t.L(), nodes) + require.NoError(t, err) + + require.NoError(t, contextutil.RunWithTimeout( + ctx, "waiting for ready", time.Minute, func(ctx context.Context) error { + for i, adminAddr := range adminAddrs { + url := fmt.Sprintf(`http://%s/health?ready=1`, adminAddr) + + for err := checkReady(ctx, url); err != nil; err = checkReady(ctx, url) { + t.L().Printf("n%d not ready, retrying: %s", nodes[i], err) + time.Sleep(time.Second) + } + t.L().Printf("n%d is ready", nodes[i]) + } + return nil + }, + )) +} + // WaitForReplication waits until all ranges in the system are on at least // replicationFactor voters. func WaitForReplication( From c454fe1e8e237bd0f11e1355ea8c591d374ebe90 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 4 Apr 2023 12:39:42 +0000 Subject: [PATCH 2/2] roachtest: fix `gossip/restart` tests This patch fixes `acceptance/gossip/restart` and `gossip/restart` by waiting for all nodes to report ready before restarting nodes, and unskips them. Epic: none Release note: None --- pkg/cmd/roachtest/tests/acceptance.go | 2 +- pkg/cmd/roachtest/tests/gossip.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/cmd/roachtest/tests/acceptance.go b/pkg/cmd/roachtest/tests/acceptance.go index 22b9a737b6e9..a8018902ed6e 100644 --- a/pkg/cmd/roachtest/tests/acceptance.go +++ b/pkg/cmd/roachtest/tests/acceptance.go @@ -31,7 +31,7 @@ func registerAcceptance(r registry.Registry) { registry.OwnerKV: { {name: "decommission-self", fn: runDecommissionSelf}, {name: "event-log", fn: runEventLog}, - {name: "gossip/peerings", fn: runGossipPeerings, skip: "flaky test. tracked in #96091"}, + {name: "gossip/peerings", fn: runGossipPeerings}, {name: "gossip/restart", fn: runGossipRestart}, { name: "gossip/restart-node-one", diff --git a/pkg/cmd/roachtest/tests/gossip.go b/pkg/cmd/roachtest/tests/gossip.go index 7d2d73e695c8..b0c616bc4628 100644 --- a/pkg/cmd/roachtest/tests/gossip.go +++ b/pkg/cmd/roachtest/tests/gossip.go @@ -285,6 +285,7 @@ func runGossipPeerings(ctx context.Context, t test.Test, c cluster.Cluster) { deadline := timeutil.Now().Add(time.Minute) for i := 1; timeutil.Now().Before(deadline); i++ { + WaitForReady(ctx, t, c, c.All()) if err := g.check(ctx, c, g.hasPeers(c.Spec().NodeCount)); err != nil { t.Fatal(err) } @@ -308,8 +309,6 @@ func runGossipPeerings(ctx context.Context, t test.Test, c cluster.Cluster) { } func runGossipRestart(ctx context.Context, t test.Test, c cluster.Cluster) { - t.Skip("skipping flaky acceptance/gossip/restart", "https://github.com/cockroachdb/cockroach/issues/48423") - c.Put(ctx, t.Cockroach(), "./cockroach") c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings()) @@ -322,6 +321,7 @@ func runGossipRestart(ctx context.Context, t test.Test, c cluster.Cluster) { deadline := timeutil.Now().Add(time.Minute) for i := 1; timeutil.Now().Before(deadline); i++ { + WaitForReady(ctx, t, c, c.All()) g.checkConnectedAndFunctional(ctx, t, c) t.L().Printf("%d: OK\n", i)