diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 2dd100483af4..73d14e3e8295 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1940,11 +1940,15 @@ func (m *monitor) wait(args ...string) error { } func waitForFullReplication(t *test, db *gosql.DB) { + tStart := timeutil.Now() for ok := false; !ok; time.Sleep(time.Second) { if err := db.QueryRow( "SELECT min(array_length(replicas, 1)) >= 3 FROM crdb_internal.ranges", ).Scan(&ok); err != nil { t.Fatal(err) } + if timeutil.Since(tStart) > 30*time.Second { + t.l.Printf("still waiting for full replication") + } } } diff --git a/pkg/cmd/roachtest/cluster_init.go b/pkg/cmd/roachtest/cluster_init.go index 307758f292c8..3518b2cb1564 100644 --- a/pkg/cmd/roachtest/cluster_init.go +++ b/pkg/cmd/roachtest/cluster_init.go @@ -34,6 +34,13 @@ func runClusterInit(ctx context.Context, t *test, c *cluster) { addrs := c.InternalAddr(ctx, c.All()) + // TODO(tbg): this should never happen, but I saw it locally. The result + // is the test hanging forever, because all nodes will create their own + // single node cluster and waitForFullReplication never returns. + if addrs[0] == "" { + t.Fatal("no address for first node") + } + // Legacy-style init where we start node 1 without a join flag and then point // the other nodes at it. func() {