From c22a04735e48c8be4f174bdcf100ec521735f050 Mon Sep 17 00:00:00 2001 From: Nick Travers Date: Thu, 24 Mar 2022 22:10:51 +0000 Subject: [PATCH] roachtest: wait for ranges to replicate before filling disk Currently, the `disk-full` roachtest creates a cluster and immediately places a ballast file on one node, which causes it to crash. If this node is the only replica for a range containing a system table, when the node crashes due to a full disk certain system queries may not complete. This results in the test being unable to make forward progress, as the one dead node prevents a system query from completing, and this query prevents the node from being restarted. Wait for all ranges to have at least two replicas before placing the ballast file on the one node. Touches #78337, #78270. Release note: None. --- pkg/cmd/roachtest/tests/disk_full.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pkg/cmd/roachtest/tests/disk_full.go b/pkg/cmd/roachtest/tests/disk_full.go index 603ab8571858..abb652d296a4 100644 --- a/pkg/cmd/roachtest/tests/disk_full.go +++ b/pkg/cmd/roachtest/tests/disk_full.go @@ -22,6 +22,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/stretchr/testify/require" ) func registerDiskFull(r registry.Registry) { @@ -38,6 +39,17 @@ func registerDiskFull(r registry.Registry) { c.Put(ctx, t.Cockroach(), "./cockroach", c.Range(1, c.Spec().NodeCount)) c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.Range(1, nodes)) + // Node 1 will soon be killed, when the ballast file fills up its disk. To + // ensure that the ranges containing system tables are available on other + // nodes, we wait here for at least two replicas of each range. Without + // this, it's possible that we end up deadlocked on a system query that + // requires a range on node 1, but node 1 will not restart until the query + // completes. + db := c.Conn(ctx, t.L(), 1) + err := WaitForReplication(ctx, t, db, 2) + require.NoError(t, err) + _ = db.Close() + t.Status("running workload") m := c.NewMonitor(ctx, c.Range(1, nodes)) m.Go(func(ctx context.Context) error {