From 3b0775abf9a576692e020ef671cf5cbf84cf5f21 Mon Sep 17 00:00:00 2001 From: Nick Travers Date: Thu, 24 Mar 2022 22:10:51 +0000 Subject: [PATCH] roachtest: wait for ranges to replicate before filling disk Currently, the `disk-full` roachtest creates a cluster and immediately places a ballast file on one node, which causes it to crash. If this node is the only replica for a range containing a system table, when the node crashes due to a full disk certain system queries may not complete. This results in the test being unable to make forward progress, as the one dead node prevents a system query from completing, and this query prevents the node from being restarted. Wait for all ranges to have at least two replicas before placing the ballast file on the one node. Touches #78337, #78270. Release note: None. --- pkg/cmd/roachtest/tests/disk_full.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pkg/cmd/roachtest/tests/disk_full.go b/pkg/cmd/roachtest/tests/disk_full.go index e76cfa44e542..6631fd800537 100644 --- a/pkg/cmd/roachtest/tests/disk_full.go +++ b/pkg/cmd/roachtest/tests/disk_full.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/stretchr/testify/require" ) func registerDiskFull(r registry.Registry) { @@ -37,6 +38,17 @@ func registerDiskFull(r registry.Registry) { c.Put(ctx, t.Cockroach(), "./cockroach", c.Range(1, c.Spec().NodeCount)) c.Start(ctx, c.Range(1, nodes)) + // Node 1 will soon be killed, when the ballast file fills up its disk. To + // ensure that the ranges containing system tables are available on other + // nodes, we wait here for at least two replicas of each range. Without + // this, it's possible that we end up deadlocked on a system query that + // requires a range on node 1, but node 1 will not restart until the query + // completes. + db := c.Conn(ctx, t.L(), 1) + err := WaitForReplication(ctx, t, db, 2) + require.NoError(t, err) + _ = db.Close() + t.Status("running workload") m := c.NewMonitor(ctx, c.Range(1, nodes)) m.Go(func(ctx context.Context) error {