From 97f17ff3f71735a0947fe6a423893a7917f5411e Mon Sep 17 00:00:00 2001 From: Bilal Akhtar Date: Fri, 7 Jul 2023 16:09:42 -0400 Subject: [PATCH] cmd/roachtest: add disagg-rebalance roachtest This test adds a roachtest that spins up a cluster with 3 nodes using S3 as the --experimental-shared-storage, and then adds a fourth node after loading a tpcc fixture and with a foreground workload running on it. It confirms the fourth node gets hydrated without transferring all live bytes over the wire. Epic: none Fixes: #103030 Release note: None --- pkg/cmd/roachtest/tests/BUILD.bazel | 1 + pkg/cmd/roachtest/tests/disagg_rebalance.go | 135 ++++++++++++++++++++ pkg/cmd/roachtest/tests/registry.go | 1 + 3 files changed, 137 insertions(+) create mode 100644 pkg/cmd/roachtest/tests/disagg_rebalance.go diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index 9a8d81b6c65f..e29a40e21d79 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -48,6 +48,7 @@ go_library( "decommission.go", "decommission_self.go", "decommissionbench.go", + "disagg_rebalance.go", "disk_full.go", "disk_stall.go", "django.go", diff --git a/pkg/cmd/roachtest/tests/disagg_rebalance.go b/pkg/cmd/roachtest/tests/disagg_rebalance.go new file mode 100644 index 000000000000..f0cfe8855707 --- /dev/null +++ b/pkg/cmd/roachtest/tests/disagg_rebalance.go @@ -0,0 +1,135 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tests + +import ( + "context" + "fmt" + "time" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/dustin/go-humanize" +) + +func registerDisaggRebalance(r registry.Registry) { + disaggRebalanceSpec := r.MakeClusterSpec(4) + r.Add(registry.TestSpec{ + Name: fmt.Sprintf("disagg-rebalance/aws/%s", disaggRebalanceSpec), + Tags: registry.Tags("aws"), + Owner: registry.OwnerStorage, + Cluster: disaggRebalanceSpec, + EncryptionSupport: registry.EncryptionAlwaysDisabled, + Timeout: 1 * time.Hour, + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + if c.Spec().Cloud != spec.AWS { + t.Skip("disagg-rebalance is only configured to run on AWS") + } + c.Put(ctx, t.Cockroach(), "./cockroach") + s3dir := fmt.Sprintf("s3://%s/disagg-rebalance/%s?AUTH=implicit", testutils.BackupTestingBucket(), c.Name()) + startOpts := option.DefaultStartOptsNoBackups() + startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs, fmt.Sprintf("--experimental-shared-storage=%s", s3dir)) + c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Range(1, 3)) + + initialWaitDuration := 2 * time.Minute + warehouses := 20 + + t.Status("workload initialization") + cmd := fmt.Sprintf( + "./cockroach workload fixtures import tpcc --warehouses=%d {pgurl:1}", + warehouses, + ) + m := c.NewMonitor(ctx, c.Range(1, 3)) + m.Go(func(ctx context.Context) error { + return c.RunE(ctx, c.Node(1), cmd) + }) + m.Wait() + + m2 := c.NewMonitor(ctx, c.Range(1, 3)) + + m2.Go(func(ctx context.Context) error { + t.Status("run tpcc") + + cmd := fmt.Sprintf( + "./cockroach workload run tpcc --warehouses=%d --duration=10m {pgurl:1-3}", + warehouses, + ) + + return c.RunE(ctx, c.Node(1), cmd) + }) + + select { + case <-time.After(initialWaitDuration): + case <-ctx.Done(): + return + } + + t.Status("starting fourth node") + // Start the fourth node. + c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Node(4)) + + t.Status("verify rebalance") + + db := c.Conn(ctx, t.L(), 4) + defer func() { + _ = db.Close() + }() + + if err := waitForRebalance(ctx, t.L(), db, 10 /* maxStdDev */, 20 /* stableSeconds */); err != nil { + t.Fatal(err) + } + + var count int + if err := db.QueryRow( + // Check if the down node has any replicas. + "SELECT count(*) FROM crdb_internal.ranges WHERE array_position(replicas, $1) IS NOT NULL", + 4, + ).Scan(&count); err != nil { + t.Fatal(err) + } + if count < 10 { + t.Fatalf("did not replicate to n4 quickly enough, only found %d replicas", count) + } + + var bytesInRanges int64 + if err := db.QueryRow( + "SELECT sum(used) "+ + "FROM crdb_internal.kv_store_status WHERE node_id = $1 GROUP BY node_id LIMIT 1", + 4, + ).Scan(&bytesInRanges); err != nil { + t.Fatal(err) + } + var bytesSnapshotted int64 + if err := db.QueryRow( + "SELECT metrics['range.snapshots.rcvd-bytes'] FROM crdb_internal.kv_store_status WHERE node_id = $1 LIMIT 1", + 4, + ).Scan(&bytesSnapshotted); err != nil { + t.Fatal(err) + } + + t.L().PrintfCtx(ctx, "got snapshot received bytes = %s, logical bytes in ranges = %s", humanize.IBytes(uint64(bytesSnapshotted)), humanize.IBytes(uint64(bytesInRanges))) + if bytesSnapshotted > bytesInRanges { + t.Fatalf("unexpected snapshot received bytes %d > bytes in all replicas on n4 %d, did not do a disaggregated rebalance?", bytesSnapshotted, bytesInRanges) + } + + t.Status("continue tpcc") + + if err := m2.WaitE(); err != nil { + t.Fatal(err) + } + }, + }) +} diff --git a/pkg/cmd/roachtest/tests/registry.go b/pkg/cmd/roachtest/tests/registry.go index b77a77c08a41..9ae91336788e 100644 --- a/pkg/cmd/roachtest/tests/registry.go +++ b/pkg/cmd/roachtest/tests/registry.go @@ -45,6 +45,7 @@ func RegisterTests(r registry.Registry) { registerCostFuzz(r) registerDecommission(r) registerDecommissionBench(r) + registerDisaggRebalance(r) registerDiskFull(r) registerDiskStalledDetection(r) registerDjango(r)