From 56125990f65e7d20bc147fefe76d075150f4c71a Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Thu, 11 Jul 2024 19:48:43 +0000 Subject: [PATCH] roachtest: update rebalance-load mixed-version tests for shared-process In this commit, we update the `rebalance/by-load` mixed-version tests so that they can run on shared-process deployments. As usual, we need to enable some features on tenants before initializing the workload. In addition, we pass the virtual cluster cookie when fetching system metrics. Informs: #127378 Release note: None --- .../tests/admission_control_tpcc_overload.go | 2 +- pkg/cmd/roachtest/tests/disk_stall.go | 6 ++-- pkg/cmd/roachtest/tests/rebalance_load.go | 35 ++++++++++--------- pkg/cmd/roachtest/tests/ts_util.go | 17 ++++++--- 4 files changed, 35 insertions(+), 25 deletions(-) diff --git a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go index e0c63bae5127..1432265d09d9 100644 --- a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go @@ -121,7 +121,7 @@ func verifyNodeLiveness( if err := retry.WithMaxAttempts(ctx, retry.Options{ MaxBackoff: 500 * time.Millisecond, }, 60, func() (err error) { - response, err = getMetrics(ctx, c, t, adminURLs[0], now.Add(-runDuration), now, []tsQuery{ + response, err = getMetrics(ctx, c, t, adminURLs[0], "", now.Add(-runDuration), now, []tsQuery{ { name: "cr.node.liveness.heartbeatfailures", queryType: total, diff --git a/pkg/cmd/roachtest/tests/disk_stall.go b/pkg/cmd/roachtest/tests/disk_stall.go index b160da88f27b..999ef47a6f54 100644 --- a/pkg/cmd/roachtest/tests/disk_stall.go +++ b/pkg/cmd/roachtest/tests/disk_stall.go @@ -159,7 +159,7 @@ func runDiskStalledWALFailover( t.Fatal("process exited unexectedly") } - data := mustGetMetrics(ctx, c, t, adminURL, + data := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName, workloadStartAt.Add(5*time.Minute), timeutil.Now().Add(-time.Minute), []tsQuery{ @@ -308,7 +308,7 @@ func runDiskStalledDetection( } stallAt := timeutil.Now() - response := mustGetMetrics(ctx, c, t, adminURL, workloadStartAt, stallAt, []tsQuery{ + response := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName, workloadStartAt, stallAt, []tsQuery{ {name: "cr.node.sql.query.count", queryType: total}, }) cum := response.Results[0].Datapoints @@ -360,7 +360,7 @@ func runDiskStalledDetection( { now := timeutil.Now() - response := mustGetMetrics(ctx, c, t, adminURL, workloadStartAt, now, []tsQuery{ + response := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName, workloadStartAt, now, []tsQuery{ {name: "cr.node.sql.query.count", queryType: total}, }) cum := response.Results[0].Datapoints diff --git a/pkg/cmd/roachtest/tests/rebalance_load.go b/pkg/cmd/roachtest/tests/rebalance_load.go index 36343751aa73..dc2558108ae7 100644 --- a/pkg/cmd/roachtest/tests/rebalance_load.go +++ b/pkg/cmd/roachtest/tests/rebalance_load.go @@ -99,13 +99,15 @@ func registerRebalanceLoad(r registry.Registry) { mixedversion.ClusterSettingOption( install.ClusterSettingsOption(settings.ClusterSettings), ), - // Multi-tenant deployments are currently unsupported. See #127378. - mixedversion.EnabledDeploymentModes(mixedversion.SystemOnlyDeployment), ) + mvt.OnStartup("maybe enable split/scatter on tenant", + func(ctx context.Context, l *logger.Logger, r *rand.Rand, h *mixedversion.Helper) error { + return enableTenantSplitScatter(l, r, h) + }) mvt.InMixedVersion("rebalance load run", func(ctx context.Context, l *logger.Logger, r *rand.Rand, h *mixedversion.Helper) error { return rebalanceByLoad( - ctx, t, c, rebalanceMode, maxDuration, concurrency, appNode, numStores, numNodes) + ctx, t, l, c, rebalanceMode, maxDuration, concurrency, appNode, numStores, numNodes) }) mvt.Run() } else { @@ -122,7 +124,7 @@ func registerRebalanceLoad(r registry.Registry) { settings.ClusterSettings["server.cpu_profile.cpu_usage_combined_threshold"] = "90" c.Start(ctx, t.L(), startOpts, settings, roachNodes) require.NoError(t, rebalanceByLoad( - ctx, t, c, rebalanceMode, maxDuration, + ctx, t, t.L(), c, rebalanceMode, maxDuration, concurrency, appNode, numStores, numNodes, )) } @@ -193,7 +195,7 @@ func registerRebalanceLoad(r registry.Registry) { Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { if c.IsLocal() { concurrency = 32 - fmt.Printf("lowering concurrency to %d in local testing\n", concurrency) + t.L().Printf("lowering concurrency to %d in local testing", concurrency) } rebalanceLoadRun( ctx, t, c, "leases and replicas", 10*time.Minute, concurrency, true, /* mixedVersion */ @@ -231,6 +233,7 @@ func registerRebalanceLoad(r registry.Registry) { func rebalanceByLoad( ctx context.Context, t test.Test, + l *logger.Logger, c cluster.Cluster, rebalanceMode string, maxDuration time.Duration, @@ -246,10 +249,10 @@ func rebalanceByLoad( splits := (numStores * storeToRangeFactor) - 1 c.Run(ctx, option.WithNodes(appNode), fmt.Sprintf("./cockroach workload init kv --drop --splits=%d {pgurl:1}", splits)) - db := c.Conn(ctx, t.L(), 1) + db := c.Conn(ctx, l, 1) defer db.Close() - require.NoError(t, WaitFor3XReplication(ctx, t, t.L(), db)) + require.NoError(t, WaitFor3XReplication(ctx, t, l, db)) var m *errgroup.Group m, ctx = errgroup.WithContext(ctx) @@ -260,7 +263,7 @@ func rebalanceByLoad( ctx, cancel := context.WithCancel(ctx) m.Go(func() error { - t.L().Printf("starting load generator\n") + l.Printf("starting load generator") err := c.RunE(ctx, option.WithNodes(appNode), fmt.Sprintf( "./cockroach workload run kv --read-percent=95 --tolerate-errors --concurrency=%d "+ "--duration=%v {pgurl:1-%d}", @@ -275,9 +278,9 @@ func rebalanceByLoad( }) m.Go(func() error { - t.Status("checking for CPU balance") + l.Printf("checking for CPU balance") - storeCPUFn, err := makeStoreCPUFn(ctx, c, t, numNodes, numStores) + storeCPUFn, err := makeStoreCPUFn(ctx, t, l, c, numNodes, numStores) if err != nil { return err } @@ -297,18 +300,18 @@ func rebalanceByLoad( now := timeutil.Now() clusterStoresCPU, err := storeCPUFn(ctx) if err != nil { - t.L().Printf("unable to get the cluster stores CPU %s\n", err.Error()) + l.Printf("unable to get the cluster stores CPU: %v", err) continue } var curIsBalanced bool curIsBalanced, reason = isLoadEvenlyDistributed(clusterStoresCPU, meanCPUTolerance) - t.L().Printf("cpu %s", reason) + l.Printf("cpu %s", reason) if !prevIsBalanced && curIsBalanced { balancedStartTime = now } prevIsBalanced = curIsBalanced if prevIsBalanced && now.Sub(balancedStartTime) > stableDuration { - t.Status("successfully achieved CPU balance; waiting for kv to finish running") + l.Printf("successfully achieved CPU balance; waiting for kv to finish running") cancel() return nil } @@ -322,9 +325,9 @@ func rebalanceByLoad( // the cluster stores. When there are multiple stores per node, stores on the // same node will report identical CPU. func makeStoreCPUFn( - octx context.Context, c cluster.Cluster, t test.Test, numNodes, numStores int, + ctx context.Context, t test.Test, l *logger.Logger, c cluster.Cluster, numNodes, numStores int, ) (func(ctx context.Context) ([]float64, error), error) { - adminURLs, err := c.ExternalAdminUIAddr(octx, t.L(), c.Node(1)) + adminURLs, err := c.ExternalAdminUIAddr(ctx, l, c.Node(1)) if err != nil { return nil, err } @@ -342,7 +345,7 @@ func makeStoreCPUFn( return func(ctx context.Context) ([]float64, error) { now := timeutil.Now() resp, err := getMetricsWithSamplePeriod( - ctx, c, t, url, startTime, now, statSamplePeriod, tsQueries) + ctx, c, t, url, install.SystemInterfaceName, startTime, now, statSamplePeriod, tsQueries) if err != nil { return nil, err } diff --git a/pkg/cmd/roachtest/tests/ts_util.go b/pkg/cmd/roachtest/tests/ts_util.go index e922c7247621..ce314f9f71c7 100644 --- a/pkg/cmd/roachtest/tests/ts_util.go +++ b/pkg/cmd/roachtest/tests/ts_util.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/ts/tspb" ) @@ -51,10 +52,11 @@ func mustGetMetrics( c cluster.Cluster, t test.Test, adminURL string, + virtualCluster string, start, end time.Time, tsQueries []tsQuery, ) tspb.TimeSeriesQueryResponse { - response, err := getMetrics(ctx, c, t, adminURL, start, end, tsQueries) + response, err := getMetrics(ctx, c, t, adminURL, virtualCluster, start, end, tsQueries) if err != nil { t.Fatal(err) } @@ -66,10 +68,11 @@ func getMetrics( c cluster.Cluster, t test.Test, adminURL string, + virtualCluster string, start, end time.Time, tsQueries []tsQuery, ) (tspb.TimeSeriesQueryResponse, error) { - return getMetricsWithSamplePeriod(ctx, c, t, adminURL, start, end, defaultSamplePeriod, tsQueries) + return getMetricsWithSamplePeriod(ctx, c, t, adminURL, virtualCluster, start, end, defaultSamplePeriod, tsQueries) } func getMetricsWithSamplePeriod( @@ -77,6 +80,7 @@ func getMetricsWithSamplePeriod( c cluster.Cluster, t test.Test, adminURL string, + virtualCluster string, start, end time.Time, samplePeriod time.Duration, tsQueries []tsQuery, @@ -114,7 +118,10 @@ func getMetricsWithSamplePeriod( Queries: queries, } var response tspb.TimeSeriesQueryResponse - client := roachtestutil.DefaultHTTPClient(c, t.L(), roachtestutil.HTTPTimeout(500*time.Millisecond)) + client := roachtestutil.DefaultHTTPClient( + c, t.L(), roachtestutil.HTTPTimeout(500*time.Millisecond), + roachtestutil.VirtualCluster(virtualCluster), + ) err := client.PostProtobuf(ctx, url, &request, &response) return response, err @@ -134,7 +141,7 @@ func verifyTxnPerSecond( t.Fatal(err) } adminURL := adminUIAddrs[0] - response := mustGetMetrics(ctx, c, t, adminURL, start, end, []tsQuery{ + response := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName, start, end, []tsQuery{ {name: "cr.node.txn.commits", queryType: rate}, {name: "cr.node.txn.commits", queryType: total}, }) @@ -185,7 +192,7 @@ func verifyLookupsPerSec( t.Fatal(err) } adminURL := adminUIAddrs[0] - response := mustGetMetrics(ctx, c, t, adminURL, start, end, []tsQuery{ + response := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName, start, end, []tsQuery{ {name: "cr.node.distsender.rangelookups", queryType: rate}, })