From e5d33a197a4a88a9c58360881690f32b64c586ce Mon Sep 17 00:00:00 2001 From: Alex Sarkesian Date: Fri, 1 Jul 2022 18:56:54 -0400 Subject: [PATCH] roachtest: unskip large decommissionBench test This extends the timeout of the large, 3000 warehouse decommission benchmark roachtest to 3 hours, since it can take up to an hour for the test to import data, achieve range count balance, and ramp up its workload. This was skipped in #83445 due to frequent timeouts at the 1hr mark. It also adds a `--max-rate` parameter to the workload generator in order to ensure the cluster avoids overload. Release note: None --- pkg/cmd/roachtest/tests/decommissionbench.go | 17 +++++++++++++---- pkg/cmd/roachtest/tests/tpcc.go | 12 +++++++++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/pkg/cmd/roachtest/tests/decommissionbench.go b/pkg/cmd/roachtest/tests/decommissionbench.go index 8d0bdf6d646c..37447771c488 100644 --- a/pkg/cmd/roachtest/tests/decommissionbench.go +++ b/pkg/cmd/roachtest/tests/decommissionbench.go @@ -44,6 +44,9 @@ type decommissionBenchSpec struct { // When true, the test will attempt to stop the node prior to decommission. whileDown bool + // An override for the default timeout, if needed. + timeout time.Duration + skip string } @@ -86,7 +89,9 @@ func registerDecommissionBench(r registry.Registry) { warehouses: 3000, load: true, admissionControl: true, - skip: "https://github.com/cockroachdb/cockroach/issues/82870", + // This test can take nearly an hour to import and achieve balance, so + // we extend the timeout to let it complete. + timeout: 3 * time.Hour, }, } { registerDecommissionBenchSpec(r, benchSpec) @@ -96,6 +101,9 @@ func registerDecommissionBench(r registry.Registry) { // registerDecommissionBenchSpec adds a test using the specified configuration to the registry. func registerDecommissionBenchSpec(r registry.Registry, benchSpec decommissionBenchSpec) { timeout := defaultTimeout + if benchSpec.timeout != time.Duration(0) { + timeout = benchSpec.timeout + } extraNameParts := []string{""} if benchSpec.snapshotRate != 0 { @@ -166,12 +174,13 @@ func runDecommissionBench( c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Node(i)) } + maxRate := tpccMaxRate(benchSpec.warehouses) rampDuration := 3 * time.Minute rampStarted := make(chan struct{}, 1) importCmd := fmt.Sprintf(`./cockroach workload fixtures import tpcc --warehouses=%d`, benchSpec.warehouses) - workloadCmd := fmt.Sprintf("./workload run tpcc --warehouses=%d --duration=%s "+ - "--histograms=%s/stats.json --ramp=%s --tolerate-errors {pgurl:1-%d}", benchSpec.warehouses, + workloadCmd := fmt.Sprintf("./workload run tpcc --warehouses=%d --max-rate=%d --duration=%s "+ + "--histograms=%s/stats.json --ramp=%s --tolerate-errors {pgurl:1-%d}", maxRate, benchSpec.warehouses, testTimeout, t.PerfArtifactsDir(), rampDuration, benchSpec.nodes) t.Status(fmt.Sprintf("initializing cluster with %d warehouses", benchSpec.warehouses)) c.Run(ctx, c.Node(pinnedNode), importCmd) @@ -230,7 +239,7 @@ func runDecommissionBench( // per-second "tick", we will simply tick at the start of the decommission // and again at the completion. Roachperf will use the elapsed time between // these ticks to plot the duration of the decommission. - tick, perfBuf := initBulkJobPerfArtifacts("decommission", defaultTimeout) + tick, perfBuf := initBulkJobPerfArtifacts("decommission", testTimeout) recorder := &decommBenchTicker{pre: tick, post: tick} m.Go(func(ctx context.Context) error { diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index a7aa3f24d588..c0685eb054e3 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -291,6 +291,14 @@ var tpccSupportedWarehouses = []struct { {hardware: "gce-n5cpu16", v: version.MustParse(`v2.1.0-0`), warehouses: 1300}, } +// tpccMaxRate calculates the max rate of the workload given a number of warehouses. +func tpccMaxRate(warehouses int) int { + const txnsPerWarehousePerSecond = 12.8 * (23.0 / 10.0) * (1.0 / 60.0) // max_tpmC/warehouse * all_txns/new_order_txns * minutes/seconds + rateAtExpected := txnsPerWarehousePerSecond * float64(warehouses) + maxRate := int(rateAtExpected / 2) + return maxRate +} + func maxSupportedTPCCWarehouses( buildVersion version.Version, cloud string, nodes spec.ClusterSpec, ) int { @@ -1016,9 +1024,7 @@ func loadTPCCBench( // the desired distribution. This should allow for load-based rebalancing to // help distribute load. Optionally pass some load configuration-specific // flags. - const txnsPerWarehousePerSecond = 12.8 * (23.0 / 10.0) * (1.0 / 60.0) // max_tpmC/warehouse * all_txns/new_order_txns * minutes/seconds - rateAtExpected := txnsPerWarehousePerSecond * float64(b.EstimatedMax) - maxRate := int(rateAtExpected / 2) + maxRate := tpccMaxRate(b.EstimatedMax) rampTime := (1 * rebalanceWait) / 4 loadTime := (3 * rebalanceWait) / 4 cmd = fmt.Sprintf("./cockroach workload run tpcc --warehouses=%d --workers=%d --max-rate=%d "+