-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
roachtest: introduce admission-control/elastic-cdc
Part of #89208. This test sets up a 3-node CRDB cluster on 8vCPU machines running 1000-warehouse TPC-C, and kicks off a few changefeed backfills concurrently. We've observed latency spikes during backfills because of its CPU/scan-heavy nature -- it can elevate CPU scheduling latencies which in turn translates to an increase in foreground latency. Also in this commit: routing std{err,out} from prometheus/grafana setup that roachtests do to the logger in scope. Release note: None
- Loading branch information
1 parent
fa47f7b
commit 42d08d8
Showing
5 changed files
with
172 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
147 changes: 147 additions & 0 deletions
147
pkg/cmd/roachtest/tests/admission_control_elastic_cdc.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
// Copyright 2022 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
|
||
package tests | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" | ||
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" | ||
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" | ||
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" | ||
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" | ||
) | ||
|
||
// This test sets up a 3-node CRDB cluster on 8vCPU machines running | ||
// 1000-warehouse TPC-C, and kicks off a few changefeed backfills concurrently. | ||
// We've observed latency spikes during backfills because of its CPU/scan-heavy | ||
// nature -- it can elevate CPU scheduling latencies which in turn translates to | ||
// an increase in foreground latency. | ||
func registerElasticControlForCDC(r registry.Registry) { | ||
r.Add(registry.TestSpec{ | ||
Name: "admission-control/elastic-cdc", | ||
Owner: registry.OwnerAdmissionControl, | ||
// TODO(irfansharif): After two weeks of nightly baking time, reduce | ||
// this to a weekly cadence. This is a long-running test and serves only | ||
// as a coarse-grained benchmark. | ||
// Tags: []string{`weekly`}, | ||
Cluster: r.MakeClusterSpec(4, spec.CPU(8)), | ||
RequiresLicense: true, | ||
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { | ||
if c.Spec().NodeCount < 4 { | ||
t.Fatalf("expected at least 4 nodes, found %d", c.Spec().NodeCount) | ||
} | ||
|
||
crdbNodes := c.Spec().NodeCount - 1 | ||
workloadNode := crdbNodes + 1 | ||
numWarehouses, workloadDuration, estimatedSetupTime := 1000, 60*time.Minute, 10*time.Minute | ||
if c.IsLocal() { | ||
numWarehouses, workloadDuration, estimatedSetupTime = 1, time.Minute, 2*time.Minute | ||
} | ||
|
||
promCfg := &prometheus.Config{} | ||
promCfg.WithPrometheusNode(c.Node(workloadNode).InstallNodes()[0]). | ||
WithNodeExporter(c.Range(1, c.Spec().NodeCount-1).InstallNodes()). | ||
WithCluster(c.Range(1, c.Spec().NodeCount-1).InstallNodes()). | ||
WithGrafanaDashboard("http://go.crdb.dev/p/changefeed-admission-control-grafana"). | ||
WithScrapeConfigs( | ||
prometheus.MakeWorkloadScrapeConfig("workload", "/", | ||
makeWorkloadScrapeNodes( | ||
c.Node(workloadNode).InstallNodes()[0], | ||
[]workloadInstance{{nodes: c.Node(workloadNode)}}, | ||
), | ||
), | ||
) | ||
|
||
if t.SkipInit() { | ||
t.Status(fmt.Sprintf("running tpcc for %s (<%s)", workloadDuration, time.Minute)) | ||
} else { | ||
t.Status(fmt.Sprintf("initializing + running tpcc for %s (<%s)", workloadDuration, 10*time.Minute)) | ||
} | ||
|
||
padDuration, err := time.ParseDuration(ifLocal(c, "5s", "5m")) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
stopFeedsDuration, err := time.ParseDuration(ifLocal(c, "5s", "1m")) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
|
||
runTPCC(ctx, t, c, tpccOptions{ | ||
Warehouses: numWarehouses, | ||
Duration: workloadDuration, | ||
SetupType: usingImport, | ||
EstimatedSetupTime: estimatedSetupTime, | ||
SkipPostRunCheck: true, | ||
ExtraSetupArgs: "--checks=false", | ||
PrometheusConfig: promCfg, | ||
During: func(ctx context.Context) error { | ||
db := c.Conn(ctx, t.L(), crdbNodes) | ||
defer db.Close() | ||
|
||
t.Status(fmt.Sprintf("configuring cluster (<%s)", 30*time.Second)) | ||
{ | ||
setAdmissionControl(ctx, t, c, true) | ||
|
||
// Changefeeds depend on rangefeeds being enabled. | ||
if _, err := db.Exec("SET CLUSTER SETTING kv.rangefeed.enabled = true"); err != nil { | ||
return err | ||
} | ||
} | ||
|
||
stopFeeds(db) // stop stray feeds (from repeated runs against the same cluster for ex.) | ||
defer stopFeeds(db) | ||
|
||
m := c.NewMonitor(ctx, c.Range(1, crdbNodes)) | ||
m.Go(func(ctx context.Context) error { | ||
const iters, changefeeds = 5, 10 | ||
for i := 0; i < iters; i++ { | ||
if i == 0 { | ||
t.Status(fmt.Sprintf("setting performance baseline (<%s)", padDuration)) | ||
} | ||
time.Sleep(padDuration) // each iteration lasts long enough to observe effects in metrics | ||
|
||
t.Status(fmt.Sprintf("during: round %d: stopping extant changefeeds (<%s)", i, stopFeedsDuration)) | ||
stopFeeds(db) | ||
time.Sleep(stopFeedsDuration) // buffer for cancellations to take effect/show up in metrics | ||
|
||
t.Status(fmt.Sprintf("during: round %d: creating %d changefeeds (<%s)", i, changefeeds, time.Minute)) | ||
for j := 0; j < changefeeds; j++ { | ||
stmtWithCursor := fmt.Sprintf(` | ||
CREATE CHANGEFEED FOR tpcc.order_line, tpcc.stock, tpcc.customer | ||
INTO 'null://' WITH cursor = '-%ds' | ||
`, int64(float64(i+1)*padDuration.Seconds())) // scanning as far back as possible (~ when the workload started) | ||
if _, err := db.ExecContext(ctx, stmtWithCursor); err != nil { | ||
return err | ||
} | ||
} | ||
|
||
// TODO(irfansharif): Add a version of this test | ||
// with initial_scan = 'only' to demonstrate the | ||
// need+efficacy of using elastic CPU control in | ||
// changefeed workers. That too has a severe effect | ||
// on scheduling latencies. | ||
} | ||
return nil | ||
}) | ||
|
||
t.Status(fmt.Sprintf("waiting for workload to finish (<%s)", workloadDuration)) | ||
m.Wait() | ||
|
||
return nil | ||
}, | ||
}) | ||
}, | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters