diff --git a/pkg/cmd/roachtest/tests/admission_control.go b/pkg/cmd/roachtest/tests/admission_control.go index 4d3835f14493..a6bf482efd9c 100644 --- a/pkg/cmd/roachtest/tests/admission_control.go +++ b/pkg/cmd/roachtest/tests/admission_control.go @@ -32,6 +32,7 @@ func registerAdmission(r registry.Registry) { registerMultiStoreOverload(r) registerSnapshotOverload(r) registerTPCCOverload(r) + registerTPCCSevereOverload(r) // TODO(irfansharif): Once registerMultiTenantFairness is unskipped and // observed to be non-flaky for 3-ish months, transfer ownership to the AC diff --git a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go index 0117163f2845..01a3d8fac016 100644 --- a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go @@ -17,9 +17,11 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/ts/tspb" "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -144,3 +146,38 @@ func registerTPCCOverload(r registry.Registry) { }) } } + +// This test begins a ramping TPCC workload that will overwhelm the CRDB nodes. +// There is no way to "pass" this test since the 6 nodes can't possibly handle +// 10K warehouses. If they could handle this load, then the test should be +// changed to increase that count. The purpose of the test is to make sure that +// the nodes don't fail under unsustainable overload. As of today (v22.2), the +// CRDB nodes will eventually OOM around 3-4 hours through the ramp period. +func registerTPCCSevereOverload(r registry.Registry) { + r.Add(registry.TestSpec{ + Name: "admission-control/tpcc-severe-overload", + Owner: registry.OwnerAdmissionControl, + // TODO(abaptist): This test will require a lot of admission control work + // to pass. Just putting it here to make easy to run at any time. + Skip: "#89142", + Cluster: r.MakeClusterSpec(7, spec.CPU(8)), + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + roachNodes := c.Range(1, c.Spec().NodeCount-1) + workloadNode := c.Spec().NodeCount + + c.Put(ctx, t.Cockroach(), "./cockroach", c.All()) + c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), roachNodes) + + t.Status("initializing (~1h)") + c.Run(ctx, c.Node(workloadNode), "./cockroach workload fixtures import tpcc --checks=false --warehouses=10000 {pgurl:1}") + + // This run passes through 4 "phases" + // 1) No admission control, low latencies (up to ~1500 warehouses). + // 2) Admission control delays, growing latencies (up to ~3000 warehouses). + // 3) High latencies (100s+), queues building (up to ~4500 warehouse). + // 4) Memory and goroutine unbounded growth with eventual node crashes (up to ~6000 warehouse). + t.Status("running workload (fails in ~3-4 hours)") + c.Run(ctx, c.Node(workloadNode), "./cockroach workload run tpcc --ramp=6h --tolerate-errors --warehouses=10000 '{pgurl:1-6}'") + }, + }) +}