From d8976d94f355f5ae3144e8e7ae7ba62fe083fe1d Mon Sep 17 00:00:00 2001 From: Andrew Baptist Date: Fri, 14 Oct 2022 15:45:07 -0400 Subject: [PATCH] roachtest: Introduce a test to overwhelm nodes This test is here to check behavior of the system as the SQL load greatly exceeds what the nodes are able to handle. In the future we want to evaulate how this is handled, but today this will cause nodes to OOM. Informs #89142. Release note: None --- pkg/cmd/roachtest/tests/admission_control.go | 1 + .../tests/admission_control_tpcc_overload.go | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/pkg/cmd/roachtest/tests/admission_control.go b/pkg/cmd/roachtest/tests/admission_control.go index 4d3835f14493..a6bf482efd9c 100644 --- a/pkg/cmd/roachtest/tests/admission_control.go +++ b/pkg/cmd/roachtest/tests/admission_control.go @@ -32,6 +32,7 @@ func registerAdmission(r registry.Registry) { registerMultiStoreOverload(r) registerSnapshotOverload(r) registerTPCCOverload(r) + registerTPCCSevereOverload(r) // TODO(irfansharif): Once registerMultiTenantFairness is unskipped and // observed to be non-flaky for 3-ish months, transfer ownership to the AC diff --git a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go index 0117163f2845..01a3d8fac016 100644 --- a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go @@ -17,9 +17,11 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/ts/tspb" "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -144,3 +146,38 @@ func registerTPCCOverload(r registry.Registry) { }) } } + +// This test begins a ramping TPCC workload that will overwhelm the CRDB nodes. +// There is no way to "pass" this test since the 6 nodes can't possibly handle +// 10K warehouses. If they could handle this load, then the test should be +// changed to increase that count. The purpose of the test is to make sure that +// the nodes don't fail under unsustainable overload. As of today (v22.2), the +// CRDB nodes will eventually OOM around 3-4 hours through the ramp period. +func registerTPCCSevereOverload(r registry.Registry) { + r.Add(registry.TestSpec{ + Name: "admission-control/tpcc-severe-overload", + Owner: registry.OwnerAdmissionControl, + // TODO(abaptist): This test will require a lot of admission control work + // to pass. Just putting it here to make easy to run at any time. + Skip: "#89142", + Cluster: r.MakeClusterSpec(7, spec.CPU(8)), + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + roachNodes := c.Range(1, c.Spec().NodeCount-1) + workloadNode := c.Spec().NodeCount + + c.Put(ctx, t.Cockroach(), "./cockroach", c.All()) + c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), roachNodes) + + t.Status("initializing (~1h)") + c.Run(ctx, c.Node(workloadNode), "./cockroach workload fixtures import tpcc --checks=false --warehouses=10000 {pgurl:1}") + + // This run passes through 4 "phases" + // 1) No admission control, low latencies (up to ~1500 warehouses). + // 2) Admission control delays, growing latencies (up to ~3000 warehouses). + // 3) High latencies (100s+), queues building (up to ~4500 warehouse). + // 4) Memory and goroutine unbounded growth with eventual node crashes (up to ~6000 warehouse). + t.Status("running workload (fails in ~3-4 hours)") + c.Run(ctx, c.Node(workloadNode), "./cockroach workload run tpcc --ramp=6h --tolerate-errors --warehouses=10000 '{pgurl:1-6}'") + }, + }) +}