Skip to content

Commit

Permalink
roachtest: Introduce a test to overwhelm nodes
Browse files Browse the repository at this point in the history
This test is here to check behavior of the system as the SQL load
greatly exceeds what the nodes are able to handle. In the future
we want to evaulate how this is handled, but today this will cause
nodes to OOM.

Informs cockroachdb#89142.

Release note: None
  • Loading branch information
andrewbaptist committed Oct 17, 2022
1 parent 98b1e0c commit d8976d9
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 0 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/admission_control.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ func registerAdmission(r registry.Registry) {
registerMultiStoreOverload(r)
registerSnapshotOverload(r)
registerTPCCOverload(r)
registerTPCCSevereOverload(r)

// TODO(irfansharif): Once registerMultiTenantFairness is unskipped and
// observed to be non-flaky for 3-ish months, transfer ownership to the AC
Expand Down
37 changes: 37 additions & 0 deletions pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/ts/tspb"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
Expand Down Expand Up @@ -144,3 +146,38 @@ func registerTPCCOverload(r registry.Registry) {
})
}
}

// This test begins a ramping TPCC workload that will overwhelm the CRDB nodes.
// There is no way to "pass" this test since the 6 nodes can't possibly handle
// 10K warehouses. If they could handle this load, then the test should be
// changed to increase that count. The purpose of the test is to make sure that
// the nodes don't fail under unsustainable overload. As of today (v22.2), the
// CRDB nodes will eventually OOM around 3-4 hours through the ramp period.
func registerTPCCSevereOverload(r registry.Registry) {
r.Add(registry.TestSpec{
Name: "admission-control/tpcc-severe-overload",
Owner: registry.OwnerAdmissionControl,
// TODO(abaptist): This test will require a lot of admission control work
// to pass. Just putting it here to make easy to run at any time.
Skip: "#89142",
Cluster: r.MakeClusterSpec(7, spec.CPU(8)),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
roachNodes := c.Range(1, c.Spec().NodeCount-1)
workloadNode := c.Spec().NodeCount

c.Put(ctx, t.Cockroach(), "./cockroach", c.All())
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), roachNodes)

t.Status("initializing (~1h)")
c.Run(ctx, c.Node(workloadNode), "./cockroach workload fixtures import tpcc --checks=false --warehouses=10000 {pgurl:1}")

// This run passes through 4 "phases"
// 1) No admission control, low latencies (up to ~1500 warehouses).
// 2) Admission control delays, growing latencies (up to ~3000 warehouses).
// 3) High latencies (100s+), queues building (up to ~4500 warehouse).
// 4) Memory and goroutine unbounded growth with eventual node crashes (up to ~6000 warehouse).
t.Status("running workload (fails in ~3-4 hours)")
c.Run(ctx, c.Node(workloadNode), "./cockroach workload run tpcc --ramp=6h --tolerate-errors --warehouses=10000 '{pgurl:1-6}'")
},
})
}

0 comments on commit d8976d9

Please sign in to comment.