Skip to content

Commit

Permalink
roachtest: tpcc: don't look at cloud during registration
Browse files Browse the repository at this point in the history
This commit cleans up the tpcc code to not look at the cloud (leaked
through `TestSpec`) during registration. Instead, we define both GCE
and AWS values in the spec and decide between them when the test is
run.

Informs cockroachdb#104029
Release note: None
  • Loading branch information
RaduBerinde committed Oct 17, 2023
1 parent 675cb0c commit 227ddf5
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 57 deletions.
4 changes: 4 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2525,6 +2525,10 @@ func (c *clusterImpl) MakeNodes(opts ...option.Option) string {
return c.name + r.String()
}

func (c *clusterImpl) Cloud() string {
return c.spec.Cloud
}

func (c *clusterImpl) IsLocal() bool {
return config.IsLocalClusterName(c.name)
}
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/cluster/cluster_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ type Cluster interface {

Spec() spec.ClusterSpec
Name() string
Cloud() string
IsLocal() bool
// IsSecure returns true iff the cluster uses TLS.
IsSecure() bool
Expand Down
10 changes: 7 additions & 3 deletions pkg/cmd/roachtest/spec/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,13 @@ func (m MemPerCPU) String() string {
// ClusterSpec represents a test's description of what its cluster needs to
// look like. It becomes part of a clusterConfig when the cluster is created.
type ClusterSpec struct {
Cloud string
Arch vm.CPUArch // CPU architecture; auto-chosen if left empty
InstanceType string // auto-chosen if left empty
// TODO(#104029): We should remove the Cloud field; the tests now specify
// their compatible clouds.
Cloud string
Arch vm.CPUArch // CPU architecture; auto-chosen if left empty
// TODO(radu): An InstanceType can only make sense in the context of a
// specific cloud. We should replace this with cloud-specific arguments.
InstanceType string // auto-chosen if left empty
NodeCount int
// CPUs is the number of CPUs per node.
CPUs int
Expand Down
146 changes: 92 additions & 54 deletions pkg/cmd/roachtest/tests/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,12 +357,12 @@ func maxSupportedTPCCWarehouses(
// TPCC workload is running. The number of database upgrades is
// controlled by the `versionsToUpgrade` parameter.
func runTPCCMixedHeadroom(
ctx context.Context, t test.Test, c cluster.Cluster, cloud string, versionsToUpgrade int,
ctx context.Context, t test.Test, c cluster.Cluster, versionsToUpgrade int,
) {
crdbNodes := c.Range(1, c.Spec().NodeCount-1)
workloadNode := c.Node(c.Spec().NodeCount)

maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), cloud, c.Spec())
maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), c.Cloud(), c.Spec())
headroomWarehouses := int(float64(maxWarehouses) * 0.7)
if c.IsLocal() {
headroomWarehouses = 10
Expand Down Expand Up @@ -484,7 +484,6 @@ func runTPCCMixedHeadroom(
}

func registerTPCC(r registry.Registry) {
cloud := r.MakeClusterSpec(1).Cloud
headroomSpec := r.MakeClusterSpec(4, spec.CPU(16), spec.RandomlyUseZfs())
r.Add(registry.TestSpec{
// w=headroom runs tpcc for a semi-extended period with some amount of
Expand All @@ -499,7 +498,7 @@ func registerTPCC(r registry.Registry) {
EncryptionSupport: registry.EncryptionMetamorphic,
Leases: registry.MetamorphicLeases,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), cloud, t.Spec().(*registry.TestSpec).Cluster)
maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), c.Cloud(), c.Spec())
headroomWarehouses := int(float64(maxWarehouses) * 0.7)
t.L().Printf("computed headroom warehouses of %d\n", headroomWarehouses)
runTPCC(ctx, t, c, tpccOptions{
Expand All @@ -526,7 +525,7 @@ func registerTPCC(r registry.Registry) {
Cluster: mixedHeadroomSpec,
EncryptionSupport: registry.EncryptionMetamorphic,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runTPCCMixedHeadroom(ctx, t, c, cloud, 1)
runTPCCMixedHeadroom(ctx, t, c, 1)
},
})

Expand All @@ -540,7 +539,7 @@ func registerTPCC(r registry.Registry) {
Cluster: mixedHeadroomSpec,
EncryptionSupport: registry.EncryptionMetamorphic,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runTPCCMixedHeadroom(ctx, t, c, cloud, 2)
runTPCCMixedHeadroom(ctx, t, c, 2)
},
})
r.Add(registry.TestSpec{
Expand Down Expand Up @@ -839,8 +838,10 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 4,

LoadWarehouses: 1000,
EstimatedMax: gceOrAws(cloud, 750, 900),
LoadWarehousesGCE: 1000,
LoadWarehousesAWS: 1000,
EstimatedMaxGCE: 750,
EstimatedMaxAWS: 900,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -849,18 +850,22 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 3500, 3900),
EstimatedMax: gceOrAws(cloud, 2900, 3500),
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Tags: registry.Tags(`aws`),
LoadWarehousesGCE: 3500,
LoadWarehousesAWS: 3900,
EstimatedMaxGCE: 2900,
EstimatedMaxAWS: 3500,
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Tags: registry.Tags(`aws`),
})
registerTPCCBenchSpec(r, tpccBenchSpec{
Nodes: 12,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 11500, 11500),
EstimatedMax: gceOrAws(cloud, 10000, 10000),
LoadWarehousesGCE: 11500,
LoadWarehousesAWS: 11500,
EstimatedMaxGCE: 10000,
EstimatedMaxAWS: 10000,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Weekly),
Expand All @@ -871,8 +876,10 @@ func registerTPCC(r registry.Registry) {
CPUs: 16,
Distribution: multiZone,

LoadWarehouses: 6500,
EstimatedMax: 5000,
LoadWarehousesGCE: 6500,
LoadWarehousesAWS: 6500,
EstimatedMaxGCE: 5000,
EstimatedMaxAWS: 5000,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -884,8 +891,10 @@ func registerTPCC(r registry.Registry) {
Distribution: multiRegion,
LoadConfig: multiLoadgen,

LoadWarehouses: 3000,
EstimatedMax: 2000,
LoadWarehousesGCE: 3000,
LoadWarehousesAWS: 3000,
EstimatedMaxGCE: 2000,
EstimatedMaxAWS: 2000,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -896,8 +905,10 @@ func registerTPCC(r registry.Registry) {
Chaos: true,
LoadConfig: singlePartitionedLoadgen,

LoadWarehouses: 2000,
EstimatedMax: 900,
LoadWarehousesGCE: 2000,
LoadWarehousesAWS: 2000,
EstimatedMaxGCE: 900,
EstimatedMaxAWS: 900,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -909,8 +920,10 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 4,

LoadWarehouses: 1000,
EstimatedMax: gceOrAws(cloud, 750, 900),
LoadWarehousesGCE: 1000,
LoadWarehousesAWS: 1000,
EstimatedMaxGCE: 750,
EstimatedMaxAWS: 900,
EncryptionEnabled: true,

Clouds: registry.AllExceptAWS,
Expand All @@ -920,8 +933,10 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 3500, 3900),
EstimatedMax: gceOrAws(cloud, 2900, 3500),
LoadWarehousesGCE: 3500,
LoadWarehousesAWS: 3900,
EstimatedMaxGCE: 2900,
EstimatedMaxAWS: 3500,
EncryptionEnabled: true,
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -931,8 +946,10 @@ func registerTPCC(r registry.Registry) {
Nodes: 12,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 11500, 11500),
EstimatedMax: gceOrAws(cloud, 10000, 10000),
LoadWarehousesGCE: 11500,
LoadWarehousesAWS: 11500,
EstimatedMaxGCE: 10000,
EstimatedMaxAWS: 10000,
EncryptionEnabled: true,

Clouds: registry.AllExceptAWS,
Expand All @@ -945,9 +962,11 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 4,

LoadWarehouses: 1000,
EstimatedMax: gceOrAws(cloud, 750, 900),
ExpirationLeases: true,
LoadWarehousesGCE: 1000,
LoadWarehousesAWS: 1000,
EstimatedMaxGCE: 750,
EstimatedMaxAWS: 900,
ExpirationLeases: true,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -956,20 +975,24 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 3500, 3900),
EstimatedMax: gceOrAws(cloud, 2900, 3500),
ExpirationLeases: true,
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Tags: registry.Tags(`aws`),
LoadWarehousesGCE: 3500,
LoadWarehousesAWS: 3900,
EstimatedMaxGCE: 2900,
EstimatedMaxAWS: 3500,
ExpirationLeases: true,
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Tags: registry.Tags(`aws`),
})
registerTPCCBenchSpec(r, tpccBenchSpec{
Nodes: 12,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 11500, 11500),
EstimatedMax: gceOrAws(cloud, 10000, 10000),
ExpirationLeases: true,
LoadWarehousesGCE: 11500,
LoadWarehousesAWS: 11500,
EstimatedMaxGCE: 10000,
EstimatedMaxAWS: 10000,
ExpirationLeases: true,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Weekly),
Expand All @@ -978,10 +1001,14 @@ func registerTPCC(r registry.Registry) {
}

func gceOrAws(cloud string, gce, aws int) int {
if cloud == "aws" {
switch cloud {
case spec.AWS:
return aws
case spec.GCE:
return gce
default:
panic(fmt.Sprintf("unknown cloud %s", cloud))
}
return gce
}

// tpccBenchDistribution represents a distribution of nodes in a tpccbench
Expand Down Expand Up @@ -1052,13 +1079,15 @@ type tpccBenchSpec struct {
// The number of warehouses to load into the cluster before beginning
// benchmarking. Should be larger than EstimatedMax and should be a
// value that is unlikely to be achievable.
LoadWarehouses int
LoadWarehousesGCE int
LoadWarehousesAWS int
// An estimate of the maximum number of warehouses achievable in the
// cluster config. The closer this is to the actual max achievable
// warehouse count, the faster the benchmark will be in producing a
// result. This can be adjusted over time as performance characteristics
// change (i.e. CockroachDB gets faster!).
EstimatedMax int
EstimatedMaxGCE int
EstimatedMaxAWS int

// MinVersion to pass to testRegistryImpl.Add.
MinVersion string
Expand All @@ -1074,6 +1103,14 @@ type tpccBenchSpec struct {
ExpirationLeases bool
}

func (s tpccBenchSpec) EstimatedMax(cloud string) int {
return gceOrAws(cloud, s.EstimatedMaxGCE, s.EstimatedMaxAWS)
}

func (s tpccBenchSpec) LoadWarehouses(cloud string) int {
return gceOrAws(cloud, s.LoadWarehousesGCE, s.LoadWarehousesAWS)
}

// partitions returns the number of partitions specified to the load generator.
func (s tpccBenchSpec) partitions() int {
switch s.LoadConfig {
Expand Down Expand Up @@ -1205,7 +1242,7 @@ func loadTPCCBench(
).Scan(&curWarehouses); err != nil {
return err
}
if curWarehouses >= b.LoadWarehouses {
if curWarehouses >= b.LoadWarehouses(c.Cloud()) {
// The cluster has enough warehouses. Nothing to do.
return nil
}
Expand All @@ -1222,17 +1259,18 @@ func loadTPCCBench(

var loadArgs string
var rebalanceWait time.Duration
loadWarehouses := b.LoadWarehouses(c.Cloud())
switch b.LoadConfig {
case singleLoadgen:
loadArgs = `--checks=false`
rebalanceWait = time.Duration(b.LoadWarehouses/250) * time.Minute
rebalanceWait = time.Duration(loadWarehouses/250) * time.Minute
case singlePartitionedLoadgen:
loadArgs = fmt.Sprintf(`--checks=false --partitions=%d`, b.partitions())
rebalanceWait = time.Duration(b.LoadWarehouses/125) * time.Minute
rebalanceWait = time.Duration(loadWarehouses/125) * time.Minute
case multiLoadgen:
loadArgs = fmt.Sprintf(`--checks=false --partitions=%d --zones="%s"`,
b.partitions(), strings.Join(b.Distribution.zones(), ","))
rebalanceWait = time.Duration(b.LoadWarehouses/50) * time.Minute
rebalanceWait = time.Duration(loadWarehouses/50) * time.Minute
default:
panic("unexpected")
}
Expand All @@ -1241,7 +1279,7 @@ func loadTPCCBench(
t.L().Printf("restoring tpcc fixture\n")
err := WaitFor3XReplication(ctx, t, db)
require.NoError(t, err)
cmd := tpccImportCmd(b.LoadWarehouses, loadArgs)
cmd := tpccImportCmd(loadWarehouses, loadArgs)
if err = c.RunE(ctx, roachNodes[:1], cmd); err != nil {
return err
}
Expand All @@ -1259,12 +1297,12 @@ func loadTPCCBench(
// the desired distribution. This should allow for load-based rebalancing to
// help distribute load. Optionally pass some load configuration-specific
// flags.
maxRate := tpccMaxRate(b.EstimatedMax)
maxRate := tpccMaxRate(b.EstimatedMax(c.Cloud()))
rampTime := (1 * rebalanceWait) / 4
loadTime := (3 * rebalanceWait) / 4
cmd = fmt.Sprintf("./cockroach workload run tpcc --warehouses=%d --workers=%d --max-rate=%d "+
"--wait=false --ramp=%s --duration=%s --scatter --tolerate-errors {pgurl%s}",
b.LoadWarehouses, b.LoadWarehouses, maxRate, rampTime, loadTime, roachNodes)
b.LoadWarehouses(c.Cloud()), b.LoadWarehouses(c.Cloud()), maxRate, rampTime, loadTime, roachNodes)
if _, err := c.RunWithDetailsSingleNode(ctx, t.L(), loadNode, cmd); err != nil {
return err
}
Expand Down Expand Up @@ -1327,7 +1365,7 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen
// 10k warehouses requires at least 20,000 connections, so add a
// bit of breathing room and check the warehouse count.
c.Run(ctx, loadNodes, "sed -i 's/maxconn [0-9]\\+/maxconn 21000/' haproxy.cfg")
if b.LoadWarehouses > 1e4 {
if b.LoadWarehouses(c.Cloud()) > 1e4 {
t.Fatal("HAProxy config supports up to 10k warehouses")
}
c.Run(ctx, loadNodes, "haproxy -f haproxy.cfg -D")
Expand All @@ -1344,7 +1382,7 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen
// Search between 1 and b.LoadWarehouses for the largest number of
// warehouses that can be operated on while sustaining a throughput
// threshold, set to a fraction of max tpmC.
precision := int(math.Max(1.0, float64(b.LoadWarehouses/200)))
precision := int(math.Max(1.0, float64(b.LoadWarehouses(c.Cloud())/200)))
initStepSize := precision

// Create a temp directory to store the local copy of results from the
Expand All @@ -1361,7 +1399,7 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen
c.Start(ctx, t.L(), startOpts, settings, roachNodes)
}

s := search.NewLineSearcher(1, b.LoadWarehouses, b.EstimatedMax, initStepSize, precision)
s := search.NewLineSearcher(1, b.LoadWarehouses(c.Cloud()), b.EstimatedMax(c.Cloud()), initStepSize, precision)
iteration := 0
if res, err := s.Search(func(warehouses int) (bool, error) {
iteration++
Expand Down Expand Up @@ -1434,7 +1472,7 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen
histogramsPath := fmt.Sprintf("%s/warehouses=%d/stats.json", t.PerfArtifactsDir(), warehouses)
cmd := fmt.Sprintf("./cockroach workload run tpcc --warehouses=%d --active-warehouses=%d "+
"--tolerate-errors --ramp=%s --duration=%s%s --histograms=%s {pgurl%s}",
b.LoadWarehouses, warehouses, rampDur,
b.LoadWarehouses(c.Cloud()), warehouses, rampDur,
loadDur, extraFlags, histogramsPath, sqlGateways)
err := c.RunE(ctx, group.loadNodes, cmd)
loadDone <- timeutil.Now()
Expand Down

0 comments on commit 227ddf5

Please sign in to comment.