Skip to content

Commit

Permalink
Merge pull request #112553 from RaduBerinde/backport23.1-111285
Browse files Browse the repository at this point in the history
release-23.1: roachtest: tpcc: don't look at cloud during registration
  • Loading branch information
RaduBerinde authored Oct 18, 2023
2 parents 4afc8fe + 227ddf5 commit 2da1165
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 57 deletions.
4 changes: 4 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2525,6 +2525,10 @@ func (c *clusterImpl) MakeNodes(opts ...option.Option) string {
return c.name + r.String()
}

func (c *clusterImpl) Cloud() string {
return c.spec.Cloud
}

func (c *clusterImpl) IsLocal() bool {
return config.IsLocalClusterName(c.name)
}
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/cluster/cluster_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ type Cluster interface {

Spec() spec.ClusterSpec
Name() string
Cloud() string
IsLocal() bool
// IsSecure returns true iff the cluster uses TLS.
IsSecure() bool
Expand Down
10 changes: 7 additions & 3 deletions pkg/cmd/roachtest/spec/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,13 @@ func (m MemPerCPU) String() string {
// ClusterSpec represents a test's description of what its cluster needs to
// look like. It becomes part of a clusterConfig when the cluster is created.
type ClusterSpec struct {
Cloud string
Arch vm.CPUArch // CPU architecture; auto-chosen if left empty
InstanceType string // auto-chosen if left empty
// TODO(#104029): We should remove the Cloud field; the tests now specify
// their compatible clouds.
Cloud string
Arch vm.CPUArch // CPU architecture; auto-chosen if left empty
// TODO(radu): An InstanceType can only make sense in the context of a
// specific cloud. We should replace this with cloud-specific arguments.
InstanceType string // auto-chosen if left empty
NodeCount int
// CPUs is the number of CPUs per node.
CPUs int
Expand Down
146 changes: 92 additions & 54 deletions pkg/cmd/roachtest/tests/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,12 +357,12 @@ func maxSupportedTPCCWarehouses(
// TPCC workload is running. The number of database upgrades is
// controlled by the `versionsToUpgrade` parameter.
func runTPCCMixedHeadroom(
ctx context.Context, t test.Test, c cluster.Cluster, cloud string, versionsToUpgrade int,
ctx context.Context, t test.Test, c cluster.Cluster, versionsToUpgrade int,
) {
crdbNodes := c.Range(1, c.Spec().NodeCount-1)
workloadNode := c.Node(c.Spec().NodeCount)

maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), cloud, c.Spec())
maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), c.Cloud(), c.Spec())
headroomWarehouses := int(float64(maxWarehouses) * 0.7)
if c.IsLocal() {
headroomWarehouses = 10
Expand Down Expand Up @@ -484,7 +484,6 @@ func runTPCCMixedHeadroom(
}

func registerTPCC(r registry.Registry) {
cloud := r.MakeClusterSpec(1).Cloud
headroomSpec := r.MakeClusterSpec(4, spec.CPU(16), spec.RandomlyUseZfs())
r.Add(registry.TestSpec{
// w=headroom runs tpcc for a semi-extended period with some amount of
Expand All @@ -499,7 +498,7 @@ func registerTPCC(r registry.Registry) {
EncryptionSupport: registry.EncryptionMetamorphic,
Leases: registry.MetamorphicLeases,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), cloud, t.Spec().(*registry.TestSpec).Cluster)
maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), c.Cloud(), c.Spec())
headroomWarehouses := int(float64(maxWarehouses) * 0.7)
t.L().Printf("computed headroom warehouses of %d\n", headroomWarehouses)
runTPCC(ctx, t, c, tpccOptions{
Expand All @@ -526,7 +525,7 @@ func registerTPCC(r registry.Registry) {
Cluster: mixedHeadroomSpec,
EncryptionSupport: registry.EncryptionMetamorphic,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runTPCCMixedHeadroom(ctx, t, c, cloud, 1)
runTPCCMixedHeadroom(ctx, t, c, 1)
},
})

Expand All @@ -540,7 +539,7 @@ func registerTPCC(r registry.Registry) {
Cluster: mixedHeadroomSpec,
EncryptionSupport: registry.EncryptionMetamorphic,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runTPCCMixedHeadroom(ctx, t, c, cloud, 2)
runTPCCMixedHeadroom(ctx, t, c, 2)
},
})
r.Add(registry.TestSpec{
Expand Down Expand Up @@ -839,8 +838,10 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 4,

LoadWarehouses: 1000,
EstimatedMax: gceOrAws(cloud, 750, 900),
LoadWarehousesGCE: 1000,
LoadWarehousesAWS: 1000,
EstimatedMaxGCE: 750,
EstimatedMaxAWS: 900,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -849,18 +850,22 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 3500, 3900),
EstimatedMax: gceOrAws(cloud, 2900, 3500),
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Tags: registry.Tags(`aws`),
LoadWarehousesGCE: 3500,
LoadWarehousesAWS: 3900,
EstimatedMaxGCE: 2900,
EstimatedMaxAWS: 3500,
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Tags: registry.Tags(`aws`),
})
registerTPCCBenchSpec(r, tpccBenchSpec{
Nodes: 12,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 11500, 11500),
EstimatedMax: gceOrAws(cloud, 10000, 10000),
LoadWarehousesGCE: 11500,
LoadWarehousesAWS: 11500,
EstimatedMaxGCE: 10000,
EstimatedMaxAWS: 10000,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Weekly),
Expand All @@ -871,8 +876,10 @@ func registerTPCC(r registry.Registry) {
CPUs: 16,
Distribution: multiZone,

LoadWarehouses: 6500,
EstimatedMax: 5000,
LoadWarehousesGCE: 6500,
LoadWarehousesAWS: 6500,
EstimatedMaxGCE: 5000,
EstimatedMaxAWS: 5000,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -884,8 +891,10 @@ func registerTPCC(r registry.Registry) {
Distribution: multiRegion,
LoadConfig: multiLoadgen,

LoadWarehouses: 3000,
EstimatedMax: 2000,
LoadWarehousesGCE: 3000,
LoadWarehousesAWS: 3000,
EstimatedMaxGCE: 2000,
EstimatedMaxAWS: 2000,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -896,8 +905,10 @@ func registerTPCC(r registry.Registry) {
Chaos: true,
LoadConfig: singlePartitionedLoadgen,

LoadWarehouses: 2000,
EstimatedMax: 900,
LoadWarehousesGCE: 2000,
LoadWarehousesAWS: 2000,
EstimatedMaxGCE: 900,
EstimatedMaxAWS: 900,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -909,8 +920,10 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 4,

LoadWarehouses: 1000,
EstimatedMax: gceOrAws(cloud, 750, 900),
LoadWarehousesGCE: 1000,
LoadWarehousesAWS: 1000,
EstimatedMaxGCE: 750,
EstimatedMaxAWS: 900,
EncryptionEnabled: true,

Clouds: registry.AllExceptAWS,
Expand All @@ -920,8 +933,10 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 3500, 3900),
EstimatedMax: gceOrAws(cloud, 2900, 3500),
LoadWarehousesGCE: 3500,
LoadWarehousesAWS: 3900,
EstimatedMaxGCE: 2900,
EstimatedMaxAWS: 3500,
EncryptionEnabled: true,
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -931,8 +946,10 @@ func registerTPCC(r registry.Registry) {
Nodes: 12,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 11500, 11500),
EstimatedMax: gceOrAws(cloud, 10000, 10000),
LoadWarehousesGCE: 11500,
LoadWarehousesAWS: 11500,
EstimatedMaxGCE: 10000,
EstimatedMaxAWS: 10000,
EncryptionEnabled: true,

Clouds: registry.AllExceptAWS,
Expand All @@ -945,9 +962,11 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 4,

LoadWarehouses: 1000,
EstimatedMax: gceOrAws(cloud, 750, 900),
ExpirationLeases: true,
LoadWarehousesGCE: 1000,
LoadWarehousesAWS: 1000,
EstimatedMaxGCE: 750,
EstimatedMaxAWS: 900,
ExpirationLeases: true,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Expand All @@ -956,20 +975,24 @@ func registerTPCC(r registry.Registry) {
Nodes: 3,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 3500, 3900),
EstimatedMax: gceOrAws(cloud, 2900, 3500),
ExpirationLeases: true,
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Tags: registry.Tags(`aws`),
LoadWarehousesGCE: 3500,
LoadWarehousesAWS: 3900,
EstimatedMaxGCE: 2900,
EstimatedMaxAWS: 3500,
ExpirationLeases: true,
Clouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Tags: registry.Tags(`aws`),
})
registerTPCCBenchSpec(r, tpccBenchSpec{
Nodes: 12,
CPUs: 16,

LoadWarehouses: gceOrAws(cloud, 11500, 11500),
EstimatedMax: gceOrAws(cloud, 10000, 10000),
ExpirationLeases: true,
LoadWarehousesGCE: 11500,
LoadWarehousesAWS: 11500,
EstimatedMaxGCE: 10000,
EstimatedMaxAWS: 10000,
ExpirationLeases: true,

Clouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Weekly),
Expand All @@ -978,10 +1001,14 @@ func registerTPCC(r registry.Registry) {
}

func gceOrAws(cloud string, gce, aws int) int {
if cloud == "aws" {
switch cloud {
case spec.AWS:
return aws
case spec.GCE:
return gce
default:
panic(fmt.Sprintf("unknown cloud %s", cloud))
}
return gce
}

// tpccBenchDistribution represents a distribution of nodes in a tpccbench
Expand Down Expand Up @@ -1052,13 +1079,15 @@ type tpccBenchSpec struct {
// The number of warehouses to load into the cluster before beginning
// benchmarking. Should be larger than EstimatedMax and should be a
// value that is unlikely to be achievable.
LoadWarehouses int
LoadWarehousesGCE int
LoadWarehousesAWS int
// An estimate of the maximum number of warehouses achievable in the
// cluster config. The closer this is to the actual max achievable
// warehouse count, the faster the benchmark will be in producing a
// result. This can be adjusted over time as performance characteristics
// change (i.e. CockroachDB gets faster!).
EstimatedMax int
EstimatedMaxGCE int
EstimatedMaxAWS int

// MinVersion to pass to testRegistryImpl.Add.
MinVersion string
Expand All @@ -1074,6 +1103,14 @@ type tpccBenchSpec struct {
ExpirationLeases bool
}

func (s tpccBenchSpec) EstimatedMax(cloud string) int {
return gceOrAws(cloud, s.EstimatedMaxGCE, s.EstimatedMaxAWS)
}

func (s tpccBenchSpec) LoadWarehouses(cloud string) int {
return gceOrAws(cloud, s.LoadWarehousesGCE, s.LoadWarehousesAWS)
}

// partitions returns the number of partitions specified to the load generator.
func (s tpccBenchSpec) partitions() int {
switch s.LoadConfig {
Expand Down Expand Up @@ -1205,7 +1242,7 @@ func loadTPCCBench(
).Scan(&curWarehouses); err != nil {
return err
}
if curWarehouses >= b.LoadWarehouses {
if curWarehouses >= b.LoadWarehouses(c.Cloud()) {
// The cluster has enough warehouses. Nothing to do.
return nil
}
Expand All @@ -1222,17 +1259,18 @@ func loadTPCCBench(

var loadArgs string
var rebalanceWait time.Duration
loadWarehouses := b.LoadWarehouses(c.Cloud())
switch b.LoadConfig {
case singleLoadgen:
loadArgs = `--checks=false`
rebalanceWait = time.Duration(b.LoadWarehouses/250) * time.Minute
rebalanceWait = time.Duration(loadWarehouses/250) * time.Minute
case singlePartitionedLoadgen:
loadArgs = fmt.Sprintf(`--checks=false --partitions=%d`, b.partitions())
rebalanceWait = time.Duration(b.LoadWarehouses/125) * time.Minute
rebalanceWait = time.Duration(loadWarehouses/125) * time.Minute
case multiLoadgen:
loadArgs = fmt.Sprintf(`--checks=false --partitions=%d --zones="%s"`,
b.partitions(), strings.Join(b.Distribution.zones(), ","))
rebalanceWait = time.Duration(b.LoadWarehouses/50) * time.Minute
rebalanceWait = time.Duration(loadWarehouses/50) * time.Minute
default:
panic("unexpected")
}
Expand All @@ -1241,7 +1279,7 @@ func loadTPCCBench(
t.L().Printf("restoring tpcc fixture\n")
err := WaitFor3XReplication(ctx, t, db)
require.NoError(t, err)
cmd := tpccImportCmd(b.LoadWarehouses, loadArgs)
cmd := tpccImportCmd(loadWarehouses, loadArgs)
if err = c.RunE(ctx, roachNodes[:1], cmd); err != nil {
return err
}
Expand All @@ -1259,12 +1297,12 @@ func loadTPCCBench(
// the desired distribution. This should allow for load-based rebalancing to
// help distribute load. Optionally pass some load configuration-specific
// flags.
maxRate := tpccMaxRate(b.EstimatedMax)
maxRate := tpccMaxRate(b.EstimatedMax(c.Cloud()))
rampTime := (1 * rebalanceWait) / 4
loadTime := (3 * rebalanceWait) / 4
cmd = fmt.Sprintf("./cockroach workload run tpcc --warehouses=%d --workers=%d --max-rate=%d "+
"--wait=false --ramp=%s --duration=%s --scatter --tolerate-errors {pgurl%s}",
b.LoadWarehouses, b.LoadWarehouses, maxRate, rampTime, loadTime, roachNodes)
b.LoadWarehouses(c.Cloud()), b.LoadWarehouses(c.Cloud()), maxRate, rampTime, loadTime, roachNodes)
if _, err := c.RunWithDetailsSingleNode(ctx, t.L(), loadNode, cmd); err != nil {
return err
}
Expand Down Expand Up @@ -1327,7 +1365,7 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen
// 10k warehouses requires at least 20,000 connections, so add a
// bit of breathing room and check the warehouse count.
c.Run(ctx, loadNodes, "sed -i 's/maxconn [0-9]\\+/maxconn 21000/' haproxy.cfg")
if b.LoadWarehouses > 1e4 {
if b.LoadWarehouses(c.Cloud()) > 1e4 {
t.Fatal("HAProxy config supports up to 10k warehouses")
}
c.Run(ctx, loadNodes, "haproxy -f haproxy.cfg -D")
Expand All @@ -1344,7 +1382,7 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen
// Search between 1 and b.LoadWarehouses for the largest number of
// warehouses that can be operated on while sustaining a throughput
// threshold, set to a fraction of max tpmC.
precision := int(math.Max(1.0, float64(b.LoadWarehouses/200)))
precision := int(math.Max(1.0, float64(b.LoadWarehouses(c.Cloud())/200)))
initStepSize := precision

// Create a temp directory to store the local copy of results from the
Expand All @@ -1361,7 +1399,7 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen
c.Start(ctx, t.L(), startOpts, settings, roachNodes)
}

s := search.NewLineSearcher(1, b.LoadWarehouses, b.EstimatedMax, initStepSize, precision)
s := search.NewLineSearcher(1, b.LoadWarehouses(c.Cloud()), b.EstimatedMax(c.Cloud()), initStepSize, precision)
iteration := 0
if res, err := s.Search(func(warehouses int) (bool, error) {
iteration++
Expand Down Expand Up @@ -1434,7 +1472,7 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen
histogramsPath := fmt.Sprintf("%s/warehouses=%d/stats.json", t.PerfArtifactsDir(), warehouses)
cmd := fmt.Sprintf("./cockroach workload run tpcc --warehouses=%d --active-warehouses=%d "+
"--tolerate-errors --ramp=%s --duration=%s%s --histograms=%s {pgurl%s}",
b.LoadWarehouses, warehouses, rampDur,
b.LoadWarehouses(c.Cloud()), warehouses, rampDur,
loadDur, extraFlags, histogramsPath, sqlGateways)
err := c.RunE(ctx, group.loadNodes, cmd)
loadDone <- timeutil.Now()
Expand Down

0 comments on commit 2da1165

Please sign in to comment.