Skip to content

Commit

Permalink
roachtest: clean up instance type specification
Browse files Browse the repository at this point in the history
`ClusterSpec` now provides options for GCE and AWS machine types.

Epic: none
Release note: None
  • Loading branch information
RaduBerinde committed Sep 30, 2023
1 parent 45a9cf2 commit 2e4c708
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 33 deletions.
2 changes: 1 addition & 1 deletion build/release/teamcity-mark-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ mark_build() {
fi

log_into_gcloud
gcloud container images add-tag "${gcr_repository}:${TC_BUILD_BRANCH}" "${gcr_repository}:latest-${release_branch}-${build_label}-build"
gawscloud container images add-tag "${gcr_repository}:${TC_BUILD_BRANCH}" "${gcr_repository}:latest-${release_branch}-${build_label}-build"
tc_end_block "Push new docker image tag"
}

Expand Down
8 changes: 4 additions & 4 deletions pkg/cmd/roachtest/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,15 +319,15 @@ func TestAWSMachineType(t *testing.T) {

for _, tc := range testCases {
t.Run(fmt.Sprintf("%d/%s/%t/%s", tc.cpus, tc.mem, tc.localSSD, tc.arch), func(t *testing.T) {
machineType, selectedArch := spec.AWSMachineType(tc.cpus, tc.mem, tc.localSSD, tc.arch)
machineType, selectedArch := spec.SelectAWSMachineType(tc.cpus, tc.mem, tc.localSSD, tc.arch)

require.Equal(t, tc.expectedMachineType, machineType)
require.Equal(t, tc.expectedArch, selectedArch)
})
}
// spec.Low is not supported.
require.Panics(t, func() { spec.AWSMachineType(4, spec.Low, false, vm.ArchAMD64) })
require.Panics(t, func() { spec.AWSMachineType(16, spec.Low, false, vm.ArchARM64) })
require.Panics(t, func() { spec.SelectAWSMachineType(4, spec.Low, false, vm.ArchAMD64) })
require.Panics(t, func() { spec.SelectAWSMachineType(16, spec.Low, false, vm.ArchARM64) })
}

func TestGCEMachineType(t *testing.T) {
Expand Down Expand Up @@ -413,7 +413,7 @@ func TestGCEMachineType(t *testing.T) {

for _, tc := range testCases {
t.Run(fmt.Sprintf("%d/%s/%s", tc.cpus, tc.mem, tc.arch), func(t *testing.T) {
machineType, selectedArch := spec.GCEMachineType(tc.cpus, tc.mem, tc.arch)
machineType, selectedArch := spec.SelectGCEMachineType(tc.cpus, tc.mem, tc.arch)

require.Equal(t, tc.expectedMachineType, machineType)
require.Equal(t, tc.expectedArch, selectedArch)
Expand Down
48 changes: 32 additions & 16 deletions pkg/cmd/roachtest/spec/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,14 @@ type ClusterSpec struct {
// TODO(#104029): We should remove the Cloud field; the tests now specify
// their compatible clouds.
Cloud string
Arch vm.CPUArch // CPU architecture; auto-chosen if left empty
// TODO(radu): An InstanceType can only make sense in the context of a
// specific cloud. We should replace this with cloud-specific arguments.
InstanceType string // auto-chosen if left empty
NodeCount int

// TODO(radu): defaultInstanceType is the default machine type used (unless
// overridden by GCE.MachineType or AWS.MachineType); it does not belong in
// the spec.
defaultInstanceType string

Arch vm.CPUArch // CPU architecture; auto-chosen if left empty
NodeCount int
// CPUs is the number of CPUs per node.
CPUs int
Mem MemPerCPU
Expand All @@ -93,20 +96,22 @@ type ClusterSpec struct {

// GCE-specific arguments. These values apply only on clusters instantiated on GCE.
GCE struct {
MachineType string
MinCPUPlatform string
VolumeType string
}

// AWS-specific arguments. These values apply only on clusters instantiated on AWS.
AWS struct {
MachineType string
// VolumeThroughput is the min provisioned EBS volume throughput.
VolumeThroughput int
}
}

// MakeClusterSpec makes a ClusterSpec.
func MakeClusterSpec(cloud string, instanceType string, nodeCount int, opts ...Option) ClusterSpec {
spec := ClusterSpec{Cloud: cloud, InstanceType: instanceType, NodeCount: nodeCount}
spec := ClusterSpec{Cloud: cloud, defaultInstanceType: instanceType, NodeCount: nodeCount}
defaultOpts := []Option{CPU(4), nodeLifetime(12 * time.Hour), ReuseAny()}
for _, o := range append(defaultOpts, opts...) {
o(&spec)
Expand Down Expand Up @@ -257,31 +262,42 @@ func (s *ClusterSpec) RoachprodOpts(

createVMOpts.GeoDistributed = s.Geo
createVMOpts.Arch = string(arch)
machineType := s.InstanceType
ssdCount := s.SSDs

machineType := s.defaultInstanceType
switch s.Cloud {
case AWS:
if s.AWS.MachineType != "" {
machineType = s.AWS.MachineType
}
case GCE:
if s.GCE.MachineType != "" {
machineType = s.GCE.MachineType
}
}

if s.CPUs != 0 {
// Default to the user-supplied machine type, if any.
// Otherwise, pick based on requested CPU count.
var selectedArch vm.CPUArch

if len(machineType) == 0 {
if machineType == "" {
// If no machine type was specified, choose one
// based on the cloud and CPU count.
switch s.Cloud {
case AWS:
machineType, selectedArch = AWSMachineType(s.CPUs, s.Mem, s.PreferLocalSSD && s.VolumeSize == 0, arch)
machineType, selectedArch = SelectAWSMachineType(s.CPUs, s.Mem, s.PreferLocalSSD && s.VolumeSize == 0, arch)
case GCE:
machineType, selectedArch = GCEMachineType(s.CPUs, s.Mem, arch)
machineType, selectedArch = SelectGCEMachineType(s.CPUs, s.Mem, arch)
case Azure:
machineType = AzureMachineType(s.CPUs, s.Mem)
}
}
if selectedArch != "" && selectedArch != arch {
// TODO(srosenberg): we need a better way to monitor the rate of this mismatch, i.e.,
// other than grepping cluster creation logs.
fmt.Printf("WARN: requested arch %s for machineType %s, but selected %s\n", arch, machineType, selectedArch)
createVMOpts.Arch = string(selectedArch)
if selectedArch != "" && selectedArch != arch {
// TODO(srosenberg): we need a better way to monitor the rate of this mismatch, i.e.,
// other than grepping cluster creation logs.
fmt.Printf("WARN: requested arch %s for machineType %s, but selected %s\n", arch, machineType, selectedArch)
createVMOpts.Arch = string(selectedArch)
}
}

// Local SSD can only be requested
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/spec/machine_type.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (
//
// At the time of writing, the intel machines are all third-generation Xeon, "Ice Lake" which are isomorphic to
// GCE's n2-(standard|highmem|custom) _with_ --minimum-cpu-platform="Intel Ice Lake" (roachprod's default).
func AWSMachineType(
func SelectAWSMachineType(
cpus int, mem MemPerCPU, shouldSupportLocalSSD bool, arch vm.CPUArch,
) (string, vm.CPUArch) {
family := "m6i" // 4 GB RAM per CPU
Expand Down Expand Up @@ -118,7 +118,7 @@ func AWSMachineType(
// At the time of writing, the intel machines are all third-generation xeon, "Ice Lake" assuming
// --minimum-cpu-platform="Intel Ice Lake" (roachprod's default). This is isomorphic to AWS's m6i or c6i.
// The only exception is low memory machines (n2-highcpu-xxx), which aren't available in AWS.
func GCEMachineType(cpus int, mem MemPerCPU, arch vm.CPUArch) (string, vm.CPUArch) {
func SelectGCEMachineType(cpus int, mem MemPerCPU, arch vm.CPUArch) (string, vm.CPUArch) {
series := "n2"
selectedArch := vm.ArchAMD64

Expand Down
14 changes: 14 additions & 0 deletions pkg/cmd/roachtest/spec/option.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,13 @@ func RandomlyUseZfs() Option {
}
}

// GCEMachineType sets the machine (instance) type when the cluster is on GCE.
func GCEMachineType(machineType string) Option {
return func(spec *ClusterSpec) {
spec.GCE.MachineType = machineType
}
}

// GCEMinCPUPlatform sets the minimum CPU platform when the cluster is on GCE.
func GCEMinCPUPlatform(platform string) Option {
return func(spec *ClusterSpec) {
Expand All @@ -212,6 +219,13 @@ func GCEVolumeType(volumeType string) Option {
}
}

// AWSMachineType sets the machine (instance) type when the cluster is on AWS.
func AWSMachineType(machineType string) Option {
return func(spec *ClusterSpec) {
spec.AWS.MachineType = machineType
}
}

// AWSVolumeThroughput sets the minimum provisioned EBS volume throughput when
// the cluster is on AWS.
func AWSVolumeThroughput(throughput int) Option {
Expand Down
3 changes: 0 additions & 3 deletions pkg/cmd/roachtest/test_registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,18 @@ func TestMakeTestRegistry(t *testing.T) {
s := r.MakeClusterSpec(100, spec.Geo(), spec.Zones("zone99"), spec.CPU(12),
spec.PreferLocalSSD(true))
require.EqualValues(t, 100, s.NodeCount)
require.Equal(t, "foo", s.InstanceType)
require.True(t, s.Geo)
require.Equal(t, "zone99", s.Zones)
require.EqualValues(t, 12, s.CPUs)
require.True(t, s.PreferLocalSSD)

s = r.MakeClusterSpec(100, spec.CPU(4), spec.TerminateOnMigration())
require.EqualValues(t, 100, s.NodeCount)
require.Equal(t, "foo", s.InstanceType)
require.EqualValues(t, 4, s.CPUs)
require.True(t, s.TerminateOnMigration)

s = r.MakeClusterSpec(10, spec.CPU(16), spec.Arch(vm.ArchARM64))
require.EqualValues(t, 10, s.NodeCount)
require.Equal(t, "foo", s.InstanceType)
require.EqualValues(t, 16, s.CPUs)
require.EqualValues(t, vm.ArchARM64, s.Arch)
})
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/tests/admission_control_database_drop.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ func registerDatabaseDrop(r registry.Registry) {
spec.Cloud(spec.GCE),
spec.GCEMinCPUPlatform("Intel Ice Lake"),
spec.GCEVolumeType("pd-ssd"),
spec.GCEMachineType("n2-standard-8"),
)
clusterSpec.InstanceType = "n2-standard-8"

r.Add(registry.TestSpec{
Name: "admission-control/database-drop",
Timeout: 10 * time.Hour,
Owner: registry.OwnerAdmissionControl,
Benchmark: true,
CompatibleClouds: registry.AllExceptAWS,
CompatibleClouds: registry.OnlyGCE,
Suites: registry.Suites(registry.Weekly),
Tags: registry.Tags(`weekly`),
Cluster: clusterSpec,
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/tests/admission_control_index_backfill.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ func registerIndexBackfill(r registry.Registry) {
spec.Cloud(spec.GCE),
spec.GCEMinCPUPlatform("Intel Ice Lake"),
spec.GCEVolumeType("pd-ssd"),
spec.GCEMachineType("n2-standard-8"),
)
clusterSpec.InstanceType = "n2-standard-8"

r.Add(registry.TestSpec{
Name: "admission-control/index-backfill",
Timeout: 6 * time.Hour,
Owner: registry.OwnerAdmissionControl,
Benchmark: true,
CompatibleClouds: registry.AllExceptAWS,
CompatibleClouds: registry.OnlyGCE,
Suites: registry.Suites(registry.Weekly),
Tags: registry.Tags(`weekly`),
Cluster: clusterSpec,
Expand Down
6 changes: 3 additions & 3 deletions pkg/cmd/roachtest/tests/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,13 +537,13 @@ func (hw hardwareSpecs) makeClusterSpecs(r registry.Registry, backupCloud string
}
s := r.MakeClusterSpec(hw.nodes+addWorkloadNode, clusterOpts...)

if backupCloud == spec.AWS && s.Cloud == spec.AWS && s.VolumeSize != 0 {
if backupCloud == spec.AWS && s.VolumeSize != 0 {
// Work around an issue that RAID0s local NVMe and GP3 storage together:
// https://github.com/cockroachdb/cockroach/issues/98783.
//
// TODO(srosenberg): Remove this workaround when 98783 is addressed.
s.InstanceType, _ = spec.AWSMachineType(s.CPUs, s.Mem, s.PreferLocalSSD && s.VolumeSize == 0, vm.ArchAMD64)
s.InstanceType = strings.Replace(s.InstanceType, "d.", ".", 1)
s.AWS.MachineType, _ = spec.SelectAWSMachineType(s.CPUs, s.Mem, s.PreferLocalSSD && s.VolumeSize == 0, vm.ArchAMD64)
s.AWS.MachineType = strings.Replace(s.AWS.MachineType, "d.", ".", 1)
s.Arch = vm.ArchAMD64
}
return s
Expand Down

0 comments on commit 2e4c708

Please sign in to comment.