From 199969fcfbc6c9d317a5c71040fcdbaeec22f9db Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Thu, 16 Mar 2023 16:44:47 +0100 Subject: [PATCH] [wip] roachprod: don't use RAID0 by default This is a WIP because the behavior when machine types with local SSD are used is unclear. For example, on AWS, roachtest prefers the c5d family, which all come with local SST storage. But looking into `awsStartupScriptTemplate`, it seems unclear how to make sure that the EBS disk(s) get mounted as /mnt/data1 (which is probably what the default should be). We could also entertain straight-up preventing combinations that would lead to an inhomogeneous RAID0. I imagine we'd have to take a round of failures to find all of the places in which it happens, but perhaps a "snitch" can be inserted instead so that we can detect all such callers and fix them up before arming the check. By the way, EBS disks on AWS come with a default of 125mb/s which is less than this RAID0 gets "most of the time" - so we can expect some tests to behave differently after this change. I still believe this is worth it - debugging is so much harder when you're on top of a storage that's hard to predict and doesn't resemble any production deployment. ---- I wasted weeks of my life on this before, and it almost happened again! When you run a roachtest that asks for an AWS cXd machine (i.e. compute optimized with NVMe local disk), and you specify a VolumeSize, you also get an EBS volume. Prior to these commit, these would be RAID0'ed together. This isn't something sane - the resulting gp3 EBS volume is very different from the local NVMe volume in every way, and it lead to hard-to-understand write throughput behavior. This commit defaults to *not* using RAID0. Touches https://github.com/cockroachdb/cockroach/pull/98767. Touches https://github.com/cockroachdb/cockroach/pull/98576. Touches https://github.com/cockroachdb/cockroach/issues/97019. Epic: none Release note: None --- pkg/cmd/roachtest/spec/cluster_spec.go | 17 +++++++++-------- pkg/cmd/roachtest/spec/machine_type.go | 3 +++ pkg/roachprod/vm/aws/aws.go | 1 + pkg/roachprod/vm/gce/gcloud.go | 1 + 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pkg/cmd/roachtest/spec/cluster_spec.go b/pkg/cmd/roachtest/spec/cluster_spec.go index 601856f928f2..57cdda164108 100644 --- a/pkg/cmd/roachtest/spec/cluster_spec.go +++ b/pkg/cmd/roachtest/spec/cluster_spec.go @@ -104,11 +104,16 @@ func awsMachineSupportsSSD(machineType string) bool { return false } -func getAWSOpts(machineType string, zones []string, volumeSize int, localSSD bool) vm.ProviderOpts { +func getAWSOpts( + machineType string, zones []string, volumeSize int, localSSD bool, RAID0 bool, +) vm.ProviderOpts { opts := aws.DefaultProviderOpts() if volumeSize != 0 { opts.DefaultEBSVolume.Disk.VolumeSize = volumeSize } + if RAID0 { + opts.UseMultipleDisks = false // NB: the default is true + } if localSSD { opts.SSDMachineType = machineType } else { @@ -137,12 +142,8 @@ func getGCEOpts( opts.Zones = zones } opts.SSDCount = localSSDCount - if localSSD && localSSDCount > 0 { - // NB: As the default behavior for _roachprod_ (at least in AWS/GCP) is - // to mount multiple disks as a single store using a RAID 0 array, we - // must explicitly ask for multiple stores to be enabled, _unless_ the - // test has explicitly asked for RAID0. - opts.UseMultipleDisks = !RAID0 + if RAID0 { + opts.UseMultipleDisks = false // NB: the default is true, i.e. no RAID0 } opts.TerminateOnMigration = terminateOnMigration @@ -250,7 +251,7 @@ func (s *ClusterSpec) RoachprodOpts( var providerOpts vm.ProviderOpts switch s.Cloud { case AWS: - providerOpts = getAWSOpts(machineType, zones, s.VolumeSize, createVMOpts.SSDOpts.UseLocalSSD) + providerOpts = getAWSOpts(machineType, zones, s.VolumeSize, createVMOpts.SSDOpts.UseLocalSSD, s.RAID0) case GCE: providerOpts = getGCEOpts(machineType, zones, s.VolumeSize, ssdCount, createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration) diff --git a/pkg/cmd/roachtest/spec/machine_type.go b/pkg/cmd/roachtest/spec/machine_type.go index af77a4e62e64..286dc6e01704 100644 --- a/pkg/cmd/roachtest/spec/machine_type.go +++ b/pkg/cmd/roachtest/spec/machine_type.go @@ -42,6 +42,9 @@ func AWSMachineType(cpus int, highmem bool) string { } // There is no c5d.24xlarge. + // + // TODO(tbg): there seems to be, see: + // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/compute-optimized-instances.html if family == "c5d" && size == "24xlarge" { family = "m5d" } diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index 2d36c4e04b2e..b18b771dd40f 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -203,6 +203,7 @@ func DefaultProviderOpts() *ProviderOpts { RemoteUserName: "ubuntu", DefaultEBSVolume: defaultEBSVolumeValue, CreateRateLimit: 2, + UseMultipleDisks: true, // don't default to RAID0 } } diff --git a/pkg/roachprod/vm/gce/gcloud.go b/pkg/roachprod/vm/gce/gcloud.go index 7dd364637f11..7b1efdeb2dbb 100644 --- a/pkg/roachprod/vm/gce/gcloud.go +++ b/pkg/roachprod/vm/gce/gcloud.go @@ -230,6 +230,7 @@ func DefaultProviderOpts() *ProviderOpts { PDVolumeType: "pd-ssd", PDVolumeSize: 500, TerminateOnMigration: false, + UseMultipleDisks: true, // don't default to RAID0 useSharedUser: true, preemptible: false, }