From abd64206a125c21c4c38d7bfdc076285c4a5600f Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Thu, 16 Mar 2023 15:05:06 +0100 Subject: [PATCH] roachtest: use m5, not m5d, for restore 8tb test This works around https://github.com/cockroachdb/cockroach/issues/98783: ``` Instance type c5.2xlarge ``` Now the roachtest runs on standard EBS volumes (provisioned to 125mb/s, i.e. pretty weak ones): ``` $ df -h /mnt/data1/ Filesystem Size Used Avail Use% Mounted on /dev/nvme1n1 2.0T 4.0G 2.0T 1% /mnt/data1 $ sudo nvme list | grep nvme1n1 /dev/nvme1n1 vol065ed9110066bb362 Amazon Elastic Block Store 1 2.15 TB / 2.15 TB 512 B + 0 B 1.0 ``` Let's see how this fares. The theory is that the test previously failed failed due to RAID0 because some nodes would unpredictably be slower than others (depending on the striping, etc, across the raided inhomogeneous volumes), which we don't handle well. Now, there's symmetry and hopefully things will be slower (since we only have 125mb/s per volume now) but functional, i.e. no more OOMs. I verified this via ``` ./pkg/cmd/roachtest/roachstress.sh -c 10 restore/tpce/8TB/aws/nodes=10/cpus=8 -- --cloud aws --parallelism 1 ``` Epic: CRDB-25503 Release note: None --- pkg/cmd/roachtest/tests/restore.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/roachtest/tests/restore.go b/pkg/cmd/roachtest/tests/restore.go index 9d76abed92b1..f569183bafac 100644 --- a/pkg/cmd/roachtest/tests/restore.go +++ b/pkg/cmd/roachtest/tests/restore.go @@ -312,7 +312,7 @@ func registerRestore(r registry.Registry) { }, { // The nightly 8TB Restore test. - hardware: makeHardwareSpecs(hardwareSpecs{nodes: 10, volumeSize: 2000}), + hardware: makeHardwareSpecs(hardwareSpecs{cpus: 8, nodes: 10, volumeSize: 2000}), backup: makeBackupSpecs(backupSpecs{ version: "v22.2.1", workload: tpceRestore{customers: 500000}}), @@ -411,7 +411,17 @@ func (hw hardwareSpecs) makeClusterSpecs(r registry.Registry) spec.ClusterSpec { if hw.mem != spec.Auto { clusterOpts = append(clusterOpts, spec.Mem(hw.mem)) } - return r.MakeClusterSpec(hw.nodes, clusterOpts...) + s := r.MakeClusterSpec(hw.nodes, clusterOpts...) + if s.Cloud == "aws" && s.VolumeSize != 0 && s.Mem != spec.Low { + // Work around an issue that RAID0s local NVMe and GP3 storage together: + // https://github.com/cockroachdb/cockroach/issues/98783. + // + // This should be removed once we have found a real solution that avoids + // ever creating such a RAID. + s.InstanceType = spec.AWSMachineType(s.CPUs, s.Mem) + s.InstanceType = strings.Replace(s.InstanceType, "d.", ".", 1) + } + return s } // String prints the hardware specs. If verbose==true, verbose specs are printed.