From 3a89d5588075acdad99bea74d37a89d5aa87965d Mon Sep 17 00:00:00 2001 From: Bhaskarjyoti Bora Date: Tue, 5 Mar 2024 13:08:19 +0530 Subject: [PATCH] roachprod: RAID0 only network disks if both local and network are present Today, if a machine has both local and network disks, both the disks are selected for RAID'ing. But, RAID'ing different types of disks causes performance differences. To address this, local disks are ignored for RAID'ing only if network disks are present. Fixes: #98783 Epic: none --- pkg/cmd/roachtest/tests/restore.go | 10 ---------- pkg/roachprod/vm/aws/support.go | 29 +++++++++++++++++++++++++++-- pkg/roachprod/vm/gce/utils.go | 18 +++++++++++++++++- 3 files changed, 44 insertions(+), 13 deletions(-) diff --git a/pkg/cmd/roachtest/tests/restore.go b/pkg/cmd/roachtest/tests/restore.go index e52b76040330..78fcf43d9e64 100644 --- a/pkg/cmd/roachtest/tests/restore.go +++ b/pkg/cmd/roachtest/tests/restore.go @@ -32,7 +32,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/roachprod/install" - "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/ts/tspb" @@ -474,15 +473,6 @@ func (hw hardwareSpecs) makeClusterSpecs(r registry.Registry, backupCloud string } s := r.MakeClusterSpec(hw.nodes, clusterOpts...) - if backupCloud == spec.AWS && s.VolumeSize != 0 { - // Work around an issue that RAID0s local NVMe and GP3 storage together: - // https://github.com/cockroachdb/cockroach/issues/98783. - // - // TODO(srosenberg): Remove this workaround when 98783 is addressed. - s.AWS.MachineType, _, _ = spec.SelectAWSMachineType(s.CPUs, s.Mem, false /* shouldSupportLocalSSD */, vm.ArchAMD64) - s.AWS.MachineType = strings.Replace(s.AWS.MachineType, "d.", ".", 1) - s.Arch = vm.ArchAMD64 - } return s } diff --git a/pkg/roachprod/vm/aws/support.go b/pkg/roachprod/vm/aws/support.go index 5437a9df7e2d..b1abe03e79b4 100644 --- a/pkg/roachprod/vm/aws/support.go +++ b/pkg/roachprod/vm/aws/support.go @@ -50,19 +50,44 @@ mount_opts="defaults" use_multiple_disks='{{if .UseMultipleDisks}}true{{end}}' -disks=() mount_prefix="/mnt/data" +# if the use_multiple_disks is not set and there are more than 1 disk (excluding the boot disk), +# then the disks will be selected for RAID'ing. If there are both EC2 NVMe Instance Storage and +# EBS, RAID'ing in this case can cause performance differences. So, to avoid this, +# EC2 NVMe Instance Storage are ignored. +# Scenarios: +# (local SSD = 0, Network Disk - 1) - no RAID'ing and mount network disk +# (local SSD = 1, Network Disk - 0) - no RAID'ing and mount local SSD +# (local SSD >= 1, Network Disk = 1) - no RAID'ing and mount network disk +# (local SSD > 1, Network Disk = 0) - local SSDs selected for RAID'ing +# (local SSD >= 0, Network Disk > 1) - network disks selected for RAID'ing +# Keep track of the Local SSDs and EBS volumes for RAID'ing +local_disks=() +ebs_volumes=() + # On different machine types, the drives are either called nvme... or xvdd. for d in $(ls /dev/nvme?n1 /dev/xvdd); do if ! mount | grep ${d}; then - disks+=("${d}") + if udevadm info --query=property --name=${d} | grep "ID_MODEL=Amazon Elastic Block Store"; then + echo "EBS Volume ${d} identified!" + ebs_volumes+=("${d}") + else + local_disks+=("${d}") + fi echo "Disk ${d} not mounted, need to mount..." else echo "Disk ${d} already mounted, skipping..." fi done +# use only EBS volumes if available and ignore EC2 NVMe Instance Storage +disks=() +if [ "${#ebs_volumes[@]}" -gt "0" ]; then + disks=("${ebs_volumes[@]}") +else + disks=("${local_disks[@]}") +fi if [ "${#disks[@]}" -eq "0" ]; then mountpoint="${mount_prefix}1" diff --git a/pkg/roachprod/vm/gce/utils.go b/pkg/roachprod/vm/gce/utils.go index 7334c31002fb..59077ddd5118 100644 --- a/pkg/roachprod/vm/gce/utils.go +++ b/pkg/roachprod/vm/gce/utils.go @@ -71,7 +71,23 @@ for d in $(ls /dev/nvme?n? /dev/disk/by-id/google-persistent-disk-[1-9]); do zpool list -v -P | grep ${d} > /dev/null if [ $? -ne 0 ]; then {{ else }} -for d in $(ls /dev/disk/by-id/google-local-* /dev/disk/by-id/google-persistent-disk-[1-9]); do +# if the use_multiple_disks is not set and there are more than 1 disk (excluding the boot disk), +# then the disks will be selected for RAID'ing. If there are both Local SSDs and Persistent disks, +# RAID'ing in this case can cause performance differences. So, to avoid this, local SSDs are ignored. +# Scenarios: +# (local SSD = 0, Persistent Disk - 1) - no RAID'ing and Persistent Disk mounted +# (local SSD = 1, Persistent Disk - 0) - no RAID'ing and local SSD mounted +# (local SSD >= 1, Persistent Disk = 1) - no RAID'ing and Persistent Disk mounted +# (local SSD > 1, Persistent Disk = 0) - local SSDs selected for RAID'ing +# (local SSD >= 0, Persistent Disk > 1) - network disks selected for RAID'ing +disk_list=() +if [ "$(ls /dev/disk/by-id/google-persistent-disk-[1-9]|wc -l)" -eq "0" ]; then + disk_list=$(ls /dev/disk/by-id/google-local-*) +else + echo "Only persistent disks are selected." + disk_list=$(ls /dev/disk/by-id/google-persistent-disk-[1-9]) +fi +for d in ${disk_list}; do if ! mount | grep ${d}; then {{ end }} disks+=("${d}")