From cb8c7fb8d2ba49d78b1ba0ae4a802fb0827e985e Mon Sep 17 00:00:00 2001 From: Alyssa Date: Mon, 5 Aug 2024 18:17:07 +0000 Subject: [PATCH] Update a3-megagpu-8 example to use local SSD solution --- .../a3-megagpu-8g/slurm-a3mega-cluster.yaml | 8 +++ .../a3-megagpu-8g/slurm-a3mega-image.yaml | 72 ------------------- 2 files changed, 8 insertions(+), 72 deletions(-) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 5ec78fc43e..5dae64cb67 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -80,6 +80,14 @@ deployment_groups: - id: a3mega_startup source: modules/scripts/startup-script settings: + # When shutting down a VM with local SSD disks, we strongly recommend the + # automatic migration of data following these instructions: + # https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance + # Failure to do will result in VMs that lose data and do not automatically + # mount local SSD filesystems + local_ssd_filesystem: + mountpoint: /mnt/localssd + permissions: "0755" # must quote numeric filesystem permissions! runners: - type: shell destination: setup_aperture.sh diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index e9572dc52c..e9f4a6bd89 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -228,78 +228,6 @@ deployment_groups: ansible.builtin.apt: name: dmabuf-import-helper state: present - - type: ansible-local - destination: mount-local-ssd-service.yml - content: | - --- - - name: Enable mount-local-ssd.service - hosts: all - become: true - tasks: - - name: Install mdadm - ansible.builtin.package: - name: mdadm - state: present - - name: Install local SSD formatting script - ansible.builtin.copy: - dest: /usr/local/ghpc/mount_localssd.sh - owner: root - group: root - mode: 0o755 - content: | - #!/bin/bash - set -e -o pipefail - - RAID_DEVICE=/dev/md0 - DST_MNT=/mnt/localssd - DISK_LABEL=LOCALSSD - OPTIONS=discard,defaults - - # if mount is successful, do nothing - if mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"; then - exit 0 - fi - - # Create new RAID, format ext4 and mount - # TODO: handle case of zero or 1 local SSD disk - # TODO: handle case when /dev/md0 exists but was not mountable for - # some reason - DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | paste -sd ' '` - NB_DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | wc -l` - mdadm --create "$RAID_DEVICE" --level=0 --raid-devices=$NB_DEVICES $DEVICES - mkfs.ext4 -F "$RAID_DEVICE" - tune2fs "$RAID_DEVICE" -r 131072 - e2label "$RAID_DEVICE" "$DISK_LABEL" - mkdir -p "$DST_MNT" - mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS" - chmod 1777 "$DST_MNT" - - name: Configure mount-local-ssd.service - ansible.builtin.copy: - dest: /etc/systemd/system/mount-local-ssd.service - owner: root - group: root - mode: 0o644 - content: | - [Unit] - Description=Assemble local SSDs as software RAID; then format and mount - - [Service] - ExecCondition=bash -c '/usr/bin/curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/machine-type | grep -q "/a3-megagpu-8g$"' - ExecStart=/bin/bash /usr/local/ghpc/mount_localssd.sh - - [Install] - WantedBy=local-fs.target - notify: Reload SystemD - handlers: - - name: Reload SystemD - ansible.builtin.systemd: - daemon_reload: true - post_tasks: - - name: Start Local SSD service - ansible.builtin.service: - name: mount-local-ssd.service - state: started - enabled: true - type: ansible-local destination: timesyncd.yml content: |