Skip to content

Commit

Permalink
Merge pull request #2871 from alyssa-sm/update-megagpu-example
Browse files Browse the repository at this point in the history
Update a3-megagpu-8 example to use local ssd solution
  • Loading branch information
alyssa-sm authored Aug 6, 2024
2 parents c9d4dfe + cb8c7fb commit 2b62970
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ deployment_groups:
- id: a3mega_startup
source: modules/scripts/startup-script
settings:
# When shutting down a VM with local SSD disks, we strongly recommend the
# automatic migration of data following these instructions:
# https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance
# Failure to do will result in VMs that lose data and do not automatically
# mount local SSD filesystems
local_ssd_filesystem:
mountpoint: /mnt/localssd
permissions: "0755" # must quote numeric filesystem permissions!
runners:
- type: shell
destination: setup_aperture.sh
Expand Down
72 changes: 0 additions & 72 deletions examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -228,78 +228,6 @@ deployment_groups:
ansible.builtin.apt:
name: dmabuf-import-helper
state: present
- type: ansible-local
destination: mount-local-ssd-service.yml
content: |
---
- name: Enable mount-local-ssd.service
hosts: all
become: true
tasks:
- name: Install mdadm
ansible.builtin.package:
name: mdadm
state: present
- name: Install local SSD formatting script
ansible.builtin.copy:
dest: /usr/local/ghpc/mount_localssd.sh
owner: root
group: root
mode: 0o755
content: |
#!/bin/bash
set -e -o pipefail
RAID_DEVICE=/dev/md0
DST_MNT=/mnt/localssd
DISK_LABEL=LOCALSSD
OPTIONS=discard,defaults
# if mount is successful, do nothing
if mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"; then
exit 0
fi
# Create new RAID, format ext4 and mount
# TODO: handle case of zero or 1 local SSD disk
# TODO: handle case when /dev/md0 exists but was not mountable for
# some reason
DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | paste -sd ' '`
NB_DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | wc -l`
mdadm --create "$RAID_DEVICE" --level=0 --raid-devices=$NB_DEVICES $DEVICES
mkfs.ext4 -F "$RAID_DEVICE"
tune2fs "$RAID_DEVICE" -r 131072
e2label "$RAID_DEVICE" "$DISK_LABEL"
mkdir -p "$DST_MNT"
mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"
chmod 1777 "$DST_MNT"
- name: Configure mount-local-ssd.service
ansible.builtin.copy:
dest: /etc/systemd/system/mount-local-ssd.service
owner: root
group: root
mode: 0o644
content: |
[Unit]
Description=Assemble local SSDs as software RAID; then format and mount
[Service]
ExecCondition=bash -c '/usr/bin/curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/machine-type | grep -q "/a3-megagpu-8g$"'
ExecStart=/bin/bash /usr/local/ghpc/mount_localssd.sh
[Install]
WantedBy=local-fs.target
notify: Reload SystemD
handlers:
- name: Reload SystemD
ansible.builtin.systemd:
daemon_reload: true
post_tasks:
- name: Start Local SSD service
ansible.builtin.service:
name: mount-local-ssd.service
state: started
enabled: true
- type: ansible-local
destination: timesyncd.yml
content: |
Expand Down

0 comments on commit 2b62970

Please sign in to comment.