From e1455af9e4a2bb20a2736e7d24c6161697ffc972 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 4 Nov 2024 22:34:41 +0000 Subject: [PATCH] Refactor mount/mode setting for local SSD RAID The local SSD RAID solution is written in Ansible which will successfully handle re-creating the RAID array and mounting it in scenarios where the VM has been re-created and the contents of local SSD have been discared. The Slurm solutions do not re-run startup scripts after the first boot using a given persistent disk. During maintenance events, the persistent disk is retained while the local SSD disks are discarded. PR #3129 addressed re-creating, formatting and mounting the RAID array but left a gap in setting the mode of the mounted directory after power off/on cycles. This PR refactors mounting and mode-setting to resolve this gap. --- .../startup-script/files/setup-raid.yml | 44 ++++++++++++------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/modules/scripts/startup-script/files/setup-raid.yml b/modules/scripts/startup-script/files/setup-raid.yml index d7590069a8..5ebf35e522 100644 --- a/modules/scripts/startup-script/files/setup-raid.yml +++ b/modules/scripts/startup-script/files/setup-raid.yml @@ -53,10 +53,11 @@ [Unit] After=local-fs.target Before=slurmd.service - ConditionPathIsMountPoint=!{{ mountpoint }} + ConditionPathExists=!{{ array_dev }} [Service] Type=oneshot + RemainAfterExit=yes ExecStart=/usr/bin/bash -c "/usr/sbin/mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-*{{ " --force" if local_ssd_devices.files | length == 1 else "" }}" ExecStartPost=/usr/sbin/mkfs -t {{ fstype }}{{ " -m 0" if fstype == "ext4" else "" }} {{ array_dev }} @@ -70,19 +71,30 @@ enabled: true daemon_reload: true - - name: Mount RAID array - ansible.posix.mount: - src: "{{ array_dev }}" - path: "{{ mountpoint }}" - fstype: "{{ fstype }}" - # the nofail option is critical as it enables the boot process to complete on machines - # that were powered off and had local SSD contents discarded; without this option - # VMs may fail to join the network - opts: discard,defaults,nofail - state: mounted + - name: Install service to mount local SSD array + ansible.builtin.copy: + dest: /etc/systemd/system/mount-localssd-raid.service + mode: 0644 + content: | + [Unit] + After=local-fs.target create-localssd-raid.service + Before=slurmd.service + Wants=create-localssd-raid.service + ConditionPathIsMountPoint=!{{ mountpoint }} - - name: Set mount permissions - ansible.builtin.file: - path: "{{ mountpoint }}" - state: directory - mode: "{{ mode }}" + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStart=/usr/bin/systemd-mount -t {{ fstype }} -o discard,defaults,nofail {{ array_dev }} {{ mountpoint }} + ExecStartPost=/usr/bin/chmod {{ mode }} {{ mountpoint }} + ExecStop=/usr/bin/systemd-umount {{ mountpoint }} + + [Install] + WantedBy=slurmd.service + + - name: Mount RAID array and set permissions + ansible.builtin.systemd: + name: mount-localssd-raid.service + state: started + enabled: true + daemon_reload: true