Skip to content

Commit

Permalink
Merge pull request #3206 from tpdownes/a3_docker
Browse files Browse the repository at this point in the history
Adopt local SSD storage for A3 docker images
  • Loading branch information
tpdownes authored Nov 1, 2024
2 parents 38f5344 + d35a85c commit f7dabee
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ vars:
enable_ops_agent: true
# enable the NVIDIA DCGM daemon and integration into Cloud Ops Agent
enable_nvidia_dcgm: true
localssd_mountpoint: /mnt/localssd

deployment_groups:
- group: cluster
Expand Down Expand Up @@ -114,8 +115,17 @@ deployment_groups:
# Failure to do will result in VMs that lose data and do not automatically
# mount local SSD filesystems
local_ssd_filesystem:
mountpoint: /mnt/localssd
mountpoint: $(vars.localssd_mountpoint)
permissions: "1777" # must quote numeric filesystem permissions!
# Docker was successfully installed in the image, this configures it
# to use the A3-specific local SSD volumes to store container images
docker:
enabled: true
world_writable: true
daemon_config: |
{
"data-root": "$(vars.localssd_mountpoint)/docker"
}
runners:
- type: ansible-local
destination: enable_nvidia_dcgm.yml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ vars:
project: $(vars.project_id)
enable_login_public_ips: true
enable_controller_public_ips: true
localssd_mountpoint: /mnt/localssd

deployment_groups:
- group: cluster
Expand Down Expand Up @@ -89,8 +90,17 @@ deployment_groups:
# Failure to do will result in VMs that lose data and do not automatically
# mount local SSD filesystems
local_ssd_filesystem:
mountpoint: /mnt/localssd
mountpoint: $(vars.localssd_mountpoint)
permissions: "1777" # must quote numeric filesystem permissions!
# Docker was successfully installed in the image, this configures it
# to use the A3-specific local SSD volumes to store container images
docker:
enabled: true
world_writable: true
daemon_config: |
{
"data-root": "$(vars.localssd_mountpoint)/docker"
}
runners:
- type: ansible-local
destination: slurm_aperture.yml
Expand Down
5 changes: 4 additions & 1 deletion modules/scripts/startup-script/files/install_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@
dest: /etc/docker/daemon.json
mode: '0644'
content: '{{ docker_daemon_config }}'
validate: /usr/bin/dockerd --validate --config-file %s
# validate flag requires Docker server version 23.0 and above
# can add this back after private A3 DLVM image is deprecated
# this image comes with Docker version 20.10.17
# validate: /usr/bin/dockerd --validate --config-file %s
when: docker_daemon_config
notify:
- Restart Docker
Expand Down

0 comments on commit f7dabee

Please sign in to comment.