From 466d1b7ee9a4b9ee0e0d20e78dc9ad50f2457464 Mon Sep 17 00:00:00 2001
From: Alyssa <alyssasm@google.com>
Date: Wed, 13 Mar 2024 20:48:16 +0000
Subject: [PATCH] Migrate gpu.yaml to Slurm-GCP v6

---
 .../validate_configs/test_configs/gpu-v6.yaml | 195 ++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 tools/validate_configs/test_configs/gpu-v6.yaml

diff --git a/tools/validate_configs/test_configs/gpu-v6.yaml b/tools/validate_configs/test_configs/gpu-v6.yaml
new file mode 100644
index 0000000000..8d92e62cad
--- /dev/null
+++ b/tools/validate_configs/test_configs/gpu-v6.yaml
@@ -0,0 +1,195 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+blueprint_name: gpu-vm-v6
+
+vars:
+  project_id:  ## Set GCP Project ID Here ##
+  deployment_name: gpu-vm-v6
+  region: us-central1
+  zone: us-central1-c
+  instance_image_vm:
+    family: common-dl-gpu-debian-10
+    project: ml-images
+
+# Broken into 3 groups to better manage GPU quotas
+deployment_groups:
+- group: high-count-auto
+  modules:
+  - id: network-hca
+    source: modules/network/pre-existing-vpc
+
+  - id: auto-megagpu
+    source: modules/compute/vm-instance
+    use:
+    - network-hca
+    settings:
+      name_prefix: auto-megagpu
+      machine_type: a2-megagpu-16g
+      instance_image: $(vars.instance_image_vm)
+
+- group: high-count-manual
+  modules:
+  - id: network-hcm
+    source: modules/network/pre-existing-vpc
+
+  - id: manual-megagpu
+    source: modules/compute/vm-instance
+    use:
+    - network-hcm
+    settings:
+      name_prefix: manual-megagpu
+      machine_type: a2-megagpu-16g
+      instance_image: $(vars.instance_image_vm)
+      guest_accelerator:
+      - type: nvidia-tesla-a100
+        count: 16
+
+- group: low-count
+  modules:
+  # Source is an embedded module, denoted by "modules/*" without ./, ../, /
+  # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
+  # Example - ./modules/network/vpc
+  - id: network
+    source: modules/network/pre-existing-vpc
+
+  - id: nogpu-n1
+    source: ./modules/compute/vm-instance
+    use:
+    - network
+    settings:
+      name_prefix: nogpu-n1
+      machine_type: n1-standard-8
+      instance_image: $(vars.instance_image_vm)
+
+  - id: manual-n1
+    source: ./modules/compute/vm-instance
+    use:
+    - network
+    settings:
+      name_prefix: manual-n1
+      machine_type: n1-standard-32
+      on_host_maintenance: TERMINATE
+      instance_image: $(vars.instance_image_vm)
+      guest_accelerator:
+      - type: nvidia-tesla-t4
+        count: 1
+
+  - id: auto-highgpu
+    source: modules/compute/vm-instance
+    use:
+    - network
+    settings:
+      name_prefix: auto-highgpu
+      machine_type: a2-highgpu-1g
+      instance_image: $(vars.instance_image_vm)
+
+  - id: manual-highgpu
+    source: modules/compute/vm-instance
+    use:
+    - network
+    settings:
+      name_prefix: manual-highgpu
+      machine_type: a2-highgpu-2g
+      instance_image: $(vars.instance_image_vm)
+      guest_accelerator:
+      - type: nvidia-tesla-a100
+        count: 2
+
+  - id: auto-ultragpu
+    source: modules/compute/vm-instance
+    use:
+    - network
+    settings:
+      name_prefix: auto-ultragpu
+      machine_type: a2-ultragpu-2g
+      instance_image: $(vars.instance_image_vm)
+
+  - id: manual-ultragpu
+    source: modules/compute/vm-instance
+    use:
+    - network
+    settings:
+      name_prefix: manual-ultragpu
+      machine_type: a2-ultragpu-2g
+      instance_image: $(vars.instance_image_vm)
+      guest_accelerator:
+      - type: nvidia-a100-80gb
+        count: 2
+
+- group: slurm-gcp-v6
+  modules:
+  - id: network_slurm
+    source: modules/network/pre-existing-vpc
+
+  - id: nogpu_nodegroup
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+    use: [network_slurm]
+    settings:
+      name: nogpu
+      enable_placement: false
+      node_count_dynamic_max: 4
+      machine_type: n2-standard-2
+
+  - id: manual_nodegroup
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+    use: [network_slurm]
+    settings:
+      name: man
+      enable_placement: false
+      node_count_dynamic_max: 4
+      machine_type: a2-ultragpu-2g
+      guest_accelerator:
+      - type: nvidia-a100-80gb
+        count: 2
+
+  - id: auto_nodegroup
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+    use: [network_slurm]
+    settings:
+      name: auto
+      enable_placement: false
+      node_count_dynamic_max: 4
+      machine_type: a2-ultragpu-2g
+
+  - id: partition
+    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
+    use:
+    - nogpu_nodegroup
+    - manual_nodegroup
+    - auto_nodegroup
+    settings:
+      partition_name: debug
+      is_default: true
+
+  - id: slurm_login
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
+    use:
+    - network_slurm
+    settings:
+      name_prefix: login
+      disable_login_public_ips: false
+      machine_type: a2-highgpu-1g
+
+  - id: slurm_controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+    use:
+    - network_slurm
+    - slurm_login
+    - partition
+    settings:
+      disable_controller_public_ips: false
+      machine_type: a2-highgpu-2g