From fc6d088494076d5998c1ed61a07d787161625675 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 6 Jan 2024 06:08:18 +0000 Subject: [PATCH] Move `examples/hpc-slurm` to V6 pick f88a30f2 Unify usage and rendering of `HintError` * Move `examples/hpc-slurm` to V6; * Updated `examples/README`; * Remove `slurm-v5-hpc-centos7` test. --- community/examples/hpc-slurm6.yaml | 87 ------------------- .../schedmd-slurm-gcp-v5-login/README.md | 3 - examples/README.md | 36 +------- examples/hpc-slurm.yaml | 66 +++++++------- .../builds/slurm-gcp-v5-hpc-centos7.yaml | 54 ------------ .../tests/slurm-v5-hpc-centos7.yml | 43 --------- .../daily-tests/tests/slurm-v6-rocky8.yml | 3 +- 7 files changed, 37 insertions(+), 255 deletions(-) delete mode 100644 community/examples/hpc-slurm6.yaml delete mode 100644 tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml delete mode 100644 tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml diff --git a/community/examples/hpc-slurm6.yaml b/community/examples/hpc-slurm6.yaml deleted file mode 100644 index cf6a15b072..0000000000 --- a/community/examples/hpc-slurm6.yaml +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm6 - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: slurm-gcp-v6 - region: us-west4 - zone: us-west4-c - instance_image: - family: slurm-gcp-6-1-hpc-rocky-linux-8 - project: schedmd-slurm-public - -deployment_groups: -- group: primary - modules: - - id: network - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network] - settings: - local_mount: /home - - - id: debug_nodeset - source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset - use: [network] - settings: - node_count_dynamic_max: 4 - machine_type: n2-standard-2 - enable_placement: false # the default is: true - - - id: debug_partition - source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [debug_nodeset, homefs] - settings: - partition_name: debug - exclusive: false # allows nodes to stay up after jobs are done - is_default: true - - - id: compute_nodeset - source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset - use: [network] - settings: - name: ns2 - node_count_dynamic_max: 20 - bandwidth_tier: gvnic_enabled - - - id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [compute_nodeset, homefs] - settings: - partition_name: compute - - - id: slurm_login - source: community/modules/scheduler/schedmd-slurm-gcp-v6-login - use: [network] - settings: - name_prefix: login - machine_type: n2-standard-4 - disable_login_public_ips: false - - - id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller - use: - - network - - debug_partition - - compute_partition - - slurm_login - - homefs - settings: - disable_controller_public_ips: false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 4833a999e5..09979f2320 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -28,9 +28,6 @@ This creates a Slurm login node which is: `use` * of VM machine type `n2-standard-4` -For a complete example using this module, see -[hpc-slurm.yaml](../../../../examples/hpc-slurm.yaml). - ## Custom Images For more information on creating valid custom images for the login node VM diff --git a/examples/README.md b/examples/README.md index a677b8ffb5..60ba8a54e9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,7 +13,6 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [Blueprint Descriptions](#blueprint-descriptions) * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge] * [hpc-enterprise-slurm.yaml](#hpc-enterprise-slurmyaml-) ![core-badge] - * [hpc-slurm6.yaml](#hpc-slurm6yaml-) ![community-badge] ![experimental-badge] * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml-) ![community-badge] ![experimental-badge] * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge] * [image-builder.yaml](#image-builderyaml-) ![core-badge] @@ -119,13 +118,11 @@ the experimental badge (![experimental-badge]). ### [hpc-slurm.yaml] ![core-badge] -> **Warning**: The variables `enable_reconfigure`, -> `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to -> `true`, require additional dependencies **to be installed on the system deploying the infrastructure**. +> **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.2.1/scripts/requirements.txt > ``` Creates a basic auto-scaling Slurm cluster with mostly default settings. The @@ -266,35 +263,6 @@ to 256 [hpc-enterprise-slurm.yaml]: ./hpc-enterprise-slurm.yaml -### [hpc-slurm6.yaml] ![community-badge] ![experimental-badge] - -> **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. -> -> ```shell -> # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.2.1/scripts/requirements.txt -> ``` - -Creates a basic auto-scaling Slurm cluster with mostly default settings. The -blueprint also creates a new VPC network, and a filestore instance mounted to -`/home`. - -There are 2 partitions in this example: `debug`, and `compute`. The `debug` -partition uses `n2-standard-2` VMs, which should work out of the box without -needing to request additional quota. The purpose of the `debug` partition is to -make sure that first time users are not immediately blocked by quota -limitations. - -[hpc-slurm6.yaml]: ../community/examples/hpc-slurm6.yaml - -#### Compute Partition - -There is a `compute` partition that achieves higher performance. Any -performance analysis should be done on the `compute` partition. By default it -uses `c2-standard-60` VMs with placement groups enabled. You may need to request -additional quota for `C2 CPUs` in the region you are deploying in. You can -select the compute partition using the `-p compute` argument when running `srun`. - ### [hpc-slurm6-tpu.yaml] ![community-badge] ![experimental-badge] > **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml index 439870b8fe..59edc0586b 100644 --- a/examples/hpc-slurm.yaml +++ b/examples/hpc-slurm.yaml @@ -18,7 +18,7 @@ blueprint_name: hpc-slurm vars: project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-small + deployment_name: hpc-slurm region: us-central1 zone: us-central1-a @@ -28,53 +28,54 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - - id: network1 + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local module, prefix with ./, ../ or / + # Example - ./modules/network/vpc + - id: network source: modules/network/vpc - id: homefs source: modules/file-system/filestore - use: [network1] + use: [network] settings: local_mount: /home - - id: debug_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 4 machine_type: n2-standard-2 + enable_placement: false # the default is: true - id: debug_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - homefs - - debug_node_group + - debug_nodeset settings: partition_name: debug exclusive: false # allows nodes to stay up after jobs are done - enable_placement: false # the default is: true is_default: true - - id: compute_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled - id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - homefs - - compute_node_group + - compute_nodeset settings: partition_name: compute - - id: h3_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: h3_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 20 machine_type: h3-standard-88 @@ -84,30 +85,29 @@ deployment_groups: bandwidth_tier: gvnic_enabled - id: h3_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - homefs - - h3_node_group + - h3_nodeset settings: partition_name: h3 + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + - id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - - network1 + - network - debug_partition - compute_partition - h3_partition - homefs + - slurm_login settings: disable_controller_public_ips: false - - - id: slurm_login - source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - use: - - network1 - - slurm_controller - settings: - machine_type: n2-standard-4 - disable_login_public_ips: false diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml deleted file mode 100644 index 3896883092..0000000000 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" -## Test Slurm v5 HPC Centos7 Example -- id: slurm-gcp-v5-hpc-centos7 - waitFor: ["fetch_builder", "build_ghpc"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml" diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml deleted file mode 100644 index fe646b74da..0000000000 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -test_name: hpc-slurm -deployment_name: "cent-v5-{{ build }}" -# Manually adding the slurm_cluster_name for use in node names, which filters -# non-alphanumeric chars and is capped at 10 chars. -slurm_cluster_name: "centv5{{ build[0:4] }}" -zone: us-west4-c -cli_deployment_vars: - enable_cleanup_compute: true - region: us-west4 - zone: "{{ zone }}" - zones: "[us-west4-a,us-west4-b,us-west4-c]" -workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/hpc-slurm.yaml" -network: "{{ deployment_name }}-net" -max_nodes: 5 -# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work. -login_node: "{{ slurm_cluster_name }}-login-*" -controller_node: "{{ slurm_cluster_name }}-controller" -post_deploy_tests: -- test-validation/test-mounts.yml -- test-validation/test-partitions.yml -custom_vars: - partitions: - - compute - - debug - mounts: - - /home diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml index b5f0a8655b..b77baf5382 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml @@ -26,13 +26,14 @@ cli_deployment_vars: zone: us-west4-c workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm6.yaml" +blueprint_yaml: "{{ workspace }}/examples/hpc-slurm.yaml" network: "{{ deployment_name }}-net" max_nodes: 5 # Note: Pattern matching in gcloud only supports 1 wildcard, a*-login-* won't work. login_node: "{{ slurm_cluster_name }}-login-*" controller_node: "{{ slurm_cluster_name }}-controller" post_deploy_tests: +- test-validation/test-mounts.yml - test-validation/test-partitions.yml custom_vars: partitions: