diff --git a/cmd/root.go b/cmd/root.go index e58b8a743d..f46cc18e47 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.41.0", + Version: "v1.42.0", Annotations: annotation, } ) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 282d5b7816..211f41e9cc 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -168,7 +168,7 @@ deployment_groups: # these images must match the images used by Slurm modules below because # we are building OpenMPI with PMI support in libraries contained in # Slurm installation - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public - id: low_cost_nodeset diff --git a/community/examples/flux-framework/flux-cluster.yaml b/community/examples/flux-framework/flux-cluster.yaml index e38393fe01..7004b3af88 100644 --- a/community/examples/flux-framework/flux-cluster.yaml +++ b/community/examples/flux-framework/flux-cluster.yaml @@ -34,7 +34,7 @@ deployment_groups: settings: local_mount: /home - id: fluxfw-gcp - source: github.com/GoogleCloudPlatform/scientific-computing-examples//fluxfw-gcp/tf?ref=867e558 + source: github.com/GoogleCloudPlatform/scientific-computing-examples//fluxfw-gcp/tf?ref=cb36377 settings: compute_node_specs: - name_prefix: gfluxfw-compute diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index a1fa81767e..67994a4be5 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -23,7 +23,7 @@ vars: image_build_machine_type: n2d-standard-16 build_from_image_family: hpc-rocky-linux-8 build_from_image_project: cloud-hpc-image-public - build_from_git_ref: 6.8.2 + build_from_git_ref: 6.8.5 built_image_family: my-custom-slurm built_instance_image: family: $(vars.built_image_family) diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index 34037a1052..475f65a317 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -24,7 +24,7 @@ vars: slurm_image: # Please refer to the following link for the latest images: # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - family: slurm-gcp-6-7-ubuntu-2004-lts + family: slurm-gcp-6-8-ubuntu-2004-lts project: schedmd-slurm-public instance_image_custom: true diff --git a/community/examples/hpc-slurm6-apptainer.yaml b/community/examples/hpc-slurm6-apptainer.yaml index 47e9c267aa..fba9e18f87 100644 --- a/community/examples/hpc-slurm6-apptainer.yaml +++ b/community/examples/hpc-slurm6-apptainer.yaml @@ -60,7 +60,7 @@ deployment_groups: settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-7-hpc-rocky-linux-8 + source_image_family: slurm-gcp-6-8-hpc-rocky-linux-8 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md deleted file mode 100644 index e83bd27391..0000000000 --- a/community/examples/intel/README.md +++ /dev/null @@ -1,454 +0,0 @@ -# Intel Solutions for the Cluster Toolkit (formerly HPC Toolkit) - -> **_NOTE:_** The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) will not be compatible -> for newer version of slurm-gcp v6. - - - - -- [Intel Solutions for the Cluster Toolkit](#intel-solutions-for-the-cluster-toolkit) - - [DAOS Cluster](#daos-cluster) - - [Initial Setup for DAOS Cluster](#initial-setup-for-daos-cluster) - - [Deploy the DAOS Cluster](#deploy-the-daos-cluster) - - [Connect to a client node](#connect-to-a-client-node) - - [Verify the DAOS storage system](#verify-the-daos-storage-system) - - [Create a DAOS Pool and Container](#create-a-daos-pool-and-container) - - [About the DAOS Command Line Tools](#about-the-daos-command-line-tools) - - [View Free Space](#view-free-space) - - [Create a Pool](#create-a-pool) - - [Create a Container](#create-a-container) - - [Mount the DAOS Container](#mount-the-daos-container) - - [Use DAOS Storage](#use-daos-storage) - - [Unmount the DAOS Container](#unmount-the-daos-container) - - [Delete the DAOS infrastructure when not in use](#delete-the-daos-infrastructure-when-not-in-use) - - [DAOS Server with Slurm cluster](#daos-server-with-slurm-cluster) - - [Initial Setup for the DAOS/Slurm cluster](#initial-setup-for-the-daosslurm-cluster) - - [Deploy the DAOS/Slurm Cluster](#deploy-the-daosslurm-cluster) - - [Connect to the DAOS/Slurm Cluster login node](#connect-to-the-daosslurm-cluster-login-node) - - [Create and Mount a DAOS Container](#create-and-mount-a-daos-container) - - [Run a Job that uses the DAOS Container](#run-a-job-that-uses-the-daos-container) - - [Unmount the Container](#unmount-the-container) - - [Delete the DAOS/Slurm Cluster infrastructure when not in use](#delete-the-daosslurm-cluster-infrastructure-when-not-in-use) - -## DAOS Cluster - -The [pfs-daos.yaml](pfs-daos.yaml) blueprint describes an environment with -- Two DAOS server instances -- Two DAOS client instances - -The [pfs-daos.yaml](pfs-daos.yaml) blueprint uses a Packer template and -Terraform modules from the [Google Cloud DAOS][google-cloud-daos] repository. -Please review the [introduction to image building](../../../docs/image-building.md) -for general information on building custom images using the Toolkit. - -Identify a project to work in and substitute its unique id wherever you see -`<>` in the instructions below. - -[google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos -[pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md -[DAOS Yum Repository]: https://packages.daos.io - -### Initial Setup for DAOS Cluster - -Before provisioning the DAOS cluster you must follow the steps listed in the [Google Cloud DAOS Pre-deployment Guide][pre-deployment_guide]. - -Skip the "Build DAOS Images" step at the end of the [Pre-deployment Guide][pre-deployment_guide]. The [pfs-daos.yaml](pfs-daos.yaml) blueprint will build the images as part of the deployment. - -The Pre-deployment Guide provides instructions for: -- installing the Google Cloud CLI -- enabling service accounts -- enabling APIs -- establishing minimum resource quotas -- creating a Cloud NAT to allow instances without public IPs to access the [DAOS Yum Repository] repository. - -### Deploy the DAOS Cluster - -After completing the steps in the [Pre-deployment Guide][pre-deployment_guide] use `gcluster` to provision the blueprint - -```text -gcluster create community/examples/intel/pfs-daos.yaml \ - --vars project_id=<> \ - [--backend-config bucket=] -``` - -This will create the deployment directory containing Terraform modules and -Packer templates. The `--backend-config` option is not required but recommended. -It will save the terraform state in a pre-existing [Google Cloud Storage -bucket][bucket]. For more information see [Setting up a remote terraform -state][backend]. Use `gcluster deploy` to provision your DAOS storage cluster: - -```text -gcluster deploy pfs-daos --auto-approve -``` - -[backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state -[bucket]: https://cloud.google.com/storage/docs/creating-buckets - -### Connect to a client node - -1. Open the following URL in a new tab. - - https://console.cloud.google.com/compute - - This will take you to **Compute Engine > VM instances** in the Google Cloud Console. - - Select the project in which the DAOS cluster will be provisioned. - -2. Click on the **SSH** button associated with the **daos-client-0001** - instance to open a window with a terminal into the first DAOS client instance. - -### Verify the DAOS storage system - -The `community/examples/intel/pfs-daos.yaml` blueprint does not contain configuration for DAOS pools and containers. Therefore, pools and containers will need to be created manually. - -Before pools and containers can be created the storage system must be formatted. Formatting the storage is done automatically by the startup script that runs on the *daos-server-0001* instance. The startup script will run the [dmg storage format](https://docs.daos.io/v2.4/admin/deployment/?h=dmg+storage#storage-formatting) command. It may take a few minutes for all daos server instances to join. - -Verify that the storage system has been formatted and that the daos-server instances have joined. - -```bash -sudo dmg system query -v -``` - -The command will not return output until the system is ready. - -The output will look similar to - -```text -Rank UUID Control Address Fault Domain State Reason ----- ---- --------------- ------------ ----- ------ -0 225a0a51-d4ed-4ac3-b1a5-04b31c08b559 10.128.0.51:10001 /daos-server-0001 Joined -1 553ab1dc-99af-460e-a57c-3350611d1d09 10.128.0.43:10001 /daos-server-0002 Joined -``` - -Both daos-server instances should show a state of *Joined*. - -### Create a DAOS Pool and Container - -#### About the DAOS Command Line Tools - -The DAOS Management tool `dmg` is used by System Administrators to manage the DAOS storage [system](https://docs.daos.io/v2.4/overview/architecture/#daos-system) and DAOS [pools](https://docs.daos.io/v2.4/overview/storage/#daos-pool). Therefore, `sudo` must be used when running `dmg`. - -The DAOS CLI `daos` is used by both users and System Administrators to create and manage [containers](https://docs.daos.io/v2.4/overview/storage/#daos-container). It is not necessary to use `sudo` with the `daos` command. - -#### View Free Space - -View how much free space is available. - -```bash -sudo dmg storage query usage -``` - -#### Create a Pool - -Create a single pool owned by root which uses 100% of the available free space. - -```bash -sudo dmg pool create --size=100% --user=root pool1 -``` - -Set ACLs to allow any user to create a container in *pool1*. - -```bash -sudo dmg pool update-acl -e A::EVERYONE@:rcta pool1 -``` - -See the [Pool Operations](https://docs.daos.io/v2.4/admin/pool_operations) section of the DAOS Administration Guide for more information about creating pools. - -#### Create a Container - -At this point it is necessary to determine who will need to access the container -and how it will be used. The ACLs will need to be set properly to allow users and/or groups to access the container. - -For the purpose of this demo create the container without specifying ACLs. The container will be owned by your user account and you will have full access to the container. - -```bash -daos container create --type=POSIX --properties=rf:0 pool1 cont1 -``` - -See the [Container Management](https://docs.daos.io/v2.4/user/container) section of the DAOS User Guide for more information about creating containers. - -#### Mount the DAOS Container - -Mount the container with dfuse (DAOS Fuse) - -```bash -mkdir -p "${HOME}/daos/cont1" -dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" -``` - -Verify that the container is mounted - -```bash -df -h -t fuse.daos -``` - -### Use DAOS Storage - -The `cont1` container is now mounted on `${HOME}/daos/cont1` - -Create a 20GiB file which will be stored in the DAOS filesystem. - -```bash -time LD_PRELOAD=/usr/lib64/libioil.so \ -dd if=/dev/zero of="${HOME}/daos/cont1/test20GiB.img" iflag=fullblock bs=1G count=20 -``` - -**Known Issue:** - -When you run `ls -lh "${HOME}/daos/cont1"` you may see that the `test20GiB.img` file shows a size of 0 bytes. - -If you unmount the container and mount it again, the file size will show as 20G. - -```bash -fusermount3 -u "${HOME}/daos/cont1" -dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" -ls -lh "${HOME}/daos/cont1" -``` - -A work-around for this issue to disable caching when mounting the container. - -```bash -dfuse --singlethread --disable-caching --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" -``` - -See the [File System](https://docs.daos.io/v2.4/user/filesystem/) section of the DAOS User Guide for more information about DFuse. - -### Unmount the DAOS Container - -The container will need to be unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. - -Verify that the container is unmounted - -```bash -df -h -t fuse.daos -``` - -Logout of the DAOS client instance. - -```bash -logout -``` - -See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. - -### Delete the DAOS infrastructure when not in use - -> **_NOTE:_** Data stored in the DAOS container will be permanently lost after cluster deletion. - -Delete the remaining infrastructure - -```bash -gcluster destroy pfs-daos --auto-approve -``` - -## DAOS Server with Slurm cluster - -The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint can be used to deploy a Slurm cluster and four DAOS server instances. The Slurm compute instances are configured as DAOS clients. - -The blueprint uses modules from -- [google-cloud-daos][google-cloud-daos] -- [community/modules/compute/schedmd-slurm-gcp-v6-nodeset][schedmd-slurm-gcp-v6-nodeset] -- [community/modules/compute/schedmd-slurm-gcp-v6-partition][schedmd-slurm-gcp-v6-partition] -- [community/modules/scheduler/schedmd-slurm-gcp-v6-login][schedmd-slurm-gcp-v6-login] -- [community/modules/scheduler/schedmd-slurm-gcp-v6-controller][schedmd-slurm-gcp-v6-controller] - -The blueprint also uses a Packer template from the [Google Cloud -DAOS][google-cloud-daos] repository. Please review the [introduction to image -building](../../../docs/image-building.md) for general information on building -custom images using the Toolkit. - -Substitute your project ID wherever you see `<>` in the instructions below. - -### Initial Setup for the DAOS/Slurm cluster - -Before provisioning the DAOS cluster you must follow the steps listed in the [Google Cloud DAOS Pre-deployment Guide][pre-deployment_guide]. - -Skip the "Build DAOS Images" step at the end of the [Pre-deployment Guide][pre-deployment_guide]. The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint will build the DAOS server image as part of the deployment. - -The [Pre-deployment Guide][pre-deployment_guide] provides instructions for enabling service accounts, APIs, establishing minimum resource quotas and other necessary steps to prepare your project for DAOS server deployment. - -[google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos -[pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md -[packer-template]: https://github.com/daos-stack/google-cloud-daos/blob/main/images/daos.pkr.hcl -[apis]: ../../../README.md#enable-gcp-apis -[schedmd-slurm-gcp-v6-nodeset]: ../../modules/compute/schedmd-slurm-gcp-v6-nodeset -[schedmd-slurm-gcp-v6-partition]: ../../modules/compute/schedmd-slurm-gcp-v6-partition -[schedmd-slurm-gcp-v6-controller]: ../../modules/scheduler/schedmd-slurm-gcp-v6-controller -[schedmd-slurm-gcp-v6-login]: ../../modules/scheduler/schedmd-slurm-gcp-v6-login - -Follow the Toolkit guidance to enable [APIs][apis] and establish minimum resource [quotas][quotas] for Slurm. - -[apis]: ../../../README.md#enable-gcp-apis -[quotas]: ../../../README.md#gcp-quotas - -The following available quota is required in the region used by Slurm: - -- Filestore: 2560GB -- C2 CPUs: 6000 (fully-scaled "compute" partition) - - This quota is not necessary at initial deployment, but will be required to - successfully scale the partition to its maximum size -- C2 CPUs: 4 (login node) - -### Deploy the DAOS/Slurm Cluster - -Use `gcluster` to provision the blueprint, supplying your project ID - -```text -gcluster create community/examples/intel/hpc-slurm-daos.yaml \ - --vars project_id=<> \ - [--backend-config bucket=] -``` - -This will create a set of directories containing Terraform modules and Packer -templates. - -The `--backend-config` option is not required but recommended. It will save the terraform state in a pre-existing [Google Cloud Storage bucket][bucket]. For more information see [Setting up a remote terraform state][backend]. - -Follow `gcluster` instructions to deploy the environment - -```text -gcluster deploy hpc-slurm-daos --auto-approve -``` - -[backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state -[bucket]: https://cloud.google.com/storage/docs/creating-buckets - -### Connect to the DAOS/Slurm Cluster login node - -Once the startup script has completed and Slurm reports readiness, connect to the login node. - -1. Open the following URL in a new tab. - - https://console.cloud.google.com/compute - - This will take you to **Compute Engine > VM instances** in the Google Cloud Console - - Select the project in which the cluster will be provisionsd. - -2. Click on the `SSH` button associated with the `hpcslurmda-login-login-001` - instance. - - This will open a separate pop up window with a terminal into our newly created - Slurm login VM. - -### Create and Mount a DAOS Container - -The [community/examples/intel/hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint defines a single DAOS pool named `pool1`. The pool will be created when the *daos-server* instances are provisioned. - -You will need to create your own DAOS container in the pool that can be used by your Slurm jobs. - -While logged into the login node create a container named `cont1` in the `pool1` pool: - -```bash -daos cont create --type=POSIX --properties=rf:0 pool1 cont1 -``` - -NOTE: If you encounter an error `daos: command not found`, it's likely that the startup scripts have not finished running yet. Wait a few minutes and try again. - -Since the `cont1` container is owned by your account, your Slurm jobs will need to run as your user account to access the container. - -Create a mount point for the container and mount it with dfuse (DAOS Fuse) - -```bash -mkdir -p ${HOME}/daos/cont1 - -dfuse --singlethread \ ---pool=pool1 \ ---container=cont1 \ ---mountpoint=${HOME}/daos/cont1 -``` - -Verify that the container is mounted - -```bash -df -h -t fuse.daos -``` - -### Run a Job that uses the DAOS Container - -On the login node create a `daos_job.sh` file with the following content - -```bash -#!/bin/bash -JOB_HOSTNAME="$(hostname)" -TIMESTAMP="$(date '+%Y%m%d%H%M%S')" - -echo "Timestamp = ${TIMESTAMP}" -echo "Date = $(date)" -echo "Hostname = $(hostname)" -echo "User = $(whoami)" -echo "Working Directory = $(pwd)" -echo "" -echo "Number of Nodes Allocated = $SLURM_JOB_NUM_NODES" -echo "Number of Tasks Allocated = $SLURM_NTASKS" - -MOUNT_DIR="${HOME}/daos/cont1" -LOG_FILE="${MOUNT_DIR}/${JOB_HOSTNAME}.log" - -echo "${JOB_HOSTNAME} : Creating directory: ${MOUNT_DIR}" -mkdir -p "${MOUNT_DIR}" - -echo "${JOB_HOSTNAME} : Mounting with dfuse" -dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${MOUNT_DIR}" -sleep 5 - -echo "${JOB_HOSTNAME} : Creating log file" -echo "Job ${SLURM_JOB_ID} running on ${JOB_HOSTNAME}" | tee "${MOUNT_DIR}/${TIMESTAMP}_${JOB_HOSTNAME}.log" - -echo "${JOB_HOSTNAME} : Unmounting dfuse" -fusermount3 -u "${MOUNT_DIR}" - -``` - -Run the `daos_job.sh` script in an interactive Slurm job on 4 nodes - -```bash -srun --nodes=4 \ - --ntasks-per-node=1 \ - --time=00:10:00 \ - --job-name=daos \ - --output=srunjob_%j.log \ - --partition=compute \ - daos_job.sh & -``` - -Run `squeue` to see the status of the job. The `daos_job.sh` script will run once on each of the 4 nodes. Each time it runs it creates a log file which is stored in the `cont1` DAOS container. - -Wait for the job to complete and then view the files that were created in the `cont1` DAOS container mounted on `${HOME}/daos/cont1`. - -```bash -ls -l ${HOME}/daos/cont1/*.log -cat ${HOME}/daos/cont1/*.log -``` - -### Unmount the Container - -The container will need to by unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. - -```bash -fusermount3 -u ${HOME}/daos/cont1 -``` - -Verify that the container is unmounted - -```bash -df -h -t fuse.daos -``` - -See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. - -### Delete the DAOS/Slurm Cluster infrastructure when not in use - -> **_NOTE:_** -> -> - Data on the DAOS file system will be permanently lost after cluster deletion. -> - If the Slurm controller is shut down before the auto-scale instances are destroyed, those compute instances will be left running. - -Open your browser to the VM instances page and ensure that instances named "compute" -have been shutdown and deleted by the Slurm autoscaler. - -Delete the remaining infrastructure: - -```bash -gcluster destroy hpc-slurm-daos --auto-approve -``` diff --git a/community/examples/intel/hpc-slurm-daos.yaml b/community/examples/intel/hpc-slurm-daos.yaml deleted file mode 100644 index b3c217474c..0000000000 --- a/community/examples/intel/hpc-slurm-daos.yaml +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm-daos - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-daos - region: us-central1 - zone: us-central1-c - daos_server_image_family: daos-server-hpc-rocky-8 - daos_version: "2.4" - tags: [] - -# Note: this blueprint assumes the existence of a default global network and -# subnetwork in the region chosen above - -validators: -- validator: test_module_not_used - inputs: {} - skip: true - -deployment_groups: -- group: primary - modules: - - id: network1 - source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=v1.33.0&depth=1 - - - id: homefs - source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore?ref=v1.33.0&depth=1 - use: [network1] - settings: - local_mount: /home - -- group: daos-server-image - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/images - - id: daos-server-image - source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" - kind: packer - settings: - daos_version: $(vars.daos_version) - daos_repo_base_url: https://packages.daos.io/ - daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo - use_iap: true - enable_oslogin: false - machine_type: n2-standard-32 - source_image_family: hpc-rocky-linux-8 - source_image_project_id: cloud-hpc-image-public - image_guest_os_features: ["GVNIC"] - disk_size: "20" - state_timeout: "10m" - scopes: ["https://www.googleapis.com/auth/cloud-platform"] - use_internal_ip: true - omit_external_ip: false - daos_install_type: server - image_family: $(vars.daos_server_image_family) - -- group: cluster - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server - - id: daos - source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.5.0&depth=1" - use: [network1] - settings: - labels: {ghpc_role: file-system} - machine_type: "n2-standard-16" - os_family: $(vars.daos_server_image_family) - daos_disk_count: 4 - tags: $(vars.tags) - pools: - - name: "pool1" - size: "100%" - # Do not set value for scm_size when size=100% - daos_scm_size: - user: "root@" - group: "root@" - acls: - - "A::OWNER@:rwdtTaAo" - - "A:G:GROUP@:rwtT" - - "A::EVERYONE@:rcta" - properties: - reclaim: "lazy" - containers: [] - - - id: daos-client-script - source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.33.0&depth=1 - settings: - runners: - - type: data - content: $(daos.daos_agent_yml) - destination: /etc/daos/daos_agent.yml - - type: data - content: $(daos.daos_control_yml) - destination: /etc/daos/daos_control.yml - - type: shell - content: $(daos.daos_client_install_script) - destination: /tmp/daos_client_install.sh - - type: shell - content: $(daos.daos_client_config_script) - destination: /tmp/daos_client_config.sh - - - id: debug_nodeset - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-nodeset?ref=v1.33.0&depth=1 - use: [network1] - settings: - name: ns1 - node_count_dynamic_max: 4 - machine_type: n2-standard-2 - enable_placement: false # the default is: true - service_account_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" - - - id: debug_partition - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-partition?ref=v1.33.0&depth=1 - use: [debug_nodeset] - settings: - partition_name: debug - exclusive: false # allows nodes to stay up after jobs are done - is_default: true - - - id: compute_nodeset - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-nodeset?ref=v1.33.0&depth=1 - use: [network1] - settings: - name: ns2 - node_count_dynamic_max: 20 - bandwidth_tier: gvnic_enabled - service_account_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" - - - id: compute_partition - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-partition?ref=v1.33.0&depth=1 - use: [compute_nodeset] - settings: - partition_name: compute - - - id: slurm_login - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scheduler/schedmd-slurm-gcp-v6-login?ref=v1.33.0&depth=1 - use: [network1] - settings: - name_prefix: login - machine_type: n2-standard-4 - enable_login_public_ips: true - tags: $(vars.tags) - service_account_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" - - - id: slurm_controller - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scheduler/schedmd-slurm-gcp-v6-controller?ref=v1.33.0&depth=1 - use: - - network1 - - debug_partition - - compute_partition - - slurm_login - - homefs - - daos-client-script - settings: - enable_controller_public_ips: true - compute_startup_script: $(daos-client-script.startup_script) - controller_startup_script: $(daos-client-script.startup_script) - login_startup_script: $(daos-client-script.startup_script) - compute_startup_scripts_timeout: 1000 - controller_startup_scripts_timeout: 1000 - login_startup_scripts_timeout: 1000 - tags: $(vars.tags) diff --git a/community/examples/intel/pfs-daos.yaml b/community/examples/intel/pfs-daos.yaml deleted file mode 100644 index 3abf5c9778..0000000000 --- a/community/examples/intel/pfs-daos.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: pfs-daos - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: pfs-daos - region: us-central1 - zone: us-central1-c - daos_server_image_family: daos-server-hpc-rocky-8 - daos_client_image_family: daos-client-hpc-rocky-8 - daos_version: "2.4" - tags: [] - -# Note: this blueprint assumes the existence of a default global network and -# subnetwork in the region chosen above - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/pre-existing-vpc - -- group: daos-server-image - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/images - - id: daos-server-image - source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" - kind: packer - settings: - daos_version: $(vars.daos_version) - daos_repo_base_url: https://packages.daos.io - daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo - use_iap: true - enable_oslogin: false - machine_type: n2-standard-32 - source_image_family: hpc-rocky-linux-8 - source_image_project_id: cloud-hpc-image-public - image_guest_os_features: ["GVNIC"] - disk_size: "20" - state_timeout: "10m" - scopes: ["https://www.googleapis.com/auth/cloud-platform"] - use_internal_ip: true - omit_external_ip: false - daos_install_type: server - image_family: $(vars.daos_server_image_family) - -- group: daos-client-image - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.5.0/images - - id: daos-client-image - source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" - kind: packer - settings: - daos_version: $(vars.daos_version) - daos_repo_base_url: https://packages.daos.io - daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo - use_iap: true - enable_oslogin: false - machine_type: n2-standard-32 - source_image_family: hpc-rocky-linux-8 - source_image_project_id: cloud-hpc-image-public - image_guest_os_features: ["GVNIC"] - disk_size: "20" - state_timeout: "10m" - scopes: ["https://www.googleapis.com/auth/cloud-platform"] - use_internal_ip: true - omit_external_ip: false - daos_install_type: client - image_family: $(vars.daos_client_image_family) - -- group: daos-cluster - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/develop/terraform/modules/daos_server - - id: daos-server - # source: $(vars.daos_server_module_source_url) - source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.5.0&depth=1" - use: [network1] - settings: - number_of_instances: 2 - labels: {ghpc_role: file-system} - os_family: $(vars.daos_server_image_family) - daos_scm_size: "172" - tags: $(vars.tags) - - # more info: https://github.com/daos-stack/google-cloud-daos/tree/develop/terraform/modules/daos_client - - id: daos-client - # source: $(vars.daos_client_module_source_url) - source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_client?ref=v0.5.0&depth=1" - use: [network1, daos-server] - settings: - number_of_instances: 2 - labels: {ghpc_role: compute} - os_family: $(vars.daos_client_image_family) - tags: $(vars.tags) diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml index 9e64014ea7..ebf52861ff 100644 --- a/community/examples/tutorial-starccm-slurm.yaml +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -15,8 +15,6 @@ --- blueprint_name: starccm-on-slurm -toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit -toolkit_modules_version: v1.41.0 vars: project_id: ## Set GCP Project ID Here ## diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 5767904901..c7068a4522 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -210,9 +210,9 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | -| [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 | +| [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | ~> 12.1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/community/modules/compute/htcondor-execute-point/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/compute/htcondor-execute-point/gpu_definition.tf +++ b/community/modules/compute/htcondor-execute-point/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index c4cb0589c7..0d8171092a 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -120,7 +120,7 @@ resource "google_storage_bucket_object" "execute_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" project_id = var.project_id region = var.region @@ -132,7 +132,7 @@ module "startup_script" { module "execute_point_instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" - version = "10.1.1" + version = "~> 12.1" name_prefix = local.name_prefix project_id = var.project_id @@ -160,7 +160,7 @@ module "execute_point_instance_template" { module "mig" { source = "terraform-google-modules/vm/google//modules/mig" - version = "10.1.1" + version = "~> 12.1" project_id = var.project_id region = var.region diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 3f320827a1..db3d320aad 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.42.0" } } diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf index 8e5b3caa45..4c0dd383c6 100644 --- a/community/modules/compute/mig/versions.tf +++ b/community/modules/compute/mig/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:mig/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.42.0" } } diff --git a/community/modules/compute/notebook/README.md b/community/modules/compute/notebook/README.md index 26b726418f..c173b155bb 100644 --- a/community/modules/compute/notebook/README.md +++ b/community/modules/compute/notebook/README.md @@ -66,8 +66,8 @@ No modules. | Name | Type | |------|------| -| [google_notebooks_instance.instance](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/notebooks_instance) | resource | | [google_storage_bucket_object.mount_script](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_workbench_instance.instance](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/workbench_instance) | resource | | [random_id.resource_name_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource | ## Inputs diff --git a/community/modules/compute/notebook/main.tf b/community/modules/compute/notebook/main.tf index 3652667ffb..43b29c8474 100644 --- a/community/modules/compute/notebook/main.tf +++ b/community/modules/compute/notebook/main.tf @@ -20,8 +20,9 @@ locals { } locals { - suffix = random_id.resource_name_suffix.hex - name = "${var.deployment_name}-notebook-${local.suffix}" + suffix = random_id.resource_name_suffix.hex + #name = "thenotebook" + name = "notebook-${var.deployment_name}-${local.suffix}" bucket = replace(var.gcs_bucket_path, "gs://", "") post_script_filename = "mount-${local.suffix}.sh" @@ -54,15 +55,19 @@ resource "google_storage_bucket_object" "mount_script" { bucket = local.bucket } -resource "google_notebooks_instance" "instance" { - name = local.name - location = var.zone - machine_type = var.machine_type - project = var.project_id - post_startup_script = "${var.gcs_bucket_path}/${google_storage_bucket_object.mount_script.name}" - labels = local.labels - vm_image { - project = var.instance_image.project - image_family = var.instance_image.family +resource "google_workbench_instance" "instance" { + name = local.name + location = var.zone + project = var.project_id + labels = local.labels + gce_setup { + machine_type = var.machine_type + metadata = { + post-startup-script = "${var.gcs_bucket_path}/${google_storage_bucket_object.mount_script.name}" + } + vm_image { + project = var.instance_image.project + family = var.instance_image.family + } } } diff --git a/community/modules/compute/notebook/variables.tf b/community/modules/compute/notebook/variables.tf index 5a2c803f30..21eb9518bd 100644 --- a/community/modules/compute/notebook/variables.tf +++ b/community/modules/compute/notebook/variables.tf @@ -22,6 +22,11 @@ variable "project_id" { variable "deployment_name" { description = "Name of the HPC deployment; used as part of name of the notebook." type = string + # notebook name can have: lowercase letters, numbers, or hyphens (-) and cannot end with a hyphen + validation { + error_message = "The notebook name uses 'deployment_name' -- can only have: lowercase letters, numbers, or hyphens" + condition = can(regex("^[a-z0-9]+(?:-[a-z0-9]+)*$", var.deployment_name)) + } } variable "zone" { diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index 757ccd1f48..4fa927cbee 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | -| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.39.0&depth=1 | +| [execution\_startup\_script](#module\_execution\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | +| [pbs\_execution](#module\_pbs\_execution) | ../../../../modules/compute/vm-instance | n/a | +| [pbs\_install](#module\_pbs\_install) | ../../scripts/pbspro-install | n/a | ## Resources diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index f66148e829..a87bf0863e 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -42,7 +42,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.39.0&depth=1" + source = "../../scripts/pbspro-install" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -53,7 +53,7 @@ module "pbs_install" { } module "execution_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" deployment_name = var.deployment_name project_id = var.project_id @@ -68,7 +68,7 @@ module "execution_startup_script" { } module "pbs_execution" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" + source = "../../../../modules/compute/vm-instance" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 51f49882a1..f1c7fedf63 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.42.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index cecea973e1..36cd855d67 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -74,7 +74,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.0 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.2 | ## Resources @@ -93,7 +93,7 @@ No resources. | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | | [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `""` | no | +| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no | ## Outputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf index ed9365c4e6..38fd95b761 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf @@ -29,7 +29,7 @@ locals { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.2" slurm_cluster_name = local.slurm_cluster_name enable_job_exclusive = var.exclusive diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf index d0c1ba162d..653862e030 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf @@ -75,7 +75,7 @@ variable "subnetwork_self_link" { variable "subnetwork_project" { description = "The project the subnetwork belongs to." type = string - default = "" + default = null } variable "exclusive" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index f9fcc59bed..552f50c50e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -151,7 +151,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.0 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.2 | ## Resources @@ -178,7 +178,7 @@ limitations under the License. | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | | [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | | [startup\_script](#input\_startup\_script) | Startup script that will be used by the partition VMs. | `string` | `""` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `""` | no | +| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no | | [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | | [zone\_target\_shape](#input\_zone\_target\_shape) | Strategy for distributing VMs across zones in a region.
ANY
GCE picks zones for creating VM instances to fulfill the requested number of VMs
within present resource constraints and to maximize utilization of unused zonal
reservations.
ANY\_SINGLE\_ZONE (default)
GCE always selects a single zone for all the VMs, optimizing for resource quotas,
available reservations and general capacity.
BALANCED
GCE prioritizes acquisition of resources, scheduling VMs in zones where resources
are available while distributing VMs as evenly as possible across allowed zones
to minimize the impact of zonal failure. | `string` | `"ANY_SINGLE_ZONE"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf index 52e9d0a7d2..2bf5bb7b30 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf @@ -40,7 +40,7 @@ data "google_compute_zones" "available" { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.2" slurm_cluster_name = local.slurm_cluster_name partition_nodes = var.node_groups diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index 45e87da037..1ca9b96eaa 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -137,7 +137,7 @@ variable "subnetwork_self_link" { variable "subnetwork_project" { description = "The project the subnetwork belongs to." type = string - default = "" + default = null } variable "exclusive" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index 4f00828f19..e26e07735f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.42.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index d251dff2af..1ee007d0a1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 | ## Resources @@ -87,7 +87,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | @@ -104,7 +104,7 @@ modules. For support with the underlying modules, see the instructions in the | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [feature](#input\_feature) | The node feature, used to bind nodes to the nodeset. If not set, the nodeset name will be used. | `string` | `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index 7ca868a049..bab7de7eaa 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -56,7 +56,7 @@ locals { } module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.5" project_id = var.project_id region = var.region diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf index a86c28ffc2..a4a2579989 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-7-debian-11", - "slurm-gcp-6-7-hpc-rocky-linux-8", - "slurm-gcp-6-7-ubuntu-2004-lts", - "slurm-gcp-6-7-ubuntu-2204-lts-arm64" + "slurm-gcp-6-8-debian-11", + "slurm-gcp-6-8-hpc-rocky-linux-8", + "slurm-gcp-6-8-ubuntu-2004-lts", + "slurm-gcp-6-8-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf index 5d5f71c9c0..9be7e48dbb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf @@ -68,7 +68,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } @@ -142,7 +142,7 @@ variable "disk_labels" { } variable "additional_disks" { - description = "Configurations of additional disks to be included on the partition nodes. (do not use \"disk_type: local-ssd\"; known issue being addressed)" + description = "Configurations of additional disks to be included on the partition nodes." type = list(object({ disk_name = string device_name = string diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf index 9e7273093a..ed469721ae 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.42.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index f0fb08ee1d..024931b2d8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -59,7 +59,7 @@ No resources. | [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. |
object({
topology = string
version = string
})
|
{
"topology": "",
"version": ""
}
| no | | [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | DEPRECATED: Use `enable_public_ips` instead. | `bool` | `null` | no | -| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf- | `string` | `null` | no | +| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-8-tf- | `string` | `null` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index 3302e0ea4c..0831588b83 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -112,7 +112,7 @@ variable "data_disks" { } variable "docker_image" { - description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf-" + description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-8-tf-" type = string default = null } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index f519a18161..52d7873c81 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.42.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 115ac451e7..8ad8c304f0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -159,7 +159,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | @@ -169,9 +169,10 @@ No modules. | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | -| [dws\_flex](#input\_dws\_flex) | If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes.
See: https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler
Options:
- enable: Enable DWS Flex Start
- max\_run\_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks).

Limitations:
- CAN NOT be used with reservations;
- CAN NOT be used with placement groups; |
object({
enabled = optional(bool, true)
max_run_duration = optional(number, 1209600) # 2 weeks
})
|
{
"enabled": false
}
| no | +| [dws\_flex](#input\_dws\_flex) | If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes.
See: https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler
Options:
- enable: Enable DWS Flex Start
- max\_run\_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks).
- use\_job\_duration: Use the job duration to determine the max\_run\_duration, if job duration is not set, max\_run\_duration will be used.

Limitations:
- CAN NOT be used with reservations;
- CAN NOT be used with placement groups;
- If `use_job_duration` is enabled nodeset can be used in "exclusive" partitions only |
object({
enabled = optional(bool, true)
max_run_duration = optional(number, 1209600) # 2 weeks
use_job_duration = optional(bool, false)
})
|
{
"enabled": false
}
| no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_maintenance\_reservation](#input\_enable\_maintenance\_reservation) | Enables slurm reservation for scheduled maintenance. | `bool` | `false` | no | +| [enable\_opportunistic\_maintenance](#input\_enable\_opportunistic\_maintenance) | On receiving maintenance notification, maintenance will be performed as soon as nodes becomes idle. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_placement](#input\_enable\_placement) | Enable placement groups. | `bool` | `true` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | @@ -179,7 +180,7 @@ No modules. | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_properties](#input\_instance\_properties) | Override the instance properties. Used to test features not supported by Slurm GCP,
recommended for advanced usage only.
See https://cloud.google.com/compute/docs/reference/rest/v1/regionInstances/bulkInsert
If any sub-field (e.g. scheduling) is set, it will override the values computed by
SlurmGCP and ignoring values of provided vars. | `any` | `null` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for compute nodes. | `string` | `null` | no | @@ -197,7 +198,7 @@ No modules. | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | -| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources, should be in one of the following formats:
- projects/PROJECT\_ID/reservations/RESERVATION\_NAME
- RESERVATION\_NAME

Must be a "SPECIFIC" reservation
Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | +| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources, should be in one of the following formats:
- projects/PROJECT\_ID/reservations/RESERVATION\_NAME[/SUFF/IX]
- RESERVATION\_NAME[/SUFF/IX]

Must be a "SPECIFIC" reservation
Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | | [service\_account](#input\_service\_account) | DEPRECATED: Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to attach to the compute instances. | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 217328277b..3f283ffade 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -105,7 +105,8 @@ locals { startup_script = local.ghpc_startup_script network_storage = var.network_storage - enable_maintenance_reservation = var.enable_maintenance_reservation + enable_maintenance_reservation = var.enable_maintenance_reservation + enable_opportunistic_maintenance = var.enable_opportunistic_maintenance } } @@ -130,26 +131,22 @@ data "google_compute_zones" "available" { } locals { - res_name_split = split("/", var.reservation_name) - reservation = var.reservation_name == "" ? null : ( - length(local.res_name_split) == 4 ? { - project : local.res_name_split[1], - name : local.res_name_split[3] - } : { - project : var.project_id, - name : var.reservation_name - } - ) + res_match = regex("^(?P(?Pprojects/(?P[a-z0-9-]+)/reservations/)?(?P[a-z0-9-]+)(?P/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name) + + res_short_name = local.res_match.name + res_project = coalesce(local.res_match.project, var.project_id) + res_prefix = coalesce(local.res_match.prefix, "projects/${local.res_project}/reservations/") + res_suffix = local.res_match.suffix == null ? "" : local.res_match.suffix - reservation_name = local.reservation == null ? "" : "projects/${local.reservation.project}/reservations/${local.reservation.name}" + reservation_name = local.res_match.whole == null ? "" : "${local.res_prefix}${local.res_short_name}${local.res_suffix}" } # tflint-ignore: terraform_unused_declarations data "google_compute_reservation" "reservation" { - count = local.reservation != null ? 1 : 0 + count = length(local.reservation_name) > 0 ? 1 : 0 - name = local.reservation.name - project = local.reservation.project + name = local.res_short_name + project = local.res_project zone = var.zone lifecycle { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf index 671d542584..b957db13c1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf @@ -44,6 +44,7 @@ output "nodeset" { condition = !var.enable_placement || var.node_count_static == 0 || var.node_count_dynamic_max == 0 error_message = "Cannot use placement with static and auto-scaling nodes in the same node set." } + precondition { condition = var.reservation_name == "" || !var.dws_flex.enabled error_message = "Cannot use reservations with DWS Flex." diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf index a86c28ffc2..a4a2579989 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-7-debian-11", - "slurm-gcp-6-7-hpc-rocky-linux-8", - "slurm-gcp-6-7-ubuntu-2004-lts", - "slurm-gcp-6-7-ubuntu-2204-lts-arm64" + "slurm-gcp-6-8-debian-11", + "slurm-gcp-6-8-hpc-rocky-linux-8", + "slurm-gcp-6-8-ubuntu-2004-lts", + "slurm-gcp-6-8-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 536659f136..c35faad4e9 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -88,7 +88,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } @@ -161,7 +161,7 @@ variable "disk_labels" { } variable "additional_disks" { - description = "Configurations of additional disks to be included on the partition nodes. (do not use \"disk_type: local-ssd\"; known issue being addressed)" + description = "Configurations of additional disks to be included on the partition nodes." type = list(object({ disk_name = string device_name = string @@ -447,8 +447,8 @@ variable "access_config" { variable "reservation_name" { description = <<-EOD Name of the reservation to use for VM resources, should be in one of the following formats: - - projects/PROJECT_ID/reservations/RESERVATION_NAME - - RESERVATION_NAME + - projects/PROJECT_ID/reservations/RESERVATION_NAME[/SUFF/IX] + - RESERVATION_NAME[/SUFF/IX] Must be a "SPECIFIC" reservation Set to empty string if using no reservation or automatically-consumed reservations @@ -458,8 +458,8 @@ variable "reservation_name" { nullable = false validation { - condition = var.reservation_name == "" || length(regexall("^projects/[a-z0-9-]+/reservations/[a-z0-9-]+$", var.reservation_name)) > 0 || length(regexall("^[a-z0-9-]+$", var.reservation_name)) > 0 - error_message = "Reservation name must be in the format 'projects/PROJECT_ID/reservations/RESERVATION_NAME' or 'RESERVATION_NAME'." + condition = length(regexall("^((projects/([a-z0-9-]+)/reservations/)?([a-z0-9-]+)(/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name)) > 0 + error_message = "Reservation name must be either empty or in the format '[projects/PROJECT_ID/reservations/]RESERVATION_NAME[/SUFF/IX]', [...] are optional parts." } } @@ -513,6 +513,14 @@ variable "enable_maintenance_reservation" { default = false } + +variable "enable_opportunistic_maintenance" { + type = bool + description = "On receiving maintenance notification, maintenance will be performed as soon as nodes becomes idle." + default = false +} + + variable "dws_flex" { description = <<-EOD If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes. @@ -520,16 +528,19 @@ variable "dws_flex" { Options: - enable: Enable DWS Flex Start - max_run_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks). - + - use_job_duration: Use the job duration to determine the max_run_duration, if job duration is not set, max_run_duration will be used. + Limitations: - CAN NOT be used with reservations; - CAN NOT be used with placement groups; + - If `use_job_duration` is enabled nodeset can be used in "exclusive" partitions only EOD type = object({ enabled = optional(bool, true) max_run_duration = optional(number, 1209600) # 2 weeks + use_job_duration = optional(bool, false) }) default = { enabled = false diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 242244c5f7..05edfba039 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.42.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf index e877d0865b..4d2e0eead4 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf @@ -15,6 +15,7 @@ locals { non_static_ns_with_placement = [for ns in var.nodeset : ns.nodeset_name if ns.enable_placement && ns.node_count_static == 0] use_static = [for ns in concat(var.nodeset, var.nodeset_tpu) : ns.nodeset_name if ns.node_count_static > 0] + uses_job_duration = length([for ns in var.nodeset : ns.dws_flex.use_job_duration if ns.dws_flex.use_job_duration]) > 0 ? true : false has_node = length(var.nodeset) > 0 has_dyn = length(var.nodeset_dyn) > 0 diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf index e75c6293f1..4a06593b32 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf @@ -37,6 +37,11 @@ output "partitions" { condition = sum([for b in [local.has_node, local.has_dyn, local.has_tpu] : b ? 1 : 0]) == 1 error_message = "Partition must contain exactly one type of nodeset." } + + precondition { + condition = !local.uses_job_duration || var.exclusive + error_message = "`use_job_duration` can only be used in exclusive partitions" + } } output "nodeset" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index 17489d3f93..d5e2163add 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.42.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index d15ddea672..178d05d476 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -63,6 +63,8 @@ No modules. | Name | Type | |------|------| | [google_bigquery_connection.connection](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_connection) | resource | +| [google_compute_address.psc](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_address) | resource | +| [google_compute_forwarding_rule.psc_consumer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_forwarding_rule) | resource | | [google_sql_database.database](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sql_database) | resource | | [google_sql_database_instance.instance](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sql_database_instance) | resource | | [google_sql_user.users](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sql_user) | resource | @@ -74,9 +76,14 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [authorized\_networks](#input\_authorized\_networks) | IP address ranges as authorized networks of the Cloud SQL for MySQL instances | `list(string)` | `[]` | no | +| [data\_cache\_enabled](#input\_data\_cache\_enabled) | Whether data cache is enabled for the instance. Can be used with ENTERPRISE\_PLUS edition. | `bool` | `false` | no | | [database\_version](#input\_database\_version) | The version of the database to be created. | `string` | `"MYSQL_5_7"` | no | | [deletion\_protection](#input\_deletion\_protection) | Whether or not to allow Terraform to destroy the instance. | `string` | `false` | no | | [deployment\_name](#input\_deployment\_name) | The name of the current deployment | `string` | n/a | yes | +| [disk\_autoresize](#input\_disk\_autoresize) | Set to false to disable automatic disk grow. | `bool` | `true` | no | +| [disk\_size\_gb](#input\_disk\_size\_gb) | Size of the database disk in GiB. | `number` | `null` | no | +| [edition](#input\_edition) | value | `string` | `"ENTERPRISE"` | no | +| [enable\_backups](#input\_enable\_backups) | Set true to enable backups | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | n/a | yes | | [network\_id](#input\_network\_id) | The ID of the GCE VPC network to which the instance is going to be created in.:
`projects//global/networks/`" | `string` | n/a | yes | | [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection, used only as dependency for Cloud SQL creation. | `string` | `null` | no | @@ -85,7 +92,9 @@ No modules. | [sql\_instance\_name](#input\_sql\_instance\_name) | name given to the sql instance for ease of identificaion | `string` | n/a | yes | | [sql\_password](#input\_sql\_password) | Password for the SQL database. | `any` | `null` | no | | [sql\_username](#input\_sql\_username) | Username for the SQL database | `string` | `"slurm"` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Self link of the network where Cloud SQL instance PSC endpoint will be created | `string` | `null` | no | | [tier](#input\_tier) | The machine type to use for the SQL instance | `string` | n/a | yes | +| [use\_psc\_connection](#input\_use\_psc\_connection) | Create Private Service Connection instead of using Private Service Access peering | `bool` | `false` | no | | [user\_managed\_replication](#input\_user\_managed\_replication) | Replication parameters that will be used for defined secrets |
list(object({
location = string
kms_key_name = optional(string)
}))
| `[]` | no | ## Outputs diff --git a/community/modules/database/slurm-cloudsql-federation/main.tf b/community/modules/database/slurm-cloudsql-federation/main.tf index 6e2bfaceeb..638e592f74 100644 --- a/community/modules/database/slurm-cloudsql-federation/main.tf +++ b/community/modules/database/slurm-cloudsql-federation/main.tf @@ -47,23 +47,76 @@ resource "google_sql_database_instance" "instance" { database_version = var.database_version settings { - user_labels = local.labels - tier = var.tier + disk_size = var.disk_size_gb + disk_autoresize = var.disk_autoresize + edition = var.edition + tier = var.tier + user_labels = local.labels + + dynamic "data_cache_config" { + for_each = var.edition == "ENTERPRISE_PLUS" ? [""] : [] + content { + data_cache_enabled = var.data_cache_enabled + } + } ip_configuration { ipv4_enabled = false - private_network = var.network_id + private_network = var.use_psc_connection ? null : var.network_id enable_private_path_for_google_cloud_services = true dynamic "authorized_networks" { - for_each = var.authorized_networks + for_each = var.use_psc_connection ? [] : var.authorized_networks iterator = ip_range content { value = ip_range.value } } + dynamic "psc_config" { + for_each = var.use_psc_connection ? [""] : [] + content { + psc_enabled = true + allowed_consumer_projects = [var.project_id] + } + } + } + + backup_configuration { + enabled = var.enable_backups + # to allow easy switching between ENTERPRISE and ENTERPRISE_PLUS + transaction_log_retention_days = 7 } } + lifecycle { + precondition { + condition = var.disk_autoresize && var.disk_size_gb == null || !var.disk_autoresize + error_message = "If setting disk_size_gb set disk_autorize to false to prevent re-provisioning of the instance after disk auto-expansion." + } + } +} + + + +resource "google_compute_address" "psc" { + count = var.use_psc_connection ? 1 : 0 + project = var.project_id + name = local.sql_instance_name + address_type = "INTERNAL" + region = var.region + subnetwork = var.subnetwork_self_link + labels = local.labels +} + +resource "google_compute_forwarding_rule" "psc_consumer" { + count = var.use_psc_connection ? 1 : 0 + name = local.sql_instance_name + project = var.project_id + region = var.region + subnetwork = var.subnetwork_self_link + ip_address = google_compute_address.psc[0].self_link + load_balancing_scheme = "" + recreate_closed_psc = true + target = google_sql_database_instance.instance.psc_service_attachment_link } resource "google_sql_database" "database" { diff --git a/community/modules/database/slurm-cloudsql-federation/outputs.tf b/community/modules/database/slurm-cloudsql-federation/outputs.tf index 21d8bbfcc9..0d05221cd8 100644 --- a/community/modules/database/slurm-cloudsql-federation/outputs.tf +++ b/community/modules/database/slurm-cloudsql-federation/outputs.tf @@ -18,7 +18,7 @@ output "cloudsql" { description = "Describes the cloudsql instance." sensitive = true value = { - server_ip = google_sql_database_instance.instance.ip_address[0].ip_address + server_ip = var.use_psc_connection ? google_compute_address.psc[0].address : google_sql_database_instance.instance.ip_address[0].ip_address user = google_sql_user.users.name password = google_sql_user.users.password db_name = google_sql_database.database.name diff --git a/community/modules/database/slurm-cloudsql-federation/variables.tf b/community/modules/database/slurm-cloudsql-federation/variables.tf index ec41c70e9d..a921d60d65 100644 --- a/community/modules/database/slurm-cloudsql-federation/variables.tf +++ b/community/modules/database/slurm-cloudsql-federation/variables.tf @@ -26,16 +26,50 @@ variable "database_version" { type = string default = "MYSQL_5_7" validation { - condition = var.database_version == "MYSQL_5_7" || var.database_version == "MYSQL_8_0" - error_message = "The database version must be either MYSQL_5_7 or MYSQL_8_0." + condition = contains(["MYSQL_5_7", "MYSQL_8_0", "MYSQL_8_4"], var.database_version) + error_message = "The database version must be either MYSQL_5_7, MYSQL_8_0 or MYSQL_8_4." } } +variable "data_cache_enabled" { + description = "Whether data cache is enabled for the instance. Can be used with ENTERPRISE_PLUS edition." + type = bool + default = false +} + variable "deployment_name" { description = "The name of the current deployment" type = string } +variable "disk_autoresize" { + description = "Set to false to disable automatic disk grow." + type = bool + default = true +} + +variable "disk_size_gb" { + description = "Size of the database disk in GiB." + type = number + default = null +} + +variable "edition" { + description = "value" + type = string + validation { + condition = contains(["ENTERPRISE", "ENTERPRISE_PLUS"], var.edition) + error_message = "The database edition must be either ENTERPRISE or ENTERPRISE_PLUS" + } + default = "ENTERPRISE" +} + +variable "enable_backups" { + description = "Set true to enable backups" + type = bool + default = false +} + variable "project_id" { description = "Project in which the HPC deployment will be created" type = string @@ -97,6 +131,12 @@ variable "private_vpc_connection_peering" { default = null } +variable "subnetwork_self_link" { + description = "Self link of the network where Cloud SQL instance PSC endpoint will be created" + type = string + default = null +} + variable "user_managed_replication" { type = list(object({ location = string @@ -105,3 +145,9 @@ variable "user_managed_replication" { description = "Replication parameters that will be used for defined secrets" default = [] } + +variable "use_psc_connection" { + description = "Create Private Service Connection instead of using Private Service Access peering" + type = bool + default = false +} diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 1e92271e3a..f40eb68805 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.42.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/Intel-DAOS/README.md b/community/modules/file-system/Intel-DAOS/README.md index 410189eceb..04db0acb8c 100644 --- a/community/modules/file-system/Intel-DAOS/README.md +++ b/community/modules/file-system/Intel-DAOS/README.md @@ -1,65 +1 @@ -## Description - -This module allows creating an instance of Distributed Asynchronous Object Storage ([DAOS](https://docs.daos.io/)) on Google Cloud Platform ([GCP](https://cloud.google.com/)). - -> **_NOTE:_** -> DAOS on GCP does not require an Cluster Toolkit wrapper. -> Terraform modules are sourced directly from GitHub. -> It will not work as a [local or embedded module](../../../../modules/README.md#embedded-modules). - -Terraform modules for DAOS servers and clients are located in the [Google Cloud DAOS repo on GitHub](https://github.com/daos-stack/google-cloud-daos). - -DAOS Terraform module parameters can be found in the README.md files in each module directory. - -- [DAOS Server module](https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server#readme) -- [DAOS Client module](https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_client#readme) - -For more information on this and other network storage options in the Cluster Toolkit, see the extended [Network Storage documentation](../../../../docs/network_storage.md). -## Examples - -The [community examples folder](../../../examples/intel/) contains two example blueprints for deploying DAOS. - -- [community/examples/intel/pfs-daos.yml](../../../examples/intel/pfs-daos.yml) - Blueprint for deploying a DAOS cluster consisting of servers and clients. - After deploying this example the DAOS storage system will be formatted but no pools or containers will exist. - The instructions in the [community/examples/intel/README.md](../../../examples/intel/README.md#create-a-daos-pool-and-container) describe how to - - - Deploy a DAOS cluster - - Manage storage (create a [pool](https://docs.daos.io/v2.2/overview/storage/?h=container#daos-pool) and a [container](https://docs.daos.io/v2.2/overview/storage/?h=container#daos-container)) - - Mount a container on a client - - Store a large file in a DAOS container - -- [community/examples/intel/hpc-slurm-daos.yaml](../../../examples/intel/hpc-slurm-daos.yaml) - Blueprint for deploying a Slurm cluster and DAOS storage with 4 servers. - The Slurm compute nodes are configured as DAOS clients and have the ability to use the DAOS filesystem. - The instructions in the [community/examples/intel/README.md](../../../examples/intel/README.md#deploy-the-daosslurm-cluster) describe how to deploy the Slurm cluster and run a job which uses the DAOS file system. - -## Support - -Content in the [google-cloud-daos](https://github.com/daos-stack/google-cloud-daos) repository is licensed under the [Apache License Version 2.0](https://github.com/daos-stack/google-cloud-daos/blob/main/LICENSE) open-source license. - -[DAOS](https://github.com/daos-stack/daos) is distributed under the BSD-2-Clause-Patent open-source license. - -Intel Corporation provides two options for technical support: - -1. Community Support - - Community support is available to anyone through Jira and via the DAOS channel for Google Cloud users on Slack. - - JIRA: https://daosio.atlassian.net/jira/software/c/projects/DAOS/issues/ - - - An Atlassian account is not needed for read only access to Jira. - - An Atlassian account is required to create and update tickets. - To create an account follow the steps at https://support.atlassian.com/atlassian-account/docs/create-an-atlassian-account. - - Slack: https://daos-stack.slack.com/archives/C03GLTLHA59 - - Community support is provided on a best-effort basis. - -2. Commercial L3 Support - - Commercial L3 support is available on an on-demand basis. - - Contact Intel Corporation to obtain more information about Commercial L3 support. - - You may inquire about L3 support via the [Slack channel](https://daos-stack.slack.com/archives/C03GLTLHA59). +> **_NOTE:_** Cluster Toolkit is dropping support for the external [Google Cloud DAOS](https://github.com/daos-stack/google-cloud-daos/tree/main) repository. The DAOS example blueprints (`hpc-slurm-daos.yaml` and `pfs-daos.yaml`) have been removed from the Cluster Toolkit. We recommend migrating to the first-party [Parallelstore](../../../../modules/file-system/parallelstore/) module for similar functionality. To help with this transition, see the Parallelstore example blueprints ([pfs-parallelstore.yaml](../../../../examples/pfs-parallelstore.yaml) and [ps-slurm.yaml](../../../../examples/ps-slurm.yaml)). If the external [Google Cloud DAOS](https://github.com/daos-stack/google-cloud-daos/tree/main) repository is necessary, we recommend using the last Cluster Toolkit [v1.41.0](https://github.com/GoogleCloudPlatform/cluster-toolkit/releases/tag/v1.41.0). diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index 0a6664171a..f36bbb2e2b 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.42.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 5251b527b0..85db83d596 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.42.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index 469e310bc0..1d1848a59b 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.42.0" } } diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf index efb0f8f2d1..26477fbb8f 100644 --- a/community/modules/network/private-service-access/versions.tf +++ b/community/modules/network/private-service-access/versions.tf @@ -30,11 +30,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.42.0" } required_version = ">= 1.2" diff --git a/community/modules/project/new-project/README.md b/community/modules/project/new-project/README.md index 850b06b602..5e5cabe9d5 100644 --- a/community/modules/project/new-project/README.md +++ b/community/modules/project/new-project/README.md @@ -6,16 +6,13 @@ access, Service Accounts, and API enablement to follow best practices. This module is meant for use with Terraform 0.13. +**Note:** This module has been removed from the Cluster Toolkit. The upstream module (`terraform-google-project-factory`) is now the recommended way to create and manage GCP projects. + ### Example ```yaml - id: project - source: community/modules/project/new-project - settings: - project_id: test_project - folder_id: 334688113020 # random number - billing_account: "111110-M2N704-854685" # random billing number - org_id: 123456789 # random org id + source: github.com/terraform-google-modules/terraform-google-project-factory?rev=v17.0.0&depth=1 ``` This creates a new project with pre-defined project ID, a designated folder and diff --git a/community/modules/project/new-project/main.tf b/community/modules/project/new-project/main.tf deleted file mode 100644 index 5a9a611a27..0000000000 --- a/community/modules/project/new-project/main.tf +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "new-project", ghpc_role = "project" }) -} - -locals { - name = var.name != null ? var.name : var.project_id -} - -module "project_factory" { - source = "terraform-google-modules/project-factory/google" - version = "~> 11.3" - - random_project_id = var.random_project_id - org_id = var.org_id - domain = var.domain - name = local.name - project_id = var.project_id - svpc_host_project_id = var.svpc_host_project_id - enable_shared_vpc_host_project = var.enable_shared_vpc_host_project - billing_account = var.billing_account - folder_id = var.folder_id - group_name = var.group_name - group_role = var.group_role - create_project_sa = var.create_project_sa - project_sa_name = var.project_sa_name - sa_role = var.sa_role - activate_apis = var.activate_apis - activate_api_identities = var.activate_api_identities - usage_bucket_name = var.usage_bucket_name - usage_bucket_prefix = var.usage_bucket_prefix - shared_vpc_subnets = var.shared_vpc_subnets - labels = local.labels - bucket_project = var.bucket_project - bucket_name = var.bucket_name - bucket_location = var.bucket_location - bucket_versioning = var.bucket_versioning - bucket_labels = var.bucket_labels - bucket_force_destroy = var.bucket_force_destroy - bucket_ula = var.bucket_ula - auto_create_network = var.auto_create_network - lien = var.lien - disable_services_on_destroy = var.disable_services_on_destroy - default_service_account = var.default_service_account - disable_dependent_services = var.disable_dependent_services - budget_amount = var.budget_amount - budget_display_name = var.budget_display_name - budget_alert_pubsub_topic = var.budget_alert_pubsub_topic - budget_monitoring_notification_channels = var.budget_monitoring_notification_channels - budget_alert_spent_percents = var.budget_alert_spent_percents - vpc_service_control_attach_enabled = var.vpc_service_control_attach_enabled - vpc_service_control_perimeter_name = var.vpc_service_control_perimeter_name - grant_services_security_admin_role = var.grant_services_security_admin_role - grant_services_network_role = var.grant_services_network_role - consumer_quotas = var.consumer_quotas - default_network_tier = var.default_network_tier - -} diff --git a/community/modules/project/new-project/outputs.tf b/community/modules/project/new-project/outputs.tf deleted file mode 100644 index 07d2e038eb..0000000000 --- a/community/modules/project/new-project/outputs.tf +++ /dev/null @@ -1,100 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "project_name" { - value = module.project_factory.project_name - description = "Name of the project that was created" -} - -output "project_id" { - value = module.project_factory.project_id - description = "ID of the project that was created" -} - -output "project_number" { - value = module.project_factory.project_number - description = "Number of the project that was created" -} - -output "domain" { - value = module.project_factory.domain - description = "The organization's domain" -} - -output "group_email" { - value = module.project_factory.group_email - description = "The email of the G Suite group with group_name" -} - -output "service_account_id" { - value = module.project_factory.service_account_id - description = "The id of the default service account" -} - -output "service_account_display_name" { - value = module.project_factory.service_account_display_name - description = "The display name of the default service account" -} - -output "service_account_email" { - value = module.project_factory.service_account_email - description = "The email of the default service account" -} - -output "service_account_name" { - value = module.project_factory.service_account_name - description = "The fully-qualified name of the default service account" -} - -output "service_account_unique_id" { - value = module.project_factory.service_account_unique_id - description = "The unique id of the default service account" -} - -output "project_bucket_self_link" { - value = module.project_factory.project_bucket_self_link - description = "Project's bucket selfLink" -} - -output "project_bucket_url" { - value = module.project_factory.project_bucket_url - description = "Project's bucket url" -} - -output "api_s_account" { - value = module.project_factory.api_s_account - description = "API service account email" -} - -output "api_s_account_fmt" { - value = module.project_factory.api_s_account_fmt - description = "API service account email formatted for terraform use" -} - -output "enabled_apis" { - value = module.project_factory.enabled_apis - description = "Enabled APIs in the project" -} - -output "enabled_api_identities" { - value = module.project_factory.enabled_api_identities - description = "Enabled API identities in the project" -} - -output "budget_name" { - value = module.project_factory.budget_name - description = "The name of the budget if created" -} diff --git a/community/modules/project/new-project/variables.tf b/community/modules/project/new-project/variables.tf deleted file mode 100644 index 8c776feebf..0000000000 --- a/community/modules/project/new-project/variables.tf +++ /dev/null @@ -1,288 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -variable "random_project_id" { - description = "Adds a suffix of 4 random characters to the `project_id`" - type = bool - default = false -} - -variable "org_id" { - description = "The organization ID." - type = string -} - -variable "domain" { - description = "The domain name (optional)." - type = string - default = "" -} - -variable "name" { - description = "The name for the project" - type = string - default = null -} - -variable "project_id" { - description = "The ID to give the project. If not provided, the `name` will be used." - type = string - default = "" -} - -variable "svpc_host_project_id" { - description = "The ID of the host project which hosts the shared VPC" - type = string - default = "" -} - -variable "enable_shared_vpc_host_project" { - description = "If this project is a shared VPC host project. If true, you must *not* set svpc_host_project_id variable. Default is false." - type = bool - default = false -} - -variable "billing_account" { - description = "The ID of the billing account to associate this project with" - type = string -} - -variable "folder_id" { - description = "The ID of a folder to host this project" - type = string - default = "" -} - -variable "group_name" { - description = "A group to control the project by being assigned group_role (defaults to project editor)" - type = string - default = "" -} - -variable "group_role" { - description = "The role to give the controlling group (group_name) over the project (defaults to project editor)" - type = string - default = "roles/editor" -} - -variable "create_project_sa" { - description = "Whether the default service account for the project shall be created" - type = bool - default = true -} - -variable "project_sa_name" { - description = "Default service account name for the project." - type = string - default = "project-service-account" -} - -variable "sa_role" { - description = "A role to give the default Service Account for the project (defaults to none)" - type = string - default = "" -} - -variable "activate_apis" { - description = "The list of apis to activate within the project" - type = list(string) - default = [ - "compute.googleapis.com", - "serviceusage.googleapis.com", - "storage.googleapis.com", - ] -} - -variable "activate_api_identities" { - description = < [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | -| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | +| [client\_startup\_script](#module\_client\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | +| [instances](#module\_instances) | ../../../../modules/compute/vm-instance | n/a | ## Resources diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 5d7165a2c2..1091c2cff5 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -55,7 +55,7 @@ locals { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" deployment_name = var.deployment_name project_id = var.project_id @@ -71,7 +71,7 @@ module "client_startup_script" { } module "instances" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" + source = "../../../../modules/compute/vm-instance" instance_count = var.instance_count name_prefix = var.name_prefix diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index a50aba0f39..05c24cb6e8 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -120,9 +120,9 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 73dc845 | -| [htcondor\_ap](#module\_htcondor\_ap) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 | +| [htcondor\_ap](#module\_htcondor\_ap) | terraform-google-modules/vm/google//modules/mig | ~> 12.1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 89de24a67c..30a71679f7 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -183,7 +183,7 @@ resource "google_storage_bucket_object" "ap_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" project_id = var.project_id region = var.region @@ -226,7 +226,8 @@ resource "google_compute_disk" "spool" { } module "access_point_instance_template" { - source = "github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=73dc845" + source = "terraform-google-modules/vm/google//modules/instance_template" + version = "~> 12.1" name_prefix = local.name_prefix project_id = var.project_id @@ -261,7 +262,7 @@ module "access_point_instance_template" { module "htcondor_ap" { source = "terraform-google-modules/vm/google//modules/mig" - version = "10.1.1" + version = "~> 12.1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 3d452c24bb..260bf47cc0 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.42.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 3e6a33dfeb..5a94c7bc5f 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -104,9 +104,9 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | -| [htcondor\_cm](#module\_htcondor\_cm) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 | +| [htcondor\_cm](#module\_htcondor\_cm) | terraform-google-modules/vm/google//modules/mig | ~> 12.1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index eec76139c2..35da433f5f 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -122,7 +122,7 @@ resource "google_storage_bucket_object" "cm_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" project_id = var.project_id region = var.region @@ -134,7 +134,7 @@ module "startup_script" { module "central_manager_instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" - version = "10.1.1" + version = "~> 12.1" name_prefix = local.name_prefix project_id = var.project_id @@ -160,7 +160,7 @@ module "central_manager_instance_template" { module "htcondor_cm" { source = "terraform-google-modules/vm/google//modules/mig" - version = "10.1.1" + version = "~> 12.1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index 432b506666..8b1837acae 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.42.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 103fe43a30..766a26ced7 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.42.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/htcondor-service-accounts/README.md b/community/modules/scheduler/htcondor-service-accounts/README.md index 1246d39994..5a403c0a38 100644 --- a/community/modules/scheduler/htcondor-service-accounts/README.md +++ b/community/modules/scheduler/htcondor-service-accounts/README.md @@ -100,9 +100,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [access\_point\_service\_account](#module\_access\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.39.0&depth=1 | -| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.39.0&depth=1 | -| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.39.0&depth=1 | +| [access\_point\_service\_account](#module\_access\_point\_service\_account) | ../../../../community/modules/project/service-account | n/a | +| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | ../../../../community/modules/project/service-account | n/a | +| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | ../../../../community/modules/project/service-account | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-service-accounts/main.tf b/community/modules/scheduler/htcondor-service-accounts/main.tf index 9d72da114d..9d97b18642 100644 --- a/community/modules/scheduler/htcondor-service-accounts/main.tf +++ b/community/modules/scheduler/htcondor-service-accounts/main.tf @@ -21,7 +21,7 @@ # require them module "access_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.39.0&depth=1" + source = "../../../../community/modules/project/service-account" project_id = var.project_id display_name = "HTCondor Access Point" @@ -31,7 +31,7 @@ module "access_point_service_account" { } module "execute_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.39.0&depth=1" + source = "../../../../community/modules/project/service-account" project_id = var.project_id display_name = "HTCondor Execute Point" @@ -41,7 +41,7 @@ module "execute_point_service_account" { } module "central_manager_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.39.0&depth=1" + source = "../../../../community/modules/project/service-account" project_id = var.project_id display_name = "HTCondor Central Manager" diff --git a/community/modules/scheduler/htcondor-setup/README.md b/community/modules/scheduler/htcondor-setup/README.md index 254bb08241..9c46f0a3c8 100644 --- a/community/modules/scheduler/htcondor-setup/README.md +++ b/community/modules/scheduler/htcondor-setup/README.md @@ -90,8 +90,8 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [health\_check\_firewall\_rule](#module\_health\_check\_firewall\_rule) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/firewall-rules | 9e695aab | -| [htcondor\_bucket](#module\_htcondor\_bucket) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/file-system/cloud-storage-bucket/ | 9e695aab | +| [health\_check\_firewall\_rule](#module\_health\_check\_firewall\_rule) | ../../../../modules/network/firewall-rules | n/a | +| [htcondor\_bucket](#module\_htcondor\_bucket) | ../../../../community/modules/file-system/cloud-storage-bucket | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-setup/main.tf b/community/modules/scheduler/htcondor-setup/main.tf index ae4dca1b73..ad471fddad 100644 --- a/community/modules/scheduler/htcondor-setup/main.tf +++ b/community/modules/scheduler/htcondor-setup/main.tf @@ -33,7 +33,7 @@ locals { } module "health_check_firewall_rule" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/firewall-rules?ref=9e695aab" + source = "../../../../modules/network/firewall-rules" subnetwork_self_link = var.subnetwork_self_link @@ -54,7 +54,7 @@ module "health_check_firewall_rule" { } module "htcondor_bucket" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/file-system/cloud-storage-bucket/?ref=9e695aab" + source = "../../../../community/modules/file-system/cloud-storage-bucket" project_id = var.project_id deployment_name = var.deployment_name diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index cc18153ccd..edcfa9b591 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | -| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.39.0&depth=1 | +| [client\_startup\_script](#module\_client\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | +| [pbs\_client](#module\_pbs\_client) | ../../../../modules/compute/vm-instance | n/a | +| [pbs\_install](#module\_pbs\_install) | ../../../../community/modules/scripts/pbspro-install | n/a | ## Resources diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index fb51718b86..4da020dac1 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.39.0&depth=1" + source = "../../../../community/modules/scripts/pbspro-install" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -43,7 +43,7 @@ module "pbs_install" { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" deployment_name = var.deployment_name project_id = var.project_id @@ -57,7 +57,7 @@ module "client_startup_script" { } module "pbs_client" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" + source = "../../../../modules/compute/vm-instance" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index 95bd3c74a7..53b0c51f95 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -69,10 +69,10 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.39.0&depth=1 | -| [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.39.0&depth=1 | -| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | -| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | ../../../../community/modules/scripts/pbspro-install | n/a | +| [pbs\_qmgr](#module\_pbs\_qmgr) | ../../../../community/modules/scripts/pbspro-qmgr | n/a | +| [pbs\_server](#module\_pbs\_server) | ../../../../modules/compute/vm-instance | n/a | +| [server\_startup\_script](#module\_server\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index 5671829ade..657cb6e86b 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.39.0&depth=1" + source = "../../../../community/modules/scripts/pbspro-install" pbs_data_service_user = var.pbs_data_service_user pbs_exec = var.pbs_exec @@ -45,7 +45,7 @@ module "pbs_install" { } module "pbs_qmgr" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr?ref=v1.39.0&depth=1" + source = "../../../../community/modules/scripts/pbspro-qmgr" client_host_count = var.client_host_count client_hostname_prefix = var.client_hostname_prefix @@ -55,7 +55,7 @@ module "pbs_qmgr" { } module "server_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" deployment_name = var.deployment_name project_id = var.project_id @@ -70,7 +70,7 @@ module "server_startup_script" { } module "pbs_server" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" + source = "../../../../modules/compute/vm-instance" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index b9ae2ce50c..fb020afb9f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -22,14 +22,14 @@ controller for optimal performance at different scales. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt > ``` -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -99,12 +99,12 @@ This option has some additional requirements: development environment deploying the cluster. One can use following commands: ```bash - pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt + pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt ``` For more information, see the [description][optdeps] of this module. -[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster#optional +[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster#optional ## Custom Images @@ -220,8 +220,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.12.0 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.0 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.12.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 92d7a9d840..fca4d3e203 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -61,7 +61,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.12.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name @@ -70,7 +70,7 @@ module "slurm_controller_instance" { region = var.region network = var.network_self_link == null ? "" : var.network_self_link subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link - subnetwork_project = var.subnetwork_project == null ? "" : var.subnetwork_project + subnetwork_project = var.subnetwork_project zone = var.zone static_ips = var.static_ips cgroup_conf_tpl = var.cgroup_conf_tpl @@ -99,7 +99,7 @@ module "slurm_controller_instance" { } module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -127,7 +127,7 @@ module "slurm_controller_template" { source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf network = var.network_self_link == null ? "" : var.network_self_link - subnetwork_project = var.subnetwork_project == null ? "" : var.subnetwork_project + subnetwork_project = var.subnetwork_project subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link tags = concat([local.slurm_cluster_name], var.tags) service_account = var.service_account != null ? var.service_account : { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index d9e1f9b600..b6b84844fe 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.42.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index 56cbc33b07..73b6c5fcb0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -44,7 +44,7 @@ manually. This will require addition configuration and verification of permissions. For more information see the [hybrid.md] documentation on [slurm-gcp]. -[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_controller_hybrid +[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_controller_hybrid > **_NOTE:_** The hybrid module requires the following dependencies to be > installed on the system deploying the module: @@ -64,15 +64,15 @@ permissions. For more information see the [hybrid.md] documentation on [pyyaml]: https://pypi.org/project/PyYAML/ [google-api-python-client]: https://pypi.org/project/google-api-python-client/ [google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/scripts/requirements.txt ### Manual Configuration This module *does not* complete the installation of hybrid partitions on your slurm cluster. After deploying, you must follow the steps listed out in the [hybrid.md] documentation under [manual steps]. -[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md -[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md#manual-configurations +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md +[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md#manual-configurations ### Example Usage The hybrid module can be added to a blueprint as follows: @@ -152,10 +152,10 @@ strongly advise only using versions 21 or 22 when using this module. Attempting to use this module with any version older than 21 may lead to unexpected results. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 [pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ [schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/packer +[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/packer ## License @@ -187,7 +187,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.12.0 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.12.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index 7b8eb1171d..c721a13bb3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -28,7 +28,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.12.2" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 44b337ec78..787ece124c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -10,9 +10,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -51,8 +51,8 @@ The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terrafo modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2#slurm-on-google-cloud-platform ## License @@ -87,8 +87,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.12.0 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.0 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.12.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index af9254ae74..3046dbac9d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -57,7 +57,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -85,7 +85,7 @@ module "slurm_login_template" { source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf network = var.network_self_link == null ? "" : var.network_self_link - subnetwork_project = var.subnetwork_project == null ? "" : var.subnetwork_project + subnetwork_project = var.subnetwork_project subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link tags = concat([local.slurm_cluster_name], var.tags) service_account = var.service_account != null ? var.service_account : { @@ -95,7 +95,7 @@ module "slurm_login_template" { } module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.12.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index c52321d462..af509f4827 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.42.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 9f4933a1fa..65f4b437f5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -11,9 +11,9 @@ The [user guide][slurm-ug] provides detailed instructions on customizing and enhancing the Slurm on GCP cluster as well as recommendations on configuring the controller for optimal performance at different scales. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions @@ -234,22 +234,22 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | -| [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 6.1 | +| [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.2 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.5 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.5 | ## Resources | Name | Type | |------|------| +| [google_compute_instance_from_template.controller](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_from_template) | resource | | [google_secret_manager_secret.cloudsql](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | | [google_secret_manager_secret_iam_member.cloudsql_secret_accessor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | | [google_secret_manager_secret_version.cloudsql_version](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource | @@ -269,7 +269,7 @@ limitations under the License. | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool, false)
private_data = optional(list(string))
scheduler_parameters = optional(list(string))
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | @@ -289,7 +289,7 @@ limitations under the License. | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_controller\_public\_ips](#input\_enable\_controller\_public\_ips) | If set to true. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. | `bool` | `false` | no | -| [enable\_default\_mounts](#input\_enable\_default\_mounts) | Enable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `true` | no | +| [enable\_default\_mounts](#input\_enable\_default\_mounts) | Enable default global network storage from the controller
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `true` | no | | [enable\_devel](#input\_enable\_devel) | DEPRECATED: `enable_devel` is always on. | `bool` | `null` | no | | [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/tools/prologs-epilogs/README.md | `bool` | `null` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | @@ -301,7 +301,7 @@ limitations under the License. | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no | | [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | `""` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for controller. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | @@ -313,7 +313,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, false)
enable_opportunistic_maintenance = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
use_job_duration = bool
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 1ce6ed158f..4b455ed5bd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -43,7 +43,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.5" project_id = var.project_id region = var.region @@ -73,7 +73,6 @@ module "slurm_controller_template" { metadata = local.metadata min_cpu_platform = var.min_cpu_platform - # network_ip = TODO: add support for network_ip on_host_maintenance = var.on_host_maintenance preemptible = var.preemptible service_account = local.service_account @@ -82,7 +81,6 @@ module "slurm_controller_template" { source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf - # spot = TODO: add support for spot (?) subnetwork = var.subnetwork_self_link tags = concat([local.slurm_cluster_name], var.tags) @@ -90,32 +88,31 @@ module "slurm_controller_template" { } # INSTANCE -locals { - # TODO: add support for proper access_config - access_config = { - nat_ip = null - network_tier = null +resource "google_compute_instance_from_template" "controller" { + name = "${local.slurm_cluster_name}-controller" + project = var.project_id + zone = var.zone + source_instance_template = module.slurm_controller_template.self_link + + allow_stopping_for_update = true + + # Can't rely on template to specify nics due to usage of static_ip + network_interface { + dynamic "access_config" { + for_each = var.enable_controller_public_ips ? ["unit"] : [] + content { + nat_ip = null + network_tier = null + } + } + network_ip = length(var.static_ips) == 0 ? "" : var.static_ips[0] + subnetwork = var.subnetwork_self_link } } -module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2" - - access_config = var.enable_controller_public_ips ? [local.access_config] : [] - add_hostname_suffix = false - hostname = "${local.slurm_cluster_name}-controller" - instance_template = module.slurm_controller_template.self_link - - project_id = var.project_id - region = var.region - slurm_cluster_name = local.slurm_cluster_name - slurm_instance_role = "controller" - static_ips = var.static_ips - subnetwork = var.subnetwork_self_link - zone = var.zone - metadata = var.metadata - - labels = local.labels +moved { + from = module.slurm_controller_instance.google_compute_instance_from_template.slurm_instance[0] + to = google_compute_instance_from_template.controller } # SECRETS: CLOUDSQL diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 998a8e0867..dd8e4699ec 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.5" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -56,7 +56,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.5" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config @@ -78,5 +78,5 @@ module "slurm_login_instance" { zone = each.value.zone # trigger replacement of login nodes when the controller instance is replaced - replace_trigger = module.slurm_controller_instance.instances_self_links[0] + replace_trigger = google_compute_instance_from_template.controller.self_link } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 3033d59f43..1b1db61cc4 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -66,7 +66,7 @@ No modules. | [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool, false)
private_data = optional(list(string))
scheduler_parameters = optional(list(string))
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql\_secret](#input\_cloudsql\_secret) | Secret URI to cloudsql secret. | `string` | `null` | no | | [compute\_startup\_scripts](#input\_compute\_startup\_scripts) | List of scripts to be ran on compute VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh index c21f7cbdbd..21454bd52c 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh @@ -41,6 +41,8 @@ if [ ! -f "${SLURM_EXTERNAL_ROOT}/${SLURM_MUX_FILE}" ]; then # See the License for the specific language governing permissions and # limitations under the License. +set -e + CMD="${0##*/}" # Locate script BASE=$(readlink -f $0) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 120ae7f1e8..7ee06332f1 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -83,9 +83,6 @@ def get(key, default): any_dynamic = any(bool(p.partition_feature) for p in lkp.cfg.partitions.values()) comma_params = { - "PrivateData": [ - "cloud", - ], "LaunchParameters": [ "enable_nss_slurm", "use_interactive_step", @@ -95,11 +92,6 @@ def get(key, default): "enable_configless", "idle_on_node_suspend", ], - "SchedulerParameters": [ - "bf_continue", - "salloc_wait_nodes", - "ignore_prefer_validation", - ], "GresTypes": [ "gpu" if any_gpus else None, ], @@ -114,11 +106,17 @@ def get(key, default): **(comma_params if not no_comma_params else {}), "Prolog": f"{prolog_path}/*" if lkp.cfg.prolog_scripts else None, "Epilog": f"{epilog_path}/*" if lkp.cfg.epilog_scripts else None, - "SuspendProgram": f"{scripts_dir}/suspend.py", + "PrivateData": get("private_data", []), + "SchedulerParameters": get("scheduler_parameters", [ + "bf_continue", + "salloc_wait_nodes", + "ignore_prefer_validation", + ]), "ResumeProgram": f"{scripts_dir}/resume.py", "ResumeFailProgram": f"{scripts_dir}/suspend.py", "ResumeRate": get("resume_rate", 0), "ResumeTimeout": get("resume_timeout", 300), + "SuspendProgram": f"{scripts_dir}/suspend.py", "SuspendRate": get("suspend_rate", 0), "SuspendTimeout": get("suspend_timeout", 300), "TreeWidth": get("tree_width", default_tree_width), diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 1bc1150c58..229c8b60b7 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -15,9 +15,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import List, Optional, Dict import argparse import collections +from datetime import timedelta +import shlex import json import logging import os @@ -57,7 +59,7 @@ BULK_INSERT_LIMIT = 5000 -def instance_properties(nodeset, model, placement_group, labels=None): +def instance_properties(nodeset:object, model:str, placement_group:Optional[str], labels:Optional[dict], job_id:Optional[int]): props = NSDict() if labels: # merge in extra labels on instance and disks @@ -99,18 +101,28 @@ def instance_properties(nodeset, model, placement_group, labels=None): props.scheduling.maintenanceInterval = nodeset.maintenance_interval if nodeset.dws_flex.enabled: - update_props_dws(props,nodeset.dws_flex) + update_props_dws(props, nodeset.dws_flex, job_id) # Override with properties explicit specified in the nodeset props.update(nodeset.get("instance_properties") or {}) return props -def update_props_dws(props:dict,dws_flex:dict) -> None: +def update_props_dws(props:object, dws_flex:object, job_id: Optional[int]) -> None: props.scheduling.onHostMaintenance = "TERMINATE" props.scheduling.instanceTerminationAction = "DELETE" - props.scheduling.maxRunDuration['seconds'] = dws_flex.max_run_duration props.reservationAffinity['consumeReservationType'] = "NO_RESERVATION" + props.scheduling.maxRunDuration['seconds'] = dws_flex_duration(dws_flex, job_id) + +def dws_flex_duration(dws_flex:object, job_id: Optional[int]) -> int: + max_duration = dws_flex.max_run_duration + if dws_flex.use_job_duration and job_id is not None and (job := lookup().job(job_id)) and job.duration: + if timedelta(seconds=30) <= job.duration <= timedelta(weeks=2): + max_duration = int(job.duration.total_seconds()) + else: + log.info("Job TimeLimit cannot be less than 30 seconds or exceed 2 weeks") + return max_duration + def per_instance_properties(node): props = NSDict() @@ -120,11 +132,7 @@ def per_instance_properties(node): def create_instances_request(nodes, partition_name, placement_group, job_id=None): """Call regionInstances.bulkInsert to create instances""" - assert len(nodes) > 0 - if placement_group: - assert len(nodes) <= min(PLACEMENT_MAX_CNT, BULK_INSERT_LIMIT) - else: - assert len(nodes) <= BULK_INSERT_LIMIT + assert 0 < len(nodes) <= BULK_INSERT_LIMIT # model here indicates any node that can be used to describe the rest model = next(iter(nodes)) @@ -134,8 +142,14 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None log.debug(f"create_instances_request: {model} placement: {placement_group}") body = NSDict() + body.count = len(nodes) - body.minCount = 1 + + if placement_group: + assert len(nodes) <= PLACEMENT_MAX_CNT + pass # do not set minCount to force "all or nothing" behavior + else: + body.minCount = 1 # source of instance properties body.sourceInstanceTemplate = template @@ -147,7 +161,7 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None ) # overwrites properties across all instances body.instanceProperties = instance_properties( - nodeset, model, placement_group, labels + nodeset, model, placement_group, labels, job_id ) # key is instance name, value overwrites properties @@ -467,7 +481,9 @@ def down_nodes(nodelist, reason): if isinstance(nodelist, list): nodelist = util.to_hostlist(nodelist) update_job_comment(nodelist, reason) - run(f"{lookup().scontrol} update nodename={nodelist} state=down reason='{reason}'") + reason_quoted = shlex.quote(reason) + log.error(f"Marking nodes {nodelist} as DOWN, reason: {reason}") + run(f"{lookup().scontrol} update nodename={nodelist} state=down reason={reason_quoted}") def hold_job(job_id, reason): @@ -495,21 +511,25 @@ def create_placement_request(pg_name, region): return request -def create_placement_groups(node_list: list, job_id=0): +def create_placement_groups(node_list: List[str], job_id:int=0) -> Dict[str, List[str]]: pgs = {} node_map = lookup().nodeset_map(node_list) for _, nodes in node_map.items(): - pgs.update(create_nodeset_placement_groups(nodes, job_id=job_id)) + pgs.update(create_nodeset_placement_groups(nodes, job_id)) return pgs -def create_nodeset_placement_groups(node_list: list, job_id=0): +def create_nodeset_placement_groups(node_list: List[str], job_id:int) -> Dict[str, List[str]]: + no_pg = {None: node_list} # canned result for no placement policies created + + if len(node_list) < 2: + return no_pg # don't create placement_policy for just one node + model = next(iter(node_list)) nodeset = lookup().node_nodeset(model) - if not nodeset.enable_placement: - return {None: node_list} - if not valid_placement_nodes(node_list): - return {None: node_list} + if not (nodeset.enable_placement and valid_placement_nodes(node_list)): + return no_pg + region = lookup().node_region(model) groups = { @@ -525,8 +545,7 @@ def create_nodeset_placement_groups(node_list: list, job_id=0): f"creating {len(groups)} placement groups: \n{yaml.safe_dump(debug_groups).rstrip()}" ) requests = { - group: create_placement_request(group, region) - for group, incl_nodes in groups.items() + group: create_placement_request(group, region) for group in groups.keys() } ops = dict( zip(requests.keys(), map_with_futures(ensure_execute, requests.values())) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 37532f6285..46b86e77a6 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -83,7 +83,7 @@ SSSSSSSSSSSS SSS SSSSSSSSSSSSS SSSS SSSS SSSS SSSS """ - +_MAINTENANCE_SBATCH_SCRIPT_PATH = dirs.custom_scripts / "perform_maintenance.sh" def start_motd(): """advise in motd that slurm is currently configuring""" @@ -224,6 +224,26 @@ def setup_sudoers(): sudoers_file.chmod(0o0440) +def setup_maintenance_script(): + perform_maintenance = """#!/bin/bash + +#SBATCH --priority=low +#SBATCH --time=180 + +VM_NAME=$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/name" -H "Metadata-Flavor: Google") +ZONE=$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/zone" -H "Metadata-Flavor: Google" | cut -d '/' -f 4) + +gcloud compute instances perform-maintenance $VM_NAME \ + --zone=$ZONE +""" + + + with open(_MAINTENANCE_SBATCH_SCRIPT_PATH, "w") as f: + f.write(perform_maintenance) + + util.chown_slurm(_MAINTENANCE_SBATCH_SCRIPT_PATH, mode=0o755) + + def update_system_config(file, content): """Add system defaults options for service files""" sysconfig = Path("/etc/sysconfig") @@ -279,10 +299,10 @@ def configure_mysql(): def configure_dirs(): for p in dirs.values(): util.mkdirp(p) - + for p in (dirs.slurm, dirs.scripts, dirs.custom_scripts): util.chown_slurm(p) - + for p in slurmdirs.values(): util.mkdirp(p) util.chown_slurm(p) @@ -357,6 +377,9 @@ def setup_controller(): run("systemctl start slurm_load_bq.timer", timeout=30) run("systemctl status slurm_load_bq.timer", timeout=30) + # Add script to perform maintenance + setup_maintenance_script() + log.info("Done setting up controller") pass @@ -400,7 +423,7 @@ def setup_compute(): slurmd_options = [ f'--conf-server="{slurmctld_host}:{lookup().control_host_port}"', ] - + try: slurmd_feature = util.instance_metadata("attributes/slurmd_feature") except Exception: @@ -439,7 +462,7 @@ def setup_compute(): def main(): start_motd() - + log.info("Starting setup, fetching config") sleep_seconds = 5 while True: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 112e2d5748..1bd876a56f 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -28,6 +28,7 @@ import datetime as dt from datetime import datetime from typing import Dict, Tuple +from functools import lru_cache import util from util import ( @@ -41,6 +42,7 @@ NSDict, TPU, chunked, + dirs, ) from util import lookup from suspend import delete_instances @@ -50,7 +52,7 @@ log = logging.getLogger() TOT_REQ_CNT = 1000 - +_MAINTENANCE_SBATCH_SCRIPT_PATH = dirs.custom_scripts / "perform_maintenance.sh" NodeStatus = Enum( "NodeStatus", @@ -334,13 +336,14 @@ def sync_placement_groups(): "STOPPED", "SUSPENDED", "COMPLETING", + "PENDING", ] ) keep_jobs = { - str(job["job_id"]) - for job in json.loads(run(f"{lookup().scontrol} show jobs --json").stdout)["jobs"] - if "job_state" in job and set(job["job_state"]) & keep_states + str(job.id) + for job in lookup().get_jobs() + if job.job_state in keep_states } keep_jobs.add("0") # Job 0 is a placeholder for static node placement @@ -350,7 +353,7 @@ def sync_placement_groups(): op = act.aggregatedList(project=lookup().project, fields=fields, filter=flt) placement_groups = {} pg_regex = re.compile( - rf"{lookup().cfg.slurm_cluster_name}-(?P[^\s\-]+)-(?P\d+)-(?P\d+)" + rf"{lookup().cfg.slurm_cluster_name}-slurmgcp-managed-(?P[^\s\-]+)-(?P\d+)-(?P\d+)" ) while op is not None: result = ensure_execute(op) @@ -482,7 +485,7 @@ def get_slurm_reservation_maintenance(lkp: util.Lookup) -> Dict[str, datetime]: return reservation_map - +@lru_cache def get_upcoming_maintenance(lkp: util.Lookup) -> Dict[str, Tuple[str, datetime]]: upc_maint_map = {} @@ -534,6 +537,65 @@ def sync_maintenance_reservation(lkp: util.Lookup) -> None: create_reservation(lkp, res_name, node, start_time) +def delete_maintenance_job(job_name: str) -> None: + util.run(f"scancel --name={job_name}") + + +def create_maintenance_job(job_name: str, node: str) -> None: + util.run(f"sbatch --job-name={job_name} --nodelist={node} {_MAINTENANCE_SBATCH_SCRIPT_PATH}") + + +def get_slurm_maintenance_job(lkp: util.Lookup) -> Dict[str, str]: + jobs = {} + + for job in lkp.get_jobs(): + if job.name is None or job.required_nodes is None or job.job_state is None: + continue + + if job.name != f"{job.required_nodes}_maintenance": + continue + + if job.job_state != "PENDING": + continue + + jobs[job.name] = job.required_nodes + + return jobs + + +def sync_opportunistic_maintenance(lkp: util.Lookup) -> None: + upc_maint_map = get_upcoming_maintenance(lkp) # map job_name -> (node_name, time) + log.debug(f"upcoming-maintenance-vms: {upc_maint_map}") + + curr_jobs = get_slurm_maintenance_job(lkp) # map job_name -> node. + log.debug(f"curr-maintenance-job-map: {curr_jobs}") + + del_jobs = set(curr_jobs.keys() - upc_maint_map.keys()) + create_jobs = {} + + for job_name, (node, _) in upc_maint_map.items(): + try: + enabled = lkp.node_nodeset(node).enable_opportunistic_maintenance + except Exception: + enabled = False + + if not enabled: + if job_name in curr_jobs: + del_jobs.add(job_name) + continue + + if job_name not in curr_jobs: + create_jobs[job_name] = node + + log.debug(f"del-maintenance-job: {del_jobs}") + for job_name in del_jobs: + delete_maintenance_job(job_name) + + log.debug(f"create-maintenance-job: {create_jobs}") + for job_name, node in create_jobs.items(): + create_maintenance_job(job_name, node) + + def main(): try: reconfigure_slurm() @@ -561,6 +623,12 @@ def main(): except Exception: log.exception("failed to sync slurm reservation for scheduled maintenance") + try: + sync_opportunistic_maintenance(lookup()) + except Exception: + log.exception("failed to sync opportunistic reservation for scheduled maintenance") + + try: # TODO: it performs 1 to 4 GCS list requests, # use cached version, combine with `_list_config_blobs` diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py index 0b25b0df58..6585b2fcd1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py @@ -88,15 +88,14 @@ def test_dict_to_conf(value: dict, want: str): (TstCfg( install_dir="ukulele", ), - """PrivateData=cloud -LaunchParameters=enable_nss_slurm,use_interactive_step + """LaunchParameters=enable_nss_slurm,use_interactive_step SlurmctldParameters=cloud_dns,enable_configless,idle_on_node_suspend SchedulerParameters=bf_continue,salloc_wait_nodes,ignore_prefer_validation -SuspendProgram=ukulele/suspend.py ResumeProgram=ukulele/resume.py ResumeFailProgram=ukulele/suspend.py ResumeRate=0 ResumeTimeout=300 +SuspendProgram=ukulele/suspend.py SuspendRate=0 SuspendTimeout=300 TreeWidth=128 @@ -106,6 +105,8 @@ def test_dict_to_conf(value: dict, want: str): install_dir="ukulele", cloud_parameters={ "no_comma_params": True, + "private_data": None, + "scheduler_parameters": None, "resume_rate": None, "resume_timeout": None, "suspend_rate": None, @@ -115,11 +116,12 @@ def test_dict_to_conf(value: dict, want: str): "tree_width": None, }, ), - """SuspendProgram=ukulele/suspend.py + """SchedulerParameters=bf_continue,salloc_wait_nodes,ignore_prefer_validation ResumeProgram=ukulele/resume.py ResumeFailProgram=ukulele/suspend.py ResumeRate=0 ResumeTimeout=300 +SuspendProgram=ukulele/suspend.py SuspendRate=0 SuspendTimeout=300 TreeWidth=128 @@ -129,6 +131,16 @@ def test_dict_to_conf(value: dict, want: str): install_dir="ukulele", cloud_parameters={ "no_comma_params": True, + "private_data": [ + "events", + "jobs", + ], + "scheduler_parameters": [ + "bf_busy_nodes", + "bf_continue", + "ignore_prefer_validation", + "nohold_on_prolog_fail", + ], "resume_rate": 1, "resume_timeout": 2, "suspend_rate": 3, @@ -138,11 +150,13 @@ def test_dict_to_conf(value: dict, want: str): "tree_width": 5, }, ), - """SuspendProgram=ukulele/suspend.py + """PrivateData=events,jobs +SchedulerParameters=bf_busy_nodes,bf_continue,ignore_prefer_validation,nohold_on_prolog_fail ResumeProgram=ukulele/resume.py ResumeFailProgram=ukulele/suspend.py ResumeRate=1 ResumeTimeout=2 +SuspendProgram=ukulele/suspend.py SuspendRate=3 SuspendTimeout=4 TreeWidth=5 diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index 14b7a7bf62..4104e948c5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -16,6 +16,7 @@ from mock import Mock from common import TstNodeset, TstCfg # needed to import util import util +from datetime import timedelta from google.api_core.client_options import ClientOptions # noqa: E402 # Note: need to install pytest-mock @@ -158,14 +159,14 @@ def test_nodeset_reservation_err(nodeset, err): with pytest.raises(err): lkp.nodeset_reservation(nodeset) lkp._get_reservation.assert_not_called() - + @pytest.mark.parametrize( "nodeset,policies,expected", [ (TstNodeset(), [], None), # no reservation (TstNodeset( reservation_name="projects/bobin/reservations/robin", - zone_policy_allow=["eine"]), + zone_policy_allow=["eine"]), [], util.ReservationDetails( project="bobin", @@ -175,7 +176,7 @@ def test_nodeset_reservation_err(nodeset, err): bulk_insert_name="projects/bobin/reservations/robin")), (TstNodeset( reservation_name="projects/bobin/reservations/robin", - zone_policy_allow=["eine"]), + zone_policy_allow=["eine"]), ["seven/wanders", "five/red/apples", "yum"], util.ReservationDetails( project="bobin", @@ -185,7 +186,7 @@ def test_nodeset_reservation_err(nodeset, err): bulk_insert_name="projects/bobin/reservations/robin")), (TstNodeset( reservation_name="projects/bobin/reservations/robin/snek/cheese-brie-6", - zone_policy_allow=["eine"]), + zone_policy_allow=["eine"]), [], util.ReservationDetails( project="bobin", @@ -199,16 +200,90 @@ def test_nodeset_reservation_err(nodeset, err): def test_nodeset_reservation_ok(nodeset, policies, expected): lkp = util.Lookup(TstCfg()) lkp._get_reservation = Mock() - + if not expected: assert lkp.nodeset_reservation(nodeset) is None lkp._get_reservation.assert_not_called() return - + lkp._get_reservation.return_value = { "resourcePolicies": {i: p for i, p in enumerate(policies)}, } assert lkp.nodeset_reservation(nodeset) == expected lkp._get_reservation.assert_called_once_with(expected.project, expected.zone, expected.name) - - + + +@pytest.mark.parametrize( + "job_info,expected_job", + [ + ( + """JobId=123 + TimeLimit=02:00:00 + JobName=myjob + JobState=PENDING + ReqNodeList=node-[1-10]""", + util.Job( + id=123, + duration=timedelta(days=0, hours=2, minutes=0, seconds=0), + name="myjob", + job_state="PENDING", + required_nodes="node-[1-10]" + ), + ), + ( + """JobId=456 + JobName=anotherjob + JobState=PENDING + ReqNodeList=node-group1""", + util.Job( + id=456, + duration=None, + name="anotherjob", + job_state="PENDING", + required_nodes="node-group1" + ), + ), + ( + """JobId=789 + TimeLimit=00:30:00 + JobState=COMPLETED""", + util.Job( + id=789, + duration=timedelta(minutes=30), + name=None, + job_state="COMPLETED", + required_nodes=None + ), + ), + ( + """JobId=101112 + TimeLimit=1-00:30:00 + JobState=COMPLETED, + ReqNodeList=node-[1-10],grob-pop-[2,1,44-77]""", + util.Job( + id=101112, + duration=timedelta(days=1, hours=0, minutes=30, seconds=0), + name=None, + job_state="COMPLETED", + required_nodes="node-[1-10],grob-pop-[2,1,44-77]" + ), + ), + ( + """JobId=131415 + TimeLimit=1-00:30:00 + JobName=mynode-1_maintenance + JobState=COMPLETED, + ReqNodeList=node-[1-10],grob-pop-[2,1,44-77]""", + util.Job( + id=131415, + duration=timedelta(days=1, hours=0, minutes=30, seconds=0), + name="mynode-1_maintenance", + job_state="COMPLETED", + required_nodes="node-[1-10],grob-pop-[2,1,44-77]" + ), + ), + ], +) +def test_parse_job_info(job_info, expected_job): + lkp = util.Lookup(TstCfg()) + assert lkp._parse_job_info(job_info) == expected_job diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test new file mode 100644 index 0000000000..0aaaeb2fc0 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Fail gracefully if nvidia-smi or dcgmi doesn't exist +if ! type -P nvidia-smi 1>/dev/null; then + echo "nvidia-smi not found - this script requires nvidia-smi to function" >&2 + exit 0 +fi + +if ! type -P dcgmi 1>/dev/null; then + echo "dcgmi not found - this script requires dcgmi to function" >&2 + exit 0 +fi + +if ! type -P nv-hostengine 1>/dev/null; then + echo "nv-hostengine not found - this script requires nv-hostengine to function" >&2 + exit 0 +fi + +# Exit if GPU isn't H100 +GPU_MODEL=$(nvidia-smi --query-gpu=name --format=csv,noheader) +if [[ "$GPU_MODEL" != *"H100"* ]]; then + echo "Non-H100 GPU detected" >&2 + exit 0 +fi + +NUMGPUS=$(nvidia-smi -L | wc -l) + +# Check that all GPUs are healthy via DCGM and check for ECC errors +if [ $NUMGPUS -gt 0 ]; then + echo "Execute DCGM health check and ECC error check for GPUs" + GPULIST=$(nvidia-smi --query-gpu=index --format=csv,noheader | tr '\n' ',' | sed 's/,$//') + rm -f /tmp/dcgm.out + rm -f /tmp/ecc_errors.out + + # Run DCGM checks + START_HOSTENGINE=false + if ! pidof nv-hostengine > /dev/null; then + echo "Starting nv-hostengine..." + nv-hostengine + sleep 1 # Give it a moment to start up + START_HOSTENGINE=true + fi + GROUPID=$(dcgmi group -c gpuinfo | awk '{print $NF}' | tr -d ' ') + dcgmi group -g $GROUPID -a $GPULIST + dcgmi diag -g $GROUPID -r 1 1> /tmp/dcgm.out + dcgmi group -d $GROUPID + + # Terminate the host engine if it was manually started + if [ "$START_HOSTENGINE" = true ]; then + echo "Terminating nv-hostengine..." + nv-hostengine -t + fi + + # Check for DCGM failures + DCGM_FAILED=0 + grep -i fail /tmp/dcgm.out > /dev/null || DCGM_FAILED=$? + + # Check for ECC errors + nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader > /tmp/ecc_errors.out + ECC_ERRORS=$(awk -F', ' '{sum += $2} END {print sum}' /tmp/ecc_errors.out) + + # Check for NVLink errors + NVLINK_ERRORS=$(nvidia-smi nvlink -sc 0bz -i 0 2>/dev/null | grep -i "Error Count" | awk '{sum += $3} END {print sum}') + # Set to 0 if empty/null + NVLINK_ERRORS=${NVLINK_ERRORS:-0} + + if [ $DCGM_FAILED -eq 0 ] || \ + [ $ECC_ERRORS -gt 0 ] || \ + [ $NVLINK_ERRORS -gt 0 ]; then + REASON="H100 GPU issues detected: " + [ $DCGM_FAILED -eq 0 ] && REASON+="DCGM test failed, " + [ $ECC_ERRORS -gt 0 ] && REASON+="ECC errors found ($ECC_ERRORS double-bit errors), " + [ $NVLINK_ERRORS -gt 0 ] && REASON+="NVLink errors detected ($NVLINK_ERRORS errors), " + REASON+="see /tmp/dcgm.out and /tmp/ecc_errors.out" + echo "$REASON" + exit 1 + fi +fi diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 8467e300e2..999c454090 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -19,6 +19,7 @@ import base64 import collections from dataclasses import dataclass +from datetime import timedelta import hashlib import inspect import json @@ -410,7 +411,7 @@ def _fill_cfg_defaults(cfg: NSDict) -> NSDict: "mount_options": "defaults,hard,intr,_netdev", } ) - + network_storage_iter = filter( None, ( @@ -453,7 +454,7 @@ def _list_config_blobs() -> Tuple[Any, str]: if res["core"] is None: raise DeffetiveStoredConfigError("config.yaml not found in bucket") return res, hash.hexdigest() - + def _fetch_config(old_hash: Optional[str]) -> Optional[Tuple[NSDict, str]]: """Fetch config from bucket, returns None if no changes are detected.""" @@ -473,8 +474,8 @@ def _download(bs) -> List[Any]: ), hash def _assemble_config( - core: Any, - partitions: List[Any], + core: Any, + partitions: List[Any], nodesets: List[Any], nodesets_dyn: List[Any], nodesets_tpu: List[Any], @@ -509,17 +510,17 @@ def _add_nodesets(yamls: List[Any], target: dict): for ns_name in chain(p.partition_nodeset, p.partition_nodeset_dyn, p.partition_nodeset_tpu): if ns_name not in ns_names: raise DeffetiveStoredConfigError(f"nodeset {ns_name} not defined in config") - + return _fill_cfg_defaults(cfg) def fetch_config() -> Tuple[bool, NSDict]: """ - Fetches config from bucket and saves it locally + Fetches config from bucket and saves it locally Returns True if new (updated) config was fetched """ hash_file = Path("/slurm/scripts/.config.hash") old_hash = hash_file.read_text() if hash_file.exists() else None - + cfg_and_hash = _fetch_config(old_hash=old_hash) if not cfg_and_hash: return False, _load_config() @@ -1147,6 +1148,10 @@ def machine_type_sockets(template) -> int: "h3": 2, "c2d": 2 if guestCpus > 56 else 1, "a3": 2, + "c2": 2 if guestCpus > 30 else 1, + "c3": 2 if guestCpus > 88 else 1, + "c3d": 2 if guestCpus > 180 else 1, + "c4": 2 if guestCpus > 96 else 1, }.get( machine_type_family(template.machineType), 1, # assume 1 socket for all other families @@ -1155,7 +1160,12 @@ def machine_type_sockets(template) -> int: def isSmt(template) -> bool: # https://cloud.google.com/compute/docs/cpu-platforms - noSmtFamily = ("t2a", "t2d", "h3",) + noSmtFamily = ( + "t2a", + "t2d", + "h3", + "c4a", + ) if machine_type_family(template.machineType) in noSmtFamily: return False if template.machine_info.guestCpus == 1: @@ -1451,6 +1461,15 @@ class ReservationDetails: policies: List[str] # names (not URLs) of resource policies bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name in long format) +@dataclass +class Job: + id: int + name: Optional[str] = None + required_nodes: Optional[str] = None + job_state: Optional[str] = None + duration: Optional[timedelta] = None + + class Lookup: """Wrapper class for cached data access""" @@ -1746,11 +1765,11 @@ def _get_reservation(self, project: str, zone: str, name: str) -> object: """See https://cloud.google.com/compute/docs/reference/rest/v1/reservations""" return self.compute.reservations().get( project=project, zone=zone, reservation=name).execute() - + def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: if not nodeset.reservation_name: return None - + zones = list(nodeset.zone_policy_allow or []) assert len(zones) == 1, "Only single zone is supported if using a reservation" zone = zones[0] @@ -1760,7 +1779,7 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: raise ValueError( f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'" ) - + project, name = match.group("project", "reservation") reservation = self._get_reservation(project, zone, name) @@ -1917,6 +1936,55 @@ def nodeset_map(self, hostnames: list): nodeset_map[self.node_nodeset_name(node)].append(node) return nodeset_map + def _parse_job_info(self, job_info: str) -> Job: + """Extract job details""" + if match:= re.search(r"JobId=(\d+)", job_info): + job_id = int(match.group(1)) + else: + raise ValueError(f"Job ID not found in the job info: {job_info}") + + if match:= re.search(r"TimeLimit=(?:(\d+)-)?(\d{2}):(\d{2}):(\d{2})", job_info): + days, hours, minutes, seconds = match.groups() + duration = timedelta( + days=int(days) if days else 0, + hours=int(hours), + minutes=int(minutes), + seconds=int(seconds) + ) + else: + duration = None + + if match := re.search(r"JobName=([^\n]+)", job_info): + name = match.group(1) + else: + name = None + + if match := re.search(r"JobState=(\w+)", job_info): + job_state = match.group(1) + else: + job_state = None + + if match := re.search(r"ReqNodeList=([^ ]+)", job_info): + required_nodes = match.group(1) + else: + required_nodes = None + + return Job(id=job_id, duration=duration, name=name, job_state=job_state, required_nodes=required_nodes) + + @lru_cache + def get_jobs(self) -> List[Job]: + res = run(f"{self.scontrol} show jobs", timeout=30) + + return [self._parse_job_info(job) for job in res.stdout.split("\n\n")[:-1]] + + @lru_cache + def job(self, job_id: int) -> Optional[Job]: + job_info = run(f"{self.scontrol} show jobid {job_id}", check=False).stdout.rstrip() + if not job_info: + return None + + return self._parse_job_info(job_info=job_info) + @property def etc_dir(self) -> Path: return Path(self.cfg.output_dir or slurmdirs.etc) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 91026fc267..308a42e639 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -316,16 +316,19 @@ variable "partitions" { variable "cloud_parameters" { description = "cloud.conf options. Default behavior defined in scripts/conf.py" type = object({ - no_comma_params = optional(bool) - resume_rate = optional(number) - resume_timeout = optional(number) - suspend_rate = optional(number) - suspend_timeout = optional(number) - topology_plugin = optional(string) - topology_param = optional(string) - tree_width = optional(number) + no_comma_params = optional(bool, false) + private_data = optional(list(string)) + scheduler_parameters = optional(list(string)) + resume_rate = optional(number) + resume_timeout = optional(number) + suspend_rate = optional(number) + suspend_timeout = optional(number) + topology_plugin = optional(string) + topology_param = optional(string) + tree_width = optional(number) }) - default = {} + default = {} + nullable = false } ########## diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf index 06ffb93594..400a58b437 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf @@ -19,7 +19,7 @@ output "slurm_cluster_name" { output "slurm_controller_instance" { description = "Compute instance of controller node" - value = module.slurm_controller_instance.slurm_instances[0] + value = google_compute_instance_from_template.controller } output "slurm_login_instances" { @@ -36,6 +36,6 @@ output "instructions" { description = "Post deployment instructions." value = <<-EOT To SSH to the controller (may need to add '--tunnel-through-iap'): - gcloud compute ssh ${module.slurm_controller_instance.instances_self_links[0]} + gcloud compute ssh ${google_compute_instance_from_template.controller.self_link} EOT } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 0d05c71f91..71a44a7236 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local template module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.5" for_each = local.nodeset_map project_id = var.project_id @@ -81,28 +81,29 @@ module "nodeset_cleanup" { locals { nodesets = [for name, ns in local.nodeset_map : { - nodeset_name = ns.nodeset_name - node_conf = ns.node_conf - dws_flex = ns.dws_flex - instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link - node_count_dynamic_max = ns.node_count_dynamic_max - node_count_static = ns.node_count_static - subnetwork = ns.subnetwork_self_link - reservation_name = ns.reservation_name - maintenance_interval = ns.maintenance_interval - instance_properties_json = ns.instance_properties_json - enable_placement = ns.enable_placement - network_storage = ns.network_storage - zone_target_shape = ns.zone_target_shape - zone_policy_allow = ns.zone_policy_allow - zone_policy_deny = ns.zone_policy_deny - enable_maintenance_reservation = ns.enable_maintenance_reservation + nodeset_name = ns.nodeset_name + node_conf = ns.node_conf + dws_flex = ns.dws_flex + instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link + node_count_dynamic_max = ns.node_count_dynamic_max + node_count_static = ns.node_count_static + subnetwork = ns.subnetwork_self_link + reservation_name = ns.reservation_name + maintenance_interval = ns.maintenance_interval + instance_properties_json = ns.instance_properties_json + enable_placement = ns.enable_placement + network_storage = ns.network_storage + zone_target_shape = ns.zone_target_shape + zone_policy_allow = ns.zone_policy_allow + zone_policy_deny = ns.zone_policy_deny + enable_maintenance_reservation = ns.enable_maintenance_reservation + enable_opportunistic_maintenance = ns.enable_opportunistic_maintenance }] } # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.5" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index a5e30ea64d..8fcbe78ddc 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -24,7 +24,7 @@ locals { module "bucket" { source = "terraform-google-modules/cloud-storage/google" - version = "~> 5.0" + version = "~> 6.1" count = var.create_bucket ? 1 : 0 @@ -122,7 +122,7 @@ locals { module "daos_network_storage_scripts" { count = length(local.daos_ns) > 0 ? 1 : 0 - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf index a86c28ffc2..a4a2579989 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-7-debian-11", - "slurm-gcp-6-7-hpc-rocky-linux-8", - "slurm-gcp-6-7-ubuntu-2004-lts", - "slurm-gcp-6-7-ubuntu-2204-lts-arm64" + "slurm-gcp-6-8-debian-11", + "slurm-gcp-6-8-hpc-rocky-linux-8", + "slurm-gcp-6-8-ubuntu-2004-lts", + "slurm-gcp-6-8-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 95e5c20d0a..b06d62b39f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -196,18 +196,19 @@ variable "nodeset" { auto_delete = optional(bool, true) boot = optional(bool, false) })), []) - bandwidth_tier = optional(string, "platform_default") - can_ip_forward = optional(bool, false) - disable_smt = optional(bool, false) - disk_auto_delete = optional(bool, true) - disk_labels = optional(map(string), {}) - disk_size_gb = optional(number) - disk_type = optional(string) - enable_confidential_vm = optional(bool, false) - enable_placement = optional(bool, false) - enable_oslogin = optional(bool, true) - enable_shielded_vm = optional(bool, false) - enable_maintenance_reservation = optional(bool, true) + bandwidth_tier = optional(string, "platform_default") + can_ip_forward = optional(bool, false) + disable_smt = optional(bool, false) + disk_auto_delete = optional(bool, true) + disk_labels = optional(map(string), {}) + disk_size_gb = optional(number) + disk_type = optional(string) + enable_confidential_vm = optional(bool, false) + enable_placement = optional(bool, false) + enable_oslogin = optional(bool, true) + enable_shielded_vm = optional(bool, false) + enable_maintenance_reservation = optional(bool, false) + enable_opportunistic_maintenance = optional(bool, false) gpu = optional(object({ count = number type = string @@ -215,6 +216,7 @@ variable "nodeset" { dws_flex = object({ enabled = bool max_run_duration = number + use_job_duration = bool }) labels = optional(map(string), {}) machine_type = optional(string) @@ -406,23 +408,24 @@ EOD variable "cloud_parameters" { description = "cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters)" type = object({ - no_comma_params = optional(bool) - resume_rate = optional(number) - resume_timeout = optional(number) - suspend_rate = optional(number) - suspend_timeout = optional(number) - topology_plugin = optional(string) - topology_param = optional(string) - tree_width = optional(number) + no_comma_params = optional(bool, false) + private_data = optional(list(string)) + scheduler_parameters = optional(list(string)) + resume_rate = optional(number) + resume_timeout = optional(number) + suspend_rate = optional(number) + suspend_timeout = optional(number) + topology_plugin = optional(string) + topology_param = optional(string) + tree_width = optional(number) }) - default = {} + default = {} + nullable = false } variable "enable_default_mounts" { description = <<-EOD Enable default global network storage from the controller - - /usr/local/etc/slurm - - /etc/munge - /home - /apps Warning: If these are disabled, the slurm etc and munge dirs must be added diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index 0df835e322..2d684b0e62 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -267,7 +267,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index c1fc007bf0..1a4982e158 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.42.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 4ad20a6352..eff81b42c1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -53,7 +53,7 @@ modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. [slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/7 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2#slurm-on-google-cloud-platform +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5#slurm-on-google-cloud-platform ## Requirements @@ -100,7 +100,7 @@ No modules. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for login nodes. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf index a86c28ffc2..a4a2579989 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-7-debian-11", - "slurm-gcp-6-7-hpc-rocky-linux-8", - "slurm-gcp-6-7-ubuntu-2004-lts", - "slurm-gcp-6-7-ubuntu-2204-lts-arm64" + "slurm-gcp-6-8-debian-11", + "slurm-gcp-6-8-hpc-rocky-linux-8", + "slurm-gcp-6-8-ubuntu-2004-lts", + "slurm-gcp-6-8-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index 2b53c8f9e5..104b9f4a33 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -325,7 +325,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index dbcebd21c1..74c4e35664 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.42.0" } } diff --git a/community/modules/scripts/ramble-execute/README.md b/community/modules/scripts/ramble-execute/README.md index 8b9fa844c6..55c2fc7e4e 100644 --- a/community/modules/scripts/ramble-execute/README.md +++ b/community/modules/scripts/ramble-execute/README.md @@ -77,7 +77,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scripts/ramble-execute/main.tf b/community/modules/scripts/ramble-execute/main.tf index d2470e3821..7ef0b029e3 100644 --- a/community/modules/scripts/ramble-execute/main.tf +++ b/community/modules/scripts/ramble-execute/main.tf @@ -55,7 +55,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/ramble-setup/README.md b/community/modules/scripts/ramble-setup/README.md index 09f8c5511d..9891088105 100644 --- a/community/modules/scripts/ramble-setup/README.md +++ b/community/modules/scripts/ramble-setup/README.md @@ -86,7 +86,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index 205c980c03..4389af7d33 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -97,7 +97,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index 1e1a8a78ee..8cbb75fb42 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -104,7 +104,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scripts/spack-execute/main.tf b/community/modules/scripts/spack-execute/main.tf index 6f8055cd59..04ebcf7d49 100644 --- a/community/modules/scripts/spack-execute/main.tf +++ b/community/modules/scripts/spack-execute/main.tf @@ -54,7 +54,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 0b1ca7810b..68fc68ee80 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -340,7 +340,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index b705ccc06c..d45f5d1be3 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -104,7 +104,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index e60ec22c3c..e7e0af9bd1 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.42.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 1a2aa18a3b..5e6507714d 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.42.0" } required_version = ">= 0.14.0" diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml index 813a90f0b6..45312348ed 100644 --- a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml +++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml @@ -15,8 +15,6 @@ --- blueprint_name: hpc-cluster-hybrid-v5 -toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit -toolkit_modules_version: v1.41.0 vars: project_id: ## <> diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index fbc851e1db..928caff5a4 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -22,7 +22,7 @@ for use with an on-premise slurm-cluster. > further testing is done, documentation on applying the hybrid module to > on-premise slurm clusters will be added and expanded. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 ## Definitions diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index b03f5403a1..95fb2a067f 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -264,8 +264,8 @@ sudo systemctl restart slurmctld If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` should point you in the right direction. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md [demo-with-cloud-controller-instructions.md]: ./demo-with-cloud-controller-instructions.md ## Validate the Hybrid Cluster diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 30e721dad0..fbfb8750c6 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -39,9 +39,9 @@ detail, as well as how to customize many of these assumptions to fit your needs. deployments in their [hybrid.md] documentation. [hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 [slurm\_controller\_hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid -[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md ### NFS Mounts @@ -235,12 +235,12 @@ image created with slurm 21.08.8: partition_name: compute ``` -[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/packer -[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/packer/example.pkrvars.hcl -[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/packer/variables.pkr.hcl#L97 -[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/packer/variables.pkr.hcl#L166 -[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/ansible/roles/munge/defaults/main.yml#L17 -[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/ansible/roles/slurm/defaults/main.yml#L31 +[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/packer +[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/packer/example.pkrvars.hcl +[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/packer/variables.pkr.hcl#L97 +[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/packer/variables.pkr.hcl#L166 +[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/ansible/roles/munge/defaults/main.yml#L17 +[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/ansible/roles/slurm/defaults/main.yml#L31 ## On Premise Setup diff --git a/docs/image-building.md b/docs/image-building.md index 488c1b19ef..98ae56e203 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -27,7 +27,6 @@ below demonstrate each approach: - [Customizing a Slurm cluster (Hello, World)](../examples/README.md#image-builderyaml-) - [Customizing a Slurm cluster (AI/ML applications)](../examples/README.md#ml-slurmyaml-) - [Provisioning an HTCondor pool (installing scheduler)](../examples/README.md#htc-htcondoryaml--) -- [Provisioning a DAOS storage cluster](../community/examples/intel/README.md#daos-cluster) ## Why build an image? @@ -168,7 +167,7 @@ deployment_groups: - group: packer modules: - id: custom-image - source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.12.0&depth=1 + source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.12.2&depth=1 kind: packer settings: use_iap: true diff --git a/docs/network_storage.md b/docs/network_storage.md index 28a39594d6..40065edba3 100644 --- a/docs/network_storage.md +++ b/docs/network_storage.md @@ -7,7 +7,7 @@ The Toolkit contains modules that will **provision**: - [Filestore (GCP managed NFS)][filestore] - [DDN EXAScaler lustre][ddn-exascaler] -- [Intel DAOS][intel-daos] +- [Parallelstore][parallelstore] - [NFS server (non-GCP managed)][nfs-server] The Toolkit also provides a **[pre-existing-network-storage]** module to work @@ -104,12 +104,12 @@ filestore | via USE | via USE | via USE | via USE | via STARTUP | via USE | via nfs-server | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE cloud-storage-bucket (GCS)| via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE DDN EXAScaler lustre | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE -Intel DAOS** | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing +Parallelstore | via USE | Needs Testing | Needs Testing | via USE | Needs Testing | Needs Testing | Needs Testing |  |   |   |   |   |   |   filestore (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE nfs-server (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE DDN EXAScaler lustre (pre-existing) | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE -Intel DAOS (pre-existing) | Not Supported | Not Supported | Not Supported | Not Supported | Not Supported | Not Supported | Not Supported +Parallelstore (pre-existing) | via USE | Needs Testing | Needs Testing | via USE | Needs Testing | Needs Testing | Needs Testing GCS FUSE (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | Needs Testing - **via USE:** Client installation and mounting occur automatically when @@ -122,10 +122,9 @@ GCS FUSE (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | - **Not Supported:** This feature is not supported right now. \* only supported on CentOS 7\ -** DAOS has additional pre-req steps and does not yet support automatic mounting [filestore]: ../modules/file-system/filestore/README.md [pre-existing-network-storage]: ../modules/file-system/pre-existing-network-storage/README.md [ddn-exascaler]: ../community/modules/file-system/DDN-EXAScaler/README.md -[intel-daos]: ../community/modules/file-system/Intel-DAOS/README.md +[parallelstore]: ../modules/file-system/parallelstore/README.md [nfs-server]: ../community/modules/file-system/nfs-server/README.md diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md index 8b1c38bb01..6f2d38cd2b 100644 --- a/docs/slurm-dws-flex.md +++ b/docs/slurm-dws-flex.md @@ -13,7 +13,7 @@ With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity re > The project needs to be allowlisted for private preview access. > Fill out the [form](https://docs.google.com/forms/d/1etaaXMW9jJUTTxfUC7TIIMttLWT5H-3Q8_3-sG6vwKk/edit). -In order to make use of DWS Flex Start mode with SlurmGCP, you must use the `dws_flex` variable in the `schedmd-slurm-gcp-v6-nodeset` module. From there you can specify the desired maximum duration (in seconds) with `max_run_duration`. See the example below: +In order to make use of DWS Flex Start mode with SlurmGCP, you must use the `dws_flex` variable in the `schedmd-slurm-gcp-v6-nodeset` module. From there you can specify the desired maximum duration (in seconds) with `max_run_duration`. You can also use `use_job_duration` which will utilize the job's `TimeLimit` within Slurm as the duration. If `use_job_duration` is enabled but `TimeLimit` is not set, it will default to `max_run_duration`. See the example below: ```yaml - id: flex_nodeset diff --git a/docs/slurm-topology.md b/docs/slurm-topology.md new file mode 100644 index 0000000000..e83c11047e --- /dev/null +++ b/docs/slurm-topology.md @@ -0,0 +1,73 @@ +# Network topology aware scheduling + +Slurm can be [configured](https://slurm.schedmd.com/topology.html) to support topology-aware +resource allocation to optimize job performance. + +If you are using Slurm via ClusterToolkit, the Slurm Topology Plugin is automatically configured with: + +```ini +TopologyPlugin=topology/tree +TopologyParam=SwitchAsNodeRank +``` + +This does two things: + +* **Minimizes inter-rack communication:** For jobs smaller than the full cluster size, Slurm will assign the job to as few racks as possible. +* **Optimizes rank placement:** Within a job, the Slurm node rank (used to assign global Slurm / MPI ranks) is ordered by the Switch that the node is on, such that ranks are ordered by rack. + +SlurmGCP automatically updates topology information for all nodes in the cluster, according to their [physical location](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies#verify-vm-location). + +> [!NOTE] +> The physical location information is available for VMs configured with a placement policy. +> VMs without a defined placement policy will be assigned a less efficient 'fake' topology. + +Applications that incorporate either the `SLURM_PROCID`/`NODE_RANK`/etc or the MPI Rank into their task assignment may see performance benefits. +In other cases, such as with PyTorch's `distributed`, you may need to modify the rank assignment to incorporate this information, see [example](../examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md). + +## Inspect topology + +You can inspect topology used by Slurm by running: + +```sh +scontrol show topology + +# Or by listing the configuration file: +cat /etc/slurm/topology.conf +``` + +To inspect the "real" topology and verify the physical host placement, you can list the `physical_host` property of nodes: + +```sh +#!/bin/bash + +# /home/where.sh - echo machines hostname and its physicalHost +echo "$(hostname) $(curl 'http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host' -H 'Metadata-Flavor: Google' -s)" +``` + +```sh +srun --nodelist={nodes_to_inspect} -l /home/where.sh | sort -V +``` + +## Disabling SlurmGCP topology integration + +Updates to `topology.conf` require reconfiguration of Slurm controller. This can be a costly operation that affects the responsiveness of the controller. + +You have the option to disable the Slurm Topology Plugin (along with automatic updates) by providing the following settings to controller module in your blueprint: + +```yaml +settings: + cloud_parameters: + topology_plugin: "" +``` + +Even with the Topology Plugin disabled, you can still optimize rank placement by using the `sort_nodes` +util in your [sbatch](https://slurm.schedmd.com/sbatch.html) scripts. For example: + +```sh +#SBATCH --ntasks-per-node=8 +#SBATCH --nodes=64 + +export SLURM_HOSTFILE=$(sort_nodes.py) + +srun -l hostname | sort +``` diff --git a/docs/vm-images.md b/docs/vm-images.md index 4b968d68b9..76cc1249be 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -105,20 +105,9 @@ project and the new image name in the `instance_image` field discussed in ## Cluster Toolkit Supported Images -### HPC CentOS 7 - -The Cluster Toolkit has officially supported the [HPC CentOS 7 VM Image][hpcimage] -as the primary VM image for HPC workloads on Google Cloud since it's release. -Since the [HPC CentOS 7 VM Image][hpcimage] comes pre-tuned for optimal -performance on typical HPC workloads, it is the default VM image in our modules, -unless there is specific requirement for a different OS distribution. - -[hpcimage]: https://cloud.google.com/blog/topics/hpc/introducing-hpc-vm-images - ### HPC Rocky Linux 8 -HPC Rocky Linux 8 is planned to become the primary supported VM image for HPC -workloads on Google Cloud from 2024. +HPC Rocky Linux 8 is the primary supported VM image for HPC workloads on Google Cloud. ### Debian 11 @@ -142,20 +131,19 @@ description of our support for Windows images. Deployment Type/Scheduler Feature - CentOS 7Debian 11Rocky Linux 8Ubuntu 20.04 + Debian 11Rocky Linux 8Ubuntu 20.04 - + Cloud Batch Lustre - ✓ - ✓ + ✓ Shared filestore @@ -163,7 +151,6 @@ description of our support for Windows images. ✓ ✓ ✓ - ✓ Startup script @@ -171,14 +158,12 @@ description of our support for Windows images. ✓ ✓ ✓ - ✓ Slurm Chrome Remote Desktop - ✓ @@ -186,25 +171,22 @@ description of our support for Windows images. Lustre - ✓ - ✓ + ✓ Shared filestore ✓ - ✓ ✓ ✓ Startup script - ✓ - ✓ ✓ + ✓ ✓ @@ -212,7 +194,6 @@ description of our support for Windows images. VM Instance Chrome Remote Desktop - ✓ * @@ -220,7 +201,6 @@ description of our support for Windows images. Lustre - ✓ ✓ ✓ @@ -231,7 +211,6 @@ description of our support for Windows images. ✓ ✓ ✓ - ✓ Startup script @@ -239,13 +218,11 @@ description of our support for Windows images. ✓ ✓ ✓ - ✓ HTCondor - ✓ ✓ @@ -255,7 +232,6 @@ description of our support for Windows images. - ✓ diff --git a/examples/README.md b/examples/README.md index 9257a98d0a..53a84d1a08 100644 --- a/examples/README.md +++ b/examples/README.md @@ -39,8 +39,6 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-ubuntu2004-v5-legacy.yaml](#hpc-slurm-ubuntu2004-v5-legacyyaml--) ![community-badge] ![deprecated-badge] * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml--) ![community-badge] - * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] - * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] * [hpc-amd-slurm-v5-legacy.yaml](#hpc-amd-slurm-v5-legacyyaml--) ![community-badge] ![deprecated-badge] * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge] * [hpc-slurm-sharedvpc.yaml](#hpc-slurm-sharedvpcyaml--) ![community-badge] ![experimental-badge] @@ -218,7 +216,7 @@ the experimental badge (![experimental-badge]). > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt > ``` Creates a basic auto-scaling Slurm cluster with mostly default settings. The @@ -585,7 +583,7 @@ An example benchmarking job for PyTorch can be run under Slurm: ```shell cp /var/tmp/torch_test.* . -sbatch -N 1 torch_test.sh +sbatch -N 1 --gpus-per-node=1 torch_test.sh ``` When you are done, clean up the resources in reverse order of creation: @@ -634,7 +632,7 @@ An example benchmarking job for PyTorch can be run under Slurm: ```shell cp /var/tmp/torch_test.* . -sbatch -N 1 torch_test.sh +sbatch -N 1 --gpus-per-node=1 torch_test.sh ``` When you are done, clean up the resources in reverse order of creation: @@ -1151,7 +1149,7 @@ The blueprint contains 3 groups: > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt > ``` Similar to the [hpc-slurm-v5-legacy.yaml] example, but using Ubuntu 20.04 instead of CentOS 7. @@ -1214,22 +1212,6 @@ For this example the following is needed in the selected region: * Compute Engine API: Resource policies: **one for each job in parallel** - _only needed for `compute` partition_ -### [pfs-daos.yaml] ![community-badge] - -This example provisions a DAOS cluster with [managed instance groups][migs] for the servers and for clients. It is more extensively discussed in a dedicated [README for Intel -examples][intel-examples-readme]. - -[pfs-daos.yaml]: ../community/examples/intel/pfs-daos.yaml -[migs]: https://cloud.google.com/compute/docs/instance-groups - -### [hpc-slurm-daos.yaml] ![community-badge] - -This example provisions DAOS servers and a Slurm cluster. It is -more extensively discussed in a dedicated [README for Intel -examples][intel-examples-readme]. - -[hpc-slurm-daos.yaml]: ../community/examples/intel/hpc-slurm-daos.yaml - ### [hpc-amd-slurm-v5-legacy.yaml] ![community-badge] ![deprecated-badge] This example provisions a Slurm cluster using AMD VM machine types. It @@ -1481,10 +1463,10 @@ guest_accelerator: - type: nvidia-l4 count: 1 gpu_sharing_config: - - max_shared_clients_per_gpu: 2 + max_shared_clients_per_gpu: 2 gpu_sharing_strategy: "TIME_SHARING" gpu_driver_installation_config: - - gpu_driver_version: "LATEST" + gpu_driver_version: "LATEST" ``` * Configuration of the cluster using default drivers provided by GKE. @@ -1694,6 +1676,28 @@ the controller and login nodes. Also since this blueprint doesn't use external IPs for compute nodes, one must needs to [set up cloud nat][cloudnat] and [set up iap][iap]. +Now, one needs to update the blueprint to include shared vpc details. In the +network configuration, update the details for shared vpc as mentioned below, + +```yaml +vars: + project_id: # update /w the service project id in which shared network will be used. + host_project_id: # update /w the host project id in which shared network is created. + deployment_name: hpc-small-shared-vpc + region: us-central1 + zone: us-central1-c + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/pre-existing-vpc + settings: + project_id: $(vars.host_project_id) + network_name: # update /w shared network name + subnetwork_name: # update /w shared sub-net name +``` + [hpc-slurm-sharedvpc.yaml]: ../community/examples/hpc-slurm-sharedvpc.yaml [fs-shared-vpc]: https://cloud.google.com/filestore/docs/shared-vpc @@ -1751,7 +1755,7 @@ deployment_groups: # GitHub module over HTTPS, prefixed with github.com - source: github.com/org/repo//path/to/module - # Local absolute source, prefixed with / + # Local absolute source, prefixed with / - source: /path/to/module # Local relative (to current working directory) source, prefixed with ./ or ../ diff --git a/examples/cae/cae-slurm-v5-legacy.yaml b/examples/cae/cae-slurm-v5-legacy.yaml index e1a5411252..01dddbecdb 100644 --- a/examples/cae/cae-slurm-v5-legacy.yaml +++ b/examples/cae/cae-slurm-v5-legacy.yaml @@ -134,13 +134,13 @@ deployment_groups: local_mount: /scratch # If you require maximum IO performance, you can consider to bring up a dedicated parallel - # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Intel DAOS. + # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Parallelstore. # Note: Those solutions may have associated license cost. # # Please visit here for more information # - DDN Exascaler Lustre: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/DDN-EXAScaler/README.md # - Sycomp IBM Spectrum Scale: https://console.developers.google.com/marketplace/product/sycomp/sycomp-storage-fueled-by-ibm-spectrum-scale - # - Intel DAOS: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/Intel-DAOS/README.md + # - Parallelstore: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/modules/file-system/parallelstore/README.md ######## Remote Desktop(s) ####### # This block enables a partition for nodes that support Chrome Remote Desktop diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index 34096a7080..ab6b129219 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -40,7 +40,7 @@ vars: # for a list of valid family options with Slurm; note: the image types for the compute nodes # and the Chrome Remote Desktop (CRD) need to have the same Slurm base. instance_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public # Documentation for each of the modules used below can be found at @@ -131,13 +131,13 @@ deployment_groups: local_mount: /scratch # If you require maximum IO performance, you can consider to bring up a dedicated parallel - # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Intel DAOS. + # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Parallelstore. # Note: Those solutions may have associated license cost. # # Please visit here for more information # - DDN Exascaler Lustre: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/DDN-EXAScaler/README.md # - Sycomp IBM Spectrum Scale: https://console.developers.google.com/marketplace/product/sycomp/sycomp-storage-fueled-by-ibm-spectrum-scale - # - Intel DAOS: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/Intel-DAOS/README.md + # - Parallelstore: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/modules/file-system/parallelstore/README.md ######## Remote Desktop(s) ####### # This block creates chrome remote desktop. diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index f7f4018b0d..1c19dcd2e6 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -33,6 +33,7 @@ deployment_groups: source: modules/network/vpc settings: subnetwork_name: gke-subnet-a3-highgpu + mtu: 8244 secondary_ranges: gke-subnet-a3-highgpu: - range_name: pods @@ -59,6 +60,7 @@ deployment_groups: global_ip_address_range: 192.169.0.0/16 network_count: 4 subnetwork_cidr_suffix: 24 + mtu: 8244 - id: gke_cluster source: modules/scheduler/gke-cluster diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index 30edb3974c..1198b520c0 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -33,6 +33,7 @@ deployment_groups: source: modules/network/vpc settings: subnetwork_name: gke-subnet-a3-mega + mtu: 8244 secondary_ranges: gke-subnet-a3-mega: - range_name: pods @@ -59,6 +60,7 @@ deployment_groups: global_ip_address_range: 192.169.0.0/16 network_count: 8 subnetwork_cidr_suffix: 24 + mtu: 8244 - id: gke_cluster source: modules/scheduler/gke-cluster diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index 413e523da7..9ffe737e83 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -67,7 +67,7 @@ deployment_groups: sc_volume_binding_mode: Immediate sc_reclaim_policy: Delete # Use Retain if you want to volume and parallelstore resource will remain after sc_topology_zones: [$(vars.zone)] - pvc_count: 2 + pvc_count: 1 capacity_gb: 12000 # from 12,000 GiB to 100,000 GiB, in multiples of 4,000 GiB - id: sample-pool @@ -76,9 +76,10 @@ deployment_groups: settings: name: sample-pool zones: [$(vars.zone)] - machine_type: n2-standard-4 + machine_type: n2-standard-16 - ### Parallelstore enabled Job ### + # Train a TensorFlow model with Keras and Parallelstore on GKE + # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample - id: parallelstore-job source: modules/compute/gke-job-template @@ -86,22 +87,35 @@ deployment_groups: - gke_cluster - parallelstore-setup settings: - image: busybox + name: tensorflow + image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d + security_context: # to make sure the job have enough access to execute the jobs and r/w from parallelstore + - key: runAsUser + value: 1000 + - key: runAsGroup + value: 100 + - key: fsGroup + value: 100 command: - - bin/sh + - bash - -c - | - echo "Set up job folders" - shopt -s extglob; JOB=${HOSTNAME%%-+([[:digit:]])} - mkdir /data/parallelstore-pvc-0/${JOB}/ -p; - mkdir /data/parallelstore-pvc-1/${JOB}/ -p; - - echo "Writing seed data to Parallelstore volumes" - dd if=/dev/urandom of=/data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 - dd if=/dev/urandom of=/data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 - - # echo "Hash file and write between the 2 hyerpdisk balanced volumes" - # md5sum /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.md5 - # md5sum /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.md5 - node_count: 5 + pip install transformers datasets + python - < --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index 705e1299eb..64bb96f0a1 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -46,8 +46,9 @@ deployment_groups: source: modules/scripts/startup-script settings: install_ansible: true - install_docker: true - enable_docker_world_writable: true + docker: + enabled: true + world_writable: true configure_ssh_host_patterns: - 10.0.0.* - 10.1.0.* @@ -81,7 +82,7 @@ deployment_groups: "reboot": false, "install_cuda": false, "install_gcsfuse": true, - "install_lustre": true, + "install_lustre": false, "install_ompi": true, "monitoring_agent": "cloud-ops", "nvidia_version": "latest", @@ -94,7 +95,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.5 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml index c51114d9bf..57dafd1f48 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml @@ -52,6 +52,7 @@ vars: enable_ops_agent: true # enable the NVIDIA DCGM daemon and integration into Cloud Ops Agent enable_nvidia_dcgm: true + localssd_mountpoint: /mnt/localssd deployment_groups: - group: cluster @@ -114,8 +115,17 @@ deployment_groups: # Failure to do will result in VMs that lose data and do not automatically # mount local SSD filesystems local_ssd_filesystem: - mountpoint: /mnt/localssd + mountpoint: $(vars.localssd_mountpoint) permissions: "1777" # must quote numeric filesystem permissions! + # Docker was successfully installed in the image, this configures it + # to use the A3-specific local SSD volumes to store container images + docker: + enabled: true + world_writable: true + daemon_config: | + { + "data-root": "$(vars.localssd_mountpoint)/docker" + } runners: - type: ansible-local destination: enable_nvidia_dcgm.yml diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md index 9800ad7128..96087ef64c 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md @@ -40,7 +40,7 @@ installing them in a Python virtual environment: python3 -m venv toolkit-a3 source toolkit-a3/bin/activate pip3 install -r \ - https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt + https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt ``` **Always** activate the environment before running any gcluster commands such as diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml index 42a823bf8e..08060286b6 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml @@ -46,8 +46,9 @@ deployment_groups: source: modules/scripts/startup-script settings: install_ansible: true - install_docker: true - enable_docker_world_writable: true + docker: + enabled: true + world_writable: true configure_ssh_host_patterns: - 10.0.0.* - 10.1.0.* @@ -81,7 +82,7 @@ deployment_groups: "install_cuda": false, "nvidia_version": "latest", "install_ompi": true, - "install_lustre": true, + "install_lustre": false, "install_gcsfuse": true, "monitoring_agent": "cloud-ops" } @@ -92,7 +93,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 5.12.0 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 5.12.2 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 8d46b10c40..4117b2c7f8 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -30,6 +30,7 @@ vars: project: $(vars.project_id) enable_login_public_ips: true enable_controller_public_ips: true + localssd_mountpoint: /mnt/localssd deployment_groups: - group: cluster @@ -89,8 +90,17 @@ deployment_groups: # Failure to do will result in VMs that lose data and do not automatically # mount local SSD filesystems local_ssd_filesystem: - mountpoint: /mnt/localssd + mountpoint: $(vars.localssd_mountpoint) permissions: "1777" # must quote numeric filesystem permissions! + # Docker was successfully installed in the image, this configures it + # to use the A3-specific local SSD volumes to store container images + docker: + enabled: true + world_writable: true + daemon_config: | + { + "data-root": "$(vars.localssd_mountpoint)/docker" + } runners: - type: ansible-local destination: slurm_aperture.yml @@ -216,6 +226,8 @@ deployment_groups: chmod 0755 "${SLURM_ROOT}/scripts/rxdm" ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/rxdm.prolog_slurmd" ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/rxdm.epilog_slurmd" + # Uncomment the line below to enable epilog that will check health of GPUs and drain node if problem is detected. + # ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/gpu-test.epilog_slurmd" - type: shell destination: reset_enroot.sh content: | diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index dfc4d4ab4c..805f9c5057 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -49,8 +49,9 @@ deployment_groups: - 10.6.0.* - 10.7.0.* - $(vars.slurm_cluster_name)* - enable_docker_world_writable: true - install_docker: true + docker: + enabled: true + world_writable: true runners: # it is important that kernel upgrades do not occur before running the # solution for building Slurm (which doesn't handle them well on the fly) @@ -83,6 +84,11 @@ deployment_groups: ansible.builtin.get_url: url: "{{ package_url }}" dest: "{{ package_filename }}" + retries: 3 + delay: 60 + register: result + until: result is success + failed_when: result is failure - name: Install kernel headers ansible.builtin.apt: deb: "{{ package_filename }}" @@ -108,7 +114,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.5 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md new file mode 100644 index 0000000000..2a46614f0e --- /dev/null +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md @@ -0,0 +1,156 @@ + +# Topologically-aware Pytorch Distributed + +This example demonstrates how to incorporate topology information into a +pytorch distributed workload. + +Note: This requires that your nodes were created using a compact placement +policy. + +The main concept is that pytorch should incorporate the information from topologically-aware Slurm into its `dist.init_process_group` function. [Slurm topology plugin is automatically configured for ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/docs/slurm-topology.md). + +## Quickstart + +Run the following commands to demonstrate topologically aware pytorch: + + # Creates a local python3 env and installs pytorch + jobid=$(sbatch --parsable install.sh) + + # Run an example of setting SLURM_HOSTFILE based on topology + sbatch --dependency=afterok:$jobid topological_pytorch.sh + +Once submitted, you should be able to view the state of the jobs with `squeue`: + + JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) + 124 a3mega topologi username PD 0:00 8 (Dependency) + 123 a3mega install. username R 2:14 1 a3mega-a3meganodeset-0 + +Wait until job 124 is complete, then review the output in `slurm-124.out`. It +will look something like this (illustative values used, your physical host will +have random characters): + + Standard + rank hostname physical_host + 0 a3mega-a3meganodeset-0.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/00000000000000000000000000000000 + 8 a3mega-a3meganodeset-1.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/dddddddddddddddddddddddddddddddd/11111111111111111111111111111111 + 16 a3mega-a3meganodeset-2.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/22222222222222222222222222222222 + 24 a3mega-a3meganodeset-3.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/cccccccccccccccccccccccccccccccc/33333333333333333333333333333333 + 32 a3mega-a3meganodeset-4.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee/44444444444444444444444444444444 + 40 a3mega-a3meganodeset-5.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/ffffffffffffffffffffffffffffffff/55555555555555555555555555555555 + 48 a3mega-a3meganodeset-6.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/66666666666666666666666666666666 + 54 a3mega-a3meganodeset-7.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/ffffffffffffffffffffffffffffffff/77777777777777777777777777777777 + Sorted by topology + rank hostname physical_host + 0 a3mega-a3meganodeset-2.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/22222222222222222222222222222222 + 8 a3mega-a3meganodeset-0.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/00000000000000000000000000000000 + 16 a3mega-a3meganodeset-6.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/66666666666666666666666666666666 + 24 a3mega-a3meganodeset-3.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/cccccccccccccccccccccccccccccccc/33333333333333333333333333333333 + 32 a3mega-a3meganodeset-1.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/dddddddddddddddddddddddddddddddd/11111111111111111111111111111111 + 40 a3mega-a3meganodeset-4.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee/44444444444444444444444444444444 + 48 a3mega-a3meganodeset-5.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/ffffffffffffffffffffffffffffffff/55555555555555555555555555555555 + 56 a3mega-a3meganodeset-7.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/ffffffffffffffffffffffffffffffff/77777777777777777777777777777777 + +Which shows that the ranks are ordered by the "rack" component of the `physical_host`. +See [here](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies#verify-vm-location) +for more information on compact placement policies. + +## Detailed Explanation + +### Setup + +First we need to install pytorch. While these same concepts transfer to using +enroot/pyxis to launch containerized workloads, in this example we will just +use a local python environment: + + # Creates a local python3 env and installs pytorch + sbatch install.sh + +### Job Submission Script +Now let's review the `topological_pytorch.sh` batch job submission script. + +First we set the requisite GPUDirect-TCPXO environment variables: + + NCCL_LIB_DIR="/var/lib/tcpxo/lib64" source /var/lib/tcpxo/lib64/nccl-env-profile.sh + export NCCL_FASTRAK_CTRL_DEV=enp0s12 + export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices + +and activate our python environment: + + source env/bin/activate + +Next we demonstrate the standard behavior that torchrun would use, which does +not incorporate topology into how it orders ranks among the nodes. + + # Demonstrate standard behavior + echo "Standard" + # Set the MASTER_ADDR to the first node in the Slurm Job Nodelist + export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) + # For torchrun, we only launch 1 task per node, and instruct torchrun to create + # 8 (SLURM_GPUS_PER_NODE) processes per node. + srun --ntasks-per-node=1 --nodes "${SLURM_NNODES}" \ + python -m torch.distributed.run \ + --nproc_per_node "${SLURM_GPUS_PER_NODE}" \ + --rdzv_endpoint "${MASTER_ADDR}":"${MASTER_PORT}" \ + --rdzv_backend c10d \ + --nnodes "${SLURM_NNODES}" topological_pytorch.py + +torchrun will launch 8 tasks per node, and assign ranks lexiconographically +across nodes according to the hostnames. + +For topologically-aware behavior, we launch all the tasks using Slurm's `srun`, +and will use the Slurm environment variables to initialize the torch distributed +process group, as we'll describe in the next section. + +Note: [Topology aware Slurm is enabled by default in ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/docs/slurm-topology.md) + +Slurm sets the `SLURM_PROCID` according to topology, which we will later use to +order NCCL ranks in Pytorch. The last thing we need to do is launch the job, +adding `--topology` to the script arguments to trigger the topology logic. + + srun python topological_pytorch.py --topology + +Note: Alternatively you can set the required environment variables to be populated by Slurm in the srun command. + + srun sh -c "WORLD_SIZE=\${SLURM_NPROCS} RANK=\${SLURM_PROCID} LOCAL_RANK=\${SLURM_LOCALID} LOCAL_WORLD_SIZE=\${SLURM_NTASKS_PER_NODE} python topological_pytorch.py" + +### Test Script +Next review the `topological_pytorch.py` script. There is a top level flag of +`--topology`, which controls whether pytorch is initialized using torchrun (when +`False`) or using Slurm (when `True`). The Slurm environment variables ensure +that the node ordering that Slurm uses gets translated to the Pytorch ranks. + + if args.topology: + # These are populated by Slurm + local_rank = int(os.environ["SLURM_LOCALID"]) + global_rank = int(os.environ["SLURM_PROCID"]) + world_size = int(os.environ["SLURM_NPROCS"]) + procs_per_node = int(os.environ["SLURM_NTASKS_PER_NODE"]) + + # Must set rank and world_size based on SLURM_PROCID and SLURM_NPROCS + dist.init_process_group("nccl", rank=global_rank, world_size=world_size) + else: + # These are populated by torchrun + local_rank = int(os.environ["LOCAL_RANK"]) + global_rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + procs_per_node = int(os.environ["LOCAL_WORLD_SIZE"]) + + # Torchrun handles rank allocation + dist.init_process_group("nccl") + +The remainder of the script is meant to demonstrate functionality. We use +`dist.all_gather_object` to collect the rank, hostname, and `physical_host` from +each pytorch worker, and then print the order out from global rank 0. What you +should see is that depending on the topology that Slurm uses to launch the jobs, +the ordering of this output will vary. + +### Running the Test + +Run the following commands to demonstrate topologically aware pytorch: + + sbatch topological_pytorch.sh + +The output shows the standard vs topologically-aware behavior. See +the Quickstart section above for an example. diff --git a/community/modules/project/new-project/metadata.yaml b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/install.sh similarity index 65% rename from community/modules/project/new-project/metadata.yaml rename to examples/machine-learning/a3-megagpu-8g/topological-pytorch/install.sh index 806241c118..ba44f14bdc 100644 --- a/community/modules/project/new-project/metadata.yaml +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/install.sh @@ -1,4 +1,5 @@ -# Copyright 2023 "Google LLC" +#!/bin/bash +# Copyright 2024 "Google LLC" # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,12 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ---- -spec: - requirements: - services: - - admin.googleapis.com - - cloudresourcemanager.googleapis.com - - cloudbilling.googleapis.com - - iam.googleapis.com +#filename: install.sh +#submit with `sbatch install.sh` + +#SBATCH --partition=a3mega +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --nodes 1 + +python3 -m venv env +source env/bin/activate +pip3 install --pre torch torchvision torchaudio diff --git a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.py b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.py new file mode 100644 index 0000000000..ed71e4ea49 --- /dev/null +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.py @@ -0,0 +1,69 @@ + +#!/usr/bin/env python +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#filename: topological_pytorch.py +import os +import torch +import torch.distributed as dist +import socket +import subprocess +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--topology", action=argparse.BooleanOptionalAction) +args = parser.parse_args() + +hostname = socket.getfqdn() +if args.topology: + # These are populated by Slurm + local_rank = int(os.environ["SLURM_LOCALID"]) + global_rank = int(os.environ["SLURM_PROCID"]) + world_size = int(os.environ["SLURM_NPROCS"]) + procs_per_node = int(os.environ["SLURM_NTASKS_PER_NODE"]) + + # Must set rank and world_size based on SLURM_PROCID and SLURM_NPROCS + dist.init_process_group("nccl", rank=global_rank, world_size=world_size) +else: + # These are populated by torchrun + local_rank = int(os.environ["LOCAL_RANK"]) + global_rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + procs_per_node = int(os.environ["LOCAL_WORLD_SIZE"]) + + # Torchrun handles rank allocation + dist.init_process_group("nccl") + +# Must attach device based on the local rank. +torch.cuda.set_device(local_rank) + +# Get the physical host for the current task to print later +physical_host = subprocess.check_output([ + "curl", "-s", + "http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host", + "-H", "Metadata-Flavor: Google" +]).decode('utf-8') + +# Create an output to collect from the all-gather +output = [None for _ in range(world_size)] +dist.all_gather_object(output, [global_rank, hostname, physical_host]) +if global_rank == 0: + # Print out ordered set of hostnames from all-gather + print("rank\thostname\tphysical_host") + # Skip to print every procs_per_node to keep output compact + for result in output[::procs_per_node]: + print("\t".join(map(str,result))) + +dist.destroy_process_group() diff --git a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh new file mode 100644 index 0000000000..477c92b0b5 --- /dev/null +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# shellcheck disable=SC2016 +# shellcheck disable=SC2155 + +#filename: topological_pytorch.sh +#submit with `sbatch topological_pytorch.sh` +#SBATCH --partition=a3mega +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --nodes 8 + +NCCL_LIB_DIR="/var/lib/tcpxo/lib64" source /var/lib/tcpxo/lib64/nccl-env-profile.sh +export NCCL_FASTRAK_CTRL_DEV=enp0s12 +export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 +export NCCL_SOCKET_IFNAME=enp0s12 +export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices + +source env/bin/activate + +export MASTER_PORT=12345 +export OMP_NUM_THREADS=12 + +# Demonstrate standard behavior +echo "Standard" +# Set the MASTER_ADDR to the first node in the Slurm Job Nodelist +export MASTER_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) +# For torchrun, we only launch 1 task per node, and instruct torchrun to create +# 8 (SLURM_GPUS_PER_NODE) processes per node. +srun --ntasks-per-node=1 --nodes "${SLURM_NNODES}" \ + python -m torch.distributed.run \ + --nproc_per_node "${SLURM_GPUS_PER_NODE}" \ + --rdzv_endpoint "${MASTER_ADDR}":"${MASTER_PORT}" \ + --rdzv_backend c10d \ + --nnodes "${SLURM_NNODES}" topological_pytorch.py + +# Demonstrate how to incorporate topology +echo "Topologically aware" +# Run 8 tasks per node (inherited from the job script), since we aren't using +# torchrun in this case. Supply the --topology flag to the script to set +# global rank and world size of variables based on Slurm +srun python topological_pytorch.py --topology diff --git a/examples/ml-slurm-v5-legacy.yaml b/examples/ml-slurm-v5-legacy.yaml index 6c0fb8aa30..113c052405 100644 --- a/examples/ml-slurm-v5-legacy.yaml +++ b/examples/ml-slurm-v5-legacy.yaml @@ -28,6 +28,8 @@ vars: family: ml-slurm project: $(vars.project_id) disk_size_gb: 200 + metadata: # Workaround for https://github.com/GoogleCloudPlatform/cluster-toolkit/discussions/3243 + VmDnsSetting: GlobalOnly # Recommended to use GCS backend for Terraform state # See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state @@ -94,9 +96,8 @@ deployment_groups: content: | #!/bin/bash # this script is designed to execute on Slurm images published by SchedMD that: - # - are based on Debian 11 distribution of Linux - # - have NVIDIA Drivers v530 pre-installed - # - have CUDA Toolkit 12.1 pre-installed. + # - are based on Debian distribution of Linux + # - have NVIDIA drivers pre-installed set -e -o pipefail @@ -112,8 +113,8 @@ deployment_groups: DL_DIR=\$(mktemp -d) cd $DL_DIR - curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh - HOME=$DL_DIR bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh -b -p $CONDA_BASE + curl -L -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh + HOME=$DL_DIR bash Miniforge3-24.7.1-2-Linux-x86_64.sh -b -p $CONDA_BASE cd - rm -rf $DL_DIR unset DL_DIR @@ -123,39 +124,12 @@ deployment_groups: conda config --system --set auto_activate_base False # following channel ordering is important! use strict_priority! conda config --system --set channel_priority strict - conda config --system --remove channels defaults - conda config --system --add channels conda-forge - conda config --system --add channels nvidia - conda config --system --add channels nvidia/label/cuda-11.8.0 - conda update -n base conda --yes ### create a virtual environment for tensorflow - conda create -n tf python=3.10 --yes + conda create -n tf python=3.11 --yes conda activate tf - conda install -n tf cuda-toolkit --yes - pip install nvidia-cudnn-cu11 nvidia-nccl-cu11 - - cd $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/nccl/lib/ - ln -s libnccl.so.2 libnccl.so - cd - - - mkdir -p $CONDA_PREFIX/etc/conda/activate.d - echo 'export OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH' > $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - echo 'NVIDIA_PYTHON_PATH=$CONDA_PREFIX/lib/python3.10/site-packages/nvidia' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$NVIDIA_PYTHON_PATH/cudnn/lib/:$NVIDIA_PYTHON_PATH/nccl/lib/' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - mkdir -p $CONDA_PREFIX/etc/conda/deactivate.d - echo 'export LD_LIBRARY_PATH=${OLD_LD_LIBRARY_PATH}' > $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh - echo 'unset OLD_LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh - - pip install tensorflow==2.12.* - pip install tensorrt==8.6.* - - ### create a virtual environment for pytorch - conda create -n pytorch python=3.10 --yes - conda activate pytorch - conda config --env --add channels pytorch - conda install -n pytorch pytorch torchvision torchaudio pytorch-cuda=11.8 --yes + pip install tensorflow[and-cuda]==2.18.* - group: packer modules: @@ -175,6 +149,7 @@ deployment_groups: # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) + disk_type: pd-ssd image_family: $(vars.new_image.family) # building this image does not require a GPU-enabled VM machine_type: c2-standard-4 diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index 4baaaf07ce..1eea66d407 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -29,6 +29,8 @@ vars: family: ml-slurm project: $(vars.project_id) disk_size_gb: 200 + metadata: # Workaround for https://github.com/GoogleCloudPlatform/cluster-toolkit/discussions/3243 + VmDnsSetting: GlobalOnly # Recommended to use GCS backend for Terraform state # See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state @@ -62,9 +64,8 @@ deployment_groups: content: | #!/bin/bash # this script is designed to execute on Slurm images published by SchedMD that: - # - are based on Debian 11 distribution of Linux - # - have NVIDIA Drivers v530 pre-installed - # - have CUDA Toolkit 12.1 pre-installed. + # - are based on Debian distribution of Linux + # - have NVIDIA drivers pre-installed set -e -o pipefail @@ -80,8 +81,8 @@ deployment_groups: DL_DIR=\$(mktemp -d) cd $DL_DIR - curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh - HOME=$DL_DIR bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh -b -p $CONDA_BASE + curl -L -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh + HOME=$DL_DIR bash Miniforge3-24.7.1-2-Linux-x86_64.sh -b -p $CONDA_BASE cd - rm -rf $DL_DIR unset DL_DIR @@ -91,39 +92,18 @@ deployment_groups: conda config --system --set auto_activate_base False # following channel ordering is important! use strict_priority! conda config --system --set channel_priority strict - conda config --system --remove channels defaults - conda config --system --add channels conda-forge - conda config --system --add channels nvidia - conda config --system --add channels nvidia/label/cuda-11.8.0 - conda update -n base conda --yes ### create a virtual environment for tensorflow - conda create -n tf python=3.10 --yes + conda create -n tf python=3.11 --yes conda activate tf - conda install -n tf cuda-toolkit --yes - pip install nvidia-cudnn-cu11 nvidia-nccl-cu11 - - cd $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/nccl/lib/ - ln -s libnccl.so.2 libnccl.so - cd - - - mkdir -p $CONDA_PREFIX/etc/conda/activate.d - echo 'export OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH' > $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - echo 'NVIDIA_PYTHON_PATH=$CONDA_PREFIX/lib/python3.10/site-packages/nvidia' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$NVIDIA_PYTHON_PATH/cudnn/lib/:$NVIDIA_PYTHON_PATH/nccl/lib/' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - mkdir -p $CONDA_PREFIX/etc/conda/deactivate.d - echo 'export LD_LIBRARY_PATH=${OLD_LD_LIBRARY_PATH}' > $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh - echo 'unset OLD_LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh - - pip install tensorflow==2.12.* - pip install tensorrt==8.6.* + pip install tensorflow[and-cuda]==2.18.* + pip install tensorrt==10.6.* ### create a virtual environment for pytorch - conda create -n pytorch python=3.10 --yes + conda create -n pytorch python=3.11 --yes conda activate pytorch - conda config --env --add channels pytorch - conda install -n pytorch pytorch torchvision torchaudio pytorch-cuda=11.8 --yes + pip install torch torchvision torchaudio - group: packer modules: @@ -139,10 +119,11 @@ deployment_groups: omit_external_ip: false source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-7-debian-11 + source_image_family: slurm-gcp-6-8-debian-11 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) + disk_type: pd-ssd image_family: $(vars.new_image.family) # building this image does not require a GPU-enabled VM machine_type: c2-standard-4 diff --git a/go.mod b/go.mod index 56808e3f4e..012b792751 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module hpc-toolkit -go 1.21 +go 1.22 require ( cloud.google.com/go/storage v1.41.0 // indirect @@ -14,15 +14,15 @@ require ( github.com/spf13/afero v1.11.0 github.com/spf13/cobra v1.8.1 github.com/zclconf/go-cty v1.15.0 - golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa + golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) require ( - github.com/fatih/color v1.17.0 - github.com/go-git/go-billy/v5 v5.5.0 + github.com/fatih/color v1.18.0 + github.com/go-git/go-billy/v5 v5.6.0 github.com/google/go-cmp v0.6.0 github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 @@ -35,7 +35,7 @@ require ( cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect dario.cat/mergo v1.0.0 // indirect github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect - github.com/cyphar/filepath-securejoin v0.2.4 // indirect + github.com/cyphar/filepath-securejoin v0.2.5 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.4.1 // indirect @@ -50,10 +50,10 @@ require ( go.opentelemetry.io/otel v1.24.0 // indirect go.opentelemetry.io/otel/metric v1.24.0 // indirect go.opentelemetry.io/otel/trace v1.24.0 // indirect - golang.org/x/mod v0.17.0 // indirect + golang.org/x/mod v0.19.0 // indirect golang.org/x/sync v0.7.0 // indirect golang.org/x/time v0.5.0 // indirect - golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect + golang.org/x/tools v0.23.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4 // indirect ) @@ -95,10 +95,10 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.24.0 // indirect - golang.org/x/net v0.26.0 // indirect + golang.org/x/crypto v0.25.0 // indirect + golang.org/x/net v0.27.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.25.0 + golang.org/x/sys v0.26.0 golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.1 // indirect google.golang.org/protobuf v1.34.2 // indirect diff --git a/go.sum b/go.sum index ca8c11adb3..3d4849db05 100644 --- a/go.sum +++ b/go.sum @@ -231,8 +231,8 @@ github.com/cncf/xds/go v0.0.0-20211001041855-01bcc9b48dfe/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= -github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= +github.com/cyphar/filepath-securejoin v0.2.5 h1:6iR5tXJ/e6tJZzzdMc1km3Sa7RRIVBKAK32O2s7AYfo= +github.com/cyphar/filepath-securejoin v0.2.5/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -252,8 +252,8 @@ github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go. github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go.mod h1:KJwIaB5Mv44NWtYuAOFCVOjcI94vtpEz2JU/D2v6IjE= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= -github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4= -github.com/fatih/color v1.17.0/go.mod h1:YZ7TlrGPkiz6ku9fK3TLD/pl3CpsiFyu8N92HLgmosI= +github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= +github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= @@ -261,8 +261,8 @@ github.com/gliderlabs/ssh v0.3.7 h1:iV3Bqi942d9huXnzEF2Mt+CY9gLu8DNM4Obd+8bODRE= github.com/gliderlabs/ssh v0.3.7/go.mod h1:zpHEXBstFnQYtGnB8k8kQLol82umzn/2/snG7alWVD8= github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 h1:+zs/tPmkDkHx3U66DAb0lQFJrpS6731Oaa12ikc+DiI= github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376/go.mod h1:an3vInlBmSxCcxctByoQdvwPiA7DTK7jaaFDBTtu0ic= -github.com/go-git/go-billy/v5 v5.5.0 h1:yEY4yhzCDuMGSv83oGxiBotRzhwhNr8VZyphhiu+mTU= -github.com/go-git/go-billy/v5 v5.5.0/go.mod h1:hmexnoNsr2SJU1Ju67OaNz5ASJY3+sHgFRpCtpDCKow= +github.com/go-git/go-billy/v5 v5.6.0 h1:w2hPNtoehvJIxR00Vb4xX94qHQi/ApZfX+nBE2Cjio8= +github.com/go-git/go-billy/v5 v5.6.0/go.mod h1:sFDq7xD3fn3E0GOwUSZqHo9lrkmx8xJhA0ZrfvjBRGM= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII= github.com/go-git/go-git/v5 v5.12.0 h1:7Md+ndsjrzZxbddRDZjF14qK+NN56sy6wkqaVrjZtys= @@ -440,8 +440,8 @@ github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770 github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770/go.mod h1:SO/iHr6q2EzbqRApt+8/E9wqebTwQn5y+UlB04bxzo0= github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= -github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI= -github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M= +github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k= +github.com/onsi/gomega v1.34.1/go.mod h1:kU1QgUvBDLXBJq618Xvm2LUX6rSAfRaFRTcdOeDLwwY= github.com/otiai10/copy v1.14.0 h1:dCI/t1iTdYGtkvCuBG2BgR6KZa83PTclw4U5n2wAllU= github.com/otiai10/copy v1.14.0/go.mod h1:ECfuL02W+/FkTWZWgQqXPWZgW9oeKCSQ5qVfSc4qc4w= github.com/otiai10/mint v1.5.1 h1:XaPLeE+9vGbuyEHem1JNk3bYc7KKqyI/na0/mLd/Kks= @@ -529,8 +529,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= -golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= +golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= +golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -541,8 +541,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ= -golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -569,8 +569,8 @@ golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.19.0 h1:fEdghXQSo20giMthA7cd28ZC+jts4amQ3YMXiP5oMQ8= +golang.org/x/mod v0.19.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -619,8 +619,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= +golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -732,13 +732,13 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= -golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= +golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -808,8 +808,8 @@ golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.23.0 h1:SGsXPZ+2l4JsgaCKkx+FQ9YZ5XEtA1GZYuoDjenLjvg= +golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/modules/README.md b/modules/README.md index 722449e6e6..c5f1df282a 100644 --- a/modules/README.md +++ b/modules/README.md @@ -103,8 +103,6 @@ Modules that are still in development and less stable are labeled with the a [DDN EXAscaler lustre](https://www.ddn.com/partners/google-cloud-platform/) file system. This module has [license costs](https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud). -* **[Intel-DAOS]** ![community-badge] : Creates - a [DAOS](https://docs.daos.io/) file system. * **[cloud-storage-bucket]** ![community-badge] ![experimental-badge] : Creates a Google Cloud Storage (GCS) bucket. * **[gke-persistent-volume]** ![core-badge] ![experimental-badge] : Creates persistent volumes and persistent volume claims for shared storage. * **[nfs-server]** ![community-badge] ![experimental-badge] : Creates a VM and @@ -114,7 +112,6 @@ Modules that are still in development and less stable are labeled with the [parallelstore]: file-system/parallelstore/README.md [pre-existing-network-storage]: file-system/pre-existing-network-storage/README.md [ddn-exascaler]: ../community/modules/file-system/DDN-EXAScaler/README.md -[intel-daos]: ../community/modules/file-system/Intel-DAOS/README.md [nfs-server]: ../community/modules/file-system/nfs-server/README.md [cloud-storage-bucket]: ../community/modules/file-system/cloud-storage-bucket/README.md [gke-persistent-volume]: ../modules/file-system/gke-persistent-volume/README.md @@ -156,15 +153,12 @@ Modules that are still in development and less stable are labeled with the ### Project -* **[new-project]** ![community-badge] ![experimental-badge] : Creates a Google - Cloud Project. * **[service-account]** ![community-badge] ![experimental-badge] : Creates [service accounts](https://cloud.google.com/iam/docs/service-accounts) for a GCP project. * **[service-enablement]** ![community-badge] ![experimental-badge] : Allows enabling various APIs for a Google Cloud Project. -[new-project]: ../community/modules/project/new-project/README.md [service-account]: ../community/modules/project/service-account/README.md [service-enablement]: ../community/modules/project/service-enablement/README.md @@ -229,8 +223,8 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 +[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md @@ -360,10 +354,6 @@ following module definition refers the local pre-existing-vpc modules. #### GitHub-hosted Modules and Packages -The [Intel DAOS blueprint][pfs-daos.yaml] makes extensive use of GitHub-hosted -Terraform and Packer modules. You may wish to use it as an example reference for -this documentation. - To use a Terraform module available on GitHub, set the source to a path starting with `github.com` (HTTPS) or `git@github.com` (SSH). For instance, the following module definition sources the Toolkit vpc module: @@ -401,7 +391,6 @@ into a hidden folder when you run `terraform init`. [tfrev]: https://www.terraform.io/language/modules/sources#selecting-a-revision [gitref]: https://git-scm.com/book/en/v2/Git-Tools-Revision-Selection#_single_revisions [tfsubdir]: https://www.terraform.io/language/modules/sources#modules-in-package-sub-directories -[pfs-daos.yaml]: ../community/examples/intel/pfs-daos.yaml ##### GitHub-hosted Packer modules @@ -413,12 +402,6 @@ repository to the module path: `deployment_name/group_name/module_id`. However, when `gcluster deploy` is invoked, it will run Packer from the subdirectory `deployment_name/group_name/module_id/subdirectory/after/double_slash`. -Referring back to the [Intel DAOS blueprint][pfs-daos.yaml], we see that it will -create 2 deployment groups at `pfs-daos/daos-client-image` and -`pfs-daos/daos-server-image`. However, Packer will actually be invoked from -a subdirectories ending in `daos-client-image/images` and -`daos-server-image/images`. - If the module does not use `//` package notation, `gcluster create` will copy only the final directory in the path to `deployment_name/group_name/module_id`. diff --git a/modules/compute/gke-job-template/README.md b/modules/compute/gke-job-template/README.md index 1b5780188a..f2a50de63b 100644 --- a/modules/compute/gke-job-template/README.md +++ b/modules/compute/gke-job-template/README.md @@ -117,6 +117,7 @@ No modules. | [random\_name\_sufix](#input\_random\_name\_sufix) | Appends a random suffix to the job name to avoid clashes. | `bool` | `true` | no | | [requested\_cpu\_per\_pod](#input\_requested\_cpu\_per\_pod) | The requested cpu per pod. If null, allocatable\_cpu\_per\_node will be used to claim whole nodes. If provided will override allocatable\_cpu\_per\_node. | `number` | `-1` | no | | [restart\_policy](#input\_restart\_policy) | Job restart policy. Only a RestartPolicy equal to `Never` or `OnFailure` is allowed. | `string` | `"Never"` | no | +| [security\_context](#input\_security\_context) | The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ |
list(object({
key = string
value = string
}))
| `[]` | no | | [tolerations](#input\_tolerations) | Tolerations allow the scheduler to schedule pods with matching taints. Generally populated from gke-node-pool via `use` field. |
list(object({
key = string
operator = string
value = string
effect = string
}))
|
[
{
"effect": "NoSchedule",
"key": "user-workload",
"operator": "Equal",
"value": "true"
}
]
| no | ## Outputs diff --git a/modules/compute/gke-job-template/main.tf b/modules/compute/gke-job-template/main.tf index cded3fbb1d..2e21c7c394 100644 --- a/modules/compute/gke-job-template/main.tf +++ b/modules/compute/gke-job-template/main.tf @@ -129,6 +129,7 @@ locals { restart_policy = var.restart_policy backoff_limit = var.backoff_limit tolerations = distinct(var.tolerations) + security_context = var.security_context labels = local.labels empty_dir_volumes = local.empty_dir_volumes diff --git a/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl b/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl index 61c34f8b25..431a519b9c 100644 --- a/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl +++ b/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl @@ -18,6 +18,12 @@ spec: gke-gcsfuse/volumes: "true" %{~ endif ~} spec: + %{~ if length(security_context) > 0 ~} + securityContext: + %{~ for context in security_context ~} + ${context.key}: ${context.value} + %{~ endfor ~} + %{~ endif ~} %{~ if k8s_service_account_name != null ~} serviceAccountName: ${k8s_service_account_name} %{~ endif ~} diff --git a/modules/compute/gke-job-template/variables.tf b/modules/compute/gke-job-template/variables.tf index 279293cf26..6a37c344c1 100644 --- a/modules/compute/gke-job-template/variables.tf +++ b/modules/compute/gke-job-template/variables.tf @@ -92,6 +92,15 @@ variable "tolerations" { ] } +variable "security_context" { + description = "The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/" + type = list(object({ + key = string + value = string + })) + default = [] +} + variable "machine_family" { description = "The machine family to use in the node selector (example: `n2`). If null then machine family will not be used as selector criteria." type = string diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 880e1834e4..7359e934ca 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -151,7 +151,7 @@ The following is an example of guest_accelerator: - gpu_partition_size: 1g.5gb gpu_sharing_config: - - gpu_sharing_strategy: TIME_SHARING + gpu_sharing_strategy: TIME_SHARING max_shared_clients_per_gpu: 3 ``` @@ -181,9 +181,9 @@ The following is an example of using a GPU (with sharing config) attached to an - type: nvidia-tesla-t4 count: 2 gpu_driver_installation_config: - - gpu_driver_version: "LATEST" + gpu_driver_version: "LATEST" gpu_sharing_config: - - max_shared_clients_per_gpu: 2 + max_shared_clients_per_gpu: 2 gpu_sharing_strategy: "TIME_SHARING" ``` @@ -223,6 +223,40 @@ Finally, the following is adding multivpc to a node pool: ... ``` +## Using GCE Reservations +You can reserve Google Compute Engine instances in a specific zone to ensure resources are available for their workloads when needed. For more details on how to manage reservations, see [Reserving Compute Engine zonal resources](https://cloud.google.com/compute/docs/instances/reserving-zonal-resources). + +After creating a reservation, you can consume the reserved GCE VM instances in GKE. GKE clusters deployed using Cluster Toolkit support the same consumption modes as Compute Engine: NO_RESERVATION(default), ANY_RESERVATION, SPECIFIC_RESERVATION. + +This can be accomplished using [`reservation_affinity`](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/modules/compute/gke-node-pool/README.md#input_reservation_affinity). + +```yaml +# Target any reservation +reservation_affinity: + consume_reservation_type: ANY_RESERVATION + +# Target a specific reservation +reservation_affinity: + consume_reservation_type: SPECIFIC_RESERVATION + specific_reservations: + - name: specific-reservation-1 +``` + +The following requirements need to be satisfied for the node pool nodes to be able to use a specific reservation: +1. A reservation with the name must exist in the specified project(`var.project_id`) and one of the specified zones(`var.zones`). +2. Its consumption type must be `specific`. +3. Its GCE VM Properties must match with those of the Node Pool; Machine type, Accelerators (GPU Type and count), Local SSD disk type and count. + +If you want to utilise a shared reservation, the owner project of the shared reservation needs to be explicitly specified like the following. Note that a shared reservation can be used by the project that hosts the reservation (owner project) and by the projects the reservation is shared with (consumer projects). See how to [create and use a shared reservation](https://cloud.google.com/compute/docs/instances/reservations-shared). + +```yaml +reservation_affinity: + consume_reservation_type: SPECIFIC_RESERVATION + specific_reservations: + - name: specific-reservation-shared + project: shared_reservation_owner_project_id +``` + ## License @@ -244,17 +278,17 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.2 | -| [google](#requirement\_google) | ~> 5.0 | -| [google-beta](#requirement\_google-beta) | ~> 5.0 | +| [terraform](#requirement\_terraform) | >= 1.5 | +| [google](#requirement\_google) | > 5 | +| [google-beta](#requirement\_google-beta) | > 5 | | [null](#requirement\_null) | ~> 3.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | ~> 5.0 | -| [google-beta](#provider\_google-beta) | ~> 5.0 | +| [google](#provider\_google) | > 5 | +| [google-beta](#provider\_google-beta) | > 5 | | [null](#provider\_null) | ~> 3.0 | ## Modules @@ -288,7 +322,7 @@ limitations under the License. | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(object({
gpu_driver_version = string
}), { gpu_driver_version = "DEFAULT" })
gpu_partition_size = optional(string)
gpu_sharing_config = optional(object({
gpu_sharing_strategy = string
max_shared_clients_per_gpu = number
}))
}))
| `[]` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | @@ -307,7 +341,7 @@ limitations under the License. | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | -| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | +| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
| `[]` | no | | [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | diff --git a/modules/compute/gke-node-pool/disk_definitions.tf b/modules/compute/gke-node-pool/disk_definitions.tf index f7dbebea0a..b5933bf316 100644 --- a/modules/compute/gke-node-pool/disk_definitions.tf +++ b/modules/compute/gke-node-pool/disk_definitions.tf @@ -22,8 +22,8 @@ locals { local_ssd_machines = { - "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, - "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, + "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 }, + "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 }, } generated_local_ssd_config = lookup(local.local_ssd_machines, var.machine_type, { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = null }) diff --git a/modules/compute/gke-node-pool/gpu_definition.tf b/modules/compute/gke-node-pool/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/modules/compute/gke-node-pool/gpu_definition.tf +++ b/modules/compute/gke-node-pool/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 02518f705a..3e38564988 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -20,8 +20,7 @@ locals { } locals { - preattached_gpu_machine_family = contains(["a2", "a3", "g2"], local.machine_family) - has_gpu = (local.guest_accelerator != null && (length([for ga in local.guest_accelerator : ga if ga.count > 0]) > 0)) || local.preattached_gpu_machine_family + has_gpu = length(local.guest_accelerator) > 0 gpu_taint = local.has_gpu ? [{ key = "nvidia.com/gpu" value = "present" @@ -85,13 +84,31 @@ resource "google_container_node_pool" "node_pool" { image_type = var.image_type dynamic "guest_accelerator" { - for_each = { for idx, ga in local.guest_accelerator : idx => ga if ga.count > 0 } + for_each = local.guest_accelerator + iterator = ga content { - type = coalesce(guest_accelerator.value.type, try(local.generated_guest_accelerator[0].type, "")) - count = coalesce(try(guest_accelerator.value.count, 0) > 0 ? guest_accelerator.value.count : try(local.generated_guest_accelerator[0].count, "0")) - gpu_driver_installation_config = coalescelist(try(guest_accelerator.value.gpu_driver_installation_config, []), [{ gpu_driver_version = "DEFAULT" }]) - gpu_partition_size = try(guest_accelerator.value.gpu_partition_size, "") - gpu_sharing_config = try(guest_accelerator.value.gpu_sharing_config, null) + type = coalesce(ga.value.type, try(local.generated_guest_accelerator[0].type, "")) + count = coalesce(try(ga.value.count, 0) > 0 ? ga.value.count : try(local.generated_guest_accelerator[0].count, "0")) + + gpu_partition_size = try(ga.value.gpu_partition_size, null) + + dynamic "gpu_driver_installation_config" { + # in case user did not specify guest_accelerator settings, we need a try to default to [] + for_each = try([ga.value.gpu_driver_installation_config], [{ gpu_driver_version = "DEFAULT" }]) + iterator = gdic + content { + gpu_driver_version = gdic.value.gpu_driver_version + } + } + + dynamic "gpu_sharing_config" { + for_each = try(ga.value.gpu_sharing_config == null, true) ? [] : [ga.value.gpu_sharing_config] + iterator = gsc + content { + gpu_sharing_strategy = gsc.value.gpu_sharing_strategy + max_shared_clients_per_gpu = gsc.value.max_shared_clients_per_gpu + } + } } } @@ -225,9 +242,12 @@ resource "google_container_node_pool" "node_pool" { ) error_message = <<-EOT Check if your reservation is configured correctly: - 1. A reservation with the name must exist in the specified project and one of the specified zones - 2. Its consumption type must be "specific" - 3. Its VM Properties must match with those of the Node Pool; Machine type, Accelerators (GPU Type and count), Local SSD disk type and count + - A reservation with the name must exist in the specified project and one of the specified zones + + - Its consumption type must be "specific" + %{for property in local.specific_reservation_requirement_violations} + - ${local.specific_reservation_requirement_violation_messages[property]} + %{endfor} EOT } } diff --git a/modules/compute/gke-node-pool/reservation_definitions.tf b/modules/compute/gke-node-pool/reservation_definitions.tf index a75246b185..26ab22808f 100644 --- a/modules/compute/gke-node-pool/reservation_definitions.tf +++ b/modules/compute/gke-node-pool/reservation_definitions.tf @@ -55,7 +55,7 @@ locals { }] nodepool_vm_properties = { "machine_type" : var.machine_type - "guest_accelerators" : { for acc in try(local.guest_accelerator, []) : (acc.count > 0 ? coalesce(acc.type, try(local.generated_guest_accelerator[0].type, "")) : "") => acc.count if acc.count > 0 }, + "guest_accelerators" : { for acc in try(local.guest_accelerator, []) : coalesce(acc.type, try(local.generated_guest_accelerator[0].type, "")) => coalesce(acc.count, try(local.generated_guest_accelerator[0].count, 0)) }, "local_ssds" : { "NVME" : coalesce(local.local_ssd_config.local_ssd_count_nvme_block, 0), "SCSI" : coalesce(local.local_ssd_config.local_ssd_count_ephemeral_storage, 0) @@ -66,4 +66,16 @@ locals { # Know that in map comparison the order of keys does not matter. That is {NVME: x, SCSI: y} and {SCSI: y, NVME: x} are equal # As of this writing, there is only one reservation supported by the Node Pool API. So, directly accessing it from the list specific_reservation_requirement_violations = length(local.reservation_vm_properties) == 0 ? [] : [for k, v in local.nodepool_vm_properties : k if v != local.reservation_vm_properties[0][k]] + + specific_reservation_requirement_violation_messages = { + "machine_type" : <<-EOT + The reservation has "${try(local.reservation_vm_properties[0].machine_type, "")}" machine type and the node pool has "${local.nodepool_vm_properties.machine_type}". Check the relevant node pool setting: "machine_type" + EOT + "guest_accelerators" : <<-EOT + The reservation has ${jsonencode(try(local.reservation_vm_properties[0].guest_accelerators, {}))} accelerators and the node pool has ${jsonencode(try(local.nodepool_vm_properties.guest_accelerators, {}))}. Check the relevant node pool setting: "guest_accelerator". When unspecified, for the machine_type=${var.machine_type}, the default is guest_accelerator=${jsonencode(try(local.generated_guest_accelerator, [{}]))}. + EOT + "local_ssds" : <<-EOT + The reservation has ${jsonencode(try(local.reservation_vm_properties[0].local_ssds, {}))} local SSDs and the node pool has ${jsonencode(try(local.nodepool_vm_properties.local_ssds, {}))}. Check the relevant node pool settings: {local_ssd_count_ephemeral_storage, local_ssd_count_nvme_block}. When unspecified, for the machine_type=${var.machine_type} the defaults are: {local_ssd_count_ephemeral_storage=${coalesce(local.generated_local_ssd_config.local_ssd_count_ephemeral_storage, 0)}, local_ssd_count_nvme_block=${coalesce(local.generated_local_ssd_config.local_ssd_count_nvme_block, 0)}}. + EOT + } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index f5f31abde0..eecc4634c1 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -79,16 +79,32 @@ variable "guest_accelerator" { type = list(object({ type = optional(string) count = optional(number, 0) - gpu_driver_installation_config = optional(list(object({ + gpu_driver_installation_config = optional(object({ gpu_driver_version = string - }))) + }), { gpu_driver_version = "DEFAULT" }) gpu_partition_size = optional(string) - gpu_sharing_config = optional(list(object({ - gpu_sharing_strategy = optional(string) - max_shared_clients_per_gpu = optional(number) - }))) + gpu_sharing_config = optional(object({ + gpu_sharing_strategy = string + max_shared_clients_per_gpu = number + })) })) - default = null + default = [] + nullable = false + + validation { + condition = alltrue([for ga in var.guest_accelerator : ga.count != null]) + error_message = "var.guest_accelerator[*].count cannot be null" + } + + validation { + condition = alltrue([for ga in var.guest_accelerator : ga.count >= 0]) + error_message = "var.guest_accelerator[*].count must never be negative" + } + + validation { + condition = alltrue([for ga in var.guest_accelerator : ga.gpu_driver_installation_config != null]) + error_message = "var.guest_accelerator[*].gpu_driver_installation_config must not be null; leave unset to enable GKE to select default GPU driver installation" + } } variable "image_type" { @@ -230,11 +246,7 @@ variable "taints" { value = any effect = string })) - default = [{ - key = "user-workload" - value = true - effect = "NO_SCHEDULE" - }] + default = [] } variable "labels" { diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index 0f4cb13c2f..72fe6c75ba 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -13,16 +13,16 @@ # limitations under the License. terraform { - required_version = ">= 1.2" + required_version = ">= 1.5" required_providers { google = { source = "hashicorp/google" - version = "~> 5.0" + version = "> 5" } google-beta = { source = "hashicorp/google-beta" - version = "~> 5.0" + version = "> 5" } null = { source = "hashicorp/null" @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.42.0" } } diff --git a/modules/compute/resource-policy/README.md b/modules/compute/resource-policy/README.md index f3f00e3437..85d8baee1c 100644 --- a/modules/compute/resource-policy/README.md +++ b/modules/compute/resource-policy/README.md @@ -43,13 +43,13 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.3 | -| [google-beta](#requirement\_google-beta) | ~> 5.0 | +| [google-beta](#requirement\_google-beta) | > 4.56.0 | ## Providers | Name | Version | |------|---------| -| [google-beta](#provider\_google-beta) | ~> 5.0 | +| [google-beta](#provider\_google-beta) | > 4.56.0 | ## Modules diff --git a/modules/compute/resource-policy/versions.tf b/modules/compute/resource-policy/versions.tf index 4b7b6158c9..89aea79811 100644 --- a/modules/compute/resource-policy/versions.tf +++ b/modules/compute/resource-policy/versions.tf @@ -18,7 +18,7 @@ terraform { required_providers { google-beta = { source = "hashicorp/google-beta" - version = "~> 5.0" + version = "> 4.56.0" } } diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 3332be5e6a..149f472d68 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -169,16 +169,16 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.3.0 | -| [google](#requirement\_google) | >= 4.73.0, <6.0 | -| [google-beta](#requirement\_google-beta) | >= 4.73.0, <6.0 | +| [google](#requirement\_google) | >= 4.73.0 | +| [google-beta](#requirement\_google-beta) | >= 4.73.0 | | [null](#requirement\_null) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | >= 4.73.0, <6.0 | -| [google-beta](#provider\_google-beta) | >= 4.73.0, <6.0 | +| [google](#provider\_google) | >= 4.73.0 | +| [google-beta](#provider\_google-beta) | >= 4.73.0 | | [null](#provider\_null) | >= 3.0 | ## Modules diff --git a/modules/compute/vm-instance/gpu_definition.tf b/modules/compute/vm-instance/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/modules/compute/vm-instance/gpu_definition.tf +++ b/modules/compute/vm-instance/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index 01207d701f..c639f075d6 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -39,7 +39,7 @@ locals { # compact_placement : true when placement policy is provided and collocation set; false if unset compact_placement = try(var.placement_policy.collocation, null) != null - gpu_attached = contains(["a2", "g2"], local.machine_family) || (length([for ga in local.guest_accelerator : ga if ga.count > 0]) > 0) + gpu_attached = contains(["a2", "g2"], local.machine_family) || length(local.guest_accelerator) > 0 # both of these must be false if either compact placement or preemptible/spot instances are used # automatic restart is tolerant of GPUs while on host maintenance is not @@ -239,7 +239,14 @@ resource "google_compute_instance" "compute_vm" { scopes = var.service_account_scopes } - guest_accelerator = local.guest_accelerator + dynamic "guest_accelerator" { + for_each = local.guest_accelerator + content { + count = guest_accelerator.value.count + type = guest_accelerator.value.type + } + } + scheduling { on_host_maintenance = local.on_host_maintenance automatic_restart = local.automatic_restart diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 2d35e5c50e..95be3897cd 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -18,12 +18,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.73.0, <6.0" + version = ">= 4.73.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.73.0, <6.0" + version = ">= 4.73.0" } null = { source = "hashicorp/null" @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.42.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/README.md b/modules/file-system/filestore/README.md index e6de03f9bd..193c5f4dc9 100644 --- a/modules/file-system/filestore/README.md +++ b/modules/file-system/filestore/README.md @@ -7,6 +7,32 @@ mounted to one or more compute VMs. For more information on this and other network storage options in the Cluster Toolkit, see the extended [Network Storage documentation](../../../docs/network_storage.md). +### Deletion protection + +We recommend considering enabling [Filestore deletion protection][fdp]. Deletion +protection will prevent unintentional deletion of an entire Filestore instance. +It does not prevent deletion of files within the Filestore instance when mounted +by a VM. It is not available on some [tiers](#filestore-tiers), including the +default BASIC\_HDD tier or BASIC\_SSD tier. Follow the documentation link for +up to date details. + +Usage can be enabled in a blueprint with, for example: + +```yaml + - id: homefs + source: modules/file-system/filestore + use: [network] + settings: + deletion_protection: + enabled: true + reason: Avoid data loss + filestore_tier: ZONAL + local_mount: /home + size_gb: 1024 +``` + +[fdp]: https://cloud.google.com/filestore/docs/deletion-protection + ### Filestore tiers At the time of writing, Filestore supports 5 [tiers of service][tiers] that are @@ -149,14 +175,14 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 0.14.0 | -| [google](#requirement\_google) | >= 4.19 | +| [google](#requirement\_google) | >= 6.4 | | [random](#requirement\_random) | ~> 3.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | >= 4.19 | +| [google](#provider\_google) | >= 6.4 | | [random](#provider\_random) | ~> 3.0 | ## Modules @@ -175,6 +201,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [connect\_mode](#input\_connect\_mode) | Used to select mode - supported values DIRECT\_PEERING and PRIVATE\_SERVICE\_ACCESS. | `string` | `"DIRECT_PEERING"` | no | +| [deletion\_protection](#input\_deletion\_protection) | Configure Filestore instance deletion protection |
object({
enabled = optional(bool, false)
reason = optional(string)
})
|
{
"enabled": false
}
| no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used as name of the filestore instance if no name is specified. | `string` | n/a | yes | | [filestore\_share\_name](#input\_filestore\_share\_name) | Name of the file system share on the instance. | `string` | `"nfsshare"` | no | | [filestore\_tier](#input\_filestore\_tier) | The service tier of the instance. | `string` | `"BASIC_HDD"` | no | diff --git a/modules/file-system/filestore/main.tf b/modules/file-system/filestore/main.tf index 53d24db8a0..8075a7848b 100644 --- a/modules/file-system/filestore/main.tf +++ b/modules/file-system/filestore/main.tf @@ -56,6 +56,9 @@ resource "google_filestore_instance" "filestore_instance" { location = var.filestore_tier == "ENTERPRISE" ? var.region : var.zone tier = var.filestore_tier + deletion_protection_enabled = var.deletion_protection.enabled + deletion_protection_reason = var.deletion_protection.reason + file_shares { capacity_gb = var.size_gb name = var.filestore_share_name diff --git a/modules/file-system/filestore/variables.tf b/modules/file-system/filestore/variables.tf index d48619c741..e3dbf1c9ff 100644 --- a/modules/file-system/filestore/variables.tf +++ b/modules/file-system/filestore/variables.tf @@ -147,3 +147,20 @@ variable "mount_options" { type = string default = "defaults,_netdev" } + +variable "deletion_protection" { + description = "Configure Filestore instance deletion protection" + type = object({ + enabled = optional(bool, false) + reason = optional(string) + }) + default = { + enabled = false + } + nullable = false + + validation { + condition = !can(coalesce(var.deletion_protection.reason)) || var.deletion_protection.enabled + error_message = "Cannot set Filestore var.deletion_protection.reason unless var.deletion_protection.enabled is true" + } +} diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 3454ca00c6..3722efa557 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -18,7 +18,7 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.19" + version = ">= 6.4" } random = { source = "hashicorp/random" @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf index b87efd8a16..e717d4c42c 100644 --- a/modules/file-system/gke-persistent-volume/versions.tf +++ b/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.42.0" } } diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf index 27f82792ab..ba5b8164f8 100644 --- a/modules/file-system/gke-storage/versions.tf +++ b/modules/file-system/gke-storage/versions.tf @@ -16,6 +16,6 @@ terraform { required_version = ">= 1.0" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.42.0" } } diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index dbf59fa86f..3615d37090 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/main.tf b/modules/network/firewall-rules/main.tf index 322d4c2e7f..f3f24e78d3 100644 --- a/modules/network/firewall-rules/main.tf +++ b/modules/network/firewall-rules/main.tf @@ -20,6 +20,14 @@ data "google_compute_subnetwork" "subnetwork" { self_link = var.subnetwork_self_link } +# Module-level check for Private Google Access on the subnetwork +check "private_google_access_enabled_subnetwork" { + assert { + condition = data.google_compute_subnetwork.subnetwork.private_ip_google_access + error_message = "Private Google Access is disabled for subnetwork '${data.google_compute_subnetwork.subnetwork.name}'. This may cause connectivity issues for instances without external IPs trying to access Google APIs and services." + } +} + module "firewall_rule" { source = "terraform-google-modules/network/google//modules/firewall-rules" version = "~> 9.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index 5312b04355..dfa3e8f332 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.42.0" } required_version = ">= 1.3" diff --git a/modules/network/multivpc/README.md b/modules/network/multivpc/README.md index 0c0a1811d1..605a6a3603 100644 --- a/modules/network/multivpc/README.md +++ b/modules/network/multivpc/README.md @@ -88,7 +88,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [vpcs](#module\_vpcs) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc | v1.39.0&depth=1 | +| [vpcs](#module\_vpcs) | ../vpc | n/a | ## Resources diff --git a/modules/network/multivpc/main.tf b/modules/network/multivpc/main.tf index 603ab057d4..3b04195f8a 100644 --- a/modules/network/multivpc/main.tf +++ b/modules/network/multivpc/main.tf @@ -48,7 +48,7 @@ resource "terraform_data" "global_ip_cidr_suffix" { } module "vpcs" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=v1.39.0&depth=1" + source = "../vpc" count = var.network_count diff --git a/modules/network/pre-existing-subnetwork/main.tf b/modules/network/pre-existing-subnetwork/main.tf index 8042f6472a..9fb206f969 100644 --- a/modules/network/pre-existing-subnetwork/main.tf +++ b/modules/network/pre-existing-subnetwork/main.tf @@ -28,3 +28,11 @@ data "google_compute_subnetwork" "primary_subnetwork" { } } } + +# Module-level check for Private Google Access on the subnetwork +check "private_google_access_enabled_subnetwork" { + assert { + condition = data.google_compute_subnetwork.primary_subnetwork.private_ip_google_access + error_message = "Private Google Access is disabled for subnetwork '${data.google_compute_subnetwork.primary_subnetwork.name}'. This may cause connectivity issues for instances without external IPs trying to access Google APIs and services." + } +} diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf index 7a38f30404..e8a3464fa4 100644 --- a/modules/network/pre-existing-subnetwork/versions.tf +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/README.md b/modules/network/pre-existing-vpc/README.md index b1257a4097..ecd4584ef5 100644 --- a/modules/network/pre-existing-vpc/README.md +++ b/modules/network/pre-existing-vpc/README.md @@ -35,6 +35,17 @@ VM will be created. > **_NOTE:_** The `project_id` and `region` settings would be inferred from the > deployment variables of the same name, but they are included here for clarity. +### Use shared-vpc + +If a network is created in different project, this module can be used to +reference the network. To use a network from a different project first make sure +you have a [cloud nat][cloudnat] and [IAP][iap] forwarding. For more details, +refer [shared-vpc][shared-vpc-doc] + +[cloudnat]: https://cloud.google.com/nat/docs/overview +[iap]: https://cloud.google.com/iap/docs/using-tcp-forwarding +[shared-vpc-doc]: ../../../examples/README.md#hpc-slurm-sharedvpcyaml-community-badge-experimental-badge + ## License diff --git a/modules/network/pre-existing-vpc/main.tf b/modules/network/pre-existing-vpc/main.tf index 88ccd5f93d..ed332bab72 100644 --- a/modules/network/pre-existing-vpc/main.tf +++ b/modules/network/pre-existing-vpc/main.tf @@ -43,3 +43,11 @@ data "google_compute_subnetwork" "primary_subnetwork" { } } } + +# Module-level check for Private Google Access on the subnetwork +check "private_google_access_enabled_subnetwork" { + assert { + condition = data.google_compute_subnetwork.primary_subnetwork.private_ip_google_access + error_message = "Private Google Access is disabled for subnetwork '${data.google_compute_subnetwork.primary_subnetwork.name}'. This may cause connectivity issues for instances without external IPs trying to access Google APIs and services." + } +} diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index c9f1ec5992..0585447957 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/network/vpc/README.md b/modules/network/vpc/README.md index 215c13a4f2..97e28d548a 100644 --- a/modules/network/vpc/README.md +++ b/modules/network/vpc/README.md @@ -172,7 +172,7 @@ No providers. | Name | Source | Version | |------|--------|---------| | [cloud\_router](#module\_cloud\_router) | terraform-google-modules/cloud-router/google | ~> 6.0 | -| [nat\_ip\_addresses](#module\_nat\_ip\_addresses) | terraform-google-modules/address/google | ~> 3.1 | +| [nat\_ip\_addresses](#module\_nat\_ip\_addresses) | terraform-google-modules/address/google | ~> 4.1 | | [vpc](#module\_vpc) | terraform-google-modules/network/google | ~> 9.0 | ## Resources @@ -196,6 +196,7 @@ No resources. | [firewall\_log\_config](#input\_firewall\_log\_config) | Firewall log configuration for Toolkit firewall rules (var.enable\_iap\_ssh\_ingress and others) | `string` | `"DISABLE_LOGGING"` | no | | [firewall\_rules](#input\_firewall\_rules) | List of firewall rules | `any` | `[]` | no | | [ips\_per\_nat](#input\_ips\_per\_nat) | The number of IP addresses to allocate for each regional Cloud NAT (set to 0 to disable NAT) | `number` | `2` | no | +| [labels](#input\_labels) | Labels to add to network resources that support labels. Key-value pairs of strings. | `map(string)` | `{}` | no | | [mtu](#input\_mtu) | The network MTU (default: 8896). Recommended values: 0 (use Compute Engine default), 1460 (default outside HPC environments), 1500 (Internet default), or 8896 (for Jumbo packets). Allowed are all values in the range 1300 to 8896, inclusively. | `number` | `8896` | no | | [network\_address\_range](#input\_network\_address\_range) | IP address range (CIDR) for global network | `string` | `"10.0.0.0/9"` | no | | [network\_description](#input\_network\_description) | An optional description of this resource (changes will trigger resource destroy/create) | `string` | `""` | no | diff --git a/modules/network/vpc/main.tf b/modules/network/vpc/main.tf index 76de7a3f57..3c1ceff0d2 100644 --- a/modules/network/vpc/main.tf +++ b/modules/network/vpc/main.tf @@ -14,6 +14,11 @@ * limitations under the License. */ +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "vpc", ghpc_role = "network" }) +} + locals { autoname = replace(var.deployment_name, "_", "-") network_name = var.network_name == null ? "${local.autoname}-net" : var.network_name @@ -175,7 +180,7 @@ module "vpc" { # https://github.com/terraform-google-modules/terraform-google-address/blob/v3.1.1/outputs.tf module "nat_ip_addresses" { source = "terraform-google-modules/address/google" - version = "~> 3.1" + version = "~> 4.1" for_each = toset(local.regions) @@ -184,6 +189,7 @@ module "nat_ip_addresses" { # an external, regional (not global) IP address is suited for a regional NAT address_type = "EXTERNAL" global = false + labels = local.labels names = [for idx in range(var.ips_per_nat) : "${local.network_name}-nat-ips-${each.value}-${idx}"] } diff --git a/modules/network/vpc/variables.tf b/modules/network/vpc/variables.tf index 996c5b7273..12495b6770 100644 --- a/modules/network/vpc/variables.tf +++ b/modules/network/vpc/variables.tf @@ -19,6 +19,13 @@ variable "project_id" { type = string } +variable "labels" { + description = "Labels to add to network resources that support labels. Key-value pairs of strings." + type = map(string) + default = {} + nullable = false +} + variable "network_name" { description = "The name of the network to be created (if unsupplied, will default to \"{deployment_name}-net\")" type = string diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 873d3b993b..2e28ae66c0 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -36,7 +36,7 @@ This can be achieved by one of the following 2 approaches: 1. Using a public IP address on the VM -- Set [var.omit_external_ip](#input_omit_external_ip) to `true` +- Set [var.omit_external_ip](#input_omit_external_ip) to `false` 1. Configuring a VPC with a Cloud NAT in the region of the VM @@ -210,24 +210,14 @@ to the console. For example: ==> example.googlecompute.toolkit_image: Startup script, if any, has finished running. ``` -Using the default value for \[var.scopes\]\[#input_scopes\], the output of -startup script execution will be stored in Cloud Logging. It can be examined -using the [Cloud Logging Console][logging-console] or with a -[gcloud logging read][logging-read-docs] command (substituting `<>` -with your project ID): +### Debugging startup-script failures -```shell -$ gcloud logging --project <> read \ - 'logName="projects/<>/logs/GCEMetadataScripts" AND jsonPayload.message=~"^startup-script: "' \ - --format="table[box](timestamp, resource.labels.instance_id, jsonPayload.message)" --freshness 2h -``` +> [!NOTE] +> There can be a delay in the propagation of the logs from the instance to +> Cloud Logging, so it may require waiting a few minutes to see the full logs. -Note that this command will print **all** startup script entries within the -project within the "freshness" window **in reverse order**. You may need to -identify the instance ID of the Packer VM and filter further by that value using -`gcloud` or `grep`. To print the entries in the order they would have appeared -on your console, we recommend piping the output of this command to the standard -Linux utility `tac`. +If the Packer image build fails, the module will output a `gcloud` command +that can be used directly to review startup-script execution. ## License diff --git a/modules/packer/custom-image/image.pkr.hcl b/modules/packer/custom-image/image.pkr.hcl index e4f30dfb58..9282cf7433 100644 --- a/modules/packer/custom-image/image.pkr.hcl +++ b/modules/packer/custom-image/image.pkr.hcl @@ -21,6 +21,9 @@ locals { image_name_default = "${local.image_family}-${formatdate("YYYYMMDD't'hhmmss'z'", timestamp())}" image_name = var.image_name != null ? var.image_name : local.image_name_default + # construct vm image name for use when getting logs + instance_name = "packer-${substr(uuidv4(), 0, 6)}" + # default to explicit var.communicator, otherwise in-order: ssh/winrm/none shell_script_communicator = length(var.shell_scripts) > 0 ? "ssh" : "" ansible_playbook_communicator = length(var.ansible_playbooks) > 0 ? "ssh" : "" @@ -96,6 +99,7 @@ source "googlecompute" "toolkit_image" { image_name = local.image_name image_family = local.image_family image_labels = local.labels + instance_name = local.instance_name machine_type = var.machine_type accelerator_type = local.accelerator_type accelerator_count = var.accelerator_count @@ -189,12 +193,24 @@ build { } } - # if the jq command is present, this will print the image name to stdout - # if jq is not present, this exits silently with code 0 - post-processor "shell-local" { + # If there is an error during image creation, print out command for getting packer VM logs + error-cleanup-provisioner "shell-local" { + environment_vars = [ + "PRJ_ID=${var.project_id}", + "INST_NAME=${local.instance_name}", + "ZONE=${var.zone}", + ] + inline_shebang = "/bin/bash -e" inline = [ - "command -v jq > /dev/null || exit 0", - "echo \"Image built: $(jq -r '.builds[-1].artifact_id' ${var.manifest_file} | cut -d ':' -f2)\"", + "type -P gcloud > /dev/null || exit 0", + "INST_ID=$(gcloud compute instances describe $INST_NAME --project $PRJ_ID --format=\"value(id)\" --zone=$ZONE)", + "echo 'Error building image try checking logs:'", + join(" ", ["echo \"gcloud logging --project $PRJ_ID read", + "'logName=(\\\"projects/$PRJ_ID/logs/GCEMetadataScripts\\\" OR \\\"projects/$PRJ_ID/logs/google_metadata_script_runner\\\") AND resource.labels.instance_id=$INST_ID'", + "--format=\\\"table(timestamp, resource.labels.instance_id, jsonPayload.message)\\\"", + "--order=asc\"" + ] + ) ] } } diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index a3a0b176b6..d4068a93c3 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -140,7 +140,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 10.1.1 | +| [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 | | [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0 | ## Resources diff --git a/modules/scheduler/batch-job-template/main.tf b/modules/scheduler/batch-job-template/main.tf index bb378ea7b1..0d681536c9 100644 --- a/modules/scheduler/batch-job-template/main.tf +++ b/modules/scheduler/batch-job-template/main.tf @@ -90,7 +90,7 @@ locals { module "instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" - version = "~> 10.1.1" + version = "~> 12.1" name_prefix = var.instance_template == null ? "${var.job_id}-instance-template" : "unused-template" project_id = var.project_id diff --git a/modules/scheduler/batch-login-node/README.md b/modules/scheduler/batch-login-node/README.md index ef76446250..c20ca7dbeb 100644 --- a/modules/scheduler/batch-login-node/README.md +++ b/modules/scheduler/batch-login-node/README.md @@ -89,7 +89,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [login\_startup\_script](#module\_login\_startup\_script) | ../../scripts/startup-script | n/a | ## Resources diff --git a/modules/scheduler/batch-login-node/main.tf b/modules/scheduler/batch-login-node/main.tf index b5eb8bc7dd..6f539af122 100644 --- a/modules/scheduler/batch-login-node/main.tf +++ b/modules/scheduler/batch-login-node/main.tf @@ -94,7 +94,7 @@ locals { } module "login_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../scripts/startup-script" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index 599294a84e..9600f1ad02 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 56b6236066..637a73ab1e 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -125,7 +125,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [kubectl\_apply](#module\_kubectl\_apply) | ../../management/kubectl-apply | n/a | -| [workload\_identity](#module\_workload\_identity) | terraform-google-modules/kubernetes-engine/google//modules/workload-identity | 29.0.0 | +| [workload\_identity](#module\_workload\_identity) | terraform-google-modules/kubernetes-engine/google//modules/workload-identity | ~> 34.0 | ## Resources diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index b57fec89cd..c7aaf52eef 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -290,7 +290,7 @@ provider "kubernetes" { module "workload_identity" { count = var.configure_workload_identity_sa ? 1 : 0 source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" - version = "29.0.0" + version = "~> 34.0" use_existing_gcp_sa = true name = "workload-identity-k8-sa" diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 67c30a9e84..a8d1ecfd89 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -34,6 +34,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.42.0" } } diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf index 328bdda8e1..3b15bab237 100644 --- a/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -23,7 +23,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.42.0" } required_version = ">= 1.3" diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index 189e627732..3cbaedb363 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -318,13 +318,14 @@ No modules. | [configure\_ssh\_host\_patterns](#input\_configure\_ssh\_host\_patterns) | If specified, it will automate ssh configuration by:
- Defining a Host block for every element of this variable and setting StrictHostKeyChecking to 'No'.
Ex: "hpc*", "hpc01*", "ml*"
- The first time users log-in, it will create ssh keys that are added to the authorized keys list
This requires a shared /home filesystem and relies on specifying the right prefix. | `list(string)` | `[]` | no | | [debug\_file](#input\_debug\_file) | Path to an optional local to be written with 'startup\_script'. | `string` | `null` | no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used to name GCS bucket for startup scripts. | `string` | n/a | yes | -| [enable\_docker\_world\_writable](#input\_enable\_docker\_world\_writable) | Configure Docker daemon to be writable by all users (if var.install\_docker is set to true). | `bool` | `false` | no | +| [docker](#input\_docker) | Install and configure Docker |
object({
enabled = optional(bool, false)
world_writable = optional(bool, false)
daemon_config = optional(string, "")
})
|
{
"enabled": false
}
| no | +| [enable\_docker\_world\_writable](#input\_enable\_docker\_world\_writable) | DEPRECATED: use var.docker | `bool` | `null` | no | | [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | The GCS path for storage bucket and the object, starting with `gs://`. | `string` | `null` | no | | [http\_no\_proxy](#input\_http\_no\_proxy) | Domains for which to disable http\_proxy behavior. Honored only if var.http\_proxy is set | `string` | `".google.com,.googleapis.com,metadata.google.internal,localhost,127.0.0.1"` | no | | [http\_proxy](#input\_http\_proxy) | Web (http and https) proxy configuration for pip, apt, and yum/dnf and interactive shells | `string` | `""` | no | | [install\_ansible](#input\_install\_ansible) | Run Ansible installation script if either set to true or unset and runner of type 'ansible-local' are used. | `bool` | `null` | no | | [install\_cloud\_ops\_agent](#input\_install\_cloud\_ops\_agent) | Warning: Consider using `install_stackdriver_agent` for better performance. Run Google Ops Agent installation script if set to true. | `bool` | `false` | no | -| [install\_docker](#input\_install\_docker) | Install Docker command line tool and daemon. | `bool` | `false` | no | +| [install\_docker](#input\_install\_docker) | DEPRECATED: use var.docker. | `bool` | `null` | no | | [install\_stackdriver\_agent](#input\_install\_stackdriver\_agent) | Run Google Stackdriver Agent installation script if set to true. Preferred over ops agent for performance. | `bool` | `false` | no | | [labels](#input\_labels) | Labels for the created GCS bucket. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_filesystem](#input\_local\_ssd\_filesystem) | Create and mount a filesystem from local SSD disks (data will be lost if VMs are powered down without enabling migration); enable by setting mountpoint field to a valid directory path. |
object({
fs_type = optional(string, "ext4")
mountpoint = optional(string, "")
permissions = optional(string, "0755")
})
|
{
"fs_type": "ext4",
"mountpoint": "",
"permissions": "0755"
}
| no | diff --git a/modules/scripts/startup-script/files/install_docker.yml b/modules/scripts/startup-script/files/install_docker.yml index 61a74ea0df..c169b62cd9 100644 --- a/modules/scripts/startup-script/files/install_docker.yml +++ b/modules/scripts/startup-script/files/install_docker.yml @@ -17,6 +17,8 @@ hosts: all become: true vars: + docker_data_root: '' + docker_daemon_config: '' enable_docker_world_writable: false tasks: - name: Check if docker is installed @@ -29,25 +31,54 @@ dest: /tmp/get-docker.sh owner: root group: root - mode: 0644 + mode: '0644' when: not docker_binary.stat.exists - name: Install Docker ansible.builtin.command: sh /tmp/get-docker.sh register: docker_installed changed_when: docker_installed.rc != 0 when: not docker_binary.stat.exists + - name: Create Docker daemon configuration + ansible.builtin.copy: + dest: /etc/docker/daemon.json + mode: '0644' + content: '{{ docker_daemon_config }}' + # validate flag requires Docker server version 23.0 and above + # can add this back after private A3 DLVM image is deprecated + # this image comes with Docker version 20.10.17 + # validate: /usr/bin/dockerd --validate --config-file %s + when: docker_daemon_config + notify: + - Restart Docker + - name: Create Docker service override directory + ansible.builtin.file: + path: /etc/systemd/system/docker.service.d + state: directory + owner: root + group: root + mode: '0755' + - name: Create Docker service override configuration + ansible.builtin.copy: + dest: /etc/systemd/system/docker.service.d/data-root.conf + mode: '0644' + content: | + [Unit] + {% if docker_data_root %} + RequiresMountsFor={{ docker_data_root }} + {% endif %} + After=create-localssd-raid.service - name: Create Docker socket override directory ansible.builtin.file: path: /etc/systemd/system/docker.socket.d state: directory owner: root group: root - mode: 0755 + mode: '0755' when: enable_docker_world_writable - name: Create Docker socket override configuration ansible.builtin.copy: dest: /etc/systemd/system/docker.socket.d/world-writable.conf - mode: 0644 + mode: '0644' content: | [Socket] SocketMode=0666 @@ -72,10 +103,14 @@ ansible.builtin.service: name: docker.socket state: restarted + - name: Restart Docker + ansible.builtin.service: + name: docker.service + state: restarted post_tasks: - name: Start Docker ansible.builtin.service: - name: docker + name: docker.service state: started enabled: true diff --git a/modules/scripts/startup-script/files/setup-raid.yml b/modules/scripts/startup-script/files/setup-raid.yml index d7590069a8..5ebf35e522 100644 --- a/modules/scripts/startup-script/files/setup-raid.yml +++ b/modules/scripts/startup-script/files/setup-raid.yml @@ -53,10 +53,11 @@ [Unit] After=local-fs.target Before=slurmd.service - ConditionPathIsMountPoint=!{{ mountpoint }} + ConditionPathExists=!{{ array_dev }} [Service] Type=oneshot + RemainAfterExit=yes ExecStart=/usr/bin/bash -c "/usr/sbin/mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-*{{ " --force" if local_ssd_devices.files | length == 1 else "" }}" ExecStartPost=/usr/sbin/mkfs -t {{ fstype }}{{ " -m 0" if fstype == "ext4" else "" }} {{ array_dev }} @@ -70,19 +71,30 @@ enabled: true daemon_reload: true - - name: Mount RAID array - ansible.posix.mount: - src: "{{ array_dev }}" - path: "{{ mountpoint }}" - fstype: "{{ fstype }}" - # the nofail option is critical as it enables the boot process to complete on machines - # that were powered off and had local SSD contents discarded; without this option - # VMs may fail to join the network - opts: discard,defaults,nofail - state: mounted + - name: Install service to mount local SSD array + ansible.builtin.copy: + dest: /etc/systemd/system/mount-localssd-raid.service + mode: 0644 + content: | + [Unit] + After=local-fs.target create-localssd-raid.service + Before=slurmd.service + Wants=create-localssd-raid.service + ConditionPathIsMountPoint=!{{ mountpoint }} - - name: Set mount permissions - ansible.builtin.file: - path: "{{ mountpoint }}" - state: directory - mode: "{{ mode }}" + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStart=/usr/bin/systemd-mount -t {{ fstype }} -o discard,defaults,nofail {{ array_dev }} {{ mountpoint }} + ExecStartPost=/usr/bin/chmod {{ mode }} {{ mountpoint }} + ExecStop=/usr/bin/systemd-umount {{ mountpoint }} + + [Install] + WantedBy=slurmd.service + + - name: Mount RAID array and set permissions + ansible.builtin.systemd: + name: mount-localssd-raid.service + state: started + enabled: true + daemon_reload: true diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index 8a6c1dd6cb..3d41bf262f 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -44,7 +44,11 @@ locals { host_name_prefix = var.configure_ssh_host_patterns } - prefix_file = "/tmp/prefix_file.json" + prefix_file = "/tmp/prefix_file.json" + ansible_docker_settings_file = "/tmp/ansible_docker_settings.json" + + docker_config = try(jsondecode(var.docker.daemon_config), {}) + docker_data_root = try(local.docker_config.data-root, null) configure_ssh_runners = local.configure_ssh ? [ { @@ -89,12 +93,21 @@ locals { } ] - docker_runner = !var.install_docker ? [] : [ + docker_runner = !var.docker.enabled ? [] : [ + { + type = "data" + destination = local.ansible_docker_settings_file + content = jsonencode({ + enable_docker_world_writable = var.docker.world_writable + docker_daemon_config = var.docker.daemon_config + docker_data_root = local.docker_data_root + }) + }, { type = "ansible-local" destination = "install_docker.yml" content = file("${path.module}/files/install_docker.yml") - args = "-e enable_docker_world_writable=${var.enable_docker_world_writable}" + args = "-e \"@${local.ansible_docker_settings_file}\"" }, ] @@ -113,7 +126,7 @@ locals { ] supplied_ansible_runners = anytrue([for r in var.runners : r.type == "ansible-local"]) - has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.install_docker, local.local_ssd_filesystem_enabled]) + has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.docker.enabled, local.local_ssd_filesystem_enabled]) install_ansible = coalesce(var.install_ansible, local.has_ansible_runners) ansible_installer = local.install_ansible ? [{ type = "shell" @@ -134,9 +147,9 @@ locals { local.proxy_runner, local.monitoring_agent_installer, local.ansible_installer, + local.raid_setup, # order RAID early to ensure filesystem is ready for subsequent runners local.configure_ssh_runners, local.docker_runner, - local.raid_setup, var.runners ) @@ -188,6 +201,19 @@ locals { } } +check "health_check" { + assert { + condition = local.docker_config == {} + error_message = <<-EOT + This message is only a warning. The Toolkit performs no validation of the + Docker daemon configuration. VM startup scripts will fail if the file is not + a valid Docker JSON configuration. Please review the Docker documentation: + + https://docs.docker.com/engine/daemon/ + EOT + } +} + resource "random_id" "resource_name_suffix" { byte_length = 4 } @@ -225,10 +251,6 @@ resource "google_storage_bucket_object" "scripts" { condition = !(var.install_cloud_ops_agent && var.install_stackdriver_agent) error_message = "Only one of var.install_stackdriver_agent or var.install_cloud_ops_agent can be set. Stackdriver is recommended for best performance." } - precondition { - condition = !var.enable_docker_world_writable || var.install_docker - error_message = "If var.enable_docker_world_writable is set to true, var.install_docker must also be set to true." - } } } diff --git a/modules/scripts/startup-script/variables.tf b/modules/scripts/startup-script/variables.tf index 3975a69614..026f0d624d 100644 --- a/modules/scripts/startup-script/variables.tf +++ b/modules/scripts/startup-script/variables.tf @@ -112,18 +112,56 @@ EOT default = [] } +variable "docker" { + description = "Install and configure Docker" + type = object({ + enabled = optional(bool, false) + world_writable = optional(bool, false) + daemon_config = optional(string, "") + }) + default = { + enabled = false + } + + validation { + condition = !coalesce(var.docker.world_writable) || var.docker.enabled + error_message = "var.docker.world_writable should only be set if var.docker.enabled is set to true" + } + + validation { + condition = !can(coalesce(var.docker.daemon_config)) || var.docker.enabled + error_message = "var.docker.daemon_config should only be set if var.docker.enabled is set to true" + } + + validation { + condition = !can(coalesce(var.docker.daemon_config)) || can(jsondecode(var.docker.daemon_config)) + error_message = "var.docker.daemon_config should be set to a valid Docker daemon JSON configuration" + } + +} + +# tflint-ignore: terraform_unused_declarations variable "enable_docker_world_writable" { - description = "Configure Docker daemon to be writable by all users (if var.install_docker is set to true)." + description = "DEPRECATED: use var.docker" type = bool - default = false - nullable = false + default = null + + validation { + condition = var.enable_docker_world_writable == null + error_message = "The variable enable_docker_world_writable has been removed. Use var.docker instead" + } } +# tflint-ignore: terraform_unused_declarations variable "install_docker" { - description = "Install Docker command line tool and daemon." + description = "DEPRECATED: use var.docker." type = bool - default = false - nullable = false + default = null + + validation { + condition = var.install_docker == null + error_message = "The variable install_docker has been removed. Use var.docker instead" + } } variable "local_ssd_filesystem" { diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index c954c7e6fa..826cf6f810 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.42.0" } required_version = ">= 1.3" diff --git a/pkg/config/expand.go b/pkg/config/expand.go index b79babfbe5..9bad4dd2d1 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { return map[string]TerraformProvider{ "google": { Source: "hashicorp/google", - Version: ">= 4.84.0, < 6.8.0", + Version: "~> 6.10.0", Configuration: gglConf}, "google-beta": { Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 6.8.0", + Version: "~> 6.10.0", Configuration: gglConf}} } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 5abdd6620d..ad00218133 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) { c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ "google": TerraformProvider{ Source: "hashicorp/google", - Version: ">= 4.84.0, < 6.8.0"}, + Version: "~> 6.10.0"}, "google-beta": TerraformProvider{ Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 6.8.0"}}) + Version: "~> 6.10.0"}}) } { // no def PR, group PR diff --git a/pkg/inspect/modules_test.go b/pkg/inspect/modules_test.go index 5ecac1b11c..03e9d570b7 100644 --- a/pkg/inspect/modules_test.go +++ b/pkg/inspect/modules_test.go @@ -234,7 +234,7 @@ func TestOutputForbiddenNames(t *testing.T) { nowhere := []string{} allowed := map[string][]string{ // Global blueprint variables we don't want to get overwritten. - "project_id": {"community/modules/project/new-project"}, + "project_id": nowhere, "labels": nowhere, "region": nowhere, "zone": nowhere, diff --git a/pkg/modulereader/metadata_legacy.go b/pkg/modulereader/metadata_legacy.go index 1c55ddf3fc..7f4e22c1ec 100644 --- a/pkg/modulereader/metadata_legacy.go +++ b/pkg/modulereader/metadata_legacy.go @@ -62,20 +62,9 @@ func defaultAPIList(source string) []string { "iam.googleapis.com", "runtimeconfig.googleapis.com", }, - "community/modules/file-system/Intel-DAOS": { - "compute.googleapis.com", - "iam.googleapis.com", - "secretmanager.googleapis.com", - }, "community/modules/file-system/nfs-server": { "compute.googleapis.com", }, - "community/modules/project/new-project": { - "admin.googleapis.com", - "cloudresourcemanager.googleapis.com", - "cloudbilling.googleapis.com", - "iam.googleapis.com", - }, "community/modules/project/service-account": { "iam.googleapis.com", }, diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index 220525c4f8..ba6aed0e16 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -135,6 +135,9 @@ func GetModuleInfo(source string, kind string) (ModuleInfo, error) { switch { case sourcereader.IsEmbeddedPath(source) || sourcereader.IsLocalPath(source): modPath = source + if sourcereader.IsLocalPath(source) && sourcereader.LocalModuleIsEmbedded(source) { + return ModuleInfo{}, fmt.Errorf("using embedded modules with local paths is no longer supported; use embedded path and rebuild gcluster binary") + } default: pkgAddr, subDir := getter.SourceDirSubdir(source) if cachedModPath, ok := modDownloadCache[pkgAddr]; ok { diff --git a/pkg/sourcereader/embedded.go b/pkg/sourcereader/embedded.go index 5410f4ca76..3e9afd8a61 100644 --- a/pkg/sourcereader/embedded.go +++ b/pkg/sourcereader/embedded.go @@ -20,6 +20,7 @@ import ( "os" "path" "path/filepath" + "strings" ) // ModuleFS contains embedded modules (./modules) for use in building @@ -53,6 +54,31 @@ func copyFileOut(bfs BaseFS, src string, dst string) error { return nil } +func LocalModuleIsEmbedded(source string) bool { + if ModuleFS == nil { + return false + } + + if !IsLocalPath(source) { + return false + } + + pathBits := strings.SplitN(filepath.Clean(source), string(os.PathSeparator), 5) + lengthPath := len(pathBits) + if lengthPath < 3 { + return false + } + + for i := 3; i <= lengthPath; i++ { + lastBits := filepath.Join(pathBits[lengthPath-i:]...) + _, err := ModuleFS.ReadDir(lastBits) + if err == nil { + return true + } + } + return false +} + // copyDir copies an FS directory to a local path func copyDir(bfs BaseFS, source string, dest string) error { dirEntries, err := bfs.ReadDir(source) diff --git a/pkg/sourcereader/embedded_test.go b/pkg/sourcereader/embedded_test.go index 8ee22ce125..a930d65127 100644 --- a/pkg/sourcereader/embedded_test.go +++ b/pkg/sourcereader/embedded_test.go @@ -98,6 +98,28 @@ func (s *embeddedSuite) TestGetModule_Embedded(c *C) { c.Assert(err, ErrorMatches, "source is not valid: .*") } +func (s *embeddedSuite) TestLocalModuleIsEmbedded(c *C) { + { // Invalid: Cannot use embedded modules locally + found := LocalModuleIsEmbedded("./modules/network/vpc") + c.Check(found, Equals, true) + } + + { // Invalid: Cannot use embedded modules locally + found := LocalModuleIsEmbedded("../hpc-toolkit/modules/compute/../network/vpc") + c.Check(found, Equals, true) + } + + { // Valid: use non-embedded modules locally + found := LocalModuleIsEmbedded("../hpc-toolkit/modules/compute/../foo/bar") + c.Check(found, Equals, false) + } + + { // Invalid: must be a local path + found := LocalModuleIsEmbedded("modules/network/vpc") + c.Check(found, Equals, false) + } +} + func (s *embeddedSuite) TestGetModule_NilFs(c *C) { ModuleFS = nil c.Assert(s.r.GetModule("here", "there"), NotNil) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml index 8fd04acac8..04baa266e5 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml @@ -93,7 +93,7 @@ ansible.builtin.debug: var: remote_ip - ## Setup firewall for cloud build + # Setup firewall for cloud build - name: Create firewall rule register: fw_result changed_when: fw_result.rc == 0 @@ -132,8 +132,11 @@ groups: [remote_host] when: remote_ip | ansible.utils.ipaddr - - name: Wait for cluster - ansible.builtin.wait_for_connection: + - name: Wait for host tasks + ansible.builtin.include_tasks: tasks/wait-for-host.yml + vars: + host_ip: "{{ remote_ip }}" + ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" ## Cleanup and fail gracefully rescue: @@ -166,6 +169,7 @@ vars: startup_timeout_seconds: 600 # 10 minutes gather_facts: false + ignore_unreachable: true # ensure always block will run even if SSH fails tasks: - name: Remote Test Block vars: diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml index 74c7c2ea46..480042a810 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml @@ -102,6 +102,11 @@ - --ttl - 2h - "--key-file=/builder/home/.ssh/id_rsa.pub" + - name: Wait for host tasks + ansible.builtin.include_tasks: tasks/wait-for-host.yml + vars: + host_ip: "{{ access_ip.stdout }}" + ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" rescue: - name: Delete Firewall Rule register: fw_deleted @@ -146,10 +151,6 @@ vars: ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" block: - - name: Wait until host is reachable - ansible.builtin.wait_for_connection: - delay: 60 - timeout: 300 - name: Gather facts ansible.builtin.setup: - name: Wait until HTCondor daemon is up diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml index 4afc25457a..8c11fc2848 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml @@ -133,7 +133,7 @@ - --rules=tcp:22 - --source-ranges={{ build_ip.stdout }} - - name: 'Add SSH Keys to OS-Login' + - name: Add SSH Keys to OS-Login register: key_created changed_when: key_created.rc == 0 ansible.builtin.command: @@ -153,6 +153,12 @@ groups: [remote_host] when: login_ip | ansible.utils.ipaddr + - name: Wait for host tasks + ansible.builtin.include_tasks: tasks/wait-for-host.yml + vars: + host_ip: "{{ login_ip }}" + ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" + ## Cleanup and fail gracefully rescue: - name: Capture gcluster stderr @@ -186,14 +192,9 @@ tasks: - name: Slurm Test Block vars: - ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" ansible_remote_tmp: "/tmp/gcluster/" + ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" block: - - name: Wait until host is reachable - ansible.builtin.wait_for_connection: - delay: 60 - timeout: 300 - - name: Gather facts ansible.builtin.setup: diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/wait-for-host.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/wait-for-host.yml new file mode 100644 index 0000000000..27dbf53109 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/wait-for-host.yml @@ -0,0 +1,41 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Assert variables are defined + ansible.builtin.assert: + that: + - host_ip is defined + +- name: Wait for firewall to allow port 22 connection + ansible.builtin.wait_for: + host: "{{ host_ip }}" + port: 22 + delay: 60 + timeout: 300 + delegate_to: localhost + ignore_errors: true + register: port_out + +- name: Check connection to remote host + ansible.builtin.wait_for_connection: + delay: 10 + delegate_to: "{{ host_ip }}" + ignore_unreachable: true + register: connect_out + +- name: Fail on bad connections + ansible.builtin.fail: + msg: "Failed to connect to remote host {{ host_ip }}" + when: port_out is failed or connect_out is failed diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml index 424908f436..adceaa1087 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml @@ -19,7 +19,7 @@ - name: Execute the job delegate_to: localhost ansible.builtin.shell: | - jobs=({{ workspace }}/{{ deployment_name }}/primary/my-job*) + jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*) for job in "${jobs[@]}"; do kubectl create -f "$job" done @@ -30,10 +30,10 @@ - name: Wait for job to complete delegate_to: localhost ansible.builtin.command: | - kubectl get job --field-selector status.successful=5 + kubectl get job --field-selector status.successful=1 register: job_completion until: job_completion.stdout_lines | length > 1 - retries: 40 + retries: 80 delay: 15 - name: Print job_completion debug output diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml index 44900430a7..d6ddeeadcc 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml @@ -27,7 +27,7 @@ vars: # on_host_maintenance: MIGRATE num_nodes: 1 rocky_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public deployment_groups: @@ -79,7 +79,7 @@ deployment_groups: # settings: # node_count_dynamic_max: $(vars.num_nodes) # instance_image: - # family: slurm-gcp-6-7-ubuntu-2004-lts + # family: slurm-gcp-6-8-ubuntu-2004-lts # project: schedmd-slurm-public # - id: ubuntu_partition diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml index d7be384115..83ceb58c65 100644 --- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml @@ -70,9 +70,9 @@ deployment_groups: machine_type: g2-standard-4 guest_accelerator: - gpu_driver_installation_config: - - gpu_driver_version: "LATEST" + gpu_driver_version: "LATEST" gpu_sharing_config: - - max_shared_clients_per_gpu: 2 + max_shared_clients_per_gpu: 2 gpu_sharing_strategy: "MPS" - id: job_template_g2_latest_driver @@ -131,9 +131,9 @@ deployment_groups: - type: nvidia-tesla-t4 count: 2 gpu_driver_installation_config: - - gpu_driver_version: "LATEST" + gpu_driver_version: "LATEST" gpu_sharing_config: - - max_shared_clients_per_gpu: 2 + max_shared_clients_per_gpu: 2 gpu_sharing_strategy: "TIME_SHARING" - id: job_template_n1_pool_full_spec diff --git a/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml b/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml index 7169240059..3382f342b6 100644 --- a/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml @@ -25,7 +25,7 @@ tags: - m.startup-script - slurm5 -timeout: 14400s # 4hr +timeout: 18000s # 5hr steps: # test image creation by provisioning a new VPC and using Packer to build an # image in it diff --git a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml index 79c8502590..c06c110a54 100644 --- a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml @@ -24,7 +24,7 @@ tags: - m.startup-script - slurm6 -timeout: 14400s # 4hr +timeout: 18000s # 5hr steps: # While using static network names we are gaurding against more than 1 instance running at a time (for multi-group tests) - id: check_for_running_build diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml index 8d5e724b0b..90338aabd9 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml @@ -22,7 +22,7 @@ slurm_cluster_name: "debiv6{{ build[0:4] }}" cli_deployment_vars: network_name: "{{ network }}" - slurm_image: "{family: slurm-gcp-6-7-debian-11, project: schedmd-slurm-public}" + slurm_image: "{family: slurm-gcp-6-8-debian-11, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/cloud-build/provision/pr-go-build-test.tf b/tools/cloud-build/provision/pr-go-build-test.tf index 32da3ff0ed..b2103ecf0f 100644 --- a/tools/cloud-build/provision/pr-go-build-test.tf +++ b/tools/cloud-build/provision/pr-go-build-test.tf @@ -14,7 +14,7 @@ resource "google_cloudbuild_trigger" "pr_go_build_test" { - for_each = toset(["1.21", "1.22"]) + for_each = toset(["1.22", "1.23"]) name = "PR-Go-${replace(each.key, ".", "-")}-build-test" description = "Test that the PR builds with Go ${each.key}" diff --git a/tools/maintenance/maintenance.py b/tools/maintenance/maintenance.py index 7a22623a8f..c65edd6368 100755 --- a/tools/maintenance/maintenance.py +++ b/tools/maintenance/maintenance.py @@ -35,6 +35,13 @@ "upcomingMaintenance.startTimeWindow.earliest," \ "upcomingMaintenance.startTimeWindow.latest," \ "upcomingMaintenance.canReschedule,upcomingMaintenance.type)'" + +UPDATED_UPC_MAINT_CMD = "gcloud alpha compute instances list --project={}" \ + " --filter='upcomingMaintenance:*' --format='value(name," \ + "upcomingMaintenance.latestWindowStartTime," \ + "upcomingMaintenance.windowEndTime," \ + "upcomingMaintenance.canReschedule,upcomingMaintenance.type)'" + PER_MAINT_CMD = "gcloud alpha compute instances list --project={}" \ " --filter=scheduling.maintenanceInterval:PERIODIC " \ " --format='value(name)'" @@ -72,6 +79,9 @@ def get_upcoming_maintenance(project: str) -> List[str]: err_msg = "Error getting upcoming maintenance list" res = run_command(UPC_MAINT_CMD.format(project), err_msg) + # Check if all output was received. If length is 3, two of the filters failed since the maintenance output is using new format. + if len(res.stdout.split()) == 3: + res = run_command(UPDATED_UPC_MAINT_CMD.format(project), err_msg) upc_maint = [x.split() for x in res.stdout.split("\n")[:-1]] return upc_maint diff --git a/tools/python-integration-tests/test.py b/tools/python-integration-tests/test.py new file mode 100644 index 0000000000..00412f7d7b --- /dev/null +++ b/tools/python-integration-tests/test.py @@ -0,0 +1,344 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import shutil +import os +import re +import signal +import socket +import subprocess +import sys +import time +import paramiko +from collections import defaultdict +import argparse +import yaml + +def run_command(cmd: str, err_msg: str = None) -> subprocess.CompletedProcess: + res = subprocess.run(cmd, shell=True, universal_newlines=True, check=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if res.returncode != 0: + raise subprocess.SubprocessError(f"{err_msg}:\n{res.stderr}") + + return res + +def parse_blueprint(file_path: str): + with open(file_path, 'r') as file: + content = yaml.safe_load(file) + return content["vars"]["deployment_name"], content["vars"]["zone"] + +def get_account_info(): + # Extract the username from posixAccounts + result = run_command(f"gcloud compute os-login describe-profile --format=json").stdout + posixAccounts = json.loads(result) + + for account in posixAccounts.get('posixAccounts', []): + if 'accountId' in account: + project_id = account['accountId'] + username = account['username'] + return project_id, username + +def create_deployment(blueprint: str): + project_id, username = get_account_info() + deployment_name, zone = parse_blueprint(blueprint) + return Deployment(blueprint, project_id, username, deployment_name, zone) + +def test_simple_job_completion(blueprint: str): + deployment = create_deployment(blueprint) + deployment.deploy() + try: + # Waiting to let the login node finish set up or ssh will fail. + print("Wait 60 seconds") + time.sleep(60) + + ssh = deployment.ssh() + test = Test(ssh, deployment) + test.check_simple_job_completion() + finally: + deployment.close_tunnel() + deployment.destroy() + +def test_topology(blueprint: str): + deployment = create_deployment(blueprint) + deployment.deploy() + try: + # Waiting to let the login node finish set up or ssh will fail. + print("Wait 60 seconds") + time.sleep(60) + ssh = deployment.ssh() + test = Test(ssh, deployment) + test.check_topology() + finally: + deployment.close_tunnel() + deployment.destroy() + +class Deployment: + def __init__(self, blueprint: str, project_id: str, username: str, deployment_name: str, zone: str): + self.blueprint_yaml = blueprint + self.project_id = project_id + self.state_bucket = "daily-tests-tf-state" + self.workspace = "" + self.username = username + self.deployment_name = deployment_name + self.zone = zone + self.test_name = deployment_name + self.tunnel = None + + def get_workspace(self): + return os.path.abspath(os.getcwd().strip()) + + def create_blueprint(self): + self.workspace = self.get_workspace() + + cmd = [ + "./gcluster", + "create", + "-l", + "ERROR", + self.blueprint_yaml, + "--backend-config", + f"bucket={self.state_bucket}", + "--vars", + f"project_id={self.project_id}", + "--vars", + f"deployment_name={self.deployment_name}" + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + + def compress_blueprint(self): + cmd = [ + "tar", + "-czf", + "%s.tgz" % (self.deployment_name), + "%s" % (self.deployment_name), + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + + def upload_deployment(self): + cmd = [ + "gsutil", + "cp", + "%s.tgz" % (self.deployment_name), + "gs://%s/%s/" % (self.state_bucket, self.test_name) + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + + def print_download_command(self): + print("gcloud storage cp gs://%s/%s/%s.tgz ." % (self.state_bucket, self.test_name, self.deployment_name)) + + def create_deployment_directory(self): + self.create_blueprint() + self.compress_blueprint() + self.upload_deployment() + self.print_download_command() + + def deploy(self): + # Create deployment directory + self.create_deployment_directory() + cmd = [ + "./gcluster", + "deploy", + self.deployment_name, + "--auto-approve" + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + + def ssh(self) -> paramiko.SSHClient: + instance_name = self.deployment_name.replace("-", "")[:10] + "-slurm-login-001" + + # Use existing SSH key pair (assuming it's already in ~/.ssh/google_compute_engine) + key_path = os.path.expanduser("~/.ssh/google_compute_engine") + + # Add the public key to OS Login + public_key_path = key_path + ".pub" + subprocess.run( + [ + "gcloud", "compute", "os-login", "ssh-keys", "describe", + "--key-file", public_key_path + ], + check=True, capture_output=True + ) + + # Construct the gcloud command to create the IAP tunnel + iap_tunnel_cmd = [ + "gcloud", "compute", "start-iap-tunnel", instance_name, + "22", "--project", self.project_id, "--zone", self.zone, + "--local-host-port=localhost:10022" + ] + + # Create the IAP tunnel process + self.tunnel = subprocess.Popen(iap_tunnel_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Sleep to give the tunnel a few seconds to set up + time.sleep(3) + + # Create an SSH client + ssh = paramiko.SSHClient() + ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + + # Load the private key + key = paramiko.RSAKey.from_private_key_file(key_path) + + # Connect to the VM + ssh.connect("localhost", port=10022, username=self.username, pkey=key) + + return ssh + + def close_tunnel(self): + if self.tunnel: + self.tunnel.terminate() + self.tunnel.wait() + self.tunnel = None + + def destroy(self): + cmd = [ + "./gcluster", + "destroy", + self.deployment_name, + "--auto-approve" + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + os.remove(f"{self.deployment_name}.tgz") + shutil.rmtree(self.deployment_name) + + +class Test: + def __init__(self, ssh, deployment): + self.ssh = ssh + self.deployment = deployment + self.job_list = {} + + def get_slurm_topology(self): + stdin, stdout, stderr = self.ssh.exec_command("scontrol show topo") + return stdout.read().decode() + + def monitor_squeue(self): + # Monitors squeue and updates self.job_list until all running jobs are complete. + lines = [] + + while True: + stdin, stdout, stderr = self.ssh.exec_command('squeue') + + lines = stdout.read().decode().splitlines()[1:] # Skip header + + if not lines: + break + for line in lines: + parts = line.split() + job_id, partition, _, _, state, times, nodes, nodelist = line.split() + + if job_id not in self.job_list: + print(f"Job id {job_id} is not recognized.") + else: + self.job_list[job_id].update({ + "partition": partition, + "state": state, + "time": times, + "nodes": nodes, + "nodelist": nodelist, + }) + time.sleep(5) + + def is_job_complete(self, job_id: str): + # Checks if a job successfully completed. + stdin, stdout, stderr = self.ssh.exec_command(f'scontrol show job {job_id} --json') + content = json.load(stdout) + return content["jobs"][0]["job_state"][0] == "COMPLETED" + + def submit_job(self, cmd: str): + stdin, stdout, stderr = self.ssh.exec_command(cmd) + jobID = stdout.read().decode().split()[-1] + self.job_list[jobID] = {} + + def get_node_depth(self, switch_name: str): + return switch_name.count("_") + + def get_real_rack(self, node: str): + result = run_command(f"gcloud compute instances describe {node} --zone={self.deployment.zone} --project={self.deployment.project_id} --format='value(resourceStatus.physicalHost)'") + return result.stdout.split("/")[1] + + def get_slurm_rack(self, node: str): + stdin, stdout, stderr = self.ssh.exec_command(f"scontrol show topology {node} | tail -1 | cut -d' ' -f1") + switch_name = stdout.read().decode() + self.assert_equal(self.get_node_depth(switch_name), 2, f"{node} does not have the expected topology depth of 2."), + return switch_name + + def get_nodes(self): + nodes = [] + stdin, stdout, stderr = self.ssh.exec_command("scontrol show node| grep NodeName") + for line in stdout.read().decode().splitlines(): + nodes.append(line.split()[0].split("=")[1]) + return nodes + + def assert_equal(self, value1, value2, message=None): + if value1 != value2: + if message is None: + message = f"Assertion failed: {value1} != {value2}" + raise AssertionError(message) + + def check_simple_job_completion(self): + # Submits 5 jobs and checks if they are successful. + for i in range(5): + self.submit_job('sbatch -N 1 --wrap "sleep 20"') + self.monitor_squeue() + + for job_id in self.job_list.keys(): + result = self.is_job_complete(job_id) + self.assert_equal(True, result, f"Something went wrong with JobID:{job_id}.") + print(f"JobID {job_id} finished successfully.") + + def check_topology(self): + # Checks isomorphism of last layer of nodes to determine topology. + r_rack, s_rack = defaultdict(set), defaultdict(set) + nodes = self.get_nodes() + + for node in nodes: + r_rack[self.get_real_rack(node)].add(node) + s_rack[self.get_slurm_rack(node)].add(node) + + r_rack_set = [set(v) for v in r_rack.values()] + s_rack_set = [set(v) for v in s_rack.values()] + + self.assert_equal(r_rack_set, s_rack_set, "The two sets did not match.") + +def main(simple_test_blueprints, topo_test_blueprints) -> None: + if simple_test_blueprints: + for blueprint in simple_test_blueprints: + test_simple_job_completion(blueprint) + print(f'{blueprint} passed simple slurm test.') + + if topo_test_blueprints: + for blueprint in topo_test_blueprints: + test_topology(blueprint) + print(f'{blueprint} passed topology test.') + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='test.py', + description="", + formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument("--simple", nargs="+", help="File path(s) to blueprint(s) to do the simple slurm test on.") + parser.add_argument("--topo", nargs="+", help="File path(s) to blueprint(s) to do the topology test on.") + + args = parser.parse_args() + + main(args.simple, args.topo) diff --git a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml index 5240404a3c..711c9f72e0 100644 --- a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml +++ b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml @@ -27,7 +27,7 @@ vars: slurm_image: # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public # If image above is changed to use custom image, then setting below must be set to true instance_image_custom: false diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index c3f9926b11..1db9c66495 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -38,14 +38,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) @@ -62,6 +62,7 @@ deployment_groups: deployment_name: ((var.deployment_name)) enable_iap_rdp_ingress: true enable_iap_winrm_ingress: true + labels: ((var.labels)) project_id: ((var.project_id)) region: ((var.region)) - source: modules/file-system/filestore diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl index e4f30dfb58..9282cf7433 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl @@ -21,6 +21,9 @@ locals { image_name_default = "${local.image_family}-${formatdate("YYYYMMDD't'hhmmss'z'", timestamp())}" image_name = var.image_name != null ? var.image_name : local.image_name_default + # construct vm image name for use when getting logs + instance_name = "packer-${substr(uuidv4(), 0, 6)}" + # default to explicit var.communicator, otherwise in-order: ssh/winrm/none shell_script_communicator = length(var.shell_scripts) > 0 ? "ssh" : "" ansible_playbook_communicator = length(var.ansible_playbooks) > 0 ? "ssh" : "" @@ -96,6 +99,7 @@ source "googlecompute" "toolkit_image" { image_name = local.image_name image_family = local.image_family image_labels = local.labels + instance_name = local.instance_name machine_type = var.machine_type accelerator_type = local.accelerator_type accelerator_count = var.accelerator_count @@ -189,12 +193,24 @@ build { } } - # if the jq command is present, this will print the image name to stdout - # if jq is not present, this exits silently with code 0 - post-processor "shell-local" { + # If there is an error during image creation, print out command for getting packer VM logs + error-cleanup-provisioner "shell-local" { + environment_vars = [ + "PRJ_ID=${var.project_id}", + "INST_NAME=${local.instance_name}", + "ZONE=${var.zone}", + ] + inline_shebang = "/bin/bash -e" inline = [ - "command -v jq > /dev/null || exit 0", - "echo \"Image built: $(jq -r '.builds[-1].artifact_id' ${var.manifest_file} | cut -d ':' -f2)\"", + "type -P gcloud > /dev/null || exit 0", + "INST_ID=$(gcloud compute instances describe $INST_NAME --project $PRJ_ID --format=\"value(id)\" --zone=$ZONE)", + "echo 'Error building image try checking logs:'", + join(" ", ["echo \"gcloud logging --project $PRJ_ID read", + "'logName=(\\\"projects/$PRJ_ID/logs/GCEMetadataScripts\\\" OR \\\"projects/$PRJ_ID/logs/google_metadata_script_runner\\\") AND resource.labels.instance_id=$INST_ID'", + "--format=\\\"table(timestamp, resource.labels.instance_id, jsonPayload.message)\\\"", + "--order=asc\"" + ] + ) ] } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/main.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/main.tf index 516a553e29..83763abfd4 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/main.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/main.tf @@ -19,6 +19,7 @@ module "network0" { deployment_name = var.deployment_name enable_iap_rdp_ingress = true enable_iap_winrm_ingress = true + labels = var.labels project_id = var.project_id region = var.region } diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index 3dd3e12681..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index d9c215a457..fd6bd3e490 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -44,14 +44,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) @@ -68,6 +68,7 @@ deployment_groups: sensitive: true settings: deployment_name: ((var.deployment_name)) + labels: ((var.labels)) project_id: ((var.project_id)) region: ((var.region)) - group: one @@ -79,14 +80,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index 3dd3e12681..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/main.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/main.tf index 5904dcf49c..b76daf0303 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/main.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/main.tf @@ -24,6 +24,7 @@ terraform { module "network0" { source = "./modules/embedded/modules/network/vpc" deployment_name = var.deployment_name + labels = var.labels project_id = var.project_id region = var.region } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index 3dd3e12681..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 46614b02e6..208cdde2ac 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,14 +39,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) @@ -57,6 +57,7 @@ deployment_groups: id: network settings: deployment_name: ((var.deployment_name)) + labels: ((var.labels)) project_id: ((var.project_id)) region: ((var.region)) - source: modules/file-system/filestore diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/main.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/main.tf index 214ee3b73c..7e0b0290af 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/main.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/main.tf @@ -17,6 +17,7 @@ module "network" { source = "./modules/embedded/modules/network/vpc" deployment_name = var.deployment_name + labels = var.labels project_id = var.project_id region = var.region } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index 3dd3e12681..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl index e4f30dfb58..9282cf7433 100644 --- a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl @@ -21,6 +21,9 @@ locals { image_name_default = "${local.image_family}-${formatdate("YYYYMMDD't'hhmmss'z'", timestamp())}" image_name = var.image_name != null ? var.image_name : local.image_name_default + # construct vm image name for use when getting logs + instance_name = "packer-${substr(uuidv4(), 0, 6)}" + # default to explicit var.communicator, otherwise in-order: ssh/winrm/none shell_script_communicator = length(var.shell_scripts) > 0 ? "ssh" : "" ansible_playbook_communicator = length(var.ansible_playbooks) > 0 ? "ssh" : "" @@ -96,6 +99,7 @@ source "googlecompute" "toolkit_image" { image_name = local.image_name image_family = local.image_family image_labels = local.labels + instance_name = local.instance_name machine_type = var.machine_type accelerator_type = local.accelerator_type accelerator_count = var.accelerator_count @@ -189,12 +193,24 @@ build { } } - # if the jq command is present, this will print the image name to stdout - # if jq is not present, this exits silently with code 0 - post-processor "shell-local" { + # If there is an error during image creation, print out command for getting packer VM logs + error-cleanup-provisioner "shell-local" { + environment_vars = [ + "PRJ_ID=${var.project_id}", + "INST_NAME=${local.instance_name}", + "ZONE=${var.zone}", + ] + inline_shebang = "/bin/bash -e" inline = [ - "command -v jq > /dev/null || exit 0", - "echo \"Image built: $(jq -r '.builds[-1].artifact_id' ${var.manifest_file} | cut -d ':' -f2)\"", + "type -P gcloud > /dev/null || exit 0", + "INST_ID=$(gcloud compute instances describe $INST_NAME --project $PRJ_ID --format=\"value(id)\" --zone=$ZONE)", + "echo 'Error building image try checking logs:'", + join(" ", ["echo \"gcloud logging --project $PRJ_ID read", + "'logName=(\\\"projects/$PRJ_ID/logs/GCEMetadataScripts\\\" OR \\\"projects/$PRJ_ID/logs/google_metadata_script_runner\\\") AND resource.labels.instance_id=$INST_ID'", + "--format=\\\"table(timestamp, resource.labels.instance_id, jsonPayload.message)\\\"", + "--order=asc\"" + ] + ) ] } } diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index 2c5e9ca64a..d8414f6db3 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,7 +39,7 @@ vars: project_id: invalid-project region: us-central1 slurm_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public zone: us-central1-a deployment_groups: @@ -47,14 +47,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars index 39fad882b4..1a3c91cac2 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars @@ -30,7 +30,7 @@ project_id = "invalid-project" region = "us-central1" slurm_image = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf index 3dd3e12681..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = "~> 6.10.0" } } } diff --git a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml index 2df9ca1276..4938bf1f76 100644 --- a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml +++ b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml @@ -44,7 +44,7 @@ deployment_groups: echo "Hello World" > /sw/hello.txt - id: batch-compute-template - source: github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=v7.8.0 + source: github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=v12.1.0 use: [batch-startup-script] settings: # Boiler plate to work with Cloud Foundation Toolkit diff --git a/tools/validate_configs/test_configs/new_project.yaml b/tools/validate_configs/test_configs/new_project.yaml deleted file mode 100644 index 6a352b169d..0000000000 --- a/tools/validate_configs/test_configs/new_project.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: new_project - -vars: - project_id: test_project - deployment_name: new_project_deployment - -deployment_groups: -- group: primary - modules: - - id: project - source: community/modules/project/new-project - settings: - folder_id: 334688113020 # random number - billing_account: 111110-M2N704-854685 # random billing number - org_id: 123456789 # random org id diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml index cfb166cbb5..962d1e3130 100644 --- a/tools/validate_configs/test_configs/node-groups.yaml +++ b/tools/validate_configs/test_configs/node-groups.yaml @@ -64,7 +64,7 @@ deployment_groups: name: c30 machine_type: c2-standard-30 instance_image: - family: slurm-gcp-6-7-debian-11 + family: slurm-gcp-6-8-debian-11 project: schedmd-slurm-public instance_image_custom: true @@ -75,7 +75,7 @@ deployment_groups: name: c60 machine_type: c2-standard-60 instance_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public - id: nodeset_3 @@ -85,7 +85,7 @@ deployment_groups: name: cd112 machine_type: c2d-standard-112 instance_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public instance_image_custom: true enable_smt: true diff --git a/tools/validate_configs/test_configs/test_outputs.yaml b/tools/validate_configs/test_configs/test_outputs.yaml index 6bb0bb48d2..b36aad81cb 100644 --- a/tools/validate_configs/test_configs/test_outputs.yaml +++ b/tools/validate_configs/test_configs/test_outputs.yaml @@ -83,31 +83,6 @@ deployment_groups: - subnetwork_address - nat_ips - - id: new-project - source: community/modules/project/new-project - outputs: - - project_name - - project_id - - project_number - - domain - - group_email - - service_account_id - - service_account_display_name - - service_account_email - - service_account_name - - service_account_unique_id - - project_bucket_self_link - - project_bucket_url - - api_s_account - - api_s_account_fmt - - enabled_apis - - enabled_api_identities - - budget_name - settings: - folder_id: 334688113020 # random number - billing_account: "111110-M2N704-854685" # random billing number - org_id: 123456789 # random org id - - id: sa source: community/modules/project/service-account outputs: