From 2fa185ccf920f53017374284599d329778ca0f26 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 15 Oct 2024 13:30:26 -0500 Subject: [PATCH] Update Slurm-GCP to 6.8.2 Brings in new default NVIDIA driver 550.90.12 which solves several known issues, including NCCL Timeout errors. https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-12/index.html --- community/examples/hpc-build-slurm-image.yaml | 2 +- .../README.md | 2 +- .../main.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 18 +++++++++--------- .../controller.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../partition.tf | 4 ++-- .../schedmd-slurm-gcp-v6-login/README.md | 8 ++++---- .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 2 +- .../a3-megagpu-8g/slurm-a3mega-image.yaml | 2 +- modules/README.md | 2 +- 11 files changed, 25 insertions(+), 25 deletions(-) diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index 45e6bd1612..a1fa81767e 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -23,7 +23,7 @@ vars: image_build_machine_type: n2d-standard-16 build_from_image_family: hpc-rocky-linux-8 build_from_image_project: cloud-hpc-image-public - build_from_git_ref: 6.7.0 + build_from_git_ref: 6.8.2 built_image_family: my-custom-slurm built_instance_image: family: $(vars.built_image_family) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 4d790fe703..d251dff2af 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index 3f0ee54af8..7ca868a049 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -56,7 +56,7 @@ locals { } module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 37b5da93da..9f4933a1fa 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -11,9 +11,9 @@ The [user guide][slurm-ug] provides detailed instructions on customizing and enhancing the Slurm on GCP cluster as well as recommendations on configuring the controller for optimal performance at different scales. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions @@ -238,13 +238,13 @@ limitations under the License. | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 9b105d7f39..1ce6ed158f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -43,7 +43,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" project_id = var.project_id region = var.region @@ -99,7 +99,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index de97810316..998a8e0867 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -56,7 +56,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 7254551072..0d05c71f91 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local template module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" for_each = local.nodeset_map project_id = var.project_id @@ -102,7 +102,7 @@ locals { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.2" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 0afd0bfee7..4ad20a6352 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -53,7 +53,7 @@ modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. [slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/7 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0#slurm-on-google-cloud-platform +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2#slurm-on-google-cloud-platform ## Requirements diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index b817972331..705e1299eb 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -94,7 +94,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 67f33cde7d..dfc4d4ab4c 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -108,7 +108,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/modules/README.md b/modules/README.md index b0575ec3db..722449e6e6 100644 --- a/modules/README.md +++ b/modules/README.md @@ -230,7 +230,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md [slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0 +[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md