From 8ee645c78eea870a93364d3f4e6d936f5063491c Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 5 Feb 2024 07:37:53 -0600 Subject: [PATCH] Update Slurm-GCP release to 5.10.2 --- community/examples/AMD/hpc-amd-slurm.yaml | 2 +- .../examples/hpc-slurm-chromedesktop.yaml | 5 ++-- community/examples/hpc-slurm-local-ssd.yaml | 1 + .../examples/hpc-slurm-ramble-gromacs.yaml | 1 + community/examples/hpc-slurm-ubuntu2004.yaml | 5 ++-- community/examples/htc-slurm.yaml | 3 +- .../examples/tutorial-starccm-slurm.yaml | 1 + .../schedmd-slurm-gcp-v5-node-group/README.md | 7 +++-- .../schedmd-slurm-gcp-v5-node-group/main.tf | 1 + .../source_image_logic.tf | 12 ++++---- .../variables.tf | 16 ++++++++-- .../README.md | 6 ++-- .../main.tf | 2 +- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v5-partition/README.md | 8 ++--- .../schedmd-slurm-gcp-v5-partition/main.tf | 2 +- .../variables.tf | 19 ++++++------ .../schedmd-slurm-gcp-v6-nodeset/README.md | 4 +-- .../schedmd-slurm-gcp-v6-partition/README.md | 4 +-- .../schedmd-slurm-gcp-v5-controller/README.md | 30 +++++++++---------- .../schedmd-slurm-gcp-v5-controller/main.tf | 4 +-- .../source_image_logic.tf | 12 ++++---- .../variables.tf | 9 +++--- .../schedmd-slurm-gcp-v5-hybrid/README.md | 14 ++++----- .../schedmd-slurm-gcp-v5-hybrid/main.tf | 2 +- .../schedmd-slurm-gcp-v5-login/README.md | 16 +++++----- .../schedmd-slurm-gcp-v5-login/main.tf | 4 +-- .../source_image_logic.tf | 12 ++++---- .../schedmd-slurm-gcp-v5-login/variables.tf | 4 +-- .../schedmd-slurm-gcp-v6-controller/README.md | 12 ++++---- .../schedmd-slurm-gcp-v6-login/README.md | 10 +++---- docs/gpu-support.md | 2 +- ...demo-with-cloud-controller-instructions.md | 2 +- .../deploy-instructions.md | 4 +-- .../on-prem-instructions.md | 20 ++++++------- docs/image-building.md | 6 ++-- docs/vm-images.md | 4 +-- examples/README.md | 8 ++--- examples/cae/cae-slurm.yaml | 7 +++-- examples/hpc-enterprise-slurm.yaml | 9 ++---- examples/image-builder.yaml | 5 ++-- examples/ml-slurm.yaml | 5 ++-- modules/README.md | 4 +-- .../daily-tests/blueprints/lustre-slurm.yaml | 4 +-- .../daily-tests/tests/slurm-v5-debian.yml | 2 +- .../daily-tests/tests/slurm-v5-rocky8.yml | 2 +- .../slurm-filestore.yaml | 10 +++---- .../os_compatibility_tests/slurm-lustre.yaml | 8 ++--- .../os_compatibility_tests/slurm-startup.yaml | 10 +++---- .../test_configs/node-groups.yaml | 5 ++-- .../test_configs/slurm-static-test.yaml | 10 +++---- 51 files changed, 190 insertions(+), 167 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 4f68b4de41..3a52e74d42 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -171,7 +171,7 @@ deployment_groups: # these images must match the images used by Slurm modules below because # we are building OpenMPI with PMI support in libraries contained in # Slurm installation - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public - id: low_cost_node_group diff --git a/community/examples/hpc-slurm-chromedesktop.yaml b/community/examples/hpc-slurm-chromedesktop.yaml index 8e6b816cb2..0e1a9c6e36 100644 --- a/community/examples/hpc-slurm-chromedesktop.yaml +++ b/community/examples/hpc-slurm-chromedesktop.yaml @@ -17,15 +17,16 @@ blueprint_name: slurm-crd vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: slurm-crd-01 region: us-central1 zone: us-central1-c instance_image_crd: - family: slurm-gcp-5-9-debian-11 + family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image: - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/community/examples/hpc-slurm-local-ssd.yaml b/community/examples/hpc-slurm-local-ssd.yaml index c8b18d1f8f..e3ebcacc56 100644 --- a/community/examples/hpc-slurm-local-ssd.yaml +++ b/community/examples/hpc-slurm-local-ssd.yaml @@ -17,6 +17,7 @@ blueprint_name: hpc-slurm-local-ssd vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: hpc-localssd region: us-central1 diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index 15e6577c95..7efb91079d 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -17,6 +17,7 @@ blueprint_name: hpc-slurm-ramble-gromacs vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: hpc-slurm-ramble-gromacs region: us-central1 diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index 261376e816..ae2deeb205 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -17,14 +17,15 @@ blueprint_name: hpc-slurm-ubuntu2004 vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: slurm-gcp-v5 region: us-west4 zone: us-west4-c instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - family: slurm-gcp-5-9-ubuntu-2004-lts + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + family: slurm-gcp-5-10-ubuntu-2004-lts project: schedmd-slurm-public instance_image_custom: true diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml index 554448b115..53a1afc833 100644 --- a/community/examples/htc-slurm.yaml +++ b/community/examples/htc-slurm.yaml @@ -17,12 +17,13 @@ # This blueprint provisions a cluster using the Slurm scheduler configured to # efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also: -# https://github.com/SchedMD/slurm-gcp/blob/master/docs/htc.md +# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md # https://slurm.schedmd.com/high_throughput.html blueprint_name: htc-slurm vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: htc-slurm region: us-west4 diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml index db18855352..e450f59d21 100644 --- a/community/examples/tutorial-starccm-slurm.yaml +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -17,6 +17,7 @@ blueprint_name: starccm-on-slurm vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: starccm-slurm region: us-central1 diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index 7d9387f3aa..d37f6d947a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -72,8 +72,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -136,11 +136,12 @@ No modules. | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-9-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-10-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | +| [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. Must be unset (null) or "PERIODIC". | `string` | `null` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | | [name](#input\_name) | Name of the node group. | `string` | `"ghpc"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf index a382a4232a..825f3c0a4a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf @@ -58,6 +58,7 @@ locals { gpu = one(local.guest_accelerator) labels = local.labels machine_type = var.machine_type + maintenance_interval = var.maintenance_interval metadata = var.metadata min_cpu_platform = var.min_cpu_platform on_host_maintenance = var.on_host_maintenance diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf index 3acb583f3b..ddcb1ff6ee 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-9-debian-11", - "slurm-gcp-5-9-hpc-rocky-linux-8", - "slurm-gcp-5-9-ubuntu-2004-lts", - "slurm-gcp-5-9-ubuntu-2204-lts-arm64", - "slurm-gcp-5-9-hpc-centos-7-k80", - "slurm-gcp-5-9-hpc-centos-7" + "slurm-gcp-5-10-debian-11", + "slurm-gcp-5-10-hpc-rocky-linux-8", + "slurm-gcp-5-10-ubuntu-2004-lts", + "slurm-gcp-5-10-ubuntu-2204-lts-arm64", + "slurm-gcp-5-10-hpc-centos-7-k80", + "slurm-gcp-5-10-hpc-centos-7" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index 2a38a2e64b..bbf3848b43 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "project_id" { description = "Project in which the HPC deployment will be created." @@ -96,7 +96,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-9-hpc-centos-7" + family = "slurm-gcp-5-10-hpc-centos-7" } validation { @@ -413,6 +413,18 @@ variable "additional_networks" { })) } +variable "maintenance_interval" { + description = "Specifies the frequency of planned maintenance events. Must be unset (null) or \"PERIODIC\"." + default = null + type = string + nullable = true + + validation { + condition = var.maintenance_interval == null || var.maintenance_interval == "PERIODIC" + error_message = "var.maintenance_interval must be unset (null) or set to \"PERIODIC\"" + } +} + variable "disable_public_ips" { description = "If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access_config is set." type = bool diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index de0dbdb267..51e49f42d6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -35,8 +35,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -69,7 +69,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.9.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.10.2 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf index 5bf9b93c91..6483eb2e0c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf @@ -29,7 +29,7 @@ locals { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.10.2" slurm_cluster_name = local.slurm_cluster_name enable_job_exclusive = var.exclusive diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf index 55d82b07d1..137023ee26 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "deployment_name" { description = "Name of the deployment." diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index ca57874c31..ba8af335f8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -110,8 +110,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -146,7 +146,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.9.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.10.2 | ## Resources @@ -164,7 +164,7 @@ limitations under the License. | [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [node\_groups](#input\_node\_groups) | A list of node groups associated with this partition. See
schedmd-slurm-gcp-v5-node-group for more information on defining a node
group in a blueprint. |
list(object({
node_count_static = number
node_count_dynamic_max = number
group_name = string
node_conf = map(string)
access_config = list(object({
nat_ip = string
network_tier = string
}))
additional_disks = list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
additional_networks = list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
bandwidth_tier = string
can_ip_forward = bool
disable_smt = bool
disk_auto_delete = bool
disk_labels = map(string)
disk_size_gb = number
disk_type = string
enable_confidential_vm = bool
enable_oslogin = bool
enable_shielded_vm = bool
enable_spot_vm = bool
gpu = object({
count = number
type = string
})
instance_template = string
labels = map(string)
machine_type = string
metadata = map(string)
min_cpu_platform = string
on_host_maintenance = string
preemptible = bool
reservation_name = string
service_account = object({
email = string
scopes = list(string)
})
shielded_instance_config = object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
spot_instance_config = object({
termination_action = string
})
source_image_family = string
source_image_project = string
source_image = string
tags = list(string)
}))
| `[]` | no | +| [node\_groups](#input\_node\_groups) | A list of node groups associated with this partition. See
schedmd-slurm-gcp-v5-node-group for more information on defining a node
group in a blueprint. |
list(object({
node_count_static = number
node_count_dynamic_max = number
group_name = string
node_conf = map(string)
access_config = list(object({
nat_ip = string
network_tier = string
}))
additional_disks = list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
additional_networks = list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
bandwidth_tier = string
can_ip_forward = bool
disable_smt = bool
disk_auto_delete = bool
disk_labels = map(string)
disk_size_gb = number
disk_type = string
enable_confidential_vm = bool
enable_oslogin = bool
enable_shielded_vm = bool
enable_spot_vm = bool
gpu = object({
count = number
type = string
})
instance_template = string
labels = map(string)
machine_type = string
maintenance_interval = string
metadata = map(string)
min_cpu_platform = string
on_host_maintenance = string
preemptible = bool
reservation_name = string
service_account = object({
email = string
scopes = list(string)
})
shielded_instance_config = object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
spot_instance_config = object({
termination_action = string
})
source_image_family = string
source_image_project = string
source_image = string
tags = list(string)
}))
| `[]` | no | | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | | [partition\_startup\_scripts\_timeout](#input\_partition\_startup\_scripts\_timeout) | The timeout (seconds) applied to the partition startup script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf index 643e4f3ac1..80f6b7a6eb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf @@ -38,7 +38,7 @@ data "google_compute_zones" "available" { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.10.2" slurm_cluster_name = local.slurm_cluster_name partition_nodes = var.node_groups diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index 698dbd5c60..7c06a1edb5 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "deployment_name" { description = "Name of the deployment." @@ -240,14 +240,15 @@ variable "node_groups" { count = number type = string }) - instance_template = string - labels = map(string) - machine_type = string - metadata = map(string) - min_cpu_platform = string - on_host_maintenance = string - preemptible = bool - reservation_name = string + instance_template = string + labels = map(string) + machine_type = string + maintenance_interval = string + metadata = map(string) + min_cpu_platform = string + on_host_maintenance = string + preemptible = bool + reservation_name = string service_account = object({ email = string scopes = list(string) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index d7f0ee21b0..ee0ab788ae 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -122,8 +122,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## Requirements diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index 523bf0d997..00731800cf 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -56,8 +56,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## Requirements diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 213c12975e..34e96043f6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -17,14 +17,14 @@ controller for optimal performance at different scales. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.10.2/scripts/requirements.txt > ``` -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -94,12 +94,12 @@ This option has some additional requirements: development environment deploying the cluster. One can use following commands: ```bash - pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt + pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.10.2/scripts/requirements.txt ``` For more information, see the [description][optdeps] of this module. -[optdeps]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster#optional +[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster#optional ## Custom Images @@ -178,8 +178,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -215,8 +215,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.9.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.9.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.10.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.10.2 | ## Resources @@ -248,8 +248,8 @@ limitations under the License. | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enable loading of cluster job usage into big query. | `bool` | `false` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when
cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no | +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | +| [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when
cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | @@ -259,7 +259,7 @@ limitations under the License. | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-9-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-10-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | @@ -271,7 +271,7 @@ limitations under the License. | [network\_self\_link](#input\_network\_self\_link) | Network to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | -| [partition](#input\_partition) | Cluster partitions as a list. |
list(object({
compute_list = list(string)
partition = object({
enable_job_exclusive = bool
enable_placement_groups = bool
network_storage = list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
partition_conf = map(string)
partition_feature = string
partition_name = string
partition_nodes = map(object({
access_config = list(object({
network_tier = string
}))
bandwidth_tier = string
node_count_dynamic_max = number
node_count_static = number
enable_spot_vm = bool
group_name = string
instance_template = string
node_conf = map(string)
reservation_name = string
spot_instance_config = object({
termination_action = string
})
}))
partition_startup_scripts_timeout = number
subnetwork = string
zone_policy_allow = list(string)
zone_policy_deny = list(string)
zone_target_shape = string
})
}))
| `[]` | no | +| [partition](#input\_partition) | Cluster partitions as a list. |
list(object({
compute_list = list(string)
partition = object({
enable_job_exclusive = bool
enable_placement_groups = bool
network_storage = list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
partition_conf = map(string)
partition_feature = string
partition_name = string
partition_nodes = map(object({
access_config = list(object({
network_tier = string
}))
bandwidth_tier = string
node_count_dynamic_max = number
node_count_static = number
enable_spot_vm = bool
group_name = string
instance_template = string
maintenance_interval = string
node_conf = map(string)
reservation_name = string
spot_instance_config = object({
termination_action = string
})
}))
partition_startup_scripts_timeout = number
subnetwork = string
zone_policy_allow = list(string)
zone_policy_deny = list(string)
zone_target_shape = string
})
}))
| `[]` | no | | [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 834b00240b..fbbc0c0b5c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -55,7 +55,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.10.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name @@ -92,7 +92,7 @@ module "slurm_controller_instance" { } module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.10.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf index 3acb583f3b..ddcb1ff6ee 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-9-debian-11", - "slurm-gcp-5-9-hpc-rocky-linux-8", - "slurm-gcp-5-9-ubuntu-2004-lts", - "slurm-gcp-5-9-ubuntu-2204-lts-arm64", - "slurm-gcp-5-9-hpc-centos-7-k80", - "slurm-gcp-5-9-hpc-centos-7" + "slurm-gcp-5-10-debian-11", + "slurm-gcp-5-10-hpc-rocky-linux-8", + "slurm-gcp-5-10-ubuntu-2004-lts", + "slurm-gcp-5-10-ubuntu-2204-lts-arm64", + "slurm-gcp-5-10-hpc-centos-7-k80", + "slurm-gcp-5-10-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 9f3c5810ed..27fac71324 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "access_config" { description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." @@ -214,7 +214,7 @@ variable "enable_cleanup_compute" { placement groups) managed by this module, when cluster is destroyed. NOTE: Requires Python and pip packages listed at the following link: - https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt + https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt *WARNING*: Toggling this may impact the running workload. Deployed compute nodes may be destroyed and their jobs will be requeued. @@ -229,7 +229,7 @@ variable "enable_cleanup_subscriptions" { cluster is destroyed. NOTE: Requires Python and pip packages listed at the following link: - https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt + https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt *WARNING*: Toggling this may temporarily impact var.enable_reconfigure behavior. EOD @@ -413,6 +413,7 @@ variable "partition" { enable_spot_vm = bool group_name = string instance_template = string + maintenance_interval = string node_conf = map(string) reservation_name = string spot_instance_config = object({ @@ -552,7 +553,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-9-hpc-centos-7" + family = "slurm-gcp-5-10-hpc-centos-7" } validation { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index e50e1baddf..8a897be889 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -38,7 +38,7 @@ manually. This will require addition configuration and verification of permissions. For more information see the [hybrid.md] documentation on [slurm-gcp]. -[slurm-controller-hybrid]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_controller_hybrid +[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_controller_hybrid > **_NOTE:_** The hybrid module requires the following dependencies to be > installed on the system deploying the module: @@ -58,15 +58,15 @@ permissions. For more information see the [hybrid.md] documentation on [pyyaml]: https://pypi.org/project/PyYAML/ [google-api-python-client]: https://pypi.org/project/google-api-python-client/ [google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/scripts/requirements.txt ### Manual Configuration This module *does not* complete the installation of hybrid partitions on your slurm cluster. After deploying, you must follow the steps listed out in the [hybrid.md] documentation under [manual steps]. -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md -[manual steps]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md#manual-configurations +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md +[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md#manual-configurations ### Example Usage The hybrid module can be added to a blueprint as follows: @@ -146,10 +146,10 @@ strongly advise only using versions 21 or 22 when using this module. Attempting to use this module with any version older than 21 may lead to unexpected results. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 [pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ [schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/packer +[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/packer ## License @@ -181,7 +181,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.9.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.10.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index a3d30a1f24..411cec0dd0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -28,7 +28,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.10.2" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 09979f2320..f4d39e56a0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -46,8 +46,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2#slurm-on-google-cloud-platform ## License @@ -82,8 +82,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.9.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.9.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.10.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.10.2 | ## Resources @@ -113,7 +113,7 @@ limitations under the License. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-9-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-10-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index 214019af31..9888a764d6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -50,7 +50,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.10.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -88,7 +88,7 @@ module "slurm_login_template" { } module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.10.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf index 3acb583f3b..ddcb1ff6ee 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-9-debian-11", - "slurm-gcp-5-9-hpc-rocky-linux-8", - "slurm-gcp-5-9-ubuntu-2004-lts", - "slurm-gcp-5-9-ubuntu-2204-lts-arm64", - "slurm-gcp-5-9-hpc-centos-7-k80", - "slurm-gcp-5-9-hpc-centos-7" + "slurm-gcp-5-10-debian-11", + "slurm-gcp-5-10-hpc-rocky-linux-8", + "slurm-gcp-5-10-ubuntu-2004-lts", + "slurm-gcp-5-10-ubuntu-2204-lts-arm64", + "slurm-gcp-5-10-hpc-centos-7-k80", + "slurm-gcp-5-10-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 0b43011968..709df950be 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "project_id" { type = string @@ -296,7 +296,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-9-hpc-centos-7" + family = "slurm-gcp-5-10-hpc-centos-7" } validation { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index fd7033b56f..230e5661af 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -11,11 +11,11 @@ The [user guide][slurm-ug] provides detailed instructions on customizing and enhancing the Slurm on GCP cluster as well as recommendations on configuring the controller for optimal performance at different scales. -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 -[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/6.2.0/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/6.2.0/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -87,8 +87,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 682a4c2a68..8fad372646 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 -[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -52,8 +52,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0#slurm-on-google-cloud-platform ## Requirements diff --git a/docs/gpu-support.md b/docs/gpu-support.md index bf600f96dc..c1aa4989a3 100644 --- a/docs/gpu-support.md +++ b/docs/gpu-support.md @@ -132,7 +132,7 @@ information, see the SchedMD documentation: * [srun Documentation](https://slurm.schedmd.com/srun.html) * [sbatch Documentation](https://slurm.schedmd.com/sbatch.html) -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp [cloud_parameters]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_cloud_parameters ## Further Reading diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index 0fa3b5595f..5bd753d2e7 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -22,7 +22,7 @@ for use with an on-premise slurm-cluster. > further testing is done, documentation on applying the hybrid module to > on-premise slurm clusters will be added and expanded. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 ## Definitions diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index 1b3f60a354..ada2606dea 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -264,8 +264,8 @@ sudo systemctl restart slurmctld If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` should point you in the right direction. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm-gcp-hybrid]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md [demo-with-cloud-controller-instructions.md]: ./demo-with-cloud-controller-instructions.md ## Validate the Hybrid Cluster diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 1ab5f94d4b..037019e887 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -39,9 +39,9 @@ detail, as well as how to customize many of these assumptions to fit your needs. deployments in their [hybrid.md] documentation. [hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm\_controller\_hybrid]: https://github.com/SchedMD/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm\_controller\_hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md ### NFS Mounts @@ -224,7 +224,7 @@ image created with slurm 21.08.8: node_count_dynamic_max: 20 instance_image: project: $(vars.project_id) - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 - id: compute-partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition @@ -235,12 +235,12 @@ image created with slurm 21.08.8: partition_name: compute ``` -[slurmgcppacker]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/packer -[example.pkrvars.hcl]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/packer/example.pkrvars.hcl -[slurmversion]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/packer/variables.pkr.hcl#L97 -[`service_account_scopes`]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/packer/variables.pkr.hcl#L166 -[`munge_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/ansible/roles/munge/defaults/main.yml#L17 -[`slurm_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/ansible/roles/slurm/defaults/main.yml#L31 +[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/packer +[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/packer/example.pkrvars.hcl +[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/packer/variables.pkr.hcl#L97 +[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/packer/variables.pkr.hcl#L166 +[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/ansible/roles/munge/defaults/main.yml#L17 +[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/ansible/roles/slurm/defaults/main.yml#L31 ## On Premise Setup diff --git a/docs/image-building.md b/docs/image-building.md index 46ce0064b1..e8ff335f8d 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -15,7 +15,7 @@ operating system and your HPC applications. A typical custom image workflow is: [images]: https://cloud.google.com/compute/docs/images [standard-os]: https://cloud.google.com/compute/docs/images/os-details -[slurm-images]: https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#public-image +[slurm-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#public-image ## Examples @@ -154,7 +154,7 @@ a subdirectory. > to Ansible playbooks by a relative path (`../ansible`) that will not be > downloaded. -[schedmd-packer]: https://github.com/SchedMD/slurm-gcp/tree/master/packer#readme +[schedmd-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/packer#readme For example, to address the issue noted above: @@ -168,7 +168,7 @@ deployment_groups: - group: packer modules: - id: custom-image - source: github.com/SchedMD/slurm-gcp//packer?ref=5.9.1&depth=1 + source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.10.2&depth=1 kind: packer settings: use_iap: true diff --git a/docs/vm-images.md b/docs/vm-images.md index 8989c43953..89f2d87d05 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -313,8 +313,8 @@ These instructions apply to the following modules: * [schedmd-slurm-gcp-v5-login] * [schedmd-slurm-gcp-v5-node-group] -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v5 -[slurm-gcp-packer]: https://github.com/SchedMD/slurm-gcp/tree/v5/packer +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 +[slurm-gcp-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5/packer [slurm-gcp-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md [slurm-gcp-published-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family [gcloud-compute-images]: https://cloud.google.com/sdk/gcloud/reference/compute/images/create diff --git a/examples/README.md b/examples/README.md index 97cd4d8000..15fc1bb50f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -605,11 +605,11 @@ The blueprint contains 3 groups: > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.10.2/scripts/requirements.txt > ``` Similar to the [hpc-slurm.yaml] example, but using Ubuntu 20.04 instead of CentOS 7. -[Other operating systems] are supported by SchedMD for the the Slurm on GCP project and images are listed [here](https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family). Only the examples listed in this page been tested by the Cloud HPC Toolkit team. +[Other operating systems] are supported by SchedMD for the the Slurm on GCP project and images are listed [here](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family). Only the examples listed in this page been tested by the Cloud HPC Toolkit team. The cluster will support 2 partitions named `debug` and `compute`. The `debug` partition is the default partition and runs on smaller @@ -618,7 +618,7 @@ specifying in the `srun` command via the `--partition` flag. The `compute` partition runs on compute optimized nodes of type `cs-standard-60`. The `compute` partition may require additional quota before using. -[Other operating systems]: https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems +[Other operating systems]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems [hpc-slurm-ubuntu2004.yaml]: ../community/examples/hpc-slurm-ubuntu2004.yaml #### Quota Requirements for hpc-slurm-ubuntu2004.yaml @@ -910,7 +910,7 @@ tuned for the execution of many short-duration, loosely-coupled (non-MPI) jobs. For more information see: -* [Slurm on Google Cloud High Throughput documentation](https://github.com/SchedMD/slurm-gcp/blob/master/docs/htc.md) +* [Slurm on Google Cloud High Throughput documentation](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md) * [General Slurm High Throughput documentation](https://slurm.schedmd.com/high_throughput.html) [htc-slurm.yaml]: ../community/examples/htc-slurm.yaml diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index 5da17f2777..ab641f96f6 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -28,6 +28,7 @@ # blueprint_name: cae-slurm vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: cae-slurm # check here for other regions with H3 deployments: https://cloud.google.com/compute/docs/regions-zones @@ -36,14 +37,14 @@ vars: # zone: europe-west4-b region: us-central1 zone: us-central1-a - # Visit https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family + # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm; note: the image types for the compute nodes # and the Chrome Remote Desktop (CRD) need to have the same Slurm base. instance_image: - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public crd_instance_image: - family: slurm-gcp-5-9-debian-11 # must be Debian for CRD + family: slurm-gcp-5-10-debian-11 # must be Debian for CRD project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index fefe24cc2b..fb710ee028 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -17,15 +17,16 @@ blueprint_name: hpc-enterprise-slurm vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: hpc01 region: us-central1 zone: us-central1-a gpu_zones: [us-central1-a, us-central1-b, us-central1-c, us-central1-f] slurm_image: - # Visit https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family + # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public # Set to true for active cluster reconfiguration. # Note that setting this option requires additional dependencies to be installed locally. @@ -89,16 +90,12 @@ deployment_groups: source: modules/file-system/filestore use: [network1] settings: - filestore_tier: BASIC_SSD - size_gb: 2560 # smallest size for BASIC_SSD local_mount: /home - id: projectsfs source: modules/file-system/filestore use: [network1] settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 # smallest size for HIGH_SCALE_SSD local_mount: /projects # This file system has an associated license cost. diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index 4eeae609b5..fb842e009b 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -21,6 +21,7 @@ blueprint_name: image-builder vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: image-builder-001 region: us-central1 @@ -59,8 +60,8 @@ deployment_groups: - scripts_for_image settings: source_image_project_id: [schedmd-slurm-public] - # see latest in https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-5-9-hpc-centos-7 + # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family + source_image_family: slurm-gcp-5-10-hpc-centos-7 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index aa06aaddbb..780b4b722b 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -16,6 +16,7 @@ blueprint_name: ml-slurm vars: + enable_devel: true project_id: ## Set project id here deployment_name: ml-example region: asia-southeast1 @@ -135,8 +136,8 @@ deployment_groups: # w/o new VPC omit_external_ip: false source_image_project_id: [schedmd-slurm-public] - # see latest in https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-5-9-debian-11 + # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family + source_image_family: slurm-gcp-5-10-debian-11 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) diff --git a/modules/README.md b/modules/README.md index 12ddaa4557..9448af47e6 100644 --- a/modules/README.md +++ b/modules/README.md @@ -209,8 +209,8 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp-version-5]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm-gcp-version-6]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 +[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml index 3db7e9fa56..10d06b03f2 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml @@ -27,7 +27,7 @@ vars: # on_host_maintenance: MIGRATE num_nodes: 1 rocky_image: - family: slurm-gcp-5-9-hpc-rocky-linux-8 + family: slurm-gcp-5-10-hpc-rocky-linux-8 project: schedmd-slurm-public deployment_groups: @@ -85,7 +85,7 @@ deployment_groups: # settings: # node_count_dynamic_max: $(vars.num_nodes) # instance_image: - # family: slurm-gcp-5-9-ubuntu-2004-lts + # family: slurm-gcp-5-10-ubuntu-2004-lts # project: schedmd-slurm-public # instance_image_custom: true diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml index d356e5f380..c6e271cdc6 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml @@ -21,7 +21,7 @@ deployment_name: "debi-v5-{{ build }}" slurm_cluster_name: "debiv5{{ build[0:4] }}" cli_deployment_vars: - instance_image: "{family: slurm-gcp-5-9-debian-11, project: schedmd-slurm-public}" + instance_image: "{family: slurm-gcp-5-10-debian-11, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml index 8f593332bf..5d26b72fdb 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml @@ -21,7 +21,7 @@ deployment_name: "rock-8-{{ build }}" slurm_cluster_name: "rock8{{ build[0:5] }}" cli_deployment_vars: - instance_image: "{family: slurm-gcp-5-9-hpc-rocky-linux-8, project: schedmd-slurm-public}" + instance_image: "{family: slurm-gcp-5-10-hpc-rocky-linux-8, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml b/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml index c56a226980..06eafcb4bd 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml @@ -24,11 +24,11 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 - # family: slurm-gcp-5-9-debian-11 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 + # family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true diff --git a/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml b/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml index 593442b137..bab5ee5183 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml @@ -24,10 +24,10 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 project: schedmd-slurm-public instance_image_custom: true diff --git a/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml b/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml index 7ab7513756..6c5164de61 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml @@ -24,11 +24,11 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 - # family: slurm-gcp-5-9-debian-11 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 + # family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml index 9b611679e6..28c776f4c1 100644 --- a/tools/validate_configs/test_configs/node-groups.yaml +++ b/tools/validate_configs/test_configs/node-groups.yaml @@ -64,7 +64,7 @@ deployment_groups: name: c30 machine_type: c2-standard-30 instance_image: - family: slurm-gcp-5-9-debian-11 + family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true @@ -83,7 +83,7 @@ deployment_groups: name: cd112 machine_type: c2d-standard-112 instance_image: - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public instance_image_custom: true enable_smt: true @@ -139,6 +139,7 @@ deployment_groups: instance_template: null labels: $(vars.labels) machine_type: n2-standard-16 + maintenance_interval: null metadata: {} min_cpu_platform: null on_host_maintenance: TERMINATE diff --git a/tools/validate_configs/test_configs/slurm-static-test.yaml b/tools/validate_configs/test_configs/slurm-static-test.yaml index 5d5ed3cf4c..7e3adcbb9a 100644 --- a/tools/validate_configs/test_configs/slurm-static-test.yaml +++ b/tools/validate_configs/test_configs/slurm-static-test.yaml @@ -24,11 +24,11 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 - # family: slurm-gcp-5-9-debian-11 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 + # family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true enable_reconfigure: true